Tutorials
This section provides in-depth tutorials covering various aspects of GraphEm.
Graph Embedding Deep Dive
Understanding GraphEm’s Embedding Algorithm
GraphEm combines spectral methods with force-directed layout for high-quality embeddings:
import graphem as ge
import numpy as np
import matplotlib.pyplot as plt
# Create a graph with known structure
edges = ge.generate_sbm(n_per_block=50, num_blocks=3, p_in=0.15, p_out=0.02)
n_vertices = 150
# Create embedder with detailed parameters
embedder = ge.GraphEmbedder(
edges=edges,
n_vertices=n_vertices,
dimension=2,
L_min=5.0, # Shorter edges for tighter layout
k_attr=0.8, # Strong attraction within communities
k_inter=0.05, # Weak repulsion between communities
knn_k=20 # More neighbors for better structure
)
# Monitor convergence
positions_history = []
for i in range(0, 100, 10):
embedder.run_layout(num_iterations=10)
positions_history.append(np.array(embedder.positions))
# Final visualization
embedder.display_layout()
Parameter Tuning Guide
Understanding how parameters affect the embedding:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Test different parameter combinations
params_to_test = [
{'L_min': 1.0, 'k_attr': 0.2, 'k_inter': 0.1, 'title': 'Loose Layout'},
{'L_min': 5.0, 'k_attr': 0.5, 'k_inter': 0.2, 'title': 'Balanced Layout'},
{'L_min': 10.0, 'k_attr': 0.8, 'k_inter': 0.5, 'title': 'Tight Layout'}
]
fig = make_subplots(rows=1, cols=3,
subplot_titles=[p['title'] for p in params_to_test],
specs=[[{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}]])
for i, params in enumerate(params_to_test, 1):
embedder = ge.GraphEmbedder(
edges=edges, n_vertices=n_vertices, dimension=2,
L_min=params['L_min'], k_attr=params['k_attr'], k_inter=params['k_inter']
)
embedder.run_layout(num_iterations=50)
pos = np.array(embedder.positions)
fig.add_trace(go.Scatter(x=pos[:, 0], y=pos[:, 1], mode='markers',
name=params['title']), row=1, col=i)
fig.show()
Working with Large Networks
Handling Networks with 10k+ Nodes
For large networks, optimize performance and memory usage:
# Generate a large scale-free network
large_edges = ge.generate_ba(n=10000, m=5)
n_vertices = 10000
# Optimized embedder for large graphs
large_embedder = ge.GraphEmbedder(
edges=large_edges,
n_vertices=n_vertices,
dimension=2, # 2D is faster than 3D
L_min=2.0,
k_attr=0.3,
k_inter=0.1,
knn_k=8, # Fewer neighbors for speed
sample_size=512, # Larger sample for accuracy
batch_size=4096, # Large batches for efficiency
verbose=True # Monitor progress
)
# Progressive refinement
print("Initial layout...")
large_embedder.run_layout(num_iterations=20)
print("Refinement...")
large_embedder.k_attr = 0.5 # Increase attraction for refinement
large_embedder.run_layout(num_iterations=30)
# Sample visualization (full graph would be too dense)
pos = np.array(large_embedder.positions)
sample_nodes = np.random.choice(n_vertices, 1000, replace=False)
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(
x=pos[sample_nodes, 0],
y=pos[sample_nodes, 1],
mode='markers',
marker=dict(size=2),
title="Sample of 1000 nodes from 10k node network"
))
fig.show()
Memory-Efficient Processing
For extremely large networks, use chunked processing:
def embed_large_network_chunked(edges, n_vertices, chunk_size=5000):
"""Embed very large networks in chunks."""
if n_vertices <= chunk_size:
# Small enough to process normally
embedder = ge.GraphEmbedder(edges=edges, n_vertices=n_vertices)
embedder.run_layout(num_iterations=50)
return embedder.positions
# For very large networks, use progressive approach
print(f"Processing {n_vertices} nodes in chunks of {chunk_size}")
# Start with a subgraph
node_subset = np.random.choice(n_vertices, chunk_size, replace=False)
mask = np.isin(edges[:, 0], node_subset) & np.isin(edges[:, 1], node_subset)
subset_edges = edges[mask]
# Remap node IDs to 0-based consecutive
old_to_new = {old: new for new, old in enumerate(node_subset)}
remapped_edges = np.array([[old_to_new[e[0]], old_to_new[e[1]]]
for e in subset_edges])
# Embed subset
embedder = ge.GraphEmbedder(edges=remapped_edges, n_vertices=len(node_subset))
embedder.run_layout(num_iterations=100)
# This is a simplified example - full implementation would
# gradually add nodes and refine positions
return embedder.positions
Influence Maximization Applications
Network Robustness Analysis
Analyze how network structure affects influence spread:
def analyze_network_robustness(generator, params, attack_strategies):
"""Analyze robustness under different attack strategies."""
# Generate base network
edges = generator(**params)
n_vertices = params['n']
G = nx.Graph()
G.add_nodes_from(range(n_vertices))
G.add_edges_from(edges)
results = {}
for strategy_name, attack_function in attack_strategies.items():
# Remove nodes according to strategy
nodes_to_remove = attack_function(G, int(0.1 * n_vertices)) # Remove 10%
G_attacked = G.copy()
G_attacked.remove_nodes_from(nodes_to_remove)
# Recompute largest connected component
largest_cc = max(nx.connected_components(G_attacked), key=len)
G_cc = G_attacked.subgraph(largest_cc).copy()
# Test influence spread in remaining network
if len(G_cc) > 50: # Only if significant network remains
cc_edges = np.array(list(G_cc.edges()))
embedder = ge.GraphEmbedder(edges=cc_edges, n_vertices=len(G_cc))
# Remap node IDs
node_mapping = {old: new for new, old in enumerate(G_cc.nodes())}
remapped_edges = np.array([[node_mapping[e[0]], node_mapping[e[1]]]
for e in cc_edges])
embedder = ge.GraphEmbedder(edges=remapped_edges, n_vertices=len(G_cc))
seeds = ge.graphem_seed_selection(embedder, k=min(10, len(G_cc)//10))
influence, _ = ge.ndlib_estimated_influence(G_cc, seeds, p=0.1)
results[strategy_name] = {
'remaining_nodes': len(G_cc),
'influence': influence,
'influence_fraction': influence / len(G_cc)
}
else:
results[strategy_name] = {
'remaining_nodes': len(G_cc),
'influence': 0,
'influence_fraction': 0.0
}
return results
# Define attack strategies
attack_strategies = {
'Random': lambda G, k: np.random.choice(list(G.nodes()), k, replace=False),
'High Degree': lambda G, k: sorted(G.nodes(), key=G.degree, reverse=True)[:k],
'High Betweenness': lambda G, k: sorted(G.nodes(),
key=lambda n: nx.betweenness_centrality(G)[n],
reverse=True)[:k]
}
# Test on different network types
network_types = [
('Scale-Free', ge.generate_ba, {'n': 500, 'm': 3}),
('Small-World', ge.generate_ws, {'n': 500, 'k': 6, 'p': 0.1}),
('Random', ge.erdos_renyi_graph, {'n': 500, 'p': 0.012})
]
for net_name, generator, params in network_types:
print(f"\n{net_name} Network:")
results = analyze_network_robustness(generator, params, attack_strategies)
for attack, data in results.items():
print(f" {attack:15}: {data['remaining_nodes']:3d} nodes, "
f"{data['influence_fraction']:.1%} influenced")
Centrality Analysis
Comparing Embedding-Based and Traditional Centralities
from graphem.benchmark import benchmark_correlations
from graphem.visualization import report_full_correlation_matrix
import pandas as pd
# Generate different network types for comparison
networks = [
('Erdős–Rényi', ge.erdos_renyi_graph, {'n': 300, 'p': 0.02}),
('Scale-Free', ge.generate_ba, {'n': 300, 'm': 2}),
('Small-World', ge.generate_ws, {'n': 300, 'k': 4, 'p': 0.1}),
('Community', ge.generate_sbm, {'n_per_block': 100, 'num_blocks': 3, 'p_in': 0.1, 'p_out': 0.01})
]
correlation_results = {}
for net_name, generator, params in networks:
print(f"Analyzing {net_name} network...")
# Run benchmark
results = benchmark_correlations(
graph_generator=generator,
graph_params=params,
dim=2,
num_iterations=50
)
# Compute correlation matrix
correlation_matrix = report_full_correlation_matrix(
results['radii'],
results['degree'],
results['betweenness'],
results['eigenvector'],
results['pagerank'],
results['closeness'],
results['node_load']
)
correlation_results[net_name] = correlation_matrix
# Compare radial centrality correlations across network types
radial_correlations = pd.DataFrame({
net_name: corr_matrix.loc['Radius']
for net_name, corr_matrix in correlation_results.items()
})
print("\nRadial Centrality Correlations Across Network Types:")
print(radial_correlations.round(3))
Custom Graph Generators
Creating Domain-Specific Networks
def generate_hierarchical_network(levels=3, branching=3, intra_level_prob=0.1):
"""Generate a hierarchical network structure."""
nodes_per_level = [branching ** i for i in range(levels)]
total_nodes = sum(nodes_per_level)
edges = []
node_id = 0
level_starts = [0]
# Create hierarchical connections
for level in range(levels - 1):
level_start = level_starts[level]
level_size = nodes_per_level[level]
next_level_size = nodes_per_level[level + 1]
# Connect each node in current level to nodes in next level
for i in range(level_size):
current_node = level_start + i
# Each node connects to 'branching' nodes in next level
start_next = level_starts[level] + level_size + i * branching
for j in range(branching):
if start_next + j < total_nodes:
edges.append([current_node, start_next + j])
level_starts.append(level_starts[-1] + level_size)
# Add intra-level connections
for level in range(levels):
level_start = level_starts[level]
level_size = nodes_per_level[level]
for i in range(level_size):
for j in range(i + 1, level_size):
if np.random.random() < intra_level_prob:
edges.append([level_start + i, level_start + j])
return np.array(edges)
# Test the custom generator
hier_edges = generate_hierarchical_network(levels=4, branching=2, intra_level_prob=0.2)
# Embed and visualize
embedder = ge.GraphEmbedder(
edges=hier_edges,
n_vertices=hier_edges.max() + 1,
dimension=2,
L_min=3.0,
k_attr=0.7,
k_inter=0.1
)
embedder.run_layout(num_iterations=80)
embedder.display_layout()
Performance Optimization
GPU Acceleration Tips
import jax
# Check available devices
print("Available devices:", jax.devices())
# For consistent GPU usage across runs
def setup_gpu_embedding(edges, n_vertices, device_id=0):
"""Setup embedder with specific GPU device."""
# Force specific device if multiple GPUs available
if len(jax.devices('gpu')) > 1:
device = jax.devices('gpu')[device_id]
with jax.default_device(device):
embedder = ge.GraphEmbedder(
edges=edges,
n_vertices=n_vertices,
batch_size=8192, # Larger batches for GPU
sample_size=1024 # Larger samples for GPU
)
return embedder
else:
return ge.GraphEmbedder(edges=edges, n_vertices=n_vertices)
# Example with large graph
large_edges = ge.generate_ba(n=20000, m=4)
embedder = setup_gpu_embedding(large_edges, 20000)
# Time the embedding
import time
start_time = time.time()
embedder.run_layout(num_iterations=50)
end_time = time.time()
print(f"Embedding 20k nodes took {end_time - start_time:.2f} seconds")
Profiling and Optimization
# Profile memory usage and computation time
def profile_embedding(edges, n_vertices, iterations=50):
"""Profile embedding performance."""
import psutil
import os
process = psutil.Process(os.getpid())
# Memory before
mem_before = process.memory_info().rss / 1024 / 1024 # MB
# Time embedding
start_time = time.time()
embedder = ge.GraphEmbedder(edges=edges, n_vertices=n_vertices)
embedder.run_layout(num_iterations=iterations)
end_time = time.time()
# Memory after
mem_after = process.memory_info().rss / 1024 / 1024 # MB
return {
'time': end_time - start_time,
'memory_used': mem_after - mem_before,
'final_memory': mem_after
}
# Test different graph sizes
sizes = [500, 1000, 2000, 5000]
for n in sizes:
edges = ge.generate_ba(n=n, m=3)
stats = profile_embedding(edges, n, iterations=30)
print(f"n={n:4d}: {stats['time']:5.2f}s, "
f"{stats['memory_used']:6.1f}MB used, "
f"{stats['final_memory']:6.1f}MB total")
Next Steps
Explore the examples for complete working applications
Check the API Reference for detailed function documentation
See the Contributing to GraphEm guide to help improve GraphEm
Run the benchmarks with
python run_benchmarks.py
to reproduce research results