Source code for graphem_rapids.benchmark

"""
Benchmark functionality for GraphEm Rapids.
"""

import time
import logging
import numpy as np
import scipy.sparse as sp
import networkx as nx
from scipy import stats

from .backends.embedder_pytorch import GraphEmbedderPyTorch
from .influence import graphem_seed_selection, ndlib_estimated_influence, greedy_seed_selection

logger = logging.getLogger(__name__)


[docs] def run_benchmark(graph_generator, graph_params, dim=3, L_min=10.0, k_attr=0.5, k_inter=0.1, n_neighbors=15, sample_size=512, num_iterations=40, backend='pytorch', **kwargs): """ Run a benchmark on the given graph using GraphEm Rapids. Parameters ---------- graph_generator : callable Function to generate a graph (returns sparse adjacency matrix) graph_params : dict Parameters for the graph generator dim : int, default=3 Number of embedding components (dimensions) L_min : float, default=10.0 Minimum spring length parameter k_attr : float, default=0.5 Attraction force constant k_inter : float, default=0.1 Intersection repulsion force constant n_neighbors : int, default=15 Number of nearest neighbors for intersection detection sample_size : int, default=512 Sample size for kNN computation num_iterations : int, default=40 Number of layout iterations backend : str, default='pytorch' Backend to use ('pytorch', 'cuvs', or 'auto') **kwargs Additional parameters passed to the embedder Returns ------- dict Benchmark results including timings and graph metrics """ logger.info("Running benchmark with %s...", graph_generator.__name__) # Generate the graph (returns sparse adjacency matrix) start_time = time.time() adjacency = graph_generator(**graph_params) # Count vertices and edges n = adjacency.shape[0] m = adjacency.nnz // 2 # Divide by 2 for undirected graphs logger.info("Generated graph with %d vertices and %d edges", n, m) # Convert to NetworkX graph for centrality calculations nx_graph = nx.from_scipy_sparse_array(adjacency) # Calculate centrality measures logger.info("Calculating centrality measures...") degree = np.array([d for _, d in nx_graph.degree()]) betweenness = np.zeros(n) btw_dict = nx.betweenness_centrality(nx_graph) for i, val in btw_dict.items(): betweenness[i] = val eigenvector = np.zeros(n) try: eig_dict = nx.eigenvector_centrality_numpy(nx_graph) for i, val in eig_dict.items(): eigenvector[i] = val except (nx.NetworkXError, nx.AmbiguousSolution) as e: logger.warning("Eigenvector centrality calculation failed: %s", e) logger.warning("Setting eigenvector centrality to degree centrality as fallback") # Use degree centrality as a fallback deg_dict = nx.degree_centrality(nx_graph) for i, val in deg_dict.items(): eigenvector[i] = val pagerank = np.zeros(n) pr_dict = nx.pagerank(nx_graph) for i, val in pr_dict.items(): pagerank[i] = val closeness = np.zeros(n) close_dict = nx.closeness_centrality(nx_graph) for i, val in close_dict.items(): closeness[i] = val node_load = np.zeros(n) node_load_dict = nx.load_centrality(nx_graph) for i, val in node_load_dict.items(): node_load[i] = val # Create embedder logger.info("Creating embedder...") # Ensure adjacency is in CSR format if not sp.isspmatrix_csr(adjacency): adjacency = adjacency.tocsr() embedder = GraphEmbedderPyTorch( adjacency=adjacency, n_components=dim, L_min=L_min, k_attr=k_attr, k_inter=k_inter, n_neighbors=n_neighbors, sample_size=sample_size, verbose=True, **kwargs ) # Run layout and get embedding logger.info("Running layout for %d iterations...", num_iterations) layout_start = time.time() embedder.run_layout(num_iterations=num_iterations) layout_time = time.time() - layout_start # Get positions and calculate radial distances positions = np.array(embedder.positions.cpu().numpy() if hasattr(embedder.positions, 'cpu') else embedder.positions) radii = np.linalg.norm(positions, axis=1) # Return benchmark data result = { 'n': n, 'm': m, 'density': 2 * m / (n * (n - 1)) if n > 1 else 0.0, 'avg_degree': 2 * m / n if n > 0 else 0.0, 'layout_time': layout_time, 'graph_type': graph_generator.__name__, 'n_components': dim, 'backend': backend, 'radii': radii, 'positions': positions, 'degree': degree, 'betweenness': betweenness, 'eigenvector': eigenvector, 'pagerank': pagerank, 'closeness': closeness, 'node_load': node_load } total_time = time.time() - start_time result['total_time'] = total_time logger.info("Benchmark completed in %.2f seconds", total_time) return result
[docs] def benchmark_correlations(graph_generator, graph_params, dim=2, L_min=10.0, k_attr=0.5, k_inter=0.1, n_neighbors=15, sample_size=512, num_iterations=40, backend='pytorch', **kwargs): """ Run a benchmark to calculate correlations between embedding radii and centrality measures. Parameters ---------- graph_generator : callable Function to generate a graph (returns sparse adjacency matrix) graph_params : dict Parameters for the graph generator dim : int, default=2 Number of embedding components (dimensions) L_min, k_attr, k_inter : float Force-directed layout parameters n_neighbors : int, default=15 Number of nearest neighbors for intersection detection sample_size : int, default=512 Sample size for kNN computation num_iterations : int, default=40 Number of layout iterations backend : str, default='pytorch' Backend to use **kwargs Additional embedder parameters Returns ------- dict Benchmark results with correlation coefficients for centrality measures """ # Run the benchmark to get basic metrics results = run_benchmark( graph_generator, graph_params, dim=dim, L_min=L_min, k_attr=k_attr, k_inter=k_inter, n_neighbors=n_neighbors, sample_size=sample_size, num_iterations=num_iterations, backend=backend, **kwargs ) # Calculate correlations with radial distances radii = results['radii'] correlations = {} # Degree correlation rho, p = stats.spearmanr(radii, results['degree']) correlations['degree'] = {'rho': rho, 'p': p} # Betweenness correlation rho, p = stats.spearmanr(radii, results['betweenness']) correlations['betweenness'] = {'rho': rho, 'p': p} # Eigenvector correlation rho, p = stats.spearmanr(radii, results['eigenvector']) correlations['eigenvector'] = {'rho': rho, 'p': p} # PageRank correlation rho, p = stats.spearmanr(radii, results['pagerank']) correlations['pagerank'] = {'rho': rho, 'p': p} # Closeness correlation rho, p = stats.spearmanr(radii, results['closeness']) correlations['closeness'] = {'rho': rho, 'p': p} # Node load correlation rho, p = stats.spearmanr(radii, results['node_load']) correlations['node_load'] = {'rho': rho, 'p': p} # Add correlations to results results['correlations'] = correlations return results
[docs] def run_influence_benchmark(graph_generator, graph_params, k=10, p=0.1, iterations=200, dim=3, num_layout_iterations=20, layout_params=None, backend='pytorch'): """ Run a benchmark comparing influence maximization methods. Parameters ---------- graph_generator : callable Function to generate a graph (returns sparse adjacency matrix) graph_params : dict Parameters for the graph generator k : int, default=10 Number of seed nodes to select for influence maximization p : float, default=0.1 Propagation probability for influence diffusion iterations : int, default=200 Number of Monte Carlo iterations for influence estimation dim : int, default=3 Number of embedding components (dimensions) num_layout_iterations : int, default=20 Number of force-directed layout iterations layout_params : dict, optional Additional parameters for embedder initialization (L_min, k_attr, k_inter, etc.) backend : str, default='pytorch' Backend to use for embedding ('pytorch', 'cuvs', or 'auto') Returns ------- dict Benchmark results comparing GraphEm vs Greedy seed selection with influence metrics """ logger.info("Running influence benchmark with %s...", graph_generator.__name__) # Generate the graph (returns sparse adjacency matrix) start_time = time.time() adjacency = graph_generator(**graph_params) # Count vertices and edges n = adjacency.shape[0] m = adjacency.nnz // 2 # Divide by 2 for undirected graphs logger.info("Generated graph with %d vertices and %d edges", n, m) # Convert to NetworkX graph nx_graph = nx.from_scipy_sparse_array(adjacency) # Default layout parameters if layout_params is None: layout_params = { 'L_min': 10.0, 'k_attr': 0.5, 'k_inter': 0.1, 'n_neighbors': 15, 'sample_size': 512, 'batch_size': 1024 } # Create embedder logger.info("Creating embedder...") # Ensure adjacency is in CSR format if not sp.isspmatrix_csr(adjacency): adjacency = adjacency.tocsr() embedder = GraphEmbedderPyTorch( adjacency=adjacency, n_components=dim, verbose=True, **layout_params ) # Run GraphEm-based seed selection logger.info("Running GraphEm seed selection...") graphem_start = time.time() graphem_seeds = graphem_seed_selection(embedder, k, num_iterations=num_layout_iterations) graphem_time = time.time() - graphem_start # Run greedy seed selection logger.info("Running greedy seed selection...") greedy_start = time.time() greedy_seeds, greedy_iters = greedy_seed_selection(nx_graph, k, p, iterations) greedy_time = time.time() - greedy_start # Evaluate influence for GraphEm seeds logger.info("Evaluating GraphEm influence...") graphem_eval_start = time.time() graphem_influence, _ = ndlib_estimated_influence(nx_graph, graphem_seeds, p, iterations) graphem_eval_time = time.time() - graphem_eval_start # Evaluate influence for Greedy seeds logger.info("Evaluating Greedy influence...") greedy_eval_start = time.time() greedy_influence, _ = ndlib_estimated_influence(nx_graph, greedy_seeds, p, iterations) greedy_eval_time = time.time() - greedy_eval_start # Calculate random baseline logger.info("Evaluating random baseline...") random_influences = [] for _ in range(10): random_seeds = np.random.choice(n, k, replace=False) random_influence, _ = ndlib_estimated_influence(nx_graph, random_seeds, p, iterations) random_influences.append(random_influence) random_influence = np.mean(random_influences) # Compile results results = { 'graph_type': graph_generator.__name__, 'n': n, 'm': m, 'backend': backend, 'graphem_seeds': graphem_seeds, 'greedy_seeds': greedy_seeds, 'graphem_influence': graphem_influence, 'greedy_influence': greedy_influence, 'random_influence': random_influence, 'graphem_time': graphem_time, 'greedy_time': greedy_time, 'graphem_eval_time': graphem_eval_time, 'greedy_eval_time': greedy_eval_time, 'greedy_iterations': greedy_iters, 'graphem_norm_influence': graphem_influence / n, 'greedy_norm_influence': greedy_influence / n, 'random_norm_influence': random_influence / n } # Calculate efficiency ratios results['graphem_efficiency'] = results['graphem_norm_influence'] / graphem_time if graphem_time > 0 else 0 results['greedy_efficiency'] = results['greedy_norm_influence'] / greedy_time if greedy_time > 0 else 0 total_time = time.time() - start_time results['total_time'] = total_time logger.info("Influence benchmark completed") return results