From 19289d82bf207a573de250c91f80e87fab55ada4 Mon Sep 17 00:00:00 2001 From: lfischerr Date: Wed, 25 Mar 2026 16:59:30 +0100 Subject: [PATCH 01/10] add second stage and tests to DoublyStochastic --- .../operators/doubly_stochastic.py | 45 ++++++++- tests/test_doubly_stochastic.py | 96 ++++++++++++++++--- 2 files changed, 124 insertions(+), 17 deletions(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index 0213d7a..99afa93 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -2,8 +2,11 @@ import numpy as np from ..graph import Graph from .base import GraphOperator +import networkx as nx +# Z. 48-57: +# https://gitlab.liris.cnrs.fr/coregraphie/netbone/-/blob/main/netbone/structural/doubly_stochastic.py?ref_type=heads @dataclass(slots=True) class DoublyStochastic(GraphOperator): """ @@ -30,6 +33,7 @@ class DoublyStochastic(GraphOperator): Copy metadata frame if present. Default True. """ + backbone_method: bool = False tolerance: float = 1e-5 max_iter: int = 10_000 copy_meta: bool = True @@ -84,14 +88,12 @@ def apply(self, G: Graph) -> Graph: # Only check rows/cols that have edges (others stay 0 and are irrelevant) if row_has_edges.any(): - rows_ok = np.all((row_sums[row_has_edges] >= min_thres) & - (row_sums[row_has_edges] <= max_thres)) + rows_ok = np.all((row_sums[row_has_edges] >= min_thres) & (row_sums[row_has_edges] <= max_thres)) else: rows_ok = True if col_has_edges.any(): - cols_ok = np.all((col_sums[col_has_edges] >= min_thres) & - (col_sums[col_has_edges] <= max_thres)) + cols_ok = np.all((col_sums[col_has_edges] >= min_thres) & (col_sums[col_has_edges] <= max_thres)) else: cols_ok = True @@ -105,6 +107,41 @@ def apply(self, G: Graph) -> Graph: # col scaling A_scaled.data *= c[A_scaled.indices] + # step 2 + if self.backbone_method: + i = 0 + + rows, cols = A_scaled.nonzero() + vals = A_scaled.data + + order = np.argsort(vals)[::-1] + rows = rows[order] + cols = cols[order] + vals = vals[order] + print(rows, cols, order) + + if not G.directed: + G_filtered = nx.Graph() + while ( + nx.number_connected_components(G_filtered) != 1 + or len(G_filtered) < A_scaled.shape[0] + or not nx.is_connected(G_filtered) + ): + if i == A_scaled.shape[0]: + break + G_filtered.add_edge(rows[i], cols[i], weight=vals[i]) + i += 1 + G_csr = nx.to_scipy_sparse_array(G_filtered) + + return Graph.from_csr( + G_csr, + directed=G.directed, + weighted=True, + mode=G.mode, + # meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), + sym_op="max", + ) + return Graph.from_csr( A_scaled, directed=G.directed, diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py index c0febd5..bfa99b2 100644 --- a/tests/test_doubly_stochastic.py +++ b/tests/test_doubly_stochastic.py @@ -16,12 +16,15 @@ def _csr(data, rows, cols, n): # ----------------- Positive dense matrix: converges to ~doubly stochastic ----------------- def test_doubly_stochastic_converges_on_positive_dense(): # Strictly positive, symmetric 4x4 (undirected) - M = np.array([ - [0.2, 0.8, 0.5, 0.3], - [0.7, 0.1, 0.4, 0.6], - [0.3, 0.9, 0.2, 0.5], - [0.5, 0.2, 0.7, 0.4], - ], dtype=float) + M = np.array( + [ + [0.2, 0.8, 0.5, 0.3], + [0.7, 0.1, 0.4, 0.6], + [0.3, 0.9, 0.2, 0.5], + [0.5, 0.2, 0.7, 0.4], + ], + dtype=float, + ) # Zero the diagonal (typical adjacency semantics) np.fill_diagonal(M, 0.0) @@ -43,6 +46,49 @@ def test_doubly_stochastic_converges_on_positive_dense(): assert not G.directed and G.weighted +def test_doubly_stochastic_with_backbone_method(): + # Strictly positive, asymmetric matrix (to make backbone relevant) + M = np.array( + [ + [0.2, 0.8, 0.5, 0.3], + [0.7, 0.1, 0.4, 0.6], + [0.3, 0.9, 0.2, 0.5], + [0.5, 0.2, 0.7, 0.4], + ], + dtype=float, + ) + + # Zero diagonal + np.fill_diagonal(M, 0.0) + + G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max") + + op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True) + + G = op.apply(G0) + A = G.adj + + # No NaNs or infs + assert np.isfinite(A.data).all() + + # Check that only backbone edges remain (sparser than original) + assert A.nnz <= G0.adj.nnz + + # Rows/cols should still approximately sum to 1 (on non-isolated nodes) + row_sums = np.asarray(A.sum(axis=1)).ravel() + col_sums = np.asarray(A.sum(axis=0)).ravel() + + # Only check nodes that still have edges + nonzero_rows = row_sums > 0 + nonzero_cols = col_sums > 0 + + assert np.allclose(row_sums[nonzero_rows], row_sums[nonzero_rows][0], atol=1e-6) + assert np.allclose(col_sums[nonzero_cols], col_sums[nonzero_cols][0], atol=1e-6) + + # Graph properties preserved + assert not G.directed and G.weighted + + # ----------------- Sparse graph with isolates: zero rows/cols remain zero, others ~1 ----------------- def test_doubly_stochastic_sparse_with_isolates(): # 5 nodes, node 4 is isolated @@ -65,7 +111,7 @@ def test_doubly_stochastic_sparse_with_isolates(): col_sums = np.asarray(A2.sum(axis=0)).ravel() # Indices with edges - rows_with = (np.diff(A2.indptr) > 0) + rows_with = np.diff(A2.indptr) > 0 cols_with = (sp.csc_matrix(A2).indptr[1:] - sp.csc_matrix(A2).indptr[:-1]) > 0 # Non-isolated rows/cols sum ~1 @@ -81,6 +127,35 @@ def test_doubly_stochastic_sparse_with_isolates(): assert not G.directed and G.weighted +def test_doubly_stochastic_sparse_with_isolates_backbone(): + A = _csr( + data=[0.4, 0.6, 0.3, 0.7, 0.2, 0.5], + rows=[0, 0, 1, 1, 2, 3], + cols=[1, 2, 2, 3, 3, 2], + n=5, + ) + G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") + + op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True) + G = op.apply(G0) + A2 = G.adj + + assert np.isfinite(A2.data).all() + + row_sums = np.asarray(A2.sum(axis=1)).ravel() + col_sums = np.asarray(A2.sum(axis=0)).ravel() + rows_with = np.diff(A2.indptr) > 0 + cols_with = (sp.csc_matrix(A2).indptr[1:] - sp.csc_matrix(A2).indptr[:-1]) > 0 + + if rows_with.any(): + assert np.all(row_sums[rows_with] > 0) + if cols_with.any(): + assert np.all(col_sums[cols_with] > 0) + + # Isolated node (4) stays isolated (not in the graph) + assert len(row_sums) == 4 + + # ----------------- Directed case: rows and cols ~1 for nonzero rows/cols ----------------- def test_doubly_stochastic_directed_graph_unsolvable(): # Directed 4x4 with zeros on diagonal, not symmetric @@ -100,12 +175,7 @@ def test_doubly_stochastic_directed_graph_unsolvable(): # No NaNs or infs assert np.isfinite(A2.data).all() - expected_result = np.array([ - [0. , 0.5, 0.5, 0. ], - [0. , 0. , 1. , 0. ], - [0.5, 0. , 0. , 0.5], - [0. , 1. , 0. , 0. ] - ]) + expected_result = np.array([[0.0, 0.5, 0.5, 0.0], [0.0, 0.0, 1.0, 0.0], [0.5, 0.0, 0.0, 0.5], [0.0, 1.0, 0.0, 0.0]]) assert np.allclose(A2.toarray(), expected_result, atol=1e-4) # Directed flag preserved From 701fa0b8eaf732504a86b29bb62a4be01f7fdaf7 Mon Sep 17 00:00:00 2001 From: lfischerr Date: Wed, 25 Mar 2026 17:02:29 +0100 Subject: [PATCH 02/10] fix linting --- graphconstructor/operators/doubly_stochastic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index 99afa93..a4622b0 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -1,8 +1,8 @@ from dataclasses import dataclass +import networkx as nx import numpy as np from ..graph import Graph from .base import GraphOperator -import networkx as nx # Z. 48-57: From ebbe3a6d669595ecbb8099201f66827f5f36c72b Mon Sep 17 00:00:00 2001 From: lfischerr Date: Sun, 12 Apr 2026 20:10:46 +0200 Subject: [PATCH 03/10] fix break statement, add metadata and adjust test --- graphconstructor/operators/doubly_stochastic.py | 9 +++++---- tests/test_doubly_stochastic.py | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index a4622b0..89341eb 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -118,8 +118,6 @@ def apply(self, G: Graph) -> Graph: rows = rows[order] cols = cols[order] vals = vals[order] - print(rows, cols, order) - if not G.directed: G_filtered = nx.Graph() while ( @@ -127,10 +125,13 @@ def apply(self, G: Graph) -> Graph: or len(G_filtered) < A_scaled.shape[0] or not nx.is_connected(G_filtered) ): - if i == A_scaled.shape[0]: + if i == len(rows) or G_filtered.number_of_nodes() == len(set(rows)): break G_filtered.add_edge(rows[i], cols[i], weight=vals[i]) i += 1 + + # add isolated nodes + G_filtered.add_nodes_from(range(G.n_nodes)) G_csr = nx.to_scipy_sparse_array(G_filtered) return Graph.from_csr( @@ -138,7 +139,7 @@ def apply(self, G: Graph) -> Graph: directed=G.directed, weighted=True, mode=G.mode, - # meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), + meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), sym_op="max", ) diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py index bfa99b2..a3ffffd 100644 --- a/tests/test_doubly_stochastic.py +++ b/tests/test_doubly_stochastic.py @@ -152,8 +152,11 @@ def test_doubly_stochastic_sparse_with_isolates_backbone(): if cols_with.any(): assert np.all(col_sums[cols_with] > 0) - # Isolated node (4) stays isolated (not in the graph) - assert len(row_sums) == 4 + # Isolated node (4) stays isolated but is still part of the graph + G_nx = G.to_networkx() + + assert len(row_sums) == 5 + assert len(G_nx.edges(4)) == 0 # ----------------- Directed case: rows and cols ~1 for nonzero rows/cols ----------------- From 0f60b381f40cc8226c6a1a1f3b5e6db4610c3269 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 16:52:57 +0200 Subject: [PATCH 04/10] break into two classes (normalize and backbond) --- .../operators/doubly_stochastic.py | 145 ++++++++++++------ 1 file changed, 102 insertions(+), 43 deletions(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index 89341eb..3ca0505 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -5,10 +5,8 @@ from .base import GraphOperator -# Z. 48-57: -# https://gitlab.liris.cnrs.fr/coregraphie/netbone/-/blob/main/netbone/structural/doubly_stochastic.py?ref_type=heads @dataclass(slots=True) -class DoublyStochastic(GraphOperator): +class DoublyStochasticNormalize(GraphOperator): """ Alternating row/column normalization (Sinkhorn-Knopp) to make the adjacency approximately doubly stochastic. Works with CSR without densifying. @@ -33,7 +31,6 @@ class DoublyStochastic(GraphOperator): Copy metadata frame if present. Default True. """ - backbone_method: bool = False tolerance: float = 1e-5 max_iter: int = 10_000 copy_meta: bool = True @@ -44,9 +41,9 @@ def apply(self, G: Graph) -> Graph: A = G.adj.tocsr(copy=False) if A.shape[0] != A.shape[1]: - raise TypeError("DoublyStochastic requires a square adjacency.") + raise TypeError("DoublyStochasticNormalize requires a square adjacency.") if (A.data < 0).any(): - raise ValueError("DoublyStochastic requires nonnegative weights.") + raise ValueError("DoublyStochasticNormalize requires nonnegative weights.") n = A.shape[0] if A.nnz == 0: @@ -67,18 +64,20 @@ def apply(self, G: Graph) -> Graph: # Precompute nnz masks for convergence checks under sparsity row_has_edges = np.array(A.sum(axis=1) > 0).T[0] col_has_edges = np.array(A.sum(axis=0) > 0)[0] + # The csc conversions above could be heavy; build once A_T = A.T.tocsr(copy=False) min_thres = 1.0 - self.tolerance max_thres = 1.0 + self.tolerance - # Parameter for stabilitation + # Parameter for stabilization MAX_FACTOR = 1e50 for _ in range(self.max_iter): c[col_has_edges] = 1 / A_T.dot(r)[col_has_edges] r[row_has_edges] = 1 / A.dot(c)[row_has_edges] + if np.any(np.abs(r) > MAX_FACTOR) or np.any(np.abs(c) > MAX_FACTOR): break # avoid overflow; close enough @@ -88,12 +87,18 @@ def apply(self, G: Graph) -> Graph: # Only check rows/cols that have edges (others stay 0 and are irrelevant) if row_has_edges.any(): - rows_ok = np.all((row_sums[row_has_edges] >= min_thres) & (row_sums[row_has_edges] <= max_thres)) + rows_ok = np.all( + (row_sums[row_has_edges] >= min_thres) + & (row_sums[row_has_edges] <= max_thres) + ) else: rows_ok = True if col_has_edges.any(): - cols_ok = np.all((col_sums[col_has_edges] >= min_thres) & (col_sums[col_has_edges] <= max_thres)) + cols_ok = np.all( + (col_sums[col_has_edges] >= min_thres) + & (col_sums[col_has_edges] <= max_thres) + ) else: cols_ok = True @@ -102,49 +107,103 @@ def apply(self, G: Graph) -> Graph: # Apply scaling once: A' = diag(r) * A * diag(c) (CSR-friendly) A_scaled = A.copy() + # row scaling A_scaled.data *= np.repeat(r, np.diff(A_scaled.indptr)) + # col scaling A_scaled.data *= c[A_scaled.indices] - # step 2 - if self.backbone_method: - i = 0 - - rows, cols = A_scaled.nonzero() - vals = A_scaled.data - - order = np.argsort(vals)[::-1] - rows = rows[order] - cols = cols[order] - vals = vals[order] - if not G.directed: - G_filtered = nx.Graph() - while ( - nx.number_connected_components(G_filtered) != 1 - or len(G_filtered) < A_scaled.shape[0] - or not nx.is_connected(G_filtered) - ): - if i == len(rows) or G_filtered.number_of_nodes() == len(set(rows)): - break - G_filtered.add_edge(rows[i], cols[i], weight=vals[i]) - i += 1 - - # add isolated nodes - G_filtered.add_nodes_from(range(G.n_nodes)) - G_csr = nx.to_scipy_sparse_array(G_filtered) + return Graph.from_csr( + A_scaled, + directed=G.directed, + weighted=True, + mode=G.mode, + meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), + sym_op="max", + ) - return Graph.from_csr( - G_csr, - directed=G.directed, - weighted=True, - mode=G.mode, - meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), - sym_op="max", + +@dataclass(slots=True) +class DoublyStochasticBackbone(GraphOperator): + """ + Backbone extraction based on doubly-stochastic edge scores. + + First applies DoublyStochasticNormalize, then sorts edges by normalized score + and adds strongest edges until the graph becomes connected, following the + behavior of the previous DoublyStochastic(backbone_method=True) branch. + + References + ---------- + - Coscia, M. (2025). "The Atlas for the Inspiring Network Scientist." + - Yassin, A. (2023). "An evaluation tool for backbone extraction techniques + in weighted complex networks." Scientific Reports. + + Parameters + ---------- + tolerance : float + Band tolerance passed to DoublyStochasticNormalize. Default 1e-5. + max_iter : int + Maximum Sinkhorn iterations passed to DoublyStochasticNormalize. + Default 10_000. + copy_meta : bool + Copy metadata frame if present. Default True. + """ + + tolerance: float = 1e-5 + max_iter: int = 10_000 + copy_meta: bool = True + supported_modes = ["similarity"] + + def apply(self, G: Graph) -> Graph: + self._check_mode_supported(G) + + normalized = DoublyStochasticNormalize( + tolerance=self.tolerance, + max_iter=self.max_iter, + copy_meta=self.copy_meta, + ).apply(G) + + A_scaled = normalized.adj.tocsr(copy=False) + + i = 0 + rows, cols = A_scaled.nonzero() + vals = A_scaled.data + + order = np.argsort(vals)[::-1] + rows = rows[order] + cols = cols[order] + vals = vals[order] + + if not G.directed: + G_filtered = nx.Graph() + + while ( + nx.number_connected_components(G_filtered) != 1 + or len(G_filtered) < A_scaled.shape[0] + or not nx.is_connected(G_filtered) + ): + if i == len(rows) or G_filtered.number_of_nodes() == len(set(rows)): + break + + G_filtered.add_edge(rows[i], cols[i], weight=vals[i]) + i += 1 + + # add isolated nodes + G_filtered.add_nodes_from(range(G.n_nodes)) + + else: + raise NotImplementedError( + "DoublyStochasticBackbone currently supports only undirected graphs." ) + G_csr = nx.to_scipy_sparse_array( + G_filtered, + nodelist=list(range(G.n_nodes)), + ) + return Graph.from_csr( - A_scaled, + G_csr, directed=G.directed, weighted=True, mode=G.mode, From 6b482cf9c89e06e068ce19cfe51ae3841b01428a Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 16:56:50 +0200 Subject: [PATCH 05/10] adjust tests --- graphconstructor/operators/__init__.py | 5 +++-- tests/test_doubly_stochastic.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/graphconstructor/operators/__init__.py b/graphconstructor/operators/__init__.py index 9e3c09e..7838bec 100644 --- a/graphconstructor/operators/__init__.py +++ b/graphconstructor/operators/__init__.py @@ -1,6 +1,6 @@ from .base import GraphOperator from .disparity import DisparityFilter -from .doubly_stochastic import DoublyStochastic +from .doubly_stochastic import DoublyStochasticNormalize, DoublyStochasticBackbone from .knn_selector import KNNSelector from .locally_adaptive_sparsification import LocallyAdaptiveSparsification from .marginal_likelihood import MarginalLikelihoodFilter @@ -11,7 +11,8 @@ __all__ = [ "DisparityFilter", - "DoublyStochastic", + "DoublyStochasticNormalize", + "DoublyStochasticBackbone", "GraphOperator", "KNNSelector", "LocallyAdaptiveSparsification", diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py index a3ffffd..02538a0 100644 --- a/tests/test_doubly_stochastic.py +++ b/tests/test_doubly_stochastic.py @@ -3,7 +3,7 @@ import pytest import scipy.sparse as sp from graphconstructor import Graph -from graphconstructor.operators import DoublyStochastic +from graphconstructor.operators import DoublyStochasticNormalize, DoublyStochasticBackbone def _csr(data, rows, cols, n): @@ -29,7 +29,7 @@ def test_doubly_stochastic_converges_on_positive_dense(): np.fill_diagonal(M, 0.0) G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A = G.adj @@ -63,7 +63,7 @@ def test_doubly_stochastic_with_backbone_method(): G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True) + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A = G.adj @@ -100,7 +100,7 @@ def test_doubly_stochastic_sparse_with_isolates(): ) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A2 = G.adj @@ -136,7 +136,7 @@ def test_doubly_stochastic_sparse_with_isolates_backbone(): ) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True) + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A2 = G.adj @@ -171,7 +171,7 @@ def test_doubly_stochastic_directed_graph_unsolvable(): ) G0 = Graph.from_csr(A, directed=True, weighted=True, mode="similarity") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A2 = G.adj @@ -189,7 +189,7 @@ def test_doubly_stochastic_directed_graph_unsolvable(): def test_doubly_stochastic_rejects_negative_weights(): A = _csr([-0.2, 0.5], [0, 1], [1, 0], 2) G0 = Graph.from_csr(A, directed=True, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic() + op = DoublyStochasticNormalize() with pytest.raises(ValueError, match="nonnegative"): op.apply(G0) @@ -197,7 +197,7 @@ def test_doubly_stochastic_rejects_negative_weights(): def test_doubly_stochastic_rejects_distances(): A = _csr([0.2, 0.5], [0, 1], [1, 0], 2) G0 = Graph.from_csr(A, directed=True, weighted=True, mode="distance", sym_op="max") - op = DoublyStochastic() + op = DoublyStochasticNormalize() with pytest.raises(ValueError, match="only supports modes"): op.apply(G0) @@ -206,7 +206,7 @@ def test_doubly_stochastic_rejects_distances(): def test_doubly_stochastic_all_zero_matrix_noop(): A = sp.csr_matrix((4, 4), dtype=float) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic() + op = DoublyStochasticNormalize() G = op.apply(G0) assert G.adj.nnz == 0 assert G.adj.shape == (4, 4) @@ -219,7 +219,7 @@ def test_doubly_stochastic_preserves_flags_and_copies_metadata(): A = _csr([0.4, 0.6, 0.3], [0, 1, 2], [1, 2, 0], 3) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", meta=meta, sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, copy_meta=True) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000, copy_meta=True) G = op.apply(G0) # Flags From ddf37c1763e6f73c863b97f8a3d8ab6b98b8209b Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 16:58:42 +0200 Subject: [PATCH 06/10] linting --- graphconstructor/operators/__init__.py | 2 +- tests/test_doubly_stochastic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/graphconstructor/operators/__init__.py b/graphconstructor/operators/__init__.py index 7838bec..88475b6 100644 --- a/graphconstructor/operators/__init__.py +++ b/graphconstructor/operators/__init__.py @@ -1,6 +1,6 @@ from .base import GraphOperator from .disparity import DisparityFilter -from .doubly_stochastic import DoublyStochasticNormalize, DoublyStochasticBackbone +from .doubly_stochastic import DoublyStochasticBackbone, DoublyStochasticNormalize from .knn_selector import KNNSelector from .locally_adaptive_sparsification import LocallyAdaptiveSparsification from .marginal_likelihood import MarginalLikelihoodFilter diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py index 02538a0..6772bdf 100644 --- a/tests/test_doubly_stochastic.py +++ b/tests/test_doubly_stochastic.py @@ -3,7 +3,7 @@ import pytest import scipy.sparse as sp from graphconstructor import Graph -from graphconstructor.operators import DoublyStochasticNormalize, DoublyStochasticBackbone +from graphconstructor.operators import DoublyStochasticBackbone, DoublyStochasticNormalize def _csr(data, rows, cols, n): From 180383d7ba478640e574a80c54ee0919ccdc5a28 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 17:24:50 +0200 Subject: [PATCH 07/10] add tests --- tests/test_doubly_stochastic.py | 80 ++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py index 6772bdf..2a5c792 100644 --- a/tests/test_doubly_stochastic.py +++ b/tests/test_doubly_stochastic.py @@ -61,7 +61,13 @@ def test_doubly_stochastic_with_backbone_method(): # Zero diagonal np.fill_diagonal(M, 0.0) - G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max") + G0 = Graph.from_dense( + M, + directed=False, + weighted=True, + mode="similarity", + sym_op="max", + ) op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) @@ -71,19 +77,17 @@ def test_doubly_stochastic_with_backbone_method(): # No NaNs or infs assert np.isfinite(A.data).all() - # Check that only backbone edges remain (sparser than original) + # Backbone should be no denser than original graph assert A.nnz <= G0.adj.nnz - # Rows/cols should still approximately sum to 1 (on non-isolated nodes) - row_sums = np.asarray(A.sum(axis=1)).ravel() - col_sums = np.asarray(A.sum(axis=0)).ravel() + # For this connected 4-node graph, the backbone should connect all nodes. + assert G.is_connected() - # Only check nodes that still have edges - nonzero_rows = row_sums > 0 - nonzero_cols = col_sums > 0 - - assert np.allclose(row_sums[nonzero_rows], row_sums[nonzero_rows][0], atol=1e-6) - assert np.allclose(col_sums[nonzero_cols], col_sums[nonzero_cols][0], atol=1e-6) + # The backbone implementation adds strongest edges until + # connected, a connected undirected graph should end up with at least n-1 + # and at most the original number of edges. + assert G.n_edges >= G.n_nodes - 1 + assert G.n_edges <= G0.n_edges # Graph properties preserved assert not G.directed and G.weighted @@ -159,6 +163,60 @@ def test_doubly_stochastic_sparse_with_isolates_backbone(): assert len(G_nx.edges(4)) == 0 +def test_doubly_stochastic_backbone_keeps_adding_edges_until_connected(): + # This graph has two very strong edges, 0--1 and 2--3, plus one weak + # bridge, 1--2. + A = _csr( + data=[100.0, 100.0, 100.0, 100.0, 1.0, 1.0], + rows=[0, 1, 2, 3, 1, 2], + cols=[1, 0, 3, 2, 2, 1], + n=4, + ) + G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") + + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) + G = op.apply(G0) + + assert G.is_connected() + assert G.n_edges == 3 + + # The weak bridge is required for connectivity and should not be skipped. + assert G.adj[1, 2] > 0 + assert G.adj[2, 1] > 0 + + +def test_doubly_stochastic_backbone_preserves_node_order_after_networkx_conversion(): + # Node 4 is part of the strongest edge. Without an explicit nodelist in + # nx.to_scipy_sparse_array, NetworkX insertion order can remap this edge + # to different matrix indices. + A = _csr( + data=[10.0, 10.0, 1.0, 1.0, 1.0, 1.0], + rows=[0, 4, 0, 1, 1, 4], + cols=[4, 0, 1, 0, 4, 1], + n=5, + ) + G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") + + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) + G = op.apply(G0) + + assert G.adj.shape == (5, 5) + + # Node labels should still correspond to the original matrix indices. + # In particular, node 4 should still be connected to the component, + # not silently remapped to another row/column. + assert G.adj[0, 4] > 0 or G.adj[1, 4] > 0 + assert np.asarray(G.adj[4].sum()).ravel()[0] > 0 + + # Node 2 and node 3 were isolated in the input and should remain isolated. + row_sums = np.asarray(G.adj.sum(axis=1)).ravel() + col_sums = np.asarray(G.adj.sum(axis=0)).ravel() + + assert row_sums[2] == 0.0 + assert col_sums[2] == 0.0 + assert row_sums[3] == 0.0 + assert col_sums[3] == 0.0 + # ----------------- Directed case: rows and cols ~1 for nonzero rows/cols ----------------- def test_doubly_stochastic_directed_graph_unsolvable(): # Directed 4x4 with zeros on diagonal, not symmetric From 7776bba19a56a6d125f25347bde116a9208fc3ae Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 17:27:26 +0200 Subject: [PATCH 08/10] refactor backbone part --- .../operators/doubly_stochastic.py | 76 +++++++++++-------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index 3ca0505..997fbeb 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -1,6 +1,7 @@ from dataclasses import dataclass import networkx as nx import numpy as np +import scipy.sparse as sp from ..graph import Graph from .base import GraphOperator @@ -158,6 +159,11 @@ class DoublyStochasticBackbone(GraphOperator): def apply(self, G: Graph) -> Graph: self._check_mode_supported(G) + if G.directed: + raise NotImplementedError( + "DoublyStochasticBackbone currently supports only undirected graphs." + ) + normalized = DoublyStochasticNormalize( tolerance=self.tolerance, max_iter=self.max_iter, @@ -165,37 +171,7 @@ def apply(self, G: Graph) -> Graph: ).apply(G) A_scaled = normalized.adj.tocsr(copy=False) - - i = 0 - rows, cols = A_scaled.nonzero() - vals = A_scaled.data - - order = np.argsort(vals)[::-1] - rows = rows[order] - cols = cols[order] - vals = vals[order] - - if not G.directed: - G_filtered = nx.Graph() - - while ( - nx.number_connected_components(G_filtered) != 1 - or len(G_filtered) < A_scaled.shape[0] - or not nx.is_connected(G_filtered) - ): - if i == len(rows) or G_filtered.number_of_nodes() == len(set(rows)): - break - - G_filtered.add_edge(rows[i], cols[i], weight=vals[i]) - i += 1 - - # add isolated nodes - G_filtered.add_nodes_from(range(G.n_nodes)) - - else: - raise NotImplementedError( - "DoublyStochasticBackbone currently supports only undirected graphs." - ) + G_filtered = self._extract_undirected_backbone(A_scaled, G.n_nodes) G_csr = nx.to_scipy_sparse_array( G_filtered, @@ -210,3 +186,41 @@ def apply(self, G: Graph) -> Graph: meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), sym_op="max", ) + + @staticmethod + def _extract_undirected_backbone(A_scaled, n_nodes: int) -> nx.Graph: + """ + Extract an undirected backbone from normalized edge scores. + + Uses only the upper triangle of the adjacency matrix so that each + undirected edge is considered once. + """ + G_filtered = nx.Graph() + G_filtered.add_nodes_from(range(n_nodes)) + + if n_nodes <= 1 or A_scaled.nnz == 0: + return G_filtered + + # Use only one orientation per undirected edge and ignore self-loops. + A_upper = sp.triu(A_scaled, k=1).tocoo() + + if A_upper.nnz == 0: + return G_filtered + + rows = A_upper.row + cols = A_upper.col + vals = A_upper.data + + order = np.argsort(vals)[::-1] + + for idx in order: + if nx.is_connected(G_filtered): + break + + G_filtered.add_edge( + int(rows[idx]), + int(cols[idx]), + weight=float(vals[idx]), + ) + + return G_filtered From b373fb6730c32dc837e74a415c66676a3e5a7853 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 17:38:32 +0200 Subject: [PATCH 09/10] add warnings and some linting --- .../operators/doubly_stochastic.py | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index 997fbeb..d80c527 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -2,6 +2,7 @@ import networkx as nx import numpy as np import scipy.sparse as sp +import warnings from ..graph import Graph from .base import GraphOperator @@ -66,7 +67,7 @@ def apply(self, G: Graph) -> Graph: row_has_edges = np.array(A.sum(axis=1) > 0).T[0] col_has_edges = np.array(A.sum(axis=0) > 0)[0] - # The csc conversions above could be heavy; build once + # Build transposed CSR once for repeated column-sum products A_T = A.T.tocsr(copy=False) min_thres = 1.0 - self.tolerance @@ -75,12 +76,18 @@ def apply(self, G: Graph) -> Graph: # Parameter for stabilization MAX_FACTOR = 1e50 + converged = False for _ in range(self.max_iter): c[col_has_edges] = 1 / A_T.dot(r)[col_has_edges] r[row_has_edges] = 1 / A.dot(c)[row_has_edges] if np.any(np.abs(r) > MAX_FACTOR) or np.any(np.abs(c) > MAX_FACTOR): - break # avoid overflow; close enough + warnings.warn( + "DoublyStochasticNormalize stopped early because scaling factors " + "became very large. Result may not be doubly stochastic.", + RuntimeWarning, + ) + break # Convergence check (band and tol) row_sums = r * (A.dot(c)) @@ -104,8 +111,15 @@ def apply(self, G: Graph) -> Graph: cols_ok = True if rows_ok and cols_ok: + converged = True break + if not converged: + warnings.warn( + "DoublyStochasticNormalize did not converge within max_iter.", + RuntimeWarning, + ) + # Apply scaling once: A' = diag(r) * A * diag(c) (CSR-friendly) A_scaled = A.copy() @@ -115,6 +129,9 @@ def apply(self, G: Graph) -> Graph: # col scaling A_scaled.data *= c[A_scaled.indices] + # TODO: For undirected graphs, Graph.from_csr symmetrizes A_scaled again. + # This may change row/column sums after normalization. A future revision should + # either use symmetric scaling or allow Graph construction without re-symmetrizing. return Graph.from_csr( A_scaled, directed=G.directed, @@ -131,8 +148,9 @@ class DoublyStochasticBackbone(GraphOperator): Backbone extraction based on doubly-stochastic edge scores. First applies DoublyStochasticNormalize, then sorts edges by normalized score - and adds strongest edges until the graph becomes connected, following the - behavior of the previous DoublyStochastic(backbone_method=True) branch. + and adds strongest edges until the graph becomes connected, or until no candidate + edges remain. For disconnected inputs, the result is a strongest-edge forest + over the available components. References ---------- @@ -176,6 +194,8 @@ def apply(self, G: Graph) -> Graph: G_csr = nx.to_scipy_sparse_array( G_filtered, nodelist=list(range(G.n_nodes)), + weight="weight", + format="csr", ) return Graph.from_csr( @@ -214,7 +234,7 @@ def _extract_undirected_backbone(A_scaled, n_nodes: int) -> nx.Graph: order = np.argsort(vals)[::-1] for idx in order: - if nx.is_connected(G_filtered): + if nx.is_connected(G_filtered): # TODO: For large graphs, this check could be expensive break G_filtered.add_edge( From 27aef7b7fc9fcfbdcd53343457163d0e023e5733 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 24 Apr 2026 17:42:08 +0200 Subject: [PATCH 10/10] linting --- graphconstructor/operators/doubly_stochastic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index d80c527..406525c 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -1,8 +1,8 @@ +import warnings from dataclasses import dataclass import networkx as nx import numpy as np import scipy.sparse as sp -import warnings from ..graph import Graph from .base import GraphOperator