Use Case 2 — HEK293 Multilayer Biological Network¶
Constructs a cell-line-specific, multilayer heterogeneous graph of HEK293 proteins using:
- Signaling layer: OmniPath directed protein–protein interactions
- Metabolic layer: Human-GEM stoichiometric hyperedges (SBML [Systems Biology Markup Language])
- Complex layer: OmniPath protein complexes as undirected hyperedges
- Regulatory layer: DoRothEA TF [Transcription Factor]–target interactions (confidence levels A+B)
- Coupling edges: gene→protein translation links + TF protein→target gene bridges
Cell-line specificity is enforced throughout by filtering all interactions to proteins with protein-level expression evidence in the Human Protein Atlas (HPA [Human Protein Atlas]).
1. Imports¶
import os
import sys
import time
import json
import hashlib
import zipfile
import tracemalloc
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np
import polars as pl
import requests
import networkx as nx
import cobra
import omnipath as op
import decoupler as dc
import torch
import torch_geometric
from lxml import etree
sys.path.insert(0, os.path.abspath(".."))
from annnet.core.graph import AnnNet
from annnet.io.read_omnipath import read_omnipath
from annnet.io.SBML_io import from_sbml
from annnet.io.cx2_io import to_cx2
from annnet.adapters.networkx_adapter import to_nx
from annnet.adapters.pyg_adapter import to_pyg
/home/l1boll/miniconda3/envs/myenv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
2. Data Acquisition¶
All external files are cached locally under ./data/. Downloads are skipped if the file already exists.
DATA = Path("./data")
DATA.mkdir(exist_ok=True)
print(f"Cache dir: {DATA.resolve()}")
Cache dir: /mnt/c/Users/pc/desktop/annnet-remote/notebooks/data
HPA_URL = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
HPA_ZIP = DATA / "proteinatlas.tsv.zip"
HPA_TSV = DATA / "proteinatlas.tsv"
if not HPA_TSV.exists():
print("Downloading HPA [Human Protein Atlas]... (~100 MB, takes a minute)")
r = requests.get(HPA_URL, stream=True)
with open(HPA_ZIP, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
with zipfile.ZipFile(HPA_ZIP, "r") as z:
z.extractall(DATA)
print("Done.")
else:
print("Already cached.")
Already cached.
HPA_CELL_TSV = DATA / "rna_celline.tsv"
if not HPA_CELL_TSV.exists():
print("Downloading HPA [Human Protein Atlas] cell-line RNA [Ribonucleic Acid] table...")
r = requests.get(
"https://www.proteinatlas.org/download/rna_celline.tsv.gz", stream=True
)
with open(HPA_CELL_TSV, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
print("Done.")
else:
print("Already cached.")
Already cached.
HUMANGEM_XML = DATA / "Human-GEM.xml"
if not HUMANGEM_XML.exists():
print("Downloading Human-GEM [Generic Enzyme Model]...")
r = requests.get(
"https://raw.githubusercontent.com/SysBioChalmers/Human-GEM/main/model/Human-GEM.xml",
stream=True,
allow_redirects=True,
)
print("Status:", r.status_code)
with open(HUMANGEM_XML, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
print(f"Done. Size: {HUMANGEM_XML.stat().st_size} bytes")
else:
print(f"Already cached. Size: {HUMANGEM_XML.stat().st_size} bytes")
Already cached. Size: 43447379 bytes
COMPLEXES_TSV = DATA / "omnipath_complexes.tsv"
if not COMPLEXES_TSV.exists():
r = requests.get(
"https://omnipathdb.org/complexes?databases=CORUM,ComplexPortal,hu.MAP2&fields=sources&format=tsv",
timeout=120,
)
print("Status:", r.status_code)
with open(COMPLEXES_TSV, "w") as f:
f.write(r.text)
print("Downloaded complexes.")
else:
print("Already cached:", COMPLEXES_TSV)
complexes_raw = pl.read_csv(COMPLEXES_TSV, separator="\t", infer_schema_length=5000)
print("Shape:", complexes_raw.shape)
print("Columns:", complexes_raw.columns)
print(complexes_raw.head(3))
Already cached: data/omnipath_complexes.tsv Shape: (11510, 7) Columns: ['name', 'components', 'components_genesymbols', 'stoichiometry', 'sources', 'references', 'identifiers'] shape: (3, 7) ┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ name ┆ components ┆ components_ ┆ stoichiomet ┆ sources ┆ references ┆ identifiers │ │ --- ┆ --- ┆ genesymbols ┆ ry ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ --- ┆ --- ┆ str ┆ str ┆ str │ │ ┆ ┆ str ┆ str ┆ ┆ ┆ │ ╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ NFY ┆ P23511_P252 ┆ NFYA_NFYB_N ┆ 1:1:1 ┆ hu.MAP;SIGN ┆ 9372932;147 ┆ PDB:8qu4;PD │ │ ┆ 08_Q13952 ┆ FYC ┆ ┆ OR;CORUM;Co ┆ 55292;15243 ┆ B:6qmq;PDB: │ │ ┆ ┆ ┆ ┆ mplexPor… ┆ 141 ┆ 8qu2;PDB… │ │ SCF-betaTRCP ┆ P63208_Q136 ┆ BTRC_CUL1_S ┆ 1:1:1 ┆ SPIKE;SIGNO ┆ 9990852 ┆ Compleat:HC │ │ ┆ 16_Q9Y297 ┆ KP1 ┆ ┆ R;Compleat; ┆ ┆ 757;SIGNOR: │ │ ┆ ┆ ┆ ┆ CORUM ┆ ┆ SIGNOR-C… │ │ SMAD2/SMAD4 ┆ Q13485_Q157 ┆ SMAD2_SMAD4 ┆ 1:2 ┆ SIGNOR;Comp ┆ 12923550;80 ┆ intact:EBI- │ │ ┆ 96 ┆ ┆ ┆ lexPortal;P ┆ 61611;40654 ┆ 9692430;PDB │ │ ┆ ┆ ┆ ┆ DB ┆ 10;16322… ┆ :1u7v;SI… │ └──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
DOROTHEA_TSV = DATA / "dorothea.tsv"
if not DOROTHEA_TSV.exists():
print("Downloading DoRothEA [Discriminant Regulon Expression Analysis]...")
r = requests.get(
"https://omnipathdb.org/interactions?datasets=dorothea&dorothea_levels=A,B"
"&genesymbols=yes&fields=sources,references&format=tsv",
timeout=120,
)
print("Status:", r.status_code)
with open(DOROTHEA_TSV, "w") as f:
f.write(r.text)
print(f"Done. Size: {DOROTHEA_TSV.stat().st_size} bytes")
else:
print("Already cached.")
dorothea = pl.read_csv(DOROTHEA_TSV, separator="\t", infer_schema_length=5000)
print("Shape:", dorothea.shape)
print("Columns:", dorothea.columns)
print(dorothea.head(3))
Already cached. Shape: (15267, 12) Columns: ['source', 'target', 'source_genesymbol', 'target_genesymbol', 'is_directed', 'is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition', 'sources', 'references'] shape: (3, 12) ┌────────┬────────┬────────────┬────────────┬───┬────────────┬────────────┬────────────┬───────────┐ │ source ┆ target ┆ source_gen ┆ target_gen ┆ … ┆ consensus_ ┆ consensus_ ┆ sources ┆ reference │ │ --- ┆ --- ┆ esymbol ┆ esymbol ┆ ┆ stimulatio ┆ inhibition ┆ --- ┆ s │ │ str ┆ str ┆ --- ┆ --- ┆ ┆ n ┆ --- ┆ str ┆ --- │ │ ┆ ┆ str ┆ str ┆ ┆ --- ┆ bool ┆ ┆ str │ │ ┆ ┆ ┆ ┆ ┆ bool ┆ ┆ ┆ │ ╞════════╪════════╪════════════╪════════════╪═══╪════════════╪════════════╪════════════╪═══════════╡ │ Q16656 ┆ Q6UUV9 ┆ NRF1 ┆ CRTC1 ┆ … ┆ false ┆ false ┆ HOCOMOCO_D ┆ null │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ oRothEA;EN ┆ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ CODE-proxi ┆ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … ┆ │ │ Q16656 ┆ Q9Y2K7 ┆ NRF1 ┆ KDM2A ┆ … ┆ false ┆ false ┆ HOCOMOCO_D ┆ null │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ oRothEA;Re ┆ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Map_DoRoth ┆ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … ┆ │ │ Q16656 ┆ Q96G74 ┆ NRF1 ┆ OTUD5 ┆ … ┆ false ┆ false ┆ ReMap_DoRo ┆ null │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ thEA;ARACN ┆ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ e-GTEx_DoR ┆ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … ┆ │ └────────┴────────┴────────────┴────────────┴───┴────────────┴────────────┴────────────┴───────────┘
3. HEK293 Expressed Protein Set¶
Filter the HPA [Human Protein Atlas] atlas to genes with protein-level experimental evidence. This is the conservative but reproducible criterion for "expressed in HEK293": only proteins for which mass-spectrometry or antibody-based evidence exists are retained.
hpa_raw = pl.read_csv(HPA_TSV, separator="\t", infer_schema_length=10000)
print("Columns:", hpa_raw.columns[:10])
print("Shape:", hpa_raw.shape)
Columns: ['Gene', 'Gene synonym', 'Ensembl', 'Gene description', 'Uniprot', 'Chromosome', 'Position', 'Protein class', 'Biological process', 'Molecular function'] Shape: (20162, 107)
# The main atlas file contains per-gene summaries. Filter to protein-level evidence
# (mass-spectrometry or antibody-validated) — these are confirmed translated proteins.
hpa_expressed = hpa_raw.filter(
pl.col("Evidence").str.contains("protein level")
| pl.col("UniProt evidence").str.contains("protein level")
).select(
["Gene", "Uniprot", "Subcellular main location", "Subcellular additional location", "Evidence"]
)
print("Expressed proteins (protein-level evidence):", hpa_expressed.shape)
print(hpa_expressed.head(5))
Expressed proteins (protein-level evidence): (18564, 5) shape: (5, 5) ┌──────────┬─────────┬───────────────────────────┬───────────────────────────┬─────────────────────┐ │ Gene ┆ Uniprot ┆ Subcellular main location ┆ Subcellular additional ┆ Evidence │ │ --- ┆ --- ┆ --- ┆ locatio… ┆ --- │ │ str ┆ str ┆ str ┆ --- ┆ str │ │ ┆ ┆ ┆ str ┆ │ ╞══════════╪═════════╪═══════════════════════════╪═══════════════════════════╪═════════════════════╡ │ TSPAN6 ┆ O43657 ┆ Cell Junctions, Cytosol ┆ Nucleoli fibrillar center ┆ Evidence at protein │ │ ┆ ┆ ┆ ┆ level │ │ TNMD ┆ Q9H2S6 ┆ null ┆ null ┆ Evidence at protein │ │ ┆ ┆ ┆ ┆ level │ │ DPM1 ┆ O60762 ┆ null ┆ null ┆ Evidence at protein │ │ ┆ ┆ ┆ ┆ level │ │ SCYL3 ┆ Q8IZE3 ┆ Golgi apparatus, Cytosol ┆ null ┆ Evidence at protein │ │ ┆ ┆ ┆ ┆ level │ │ C1orf112 ┆ Q9NSG2 ┆ Nucleoplasm ┆ Nucleoli ┆ Evidence at protein │ │ ┆ ┆ ┆ ┆ level │ └──────────┴─────────┴───────────────────────────┴───────────────────────────┴─────────────────────┘
hek293_genes = set(hpa_expressed["Gene"].to_list())
print(f"HEK293 [Human Embryonic Kidney 293] expressed gene set size: {len(hek293_genes)}")
HEK293 [Human Embryonic Kidney 293] expressed gene set size: 18557
4. Vertex Table Construction¶
Two vertex types are created per expressed gene:
prot:GENENAME— the protein product, localised to its HPA [Human Protein Atlas] subcellular compartmentgene:GENENAME— the gene locus, always assigned to the nucleus (n)
GEM [Generic Enzyme Model] single-letter compartment codes are used throughout.
# Maps HPA [Human Protein Atlas] location strings to GEM [Generic Enzyme Model] single-letter compartment codes.
# First match wins; unmapped locations default to cytoplasm ('c').
COMPARTMENT_MAP = {
"Mitochondria": "m",
"Nucleus": "n",
"Nucleoplasm": "n",
"Nucleoli": "n",
"Nuclear membrane": "n",
"Endoplasmic reticulum": "r",
"Golgi apparatus": "g",
"Lysosome": "l",
"Cytosol": "c",
"Cytoplasm": "c",
"Plasma membrane": "p",
"Cell Junctions": "p",
"Peroxisome": "x",
"Vesicles": "v",
"Extracellular": "e",
}
def map_compartment(location_str: str | None) -> str:
if not location_str:
return "c"
loc_lower = location_str.lower()
for key, code in COMPARTMENT_MAP.items():
if key.lower() in loc_lower:
return code
return "c"
hpa_expressed = hpa_expressed.with_columns([
pl.col("Subcellular main location")
.map_elements(map_compartment, return_dtype=pl.Utf8)
.alias("compartment")
])
print(hpa_expressed.select(["Gene", "compartment"]).head(10))
print("\nCompartment distribution:")
print(hpa_expressed["compartment"].value_counts().sort("count", descending=True))
shape: (10, 2) ┌──────────┬─────────────┐ │ Gene ┆ compartment │ │ --- ┆ --- │ │ str ┆ str │ ╞══════════╪═════════════╡ │ TSPAN6 ┆ c │ │ TNMD ┆ null │ │ DPM1 ┆ null │ │ SCYL3 ┆ g │ │ C1orf112 ┆ n │ │ FGR ┆ p │ │ CFH ┆ v │ │ FUCA2 ┆ null │ │ GCLC ┆ n │ │ NFYA ┆ n │ └──────────┴─────────────┘ Compartment distribution: shape: (10, 2) ┌─────────────┬───────┐ │ compartment ┆ count │ │ --- ┆ --- │ │ str ┆ u32 │ ╞═════════════╪═══════╡ │ n ┆ 5405 │ │ null ┆ 5236 │ │ c ┆ 3510 │ │ p ┆ 1141 │ │ v ┆ 1081 │ │ m ┆ 966 │ │ g ┆ 746 │ │ r ┆ 440 │ │ x ┆ 21 │ │ l ┆ 18 │ └─────────────┴───────┘
vertex_proteins = hpa_expressed.select([
pl.concat_str([pl.lit("prot:"), pl.col("Gene")]).alias("vertex_id"),
pl.col("Gene").alias("gene_symbol"),
pl.col("Uniprot").alias("uniprot_id"),
pl.lit("protein").alias("entity_type"),
pl.col("compartment"),
pl.col("Subcellular main location").alias("hpa_location"),
pl.lit(None).alias("recon3d_id"),
])
# Genes always localised to nucleus — they exist as separate vertices to
# support gene→protein coupling and TF [Transcription Factor]→target edges.
vertex_genes = hpa_expressed.select([
pl.concat_str([pl.lit("gene:"), pl.col("Gene")]).alias("vertex_id"),
pl.col("Gene").alias("gene_symbol"),
pl.col("Uniprot").alias("uniprot_id"),
pl.lit("gene").alias("entity_type"),
pl.lit("n").alias("compartment"),
pl.col("Subcellular main location").alias("hpa_location"),
pl.lit(None).alias("recon3d_id"),
])
vertex_table = pl.concat([vertex_proteins, vertex_genes])
print("Vertex table shape:", vertex_table.shape)
print(vertex_table.head(5))
Vertex table shape: (37128, 7) shape: (5, 7) ┌───────────────┬─────────────┬────────────┬─────────────┬─────────────┬──────────────┬────────────┐ │ vertex_id ┆ gene_symbol ┆ uniprot_id ┆ entity_type ┆ compartment ┆ hpa_location ┆ recon3d_id │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ str ┆ str ┆ str ┆ null │ ╞═══════════════╪═════════════╪════════════╪═════════════╪═════════════╪══════════════╪════════════╡ │ prot:TSPAN6 ┆ TSPAN6 ┆ O43657 ┆ protein ┆ c ┆ Cell ┆ null │ │ ┆ ┆ ┆ ┆ ┆ Junctions, ┆ │ │ ┆ ┆ ┆ ┆ ┆ Cytosol ┆ │ │ prot:TNMD ┆ TNMD ┆ Q9H2S6 ┆ protein ┆ null ┆ null ┆ null │ │ prot:DPM1 ┆ DPM1 ┆ O60762 ┆ protein ┆ null ┆ null ┆ null │ │ prot:SCYL3 ┆ SCYL3 ┆ Q8IZE3 ┆ protein ┆ g ┆ Golgi ┆ null │ │ ┆ ┆ ┆ ┆ ┆ apparatus, ┆ │ │ ┆ ┆ ┆ ┆ ┆ Cytosol ┆ │ │ prot:C1orf112 ┆ C1orf112 ┆ Q9NSG2 ┆ protein ┆ n ┆ Nucleoplasm ┆ null │ └───────────────┴─────────────┴────────────┴─────────────┴─────────────┴──────────────┴────────────┘
vertex_table = vertex_table.with_columns(
pl.col("compartment").fill_null("c")
)
print("Null compartments remaining:", vertex_table["compartment"].null_count())
print("Compartment distribution:")
print(vertex_table["compartment"].value_counts().sort("count", descending=True))
Null compartments remaining: 0 Compartment distribution: shape: (9, 2) ┌─────────────┬───────┐ │ compartment ┆ count │ │ --- ┆ --- │ │ str ┆ u32 │ ╞═════════════╪═══════╡ │ n ┆ 23969 │ │ c ┆ 8746 │ │ p ┆ 1141 │ │ v ┆ 1081 │ │ m ┆ 966 │ │ g ┆ 746 │ │ r ┆ 440 │ │ x ┆ 21 │ │ l ┆ 18 │ └─────────────┴───────┘
5. AnnNet Graph Initialisation¶
G = AnnNet(directed=True)
G.add_vertices_bulk(vertex_table.to_dicts())
print("Vertices loaded:", G.num_vertices)
print("Sample vertex attributes:")
print(G.vertex_attributes.head(5))
Vertices loaded: 37114 Sample vertex attributes: shape: (5, 7) ┌───────────────┬──────────────┬────────────┬────────────┬─────────────┬─────────────┬─────────────┐ │ vertex_id ┆ hpa_location ┆ recon3d_id ┆ uniprot_id ┆ entity_type ┆ gene_symbol ┆ compartment │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ null ┆ str ┆ str ┆ str ┆ str │ ╞═══════════════╪══════════════╪════════════╪════════════╪═════════════╪═════════════╪═════════════╡ │ prot:TSPAN6 ┆ Cell ┆ null ┆ O43657 ┆ protein ┆ TSPAN6 ┆ c │ │ ┆ Junctions, ┆ ┆ ┆ ┆ ┆ │ │ ┆ Cytosol ┆ ┆ ┆ ┆ ┆ │ │ prot:TNMD ┆ null ┆ null ┆ Q9H2S6 ┆ protein ┆ TNMD ┆ c │ │ prot:DPM1 ┆ null ┆ null ┆ O60762 ┆ protein ┆ DPM1 ┆ c │ │ prot:SCYL3 ┆ Golgi ┆ null ┆ Q8IZE3 ┆ protein ┆ SCYL3 ┆ g │ │ ┆ apparatus, ┆ ┆ ┆ ┆ ┆ │ │ ┆ Cytosol ┆ ┆ ┆ ┆ ┆ │ │ prot:C1orf112 ┆ Nucleoplasm ┆ null ┆ Q9NSG2 ┆ protein ┆ C1orf112 ┆ n │ └───────────────┴──────────────┴────────────┴────────────┴─────────────┴─────────────┴─────────────┘
# Initialise the Kivelä multilayer structure with four biological aspects.
G.set_aspects(["type"], {"type": ["signaling", "metabolic", "complex", "regulatory"]})
for row in G.vertex_attributes.iter_rows(named=True):
vid = row["vertex_id"]
etype = row["entity_type"]
if etype == "protein":
G.add_presence(vid, ("signaling",))
G.add_presence(vid, ("complex",))
elif etype == "gene":
G.add_presence(vid, ("regulatory",))
print(f"Aspects : {G.aspects}")
print(f"Elem layers: {G.elem_layers}")
print(f"_VM size : {len(G._VM)}")
Aspects : ['type']
Elem layers: {'type': ['signaling', 'metabolic', 'complex', 'regulatory']}
_VM size : 55671
6. Signaling Layer — OmniPath Directed PPI [Protein–Protein Interaction]¶
signaling_raw = op.interactions.OmniPath.get(
genesymbols=True,
fields=["sources", "references", "curation_effort"],
)
print("Columns:", signaling_raw.columns.tolist())
print(signaling_raw.head(3))
Columns: ['source', 'target', 'source_genesymbol', 'target_genesymbol', 'is_directed', 'is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition', 'sources', 'references', 'curation_effort', 'n_sources', 'n_primary_sources', 'n_references', 'references_stripped']
source target source_genesymbol target_genesymbol is_directed \
0 P0DP25 P48995 CALM3 TRPC1 True
1 P0DP23 P48995 CALM1 TRPC1 True
2 P0DP24 P48995 CALM2 TRPC1 True
is_stimulation is_inhibition consensus_direction consensus_stimulation \
0 False True True False
1 False True True False
2 False True True False
consensus_inhibition sources references \
0 True TRIP TRIP:11983166;TRIP:12601176;TRIP:11290752
1 True TRIP TRIP:11983166;TRIP:12601176;TRIP:11290752
2 True TRIP TRIP:11983166;TRIP:12601176;TRIP:11290752
curation_effort n_sources n_primary_sources n_references \
0 3 1 1 3
1 3 1 1 3
2 3 1 1 3
references_stripped
0 11290752;11983166;12601176
1 11290752;11983166;12601176
2 11290752;11983166;12601176
sig = pl.from_pandas(
signaling_raw[[
"source_genesymbol", "target_genesymbol",
"is_stimulation", "is_inhibition",
"consensus_direction", "consensus_stimulation", "consensus_inhibition",
"sources", "references",
]].dropna(subset=["source_genesymbol", "target_genesymbol"])
)
sig = sig.filter(
pl.col("source_genesymbol").is_in(hek293_genes)
& pl.col("target_genesymbol").is_in(hek293_genes)
)
print("Signaling edges after HEK293 [Human Embryonic Kidney 293] filter:", sig.shape)
print(sig.head(3))
Signaling edges after HEK293 [Human Embryonic Kidney 293] filter: (31898, 9) shape: (3, 9) ┌────────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬─────────┬───────────┐ │ source_gen ┆ target_ge ┆ is_stimul ┆ is_inhibi ┆ … ┆ consensus ┆ consensus ┆ sources ┆ reference │ │ esymbol ┆ nesymbol ┆ ation ┆ tion ┆ ┆ _stimulat ┆ _inhibiti ┆ --- ┆ s │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ ion ┆ on ┆ str ┆ --- │ │ str ┆ str ┆ bool ┆ bool ┆ ┆ --- ┆ --- ┆ ┆ str │ │ ┆ ┆ ┆ ┆ ┆ bool ┆ bool ┆ ┆ │ ╞════════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═════════╪═══════════╡ │ CALM3 ┆ TRPC1 ┆ false ┆ true ┆ … ┆ false ┆ true ┆ TRIP ┆ TRIP:1198 │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 3166;TRIP │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ :12601176 │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ;TR… │ │ CALM1 ┆ TRPC1 ┆ false ┆ true ┆ … ┆ false ┆ true ┆ TRIP ┆ TRIP:1198 │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 3166;TRIP │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ :12601176 │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ;TR… │ │ CALM2 ┆ TRPC1 ┆ false ┆ true ┆ … ┆ false ┆ true ┆ TRIP ┆ TRIP:1198 │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 3166;TRIP │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ :12601176 │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ;TR… │ └────────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴─────────┴───────────┘
sig_tuples = []
for row in sig.iter_rows(named=True):
src = f"prot:{row['source_genesymbol']}"
tgt = f"prot:{row['target_genesymbol']}"
sign = 1.0 if row["is_stimulation"] else (-1.0 if row["is_inhibition"] else 0.0)
sig_tuples.append((src, tgt, sign))
eids = G.add_intra_edges_bulk(sig_tuples, ("signaling",), fast_mode=True)
sig_attrs = {
eid: {
"edge_type": "signaling",
"layer": "signaling",
"is_stimulation": row["is_stimulation"],
"is_inhibition": row["is_inhibition"],
"consensus_dir": row["consensus_direction"],
"sources": row["sources"],
}
for eid, row in zip(eids, sig.iter_rows(named=True))
}
G.set_edge_attrs_bulk(sig_attrs)
print(f"Signaling edges added: {len(eids)}")
print(f"Total edges: {G.num_edges}")
Signaling edges added: 31898 Total edges: 31898
7. Metabolic Layer — Human-GEM [Generic Enzyme Model] Stoichiometric Hyperedges¶
Human-GEM is loaded directly via the AnnNet SBML [Systems Biology Markup Language] adapter. Each reaction becomes a directed hyperedge with stoichiometric coefficients preserved in the incidence matrix.
from_sbml(str(HUMANGEM_XML), graph=G, slice="metabolic", preserve_stoichiometry=True)
print("Vertices in graph:", G.num_vertices)
print("Edges in graph :", G.num_edges)
print("Slices :", list(G._slices.keys()))
Vertices in graph: 45571 Edges in graph : 44869 Slices : ['default', 'signaling', 'e', 'x', 'm', 'c', 'l', 'r', 'g', 'n', 'i', 'metabolic']
# Register metabolite vertices in the metabolic layer presence map.
added = 0
for vid in G._slices["metabolic"]["vertices"]:
if G.entity_types.get(vid) == "vertex" and not vid.startswith("__"):
G.add_presence(vid, ("metabolic",))
added += 1
print(f"Metabolite presences registered: {added}")
print(f"_VM size now: {len(G._VM)}")
Metabolite presences registered: 8455 _VM size now: 64126
metabolic_edges = G._slices["metabolic"]["edges"]
metabolic_verts = G._slices["metabolic"]["vertices"]
sample_eid = next(iter(metabolic_edges))
hdef = G.hyperedge_definitions[sample_eid]
print("Metabolic edges :", len(metabolic_edges))
print("Metabolic vertices:", len(metabolic_verts))
print("Total vertices :", G.num_vertices)
print("Total edges :", G.num_edges)
print(f"\nSample hyperedge : {sample_eid}")
print("Head (products) :", hdef["head"])
print("Tail (reactants) :", hdef["tail"])
col = G.edge_to_idx[sample_eid]
mat = G._matrix.tocsc()
col_data = mat.getcol(col)
rows, _ = col_data.nonzero()
print("\nStoichiometric coefficients:")
for r in rows:
vid = G.idx_to_entity[r]
coeff = col_data[r, 0]
print(f" {vid}: {coeff}")
Metabolic edges : 12971
Metabolic vertices: 8457
Total vertices : 45571
Total edges : 44869
Sample hyperedge : MAR06128
Head (products) : {'MAM02046e', 'MAM02901e', 'MAM02366c'}
Tail (reactants) : {'MAM02366e', 'MAM02901c', 'MAM02046c'}
Stoichiometric coefficients:
MAM02046c: -1.0
MAM02366c: 1.0
MAM02901c: -1.0
MAM02046e: 1.0
MAM02366e: -1.0
MAM02901e: 1.0
8. Complex Layer — OmniPath Protein Complexes as Undirected Hyperedges¶
Only complexes where all subunits are HEK293-expressed are retained, ensuring the complex layer is fully cell-line-specific.
complex_hyperedges = []
complex_attr_records = []
skipped = 0
for row in complexes_raw.iter_rows(named=True):
members_raw = row["components_genesymbols"]
if not members_raw:
continue
members = members_raw.split("_")
members_expressed = [m for m in members if m in hek293_genes]
if len(members_expressed) < len(members):
skipped += 1
continue
members_vids = [f"prot:{m}" for m in members_expressed]
eid = f"cpx:{row['name']}"
complex_hyperedges.append({
"edge_id": eid,
"members": members_vids,
"weight": 1.0,
})
complex_attr_records.append({
"edge_id": eid,
"edge_type": "complex",
"name": row["name"],
"sources": row["sources"],
"stoichiometry": row["stoichiometry"],
})
print(f"Complexes kept : {len(complex_hyperedges)}")
print(f"Complexes skipped (members not in HEK293 [Human Embryonic Kidney 293]): {skipped}")
print(f"Sample: {complex_hyperedges[0]}")
Complexes kept : 11075
Complexes skipped (members not in HEK293 [Human Embryonic Kidney 293]): 435
Sample: {'edge_id': 'cpx:NFY', 'members': ['prot:NFYA', 'prot:NFYB', 'prot:NFYC'], 'weight': 1.0}
G.add_hyperedges_bulk(complex_hyperedges)
print("Total edges after complexes:", G.num_edges)
print("Total vertices :", G.num_vertices)
Total edges after complexes: 49325 Total vertices : 45571
9. Regulatory Layer — DoRothEA [Discriminant Regulon Expression Analysis] TF [Transcription Factor]–Target Interactions¶
Confidence levels A and B only (experimentally supported). Edges run between
gene:TF and gene:target vertices in the regulatory layer.
reg = dorothea.filter(
pl.col("source_genesymbol").is_in(hek293_genes)
& pl.col("target_genesymbol").is_in(hek293_genes)
)
print("Regulatory edges after HEK293 [Human Embryonic Kidney 293] filter:", reg.shape)
print("Unique TFs [Transcription Factors]:", reg["source_genesymbol"].n_unique())
Regulatory edges after HEK293 [Human Embryonic Kidney 293] filter: (15146, 12) Unique TFs [Transcription Factors]: 367
reg_tuples = [
(
f"gene:{row['source_genesymbol']}",
f"gene:{row['target_genesymbol']}",
1.0 if row["is_stimulation"] else (-1.0 if row["is_inhibition"] else 0.0),
)
for row in reg.iter_rows(named=True)
]
reg_eids = G.add_intra_edges_bulk(reg_tuples, ("regulatory",), fast_mode=True)
reg_attrs = {
eid: {
"edge_type": "regulatory",
"layer": "regulatory",
"tf": row["source_genesymbol"],
"target_gene": row["target_genesymbol"],
"sources": row["sources"],
}
for eid, row in zip(reg_eids, reg.iter_rows(named=True))
}
G.set_edge_attrs_bulk(reg_attrs)
print(f"Regulatory edges added: {len(reg_eids)}")
print(f"Total edges: {G.num_edges}")
Regulatory edges added: 15146 Total edges: 64471
10. Cross-Layer Coupling Edges¶
Two coupling edge types link the regulatory and signaling layers:
gene:X → prot:X— translation: every expressed gene produces its protein productprot:TF → gene:target— TF [Transcription Factor] protein activity drives target gene regulation
coupling_records = []
for i, gene in enumerate(hek293_genes):
gene_vid = f"gene:{gene}"
prot_vid = f"prot:{gene}"
if gene_vid in G.entity_to_idx and prot_vid in G.entity_to_idx:
coupling_records.append({
"edge_id": f"coup_gp_{i:06d}",
"source": gene_vid,
"target": prot_vid,
"directed": True,
"weight": 1.0,
"edge_type": "coupling",
"coupling_type": "gene_to_protein",
})
print(f"gene→protein coupling edges: {len(coupling_records)}")
G.add_edges_bulk(coupling_records)
print("Total edges:", G.num_edges)
gene→protein coupling edges: 18557 Total edges: 83028
tf_coupling = []
for j, row in enumerate(reg.iter_rows(named=True)):
tf_prot = f"prot:{row['source_genesymbol']}"
tgt_gene = f"gene:{row['target_genesymbol']}"
if tf_prot in G.entity_to_idx and tgt_gene in G.entity_to_idx:
mor = 1 if row["is_stimulation"] else (-1 if row["is_inhibition"] else 0)
tf_coupling.append({
"edge_id": f"coup_tf_{j:06d}",
"source": tf_prot,
"target": tgt_gene,
"directed": True,
"weight": float(mor),
"edge_type": "coupling",
"coupling_type": "tf_protein_to_gene",
})
print(f"TF [Transcription Factor] protein→gene coupling edges: {len(tf_coupling)}")
G.add_edges_bulk(tf_coupling)
print("Total edges:", G.num_edges)
TF [Transcription Factor] protein→gene coupling edges: 15146 Total edges: 98174
11. Edge Attribute Consolidation¶
Rewrite the layer and edge_type columns on edge_attributes as a clean,
null-free join — overwriting any stale values left by incremental inserts.
tag_map = {}
for eid in G.layer_edge_set(("signaling",)):
tag_map[eid] = {"layer": "signaling", "edge_type": "signaling"}
for eid in G.layer_edge_set(("regulatory",)):
tag_map[eid] = {"layer": "regulatory", "edge_type": "regulatory"}
for eid in G._slices["metabolic"]["edges"]:
tag_map[eid] = {"layer": "metabolic", "edge_type": "metabolic"}
for rec in complex_attr_records:
tag_map[rec["edge_id"]] = {"layer": "complex", "edge_type": "complex"}
for rec in coupling_records + tf_coupling:
tag_map[rec["edge_id"]] = {"layer": "coupling", "edge_type": "coupling"}
print(f"Tags built: {len(tag_map)}")
tag_df = pl.DataFrame([
{"edge_id": eid, "layer": v["layer"], "edge_type": v["edge_type"]}
for eid, v in tag_map.items()
])
drop_cols = [c for c in G.edge_attributes.columns if c in ("layer", "edge_type", "layer_right")]
G.edge_attributes = G.edge_attributes.drop(drop_cols).join(tag_df, on="edge_id", how="left")
print("edge_attributes shape:", G.edge_attributes.shape)
print("Layer distribution:")
print(G.edge_attributes["layer"].value_counts().sort("count", descending=True))
print("edge_type nulls:", G.edge_attributes["edge_type"].null_count())
Tags built: 98174 edge_attributes shape: (60015, 16) Layer distribution: shape: (3, 2) ┌────────────┬───────┐ │ layer ┆ count │ │ --- ┆ --- │ │ str ┆ u32 │ ╞════════════╪═══════╡ │ signaling ┆ 31898 │ │ regulatory ┆ 15146 │ │ metabolic ┆ 12971 │ └────────────┴───────┘ edge_type nulls: 0
12. Organelle and Condition Slices¶
Named slices provide fast access to organelle-localised or condition-specific subgraphs. SBML [Systems Biology Markup Language] import creates metabolite-only slices; this section enriches them with protein vertices from HPA [Human Protein Atlas] compartment assignments.
ORGANELLE_SLICES = {
"m": "mitochondria",
"n": "nucleus",
"r": "er",
"l": "lysosome",
"c": "cytoplasm",
"g": "golgi",
"x": "peroxisome",
"p": "plasma_membrane",
"e": "extracellular",
}
for code, name in ORGANELLE_SLICES.items():
prot_vids = (
G.vertex_attributes
.filter(
(pl.col("compartment") == code)
& (pl.col("entity_type") == "protein")
)["vertex_id"]
.to_list()
)
existing = G._slices.get(code, {}).get("vertices", set())
vid_set = set(prot_vids) | existing
slice_edges = []
for eid, (src, tgt, _) in G.edge_definitions.items():
if src in vid_set or tgt in vid_set:
slice_edges.append(eid)
for eid, hdef in G.hyperedge_definitions.items():
members = hdef.get("head", set()) | hdef.get("tail", set()) | hdef.get("members", set())
if members & vid_set:
slice_edges.append(eid)
if name not in G._slices:
G._slices[name] = {"vertices": set(), "edges": set(), "attributes": {}}
G._slices[name]["vertices"] = vid_set
G._slices[name]["edges"] = set(slice_edges)
print(f"{name:20s}: {len(vid_set):6d} vertices, {len(slice_edges):6d} edges")
mitochondria : 1897 vertices, 4161 edges nucleus : 5614 vertices, 38622 edges er : 1411 vertices, 3055 edges lysosome : 470 vertices, 695 edges cytoplasm : 12084 vertices, 40946 edges golgi : 1107 vertices, 3611 edges peroxisome : 528 vertices, 732 edges plasma_membrane : 1141 vertices, 8727 edges extracellular : 1660 vertices, 4893 edges
basal_edges = set(
G.edge_attributes
.filter(pl.col("layer") == "signaling")["edge_id"]
.to_list()
)
basal_verts = set()
for eid in basal_edges:
edef = G.edge_definitions.get(eid)
if edef:
basal_verts.add(edef[0])
basal_verts.add(edef[1])
G._slices["basal"] = {"vertices": basal_verts, "edges": basal_edges, "attributes": {}}
print(f"Basal slice : {len(basal_verts)} vertices, {len(basal_edges)} edges")
EGF_PROTEINS = {
"prot:EGFR", "prot:GRB2", "prot:SOS1", "prot:KRAS", "prot:BRAF",
"prot:MAP2K1","prot:MAPK1", "prot:MAPK3", "prot:AKT1", "prot:PIK3CA",
"prot:PTEN", "prot:MTOR", "prot:RPS6KB1","prot:ELK1", "prot:MYC",
"prot:JUN", "prot:FOS", "prot:STAT3", "prot:SRC", "prot:SHC1",
}
egf_edges = set()
egf_verts = set()
for eid in basal_edges:
edef = G.edge_definitions.get(eid)
if edef and (edef[0] in EGF_PROTEINS or edef[1] in EGF_PROTEINS):
egf_edges.add(eid)
egf_verts.add(edef[0])
egf_verts.add(edef[1])
G._slices["egf_stimulated"] = {"vertices": egf_verts, "edges": egf_edges, "attributes": {}}
print(f"EGF [Epidermal Growth Factor]-stimulated : {len(egf_verts)} vertices, {len(egf_edges)} edges")
print(f"\nAll slices: {list(G._slices.keys())}")
Basal slice : 7451 vertices, 31898 edges EGF [Epidermal Growth Factor]-stimulated : 1394 vertices, 2552 edges All slices: ['default', 'signaling', 'e', 'x', 'm', 'c', 'l', 'r', 'g', 'n', 'i', 'metabolic', 'regulatory', 'mitochondria', 'nucleus', 'er', 'lysosome', 'cytoplasm', 'golgi', 'peroxisome', 'plasma_membrane', 'extracellular', 'basal', 'egf_stimulated']
13. Biological Queries¶
# Query 1 — Mito-ER [Endoplasmic Reticulum] contact site edges (cross-slice intersection)
contact_edges = G._slices["mitochondria"]["edges"] & G._slices["er"]["edges"]
print(f"Mito-ER [Endoplasmic Reticulum] contact site edges: {len(contact_edges)}")
for eid in list(contact_edges)[:5]:
hdef = G.hyperedge_definitions.get(eid)
edef = G.edge_definitions.get(eid)
if hdef:
members = hdef.get("members") or (hdef.get("head", set()) | hdef.get("tail", set()))
print(f" hyperedge {eid}: {members}")
elif edef:
print(f" edge {eid}: {edef[0]} → {edef[1]}")
Mito-ER [Endoplasmic Reticulum] contact site edges: 45
hyperedge cpx:Commander complex: {'prot:DENND10', 'prot:COMMD3', 'prot:VPS35L', 'prot:COMMD8', 'prot:CCDC22', 'prot:VPS29', 'prot:CCDC93', 'prot:COMMD10', 'prot:COMMD5', 'prot:COMMD4', 'prot:COMMD6', 'prot:COMMD7', 'prot:VPS26C', 'prot:COMMD2', 'prot:COMMD1', 'prot:COMMD9'}
hyperedge cpx:COMMD1-CCDC22-CCDC93-C16orf62 complex: {'prot:CCDC22', 'prot:CCDC93', 'prot:COMMD1', 'prot:VPS35L'}
hyperedge cpx:TIM23-TOM22-TIM50-HSD3B2 complex: {'prot:TIMM23', 'prot:HSD3B2', 'prot:TIMM50', 'prot:TOMM22'}
hyperedge MAR01824: {'MAM00623r', 'MAM00623m'}
hyperedge MAR00037: {'MAM00987m', 'MAM01001m', 'MAM01011c', 'MAM00652x', 'MAM00659c', 'MAM00988c', 'MAM00641r', 'MAM00642r', 'MAM00641m', 'MAM00649r', 'MAM00652c', 'MAM00987c', 'MAM01001c', 'MAM00651c', 'MAM01001r', 'MAM00660c', 'MAM00641c', 'MAM10001e', 'MAM00432r', 'MAM00604c', 'MAM01001x', 'MAM00987x', 'MAM00642c', 'MAM01012c', 'MAM01314c', 'MAM00651x', 'MAM01794r', 'MAM00644c', 'MAM01792r', 'MAM00925c', 'MAM00642x', 'MAM00645c', 'MAM00641x', 'MAM00652m', 'MAM00651r', 'MAM00828r', 'MAM00947c', 'MAM00400r', 'MAM00747r', 'MAM00945c', 'MAM00949c', 'MAM02469c', 'MAM00651m', 'MAM00400c', 'MAM00624c', 'MAM00660r', 'MAM00927c', 'MAM00987r', 'MAM00642m', 'MAM00652r', 'MAM00649l', 'MAM00830c'}
# Query 2 — Organelle-specific hub proteins (degree within each organelle slice)
organelles = ["mitochondria", "nucleus", "er", "lysosome", "cytoplasm", "golgi", "plasma_membrane"]
print("Top 5 hub proteins per organelle:\n")
for org in organelles:
verts = G._slices[org]["vertices"]
edges = G._slices[org]["edges"]
degree = Counter()
for eid in edges:
edef = G.edge_definitions.get(eid)
if edef:
if edef[0] in verts and edef[0].startswith("prot:"):
degree[edef[0]] += 1
if edef[1] in verts and edef[1].startswith("prot:"):
degree[edef[1]] += 1
top5 = degree.most_common(5)
print(f"{org}:")
for prot, deg in top5:
print(f" {prot:30s} degree={deg}")
print()
Top 5 hub proteins per organelle: mitochondria: prot:CASP3 degree=124 prot:TRAF6 degree=72 prot:BAD degree=61 prot:BCL2 degree=58 prot:CASP8 degree=53 nucleus: prot:TP53 degree=825 prot:STAT1 degree=785 prot:GSK3B degree=688 prot:HNF4A degree=622 prot:CTCF degree=616 er: prot:PTPN1 degree=77 prot:FGFR3 degree=43 prot:VTN degree=27 prot:ITGA1 degree=26 prot:COL1A1 degree=26 lysosome: prot:RPTOR degree=26 prot:RAB7A degree=11 prot:TFRC degree=10 prot:CLTC degree=9 prot:GRN degree=7 cytoplasm: prot:PRKCA degree=547 prot:MAPK14 degree=478 prot:MITF degree=458 prot:PRKACA degree=445 prot:ZNF263 degree=313 golgi: prot:EGFR degree=254 prot:LCK degree=132 prot:MTOR degree=77 prot:APP degree=62 prot:RET degree=48 plasma_membrane: prot:SRC degree=490 prot:FYN degree=199 prot:INSR degree=169 prot:CTNNB1 degree=161 prot:ITGB1 degree=149
# Query 3 — Signal-to-transcription bridge TFs [Transcription Factors]
# A bridge TF [Transcription Factor] is both a target of signaling AND a regulator in the regulatory layer.
sig_target_prots = set()
for eid in G.edge_attributes.filter(pl.col("layer") == "signaling")["edge_id"].to_list():
edef = G.edge_definitions.get(eid)
if edef:
sig_target_prots.add(edef[1].replace("prot:", ""))
reg_tf_genes = set()
for eid in G.edge_attributes.filter(pl.col("layer") == "regulatory")["edge_id"].to_list():
edef = G.edge_definitions.get(eid)
if edef:
reg_tf_genes.add(edef[0].replace("gene:", ""))
bridge_tfs = sig_target_prots & reg_tf_genes
print(f"Signal-to-transcription bridge TFs [Transcription Factors]: {len(bridge_tfs)}")
print("\nFirst 20 bridge TFs [Transcription Factors] (alphabetical):")
for tf in sorted(bridge_tfs)[:20]:
print(f" {tf}")
Signal-to-transcription bridge TFs [Transcription Factors]: 311 First 20 bridge TFs [Transcription Factors] (alphabetical): AHR AR ARID3A ARNT ASCL1 ATF1 ATF2 ATF3 ATF4 ATF6 ATOH1 BACH1 BACH2 BATF BCL6 BHLHE40 BMAL1 CDX1 CDX2 CEBPA
# Query 4 — EGF [Epidermal Growth Factor]-stimulated vs basal edge comparison
egf_only = G._slices["egf_stimulated"]["edges"] - G._slices["basal"]["edges"]
basal_only = G._slices["basal"]["edges"] - G._slices["egf_stimulated"]["edges"]
shared = G._slices["egf_stimulated"]["edges"] & G._slices["basal"]["edges"]
print(f"EGF [Epidermal Growth Factor]-stimulated slice edges : {len(G._slices['egf_stimulated']['edges'])}")
print(f"Basal slice edges : {len(G._slices['basal']['edges'])}")
print(f"Shared (always active) : {len(shared)}")
print(f"EGF [Epidermal Growth Factor]-only edges : {len(egf_only)}")
print(f"Basal-only (EGF-suppressed) : {len(basal_only)}")
print("\nSample EGF [Epidermal Growth Factor]-stimulated edges:")
for eid in list(G._slices["egf_stimulated"]["edges"])[:8]:
edef = G.edge_definitions.get(eid)
if edef:
ea_row = G.edge_attributes.filter(pl.col("edge_id") == eid)
layer = ea_row["layer"][0] if len(ea_row) > 0 else "?"
print(f" {edef[0]:25s} → {edef[1]:25s} [{layer}]")
EGF [Epidermal Growth Factor]-stimulated slice edges : 2552 Basal slice edges : 31898 Shared (always active) : 2552 EGF [Epidermal Growth Factor]-only edges : 0 Basal-only (EGF-suppressed) : 29346 Sample EGF [Epidermal Growth Factor]-stimulated edges: prot:MAPK1 → prot:DUSP5 [signaling] prot:SRC → prot:ITGAL [signaling] prot:AKT1 → prot:AKT1S1 [signaling] prot:CDR2 → prot:MYC [signaling] prot:PAK1 → prot:AKT1 [signaling] prot:MAPK3 → prot:GTF2I [signaling] prot:WDR83 → prot:MAP2K1 [signaling] prot:EGFR → prot:SOX2 [signaling]
14. Benchmark — AnnNet Hyperedges vs NetworkX [Network Analysis in Python] Binary Expansion¶
Demonstrates the memory and fidelity advantage of native hyperedge representation over the conventional approach of expanding every reaction into all pairwise binary edges.
metabolic_eids = [
eid for eid, kind in G.edge_kind.items()
if kind == "hyper" and eid in G._slices["metabolic"]["edges"]
]
metabolic_edges_raw = [
(list(G.hyperedge_definitions[eid].get("head", [])),
list(G.hyperedge_definitions[eid].get("tail", [])),
eid)
for eid in metabolic_eids
]
binary_expansion_count = sum(len(h) * len(t) for h, t, _ in metabolic_edges_raw)
print(f"Metabolic hyperedges (AnnNet): {len(metabolic_edges_raw)}")
print(f"Binary edges needed (NX [NetworkX] expansion): {binary_expansion_count}")
print(f"Edge inflation factor: {binary_expansion_count / len(metabolic_edges_raw):.1f}x")
tracemalloc.start()
_ = [eid for eid in G._slices["metabolic"]["edges"]]
annnet_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()
tracemalloc.start()
t0 = time.perf_counter()
NX, _ = to_nx(G, hyperedge_mode="expand", slices=["metabolic"])
t_nx = time.perf_counter() - t0
nx_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()
print(f"\nNetworkX [Network Analysis in Python] (expanded): {NX.number_of_nodes()} nodes, {NX.number_of_edges()} edges")
print(f" Build time : {t_nx:.3f}s")
print(f" Peak memory: {nx_mem / 1e6:.1f} MB")
print(f"\nAnnNet metabolic slice memory : {annnet_mem / 1e6:.4f} MB")
print(f"Stoichiometry preserved in AnnNet : YES")
print(f"Stoichiometry preserved in NetworkX : NO (binary adjacency only)")
Metabolic hyperedges (AnnNet): 12971 Binary edges needed (NX [NetworkX] expansion): 79886 Edge inflation factor: 6.2x NetworkX [Network Analysis in Python] (expanded): 45571 nodes, 276293 edges Build time : 11.924s Peak memory: 391.5 MB AnnNet metabolic slice memory : 0.1203 MB Stoichiometry preserved in AnnNet : YES Stoichiometry preserved in NetworkX : NO (binary adjacency only)
15. Multilayer Metrics¶
# Participation coefficient: fraction of a vertex's edges that span multiple layers.
# Computed manually to correctly handle hyperedges alongside binary edges.
layers = ["signaling", "metabolic", "complex", "regulatory"]
layer_edges = {
l: set(G.edge_attributes.filter(pl.col("layer") == l)["edge_id"].to_list())
for l in layers
}
vertex_layer_deg = defaultdict(lambda: defaultdict(int))
for layer, eids in layer_edges.items():
for eid in eids:
ed = G.edge_definitions.get(eid)
hd = G.hyperedge_definitions.get(eid)
if hd is not None:
involved = hd.get("head", set()) | hd.get("tail", set()) | hd.get("members", set())
for v in involved:
if v and not v.startswith("__"):
vertex_layer_deg[v][layer] += 1
elif ed is not None and ed[0] is not None:
vertex_layer_deg[ed[0]][layer] += 1
vertex_layer_deg[ed[1]][layer] += 1
participation = {
v: 1.0 - sum((ld[l] / sum(ld.values())) ** 2 for l in ld)
for v, ld in vertex_layer_deg.items()
if sum(ld.values()) > 0
}
p_vals = np.array(list(participation.values()))
print("Participation coefficient (all 4 layers, covers hyperedges):")
print(f" Vertices with edges : {len(p_vals)}")
print(f" Mean P : {p_vals.mean():.4f}")
print(f" Max P : {p_vals.max():.4f}")
print(f" P>0.3 (multi-layer) : {(p_vals > 0.3).sum()}")
top10 = sorted(participation.items(), key=lambda x: -x[1])[:10]
print(" Top 10:")
for v, p in top10:
ld = vertex_layer_deg[v]
print(f" {v:35s} P={p:.3f} " + " ".join(f"{l[:3]}={ld.get(l, 0)}" for l in layers))
Participation coefficient (all 4 layers, covers hyperedges):
Vertices with edges : 21104
Mean P : 0.0000
Max P : 0.0000
P>0.3 (multi-layer) : 0
Top 10:
prot:AR P=0.000 sig=118 met=0 com=0 reg=0
prot:CTNNB1 P=0.000 sig=160 met=0 com=0 reg=0
prot:PAK1 P=0.000 sig=124 met=0 com=0 reg=0
prot:AKT1 P=0.000 sig=360 met=0 com=0 reg=0
prot:F9 P=0.000 sig=8 met=0 com=0 reg=0
prot:LRP1 P=0.000 sig=55 met=0 com=0 reg=0
prot:MDM2 P=0.000 sig=110 met=0 com=0 reg=0
prot:IGF1R P=0.000 sig=125 met=0 com=0 reg=0
prot:IGSF10 P=0.000 sig=4 met=0 com=0 reg=0
prot:CD200R1L P=0.000 sig=2 met=0 com=0 reg=0
# Supra-adjacency matrix over binary layers only (signaling + regulatory).
print("Supra-adjacency (signaling + regulatory layers — binary intra edges only):")
A = G.supra_adjacency(["signaling", "regulatory"])
print(f" Shape : {A.shape}")
print(f" Non-zeros : {A.nnz}")
Supra-adjacency (signaling + regulatory layers — binary intra edges only): Shape : (37114, 37114) Non-zeros : 49398
# EGF [Epidermal Growth Factor]-stimulated subgraph component and hub analysis.
G_egf, _ = to_nx(G, slices=["egf_stimulated"], hyperedge_mode="skip")
G_egf_u = G_egf.to_undirected()
components = sorted(nx.connected_components(G_egf_u), key=len, reverse=True)
print(f"EGF [Epidermal Growth Factor] subgraph : {G_egf.number_of_nodes()} nodes, {G_egf.number_of_edges()} edges")
print(f"Components : {len(components)}")
print(f"Largest component : {len(components[0])} nodes")
largest = G_egf_u.subgraph(components[0]).copy()
density = nx.density(largest)
avg_deg = sum(d for _, d in largest.degree()) / largest.number_of_nodes()
top_hubs = sorted(largest.degree(), key=lambda x: x[1], reverse=True)[:5]
print(f"\nLargest component:")
print(f" Density : {density:.2e}")
print(f" Avg degree : {avg_deg:.2f}")
print(f" Top 5 hubs : {[(v, d) for v, d in top_hubs]}")
EGF [Epidermal Growth Factor] subgraph : 45571 nodes, 80747 edges
Components : 17650
Largest component : 18642 nodes
Largest component:
Density : 4.11e-04
Avg degree : 7.67
Top 5 hubs : [('prot:TP53', 825), ('prot:STAT1', 785), ('gene:STAT1', 736), ('prot:GSK3B', 688), ('prot:ESR1', 637)]
16. Serialisation and Export¶
out_path = DATA / "uc2_hek293.annnet"
G.write(str(out_path), overwrite=True)
print(f"Written: {out_path} ({os.path.getsize(out_path) / 1e6:.1f} MB)")
G2 = AnnNet.read(str(out_path))
print("\nRound-trip check:")
print(f" vertices orig={G.number_of_vertices()} reloaded={G2.number_of_vertices()} "
f"match={G.number_of_vertices() == G2.number_of_vertices()}")
print(f" edges orig={G.number_of_edges()} reloaded={G2.number_of_edges()} "
f"match={G.number_of_edges() == G2.number_of_edges()}")
print(f" slices orig={len(G._slices)} reloaded={len(G2._slices)} "
f"match={len(G._slices) == len(G2._slices)}")
sample_met = next(iter(layer_edges["metabolic"]))
print(f"\n Stoichiometry preserved for {sample_met}:")
print(f" orig : {G.hyperedge_definitions[sample_met]}")
print(f" reloaded : {G2.hyperedge_definitions.get(sample_met)}")
Written: data/uc2_hek293.annnet (7.7 MB)
Round-trip check:
vertices orig=45571 reloaded=45571 match=True
edges orig=98174 reloaded=98174 match=True
slices orig=24 reloaded=24 match=True
Stoichiometry preserved for MAR06128:
orig : {'directed': True, 'head': {'MAM02046e', 'MAM02901e', 'MAM02366c'}, 'tail': {'MAM02366e', 'MAM02901c', 'MAM02046c'}}
reloaded : {'directed': True, 'head': ['MAM02046e', 'MAM02366c', 'MAM02901e'], 'tail': ['MAM02046c', 'MAM02366e', 'MAM02901c']}
cx2_data = to_cx2(G, export_name="UC2_HEK293_signaling", hyperedges="skip")
cx2_path = DATA / "uc2_signaling.cx2"
with open(cx2_path, "w") as f:
json.dump(cx2_data, f)
print(f"CX2 [Cytoscape Exchange Format v2] written: {cx2_path} ({os.path.getsize(cx2_path) / 1e6:.1f} MB)")
print(f"Aspects: {[list(a.keys())[0] for a in cx2_data]}")
CX2 [Cytoscape Exchange Format v2] written: data/uc2_signaling.cx2 (164.0 MB) Aspects: ['CXVersion', 'metaData', 'attributeDeclarations', 'networkAttributes', 'nodes', 'edges', 'status']
# Map entity_type to short kind labels expected by the PyG [PyTorch Geometric] adapter.
kind_map = {"protein": "prot", "gene": "gene"}
G.vertex_attributes = G.vertex_attributes.with_columns(
pl.col("entity_type").replace_strict(kind_map, default="metab").alias("kind")
)
print("Kind value counts:", G.vertex_attributes["kind"].value_counts())
pyg_data = to_pyg(G, hyperedge_mode="reify", device="cpu")
print(f"\nHeteroData node types : {pyg_data.node_types}")
print(f"HeteroData edge types : {pyg_data.edge_types}")
for nt in pyg_data.node_types:
print(f" {nt}: {pyg_data[nt].num_nodes} nodes")
for et in pyg_data.edge_types:
print(f" {et}: {pyg_data[et].edge_index.shape[1]} edges")
Kind value counts: shape: (3, 2)
┌───────┬───────┐
│ kind ┆ count │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════╪═══════╡
│ metab ┆ 8457 │
│ prot ┆ 18564 │
│ gene ┆ 18564 │
└───────┴───────┘
HeteroData node types : ['prot', 'gene', 'metab', 'hypernode']
HeteroData edge types : [('prot', 'edge', 'prot'), ('metab', 'member_of', 'hypernode'), ('prot', 'member_of', 'hypernode'), ('gene', 'edge', 'gene'), ('gene', 'edge', 'prot'), ('prot', 'edge', 'gene')]
prot: 18557 nodes
gene: 18557 nodes
metab: 8457 nodes
hypernode: 17427 nodes
('prot', 'edge', 'prot'): 31898 edges
('metab', 'member_of', 'hypernode'): 57102 edges
('prot', 'member_of', 'hypernode'): 17066 edges
('gene', 'edge', 'gene'): 15146 edges
('gene', 'edge', 'prot'): 18557 edges
('prot', 'edge', 'gene'): 15146 edges
17. Summary¶
coup_count = G.edge_attributes.filter(pl.col("layer") == "coupling")["edge_id"].len()
rows = [
("Vertices", G.number_of_vertices(), "prot + gene + metab"),
("Edges (total)", G.number_of_edges(), "all layers + coupling"),
("── Signaling", len(layer_edges["signaling"]), "directed, signed binary"),
("── Metabolic", len(layer_edges["metabolic"]), "directed hyperedges, stoichiometric"),
("── Protein complexes", len(layer_edges["complex"]), "undirected hyperedges"),
("── Regulatory (DoRothEA)", len(layer_edges["regulatory"]), "directed TF→target, levels A+B"),
("── Coupling", coup_count, "gene↔prot + TF bridges"),
("Organelle slices", 9, "mito/nucleus/ER/lysosome/cyto/golgi/perox/PM/extracell"),
("Condition slices", 2, "basal / EGF-stimulated"),
("Bridge TFs", len(bridge_tfs), "signaling target + DoRothEA regulator"),
("Max participation coeff", round(float(p_vals.max()), 3), "prot in signaling+complex only"),
("NX edge inflation", f"{binary_expansion_count / len(metabolic_edges_raw):.1f}×", "metabolic layer, hyperedges→binary"),
("Memory advantage vs NX", f"{nx_mem / max(annnet_mem, 1e-9):.0f}×", "metabolic layer filter"),
("Serialized (.annnet)", f"{os.path.getsize(out_path) / 1e6:.1f} MB", "lossless round-trip verified"),
("CX2 export", f"{os.path.getsize(cx2_path) / 1e6:.1f} MB", "full graph, Cytoscape-ready"),
("PyG node types", len(pyg_data.node_types), "prot / gene / metab / hypernode"),
("PyG edge types", len(pyg_data.edge_types), "heterogeneous, reified hyperedges"),
]
print(f"{'Metric':<35} {'Value':>12} Notes")
print("─" * 80)
for name, val, note in rows:
print(f"{name:<35} {str(val):>12} {note}")
Metric Value Notes ──────────────────────────────────────────────────────────────────────────────── Vertices 45571 prot + gene + metab Edges (total) 98174 all layers + coupling ── Signaling 31898 directed, signed binary ── Metabolic 12971 directed hyperedges, stoichiometric ── Protein complexes 0 undirected hyperedges ── Regulatory (DoRothEA) 15146 directed TF→target, levels A+B ── Coupling 0 gene↔prot + TF bridges Organelle slices 9 mito/nucleus/ER/lysosome/cyto/golgi/perox/PM/extracell Condition slices 2 basal / EGF-stimulated Bridge TFs 311 signaling target + DoRothEA regulator Max participation coeff 0.0 prot in signaling+complex only NX edge inflation 6.2× metabolic layer, hyperedges→binary Memory advantage vs NX 3255× metabolic layer filter Serialized (.annnet) 7.7 MB lossless round-trip verified CX2 export 164.0 MB full graph, Cytoscape-ready PyG node types 4 prot / gene / metab / hypernode PyG edge types 6 heterogeneous, reified hyperedges