Use Case 2 — HEK293 Multilayer Biological Network¶

Constructs a cell-line-specific, multilayer heterogeneous graph of HEK293 proteins using:

Signaling layer: OmniPath directed protein–protein interactions
Metabolic layer: Human-GEM stoichiometric hyperedges (SBML [Systems Biology Markup Language])
Complex layer: OmniPath protein complexes as undirected hyperedges
Regulatory layer: DoRothEA TF [Transcription Factor]–target interactions (confidence levels A+B)
Coupling edges: gene→protein translation links + TF protein→target gene bridges

Cell-line specificity is enforced throughout by filtering all interactions to proteins with protein-level expression evidence in the Human Protein Atlas (HPA [Human Protein Atlas]).

1. Imports¶

In [1]:

Copied!





import os
import sys
import time
import json
import hashlib
import zipfile
import tracemalloc
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import polars as pl
import requests
import networkx as nx

import cobra
import omnipath as op
import decoupler as dc
import torch
import torch_geometric
from lxml import etree

sys.path.insert(0, os.path.abspath(".."))
from annnet.core.graph import AnnNet
from annnet.io.read_omnipath import read_omnipath
from annnet.io.SBML_io import from_sbml
from annnet.io.cx2_io import to_cx2
from annnet.adapters.networkx_adapter import to_nx
from annnet.adapters.pyg_adapter import to_pyg
import os
import sys
import time
import json
import hashlib
import zipfile
import tracemalloc
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import polars as pl
import requests
import networkx as nx

import cobra
import omnipath as op
import decoupler as dc
import torch
import torch_geometric
from lxml import etree

sys.path.insert(0, os.path.abspath(".."))
from annnet.core.graph import AnnNet
from annnet.io.read_omnipath import read_omnipath
from annnet.io.SBML_io import from_sbml
from annnet.io.cx2_io import to_cx2
from annnet.adapters.networkx_adapter import to_nx
from annnet.adapters.pyg_adapter import to_pyg

/home/l1boll/miniconda3/envs/myenv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

2. Data Acquisition¶

All external files are cached locally under ./data/. Downloads are skipped if the file already exists.

In [2]:

Copied!

DATA = Path("./data")
DATA.mkdir(exist_ok=True)
print(f"Cache dir: {DATA.resolve()}")
DATA = Path("./data")
DATA.mkdir(exist_ok=True)
print(f"Cache dir: {DATA.resolve()}")

Cache dir: /mnt/c/Users/pc/desktop/annnet-remote/notebooks/data

In [3]:

Copied!





HPA_URL = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
HPA_ZIP = DATA / "proteinatlas.tsv.zip"
HPA_TSV = DATA / "proteinatlas.tsv"

if not HPA_TSV.exists():
    print("Downloading HPA [Human Protein Atlas]... (~100 MB, takes a minute)")
    r = requests.get(HPA_URL, stream=True)
    with open(HPA_ZIP, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    with zipfile.ZipFile(HPA_ZIP, "r") as z:
        z.extractall(DATA)
    print("Done.")
else:
    print("Already cached.")
HPA_URL = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
HPA_ZIP = DATA / "proteinatlas.tsv.zip"
HPA_TSV = DATA / "proteinatlas.tsv"

if not HPA_TSV.exists():
    print("Downloading HPA [Human Protein Atlas]... (~100 MB, takes a minute)")
    r = requests.get(HPA_URL, stream=True)
    with open(HPA_ZIP, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    with zipfile.ZipFile(HPA_ZIP, "r") as z:
        z.extractall(DATA)
    print("Done.")
else:
    print("Already cached.")

Already cached.

In [4]:

Copied!





HPA_CELL_TSV = DATA / "rna_celline.tsv"

if not HPA_CELL_TSV.exists():
    print("Downloading HPA [Human Protein Atlas] cell-line RNA [Ribonucleic Acid] table...")
    r = requests.get(
        "https://www.proteinatlas.org/download/rna_celline.tsv.gz", stream=True
    )
    with open(HPA_CELL_TSV, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    print("Done.")
else:
    print("Already cached.")
HPA_CELL_TSV = DATA / "rna_celline.tsv"

if not HPA_CELL_TSV.exists():
    print("Downloading HPA [Human Protein Atlas] cell-line RNA [Ribonucleic Acid] table...")
    r = requests.get(
        "https://www.proteinatlas.org/download/rna_celline.tsv.gz", stream=True
    )
    with open(HPA_CELL_TSV, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    print("Done.")
else:
    print("Already cached.")

Already cached.

In [5]:

Copied!





HUMANGEM_XML = DATA / "Human-GEM.xml"

if not HUMANGEM_XML.exists():
    print("Downloading Human-GEM [Generic Enzyme Model]...")
    r = requests.get(
        "https://raw.githubusercontent.com/SysBioChalmers/Human-GEM/main/model/Human-GEM.xml",
        stream=True,
        allow_redirects=True,
    )
    print("Status:", r.status_code)
    with open(HUMANGEM_XML, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    print(f"Done. Size: {HUMANGEM_XML.stat().st_size} bytes")
else:
    print(f"Already cached. Size: {HUMANGEM_XML.stat().st_size} bytes")
HUMANGEM_XML = DATA / "Human-GEM.xml"

if not HUMANGEM_XML.exists():
    print("Downloading Human-GEM [Generic Enzyme Model]...")
    r = requests.get(
        "https://raw.githubusercontent.com/SysBioChalmers/Human-GEM/main/model/Human-GEM.xml",
        stream=True,
        allow_redirects=True,
    )
    print("Status:", r.status_code)
    with open(HUMANGEM_XML, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    print(f"Done. Size: {HUMANGEM_XML.stat().st_size} bytes")
else:
    print(f"Already cached. Size: {HUMANGEM_XML.stat().st_size} bytes")

Already cached. Size: 43447379 bytes

In [6]:

Copied!





COMPLEXES_TSV = DATA / "omnipath_complexes.tsv"

if not COMPLEXES_TSV.exists():
    r = requests.get(
        "https://omnipathdb.org/complexes?databases=CORUM,ComplexPortal,hu.MAP2&fields=sources&format=tsv",
        timeout=120,
    )
    print("Status:", r.status_code)
    with open(COMPLEXES_TSV, "w") as f:
        f.write(r.text)
    print("Downloaded complexes.")
else:
    print("Already cached:", COMPLEXES_TSV)

complexes_raw = pl.read_csv(COMPLEXES_TSV, separator="\t", infer_schema_length=5000)
print("Shape:", complexes_raw.shape)
print("Columns:", complexes_raw.columns)
print(complexes_raw.head(3))
COMPLEXES_TSV = DATA / "omnipath_complexes.tsv"

if not COMPLEXES_TSV.exists():
    r = requests.get(
        "https://omnipathdb.org/complexes?databases=CORUM,ComplexPortal,hu.MAP2&fields=sources&format=tsv",
        timeout=120,
    )
    print("Status:", r.status_code)
    with open(COMPLEXES_TSV, "w") as f:
        f.write(r.text)
    print("Downloaded complexes.")
else:
    print("Already cached:", COMPLEXES_TSV)

complexes_raw = pl.read_csv(COMPLEXES_TSV, separator="\t", infer_schema_length=5000)
print("Shape:", complexes_raw.shape)
print("Columns:", complexes_raw.columns)
print(complexes_raw.head(3))

Already cached: data/omnipath_complexes.tsv
Shape: (11510, 7)
Columns: ['name', 'components', 'components_genesymbols', 'stoichiometry', 'sources', 'references', 'identifiers']
shape: (3, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ name         ┆ components  ┆ components_ ┆ stoichiomet ┆ sources     ┆ references  ┆ identifiers │
│ ---          ┆ ---         ┆ genesymbols ┆ ry          ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ ---         ┆ ---         ┆ str         ┆ str         ┆ str         │
│              ┆             ┆ str         ┆ str         ┆             ┆             ┆             │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ NFY          ┆ P23511_P252 ┆ NFYA_NFYB_N ┆ 1:1:1       ┆ hu.MAP;SIGN ┆ 9372932;147 ┆ PDB:8qu4;PD │
│              ┆ 08_Q13952   ┆ FYC         ┆             ┆ OR;CORUM;Co ┆ 55292;15243 ┆ B:6qmq;PDB: │
│              ┆             ┆             ┆             ┆ mplexPor…   ┆ 141         ┆ 8qu2;PDB…   │
│ SCF-betaTRCP ┆ P63208_Q136 ┆ BTRC_CUL1_S ┆ 1:1:1       ┆ SPIKE;SIGNO ┆ 9990852     ┆ Compleat:HC │
│              ┆ 16_Q9Y297   ┆ KP1         ┆             ┆ R;Compleat; ┆             ┆ 757;SIGNOR: │
│              ┆             ┆             ┆             ┆ CORUM       ┆             ┆ SIGNOR-C…   │
│ SMAD2/SMAD4  ┆ Q13485_Q157 ┆ SMAD2_SMAD4 ┆ 1:2         ┆ SIGNOR;Comp ┆ 12923550;80 ┆ intact:EBI- │
│              ┆ 96          ┆             ┆             ┆ lexPortal;P ┆ 61611;40654 ┆ 9692430;PDB │
│              ┆             ┆             ┆             ┆ DB          ┆ 10;16322…   ┆ :1u7v;SI…   │
└──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘

In [7]:

Copied!





DOROTHEA_TSV = DATA / "dorothea.tsv"

if not DOROTHEA_TSV.exists():
    print("Downloading DoRothEA [Discriminant Regulon Expression Analysis]...")
    r = requests.get(
        "https://omnipathdb.org/interactions?datasets=dorothea&dorothea_levels=A,B"
        "&genesymbols=yes&fields=sources,references&format=tsv",
        timeout=120,
    )
    print("Status:", r.status_code)
    with open(DOROTHEA_TSV, "w") as f:
        f.write(r.text)
    print(f"Done. Size: {DOROTHEA_TSV.stat().st_size} bytes")
else:
    print("Already cached.")

dorothea = pl.read_csv(DOROTHEA_TSV, separator="\t", infer_schema_length=5000)
print("Shape:", dorothea.shape)
print("Columns:", dorothea.columns)
print(dorothea.head(3))
DOROTHEA_TSV = DATA / "dorothea.tsv"

if not DOROTHEA_TSV.exists():
    print("Downloading DoRothEA [Discriminant Regulon Expression Analysis]...")
    r = requests.get(
        "https://omnipathdb.org/interactions?datasets=dorothea&dorothea_levels=A,B"
        "&genesymbols=yes&fields=sources,references&format=tsv",
        timeout=120,
    )
    print("Status:", r.status_code)
    with open(DOROTHEA_TSV, "w") as f:
        f.write(r.text)
    print(f"Done. Size: {DOROTHEA_TSV.stat().st_size} bytes")
else:
    print("Already cached.")

dorothea = pl.read_csv(DOROTHEA_TSV, separator="\t", infer_schema_length=5000)
print("Shape:", dorothea.shape)
print("Columns:", dorothea.columns)
print(dorothea.head(3))

Already cached.
Shape: (15267, 12)
Columns: ['source', 'target', 'source_genesymbol', 'target_genesymbol', 'is_directed', 'is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition', 'sources', 'references']
shape: (3, 12)
┌────────┬────────┬────────────┬────────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ source ┆ target ┆ source_gen ┆ target_gen ┆ … ┆ consensus_ ┆ consensus_ ┆ sources    ┆ reference │
│ ---    ┆ ---    ┆ esymbol    ┆ esymbol    ┆   ┆ stimulatio ┆ inhibition ┆ ---        ┆ s         │
│ str    ┆ str    ┆ ---        ┆ ---        ┆   ┆ n          ┆ ---        ┆ str        ┆ ---       │
│        ┆        ┆ str        ┆ str        ┆   ┆ ---        ┆ bool       ┆            ┆ str       │
│        ┆        ┆            ┆            ┆   ┆ bool       ┆            ┆            ┆           │
╞════════╪════════╪════════════╪════════════╪═══╪════════════╪════════════╪════════════╪═══════════╡
│ Q16656 ┆ Q6UUV9 ┆ NRF1       ┆ CRTC1      ┆ … ┆ false      ┆ false      ┆ HOCOMOCO_D ┆ null      │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ oRothEA;EN ┆           │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ CODE-proxi ┆           │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ …          ┆           │
│ Q16656 ┆ Q9Y2K7 ┆ NRF1       ┆ KDM2A      ┆ … ┆ false      ┆ false      ┆ HOCOMOCO_D ┆ null      │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ oRothEA;Re ┆           │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ Map_DoRoth ┆           │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ …          ┆           │
│ Q16656 ┆ Q96G74 ┆ NRF1       ┆ OTUD5      ┆ … ┆ false      ┆ false      ┆ ReMap_DoRo ┆ null      │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ thEA;ARACN ┆           │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ e-GTEx_DoR ┆           │
│        ┆        ┆            ┆            ┆   ┆            ┆            ┆ …          ┆           │
└────────┴────────┴────────────┴────────────┴───┴────────────┴────────────┴────────────┴───────────┘

3. HEK293 Expressed Protein Set¶

Filter the HPA [Human Protein Atlas] atlas to genes with protein-level experimental evidence. This is the conservative but reproducible criterion for "expressed in HEK293": only proteins for which mass-spectrometry or antibody-based evidence exists are retained.

In [8]:

Copied!

hpa_raw = pl.read_csv(HPA_TSV, separator="\t", infer_schema_length=10000)
print("Columns:", hpa_raw.columns[:10])
print("Shape:", hpa_raw.shape)
hpa_raw = pl.read_csv(HPA_TSV, separator="\t", infer_schema_length=10000)
print("Columns:", hpa_raw.columns[:10])
print("Shape:", hpa_raw.shape)

Columns: ['Gene', 'Gene synonym', 'Ensembl', 'Gene description', 'Uniprot', 'Chromosome', 'Position', 'Protein class', 'Biological process', 'Molecular function']
Shape: (20162, 107)

In [9]:

Copied!





# The main atlas file contains per-gene summaries. Filter to protein-level evidence
# (mass-spectrometry or antibody-validated) — these are confirmed translated proteins.
hpa_expressed = hpa_raw.filter(
    pl.col("Evidence").str.contains("protein level")
    | pl.col("UniProt evidence").str.contains("protein level")
).select(
    ["Gene", "Uniprot", "Subcellular main location", "Subcellular additional location", "Evidence"]
)

print("Expressed proteins (protein-level evidence):", hpa_expressed.shape)
print(hpa_expressed.head(5))
# The main atlas file contains per-gene summaries. Filter to protein-level evidence
# (mass-spectrometry or antibody-validated) — these are confirmed translated proteins.
hpa_expressed = hpa_raw.filter(
    pl.col("Evidence").str.contains("protein level")
    | pl.col("UniProt evidence").str.contains("protein level")
).select(
    ["Gene", "Uniprot", "Subcellular main location", "Subcellular additional location", "Evidence"]
)

print("Expressed proteins (protein-level evidence):", hpa_expressed.shape)
print(hpa_expressed.head(5))

Expressed proteins (protein-level evidence): (18564, 5)
shape: (5, 5)
┌──────────┬─────────┬───────────────────────────┬───────────────────────────┬─────────────────────┐
│ Gene     ┆ Uniprot ┆ Subcellular main location ┆ Subcellular additional    ┆ Evidence            │
│ ---      ┆ ---     ┆ ---                       ┆ locatio…                  ┆ ---                 │
│ str      ┆ str     ┆ str                       ┆ ---                       ┆ str                 │
│          ┆         ┆                           ┆ str                       ┆                     │
╞══════════╪═════════╪═══════════════════════════╪═══════════════════════════╪═════════════════════╡
│ TSPAN6   ┆ O43657  ┆ Cell Junctions, Cytosol   ┆ Nucleoli fibrillar center ┆ Evidence at protein │
│          ┆         ┆                           ┆                           ┆ level               │
│ TNMD     ┆ Q9H2S6  ┆ null                      ┆ null                      ┆ Evidence at protein │
│          ┆         ┆                           ┆                           ┆ level               │
│ DPM1     ┆ O60762  ┆ null                      ┆ null                      ┆ Evidence at protein │
│          ┆         ┆                           ┆                           ┆ level               │
│ SCYL3    ┆ Q8IZE3  ┆ Golgi apparatus, Cytosol  ┆ null                      ┆ Evidence at protein │
│          ┆         ┆                           ┆                           ┆ level               │
│ C1orf112 ┆ Q9NSG2  ┆ Nucleoplasm               ┆ Nucleoli                  ┆ Evidence at protein │
│          ┆         ┆                           ┆                           ┆ level               │
└──────────┴─────────┴───────────────────────────┴───────────────────────────┴─────────────────────┘

In [10]:

Copied!

hek293_genes = set(hpa_expressed["Gene"].to_list())
print(f"HEK293 [Human Embryonic Kidney 293] expressed gene set size: {len(hek293_genes)}")
hek293_genes = set(hpa_expressed["Gene"].to_list())
print(f"HEK293 [Human Embryonic Kidney 293] expressed gene set size: {len(hek293_genes)}")

HEK293 [Human Embryonic Kidney 293] expressed gene set size: 18557

4. Vertex Table Construction¶

Two vertex types are created per expressed gene:

prot:GENENAME — the protein product, localised to its HPA [Human Protein Atlas] subcellular compartment
gene:GENENAME — the gene locus, always assigned to the nucleus (n)

GEM [Generic Enzyme Model] single-letter compartment codes are used throughout.

In [11]:

Copied!





# Maps HPA [Human Protein Atlas] location strings to GEM [Generic Enzyme Model] single-letter compartment codes.
# First match wins; unmapped locations default to cytoplasm ('c').
COMPARTMENT_MAP = {
    "Mitochondria":        "m",
    "Nucleus":             "n",
    "Nucleoplasm":         "n",
    "Nucleoli":            "n",
    "Nuclear membrane":    "n",
    "Endoplasmic reticulum": "r",
    "Golgi apparatus":     "g",
    "Lysosome":            "l",
    "Cytosol":             "c",
    "Cytoplasm":           "c",
    "Plasma membrane":     "p",
    "Cell Junctions":      "p",
    "Peroxisome":          "x",
    "Vesicles":            "v",
    "Extracellular":       "e",
}


def map_compartment(location_str: str | None) -> str:
    if not location_str:
        return "c"
    loc_lower = location_str.lower()
    for key, code in COMPARTMENT_MAP.items():
        if key.lower() in loc_lower:
            return code
    return "c"


hpa_expressed = hpa_expressed.with_columns([
    pl.col("Subcellular main location")
      .map_elements(map_compartment, return_dtype=pl.Utf8)
      .alias("compartment")
])

print(hpa_expressed.select(["Gene", "compartment"]).head(10))
print("\nCompartment distribution:")
print(hpa_expressed["compartment"].value_counts().sort("count", descending=True))
# Maps HPA [Human Protein Atlas] location strings to GEM [Generic Enzyme Model] single-letter compartment codes.
# First match wins; unmapped locations default to cytoplasm ('c').
COMPARTMENT_MAP = {
    "Mitochondria":        "m",
    "Nucleus":             "n",
    "Nucleoplasm":         "n",
    "Nucleoli":            "n",
    "Nuclear membrane":    "n",
    "Endoplasmic reticulum": "r",
    "Golgi apparatus":     "g",
    "Lysosome":            "l",
    "Cytosol":             "c",
    "Cytoplasm":           "c",
    "Plasma membrane":     "p",
    "Cell Junctions":      "p",
    "Peroxisome":          "x",
    "Vesicles":            "v",
    "Extracellular":       "e",
}


def map_compartment(location_str: str | None) -> str:
    if not location_str:
        return "c"
    loc_lower = location_str.lower()
    for key, code in COMPARTMENT_MAP.items():
        if key.lower() in loc_lower:
            return code
    return "c"


hpa_expressed = hpa_expressed.with_columns([
    pl.col("Subcellular main location")
      .map_elements(map_compartment, return_dtype=pl.Utf8)
      .alias("compartment")
])

print(hpa_expressed.select(["Gene", "compartment"]).head(10))
print("\nCompartment distribution:")
print(hpa_expressed["compartment"].value_counts().sort("count", descending=True))

shape: (10, 2)
┌──────────┬─────────────┐
│ Gene     ┆ compartment │
│ ---      ┆ ---         │
│ str      ┆ str         │
╞══════════╪═════════════╡
│ TSPAN6   ┆ c           │
│ TNMD     ┆ null        │
│ DPM1     ┆ null        │
│ SCYL3    ┆ g           │
│ C1orf112 ┆ n           │
│ FGR      ┆ p           │
│ CFH      ┆ v           │
│ FUCA2    ┆ null        │
│ GCLC     ┆ n           │
│ NFYA     ┆ n           │
└──────────┴─────────────┘

Compartment distribution:
shape: (10, 2)
┌─────────────┬───────┐
│ compartment ┆ count │
│ ---         ┆ ---   │
│ str         ┆ u32   │
╞═════════════╪═══════╡
│ n           ┆ 5405  │
│ null        ┆ 5236  │
│ c           ┆ 3510  │
│ p           ┆ 1141  │
│ v           ┆ 1081  │
│ m           ┆ 966   │
│ g           ┆ 746   │
│ r           ┆ 440   │
│ x           ┆ 21    │
│ l           ┆ 18    │
└─────────────┴───────┘

In [12]:

Copied!





vertex_proteins = hpa_expressed.select([
    pl.concat_str([pl.lit("prot:"), pl.col("Gene")]).alias("vertex_id"),
    pl.col("Gene").alias("gene_symbol"),
    pl.col("Uniprot").alias("uniprot_id"),
    pl.lit("protein").alias("entity_type"),
    pl.col("compartment"),
    pl.col("Subcellular main location").alias("hpa_location"),
    pl.lit(None).alias("recon3d_id"),
])

# Genes always localised to nucleus — they exist as separate vertices to
# support gene→protein coupling and TF [Transcription Factor]→target edges.
vertex_genes = hpa_expressed.select([
    pl.concat_str([pl.lit("gene:"), pl.col("Gene")]).alias("vertex_id"),
    pl.col("Gene").alias("gene_symbol"),
    pl.col("Uniprot").alias("uniprot_id"),
    pl.lit("gene").alias("entity_type"),
    pl.lit("n").alias("compartment"),
    pl.col("Subcellular main location").alias("hpa_location"),
    pl.lit(None).alias("recon3d_id"),
])

vertex_table = pl.concat([vertex_proteins, vertex_genes])
print("Vertex table shape:", vertex_table.shape)
print(vertex_table.head(5))
vertex_proteins = hpa_expressed.select([
    pl.concat_str([pl.lit("prot:"), pl.col("Gene")]).alias("vertex_id"),
    pl.col("Gene").alias("gene_symbol"),
    pl.col("Uniprot").alias("uniprot_id"),
    pl.lit("protein").alias("entity_type"),
    pl.col("compartment"),
    pl.col("Subcellular main location").alias("hpa_location"),
    pl.lit(None).alias("recon3d_id"),
])

# Genes always localised to nucleus — they exist as separate vertices to
# support gene→protein coupling and TF [Transcription Factor]→target edges.
vertex_genes = hpa_expressed.select([
    pl.concat_str([pl.lit("gene:"), pl.col("Gene")]).alias("vertex_id"),
    pl.col("Gene").alias("gene_symbol"),
    pl.col("Uniprot").alias("uniprot_id"),
    pl.lit("gene").alias("entity_type"),
    pl.lit("n").alias("compartment"),
    pl.col("Subcellular main location").alias("hpa_location"),
    pl.lit(None).alias("recon3d_id"),
])

vertex_table = pl.concat([vertex_proteins, vertex_genes])
print("Vertex table shape:", vertex_table.shape)
print(vertex_table.head(5))

Vertex table shape: (37128, 7)
shape: (5, 7)
┌───────────────┬─────────────┬────────────┬─────────────┬─────────────┬──────────────┬────────────┐
│ vertex_id     ┆ gene_symbol ┆ uniprot_id ┆ entity_type ┆ compartment ┆ hpa_location ┆ recon3d_id │
│ ---           ┆ ---         ┆ ---        ┆ ---         ┆ ---         ┆ ---          ┆ ---        │
│ str           ┆ str         ┆ str        ┆ str         ┆ str         ┆ str          ┆ null       │
╞═══════════════╪═════════════╪════════════╪═════════════╪═════════════╪══════════════╪════════════╡
│ prot:TSPAN6   ┆ TSPAN6      ┆ O43657     ┆ protein     ┆ c           ┆ Cell         ┆ null       │
│               ┆             ┆            ┆             ┆             ┆ Junctions,   ┆            │
│               ┆             ┆            ┆             ┆             ┆ Cytosol      ┆            │
│ prot:TNMD     ┆ TNMD        ┆ Q9H2S6     ┆ protein     ┆ null        ┆ null         ┆ null       │
│ prot:DPM1     ┆ DPM1        ┆ O60762     ┆ protein     ┆ null        ┆ null         ┆ null       │
│ prot:SCYL3    ┆ SCYL3       ┆ Q8IZE3     ┆ protein     ┆ g           ┆ Golgi        ┆ null       │
│               ┆             ┆            ┆             ┆             ┆ apparatus,   ┆            │
│               ┆             ┆            ┆             ┆             ┆ Cytosol      ┆            │
│ prot:C1orf112 ┆ C1orf112    ┆ Q9NSG2     ┆ protein     ┆ n           ┆ Nucleoplasm  ┆ null       │
└───────────────┴─────────────┴────────────┴─────────────┴─────────────┴──────────────┴────────────┘

In [13]:

Copied!





vertex_table = vertex_table.with_columns(
    pl.col("compartment").fill_null("c")
)
print("Null compartments remaining:", vertex_table["compartment"].null_count())
print("Compartment distribution:")
print(vertex_table["compartment"].value_counts().sort("count", descending=True))
vertex_table = vertex_table.with_columns(
    pl.col("compartment").fill_null("c")
)
print("Null compartments remaining:", vertex_table["compartment"].null_count())
print("Compartment distribution:")
print(vertex_table["compartment"].value_counts().sort("count", descending=True))

Null compartments remaining: 0
Compartment distribution:
shape: (9, 2)
┌─────────────┬───────┐
│ compartment ┆ count │
│ ---         ┆ ---   │
│ str         ┆ u32   │
╞═════════════╪═══════╡
│ n           ┆ 23969 │
│ c           ┆ 8746  │
│ p           ┆ 1141  │
│ v           ┆ 1081  │
│ m           ┆ 966   │
│ g           ┆ 746   │
│ r           ┆ 440   │
│ x           ┆ 21    │
│ l           ┆ 18    │
└─────────────┴───────┘

5. AnnNet Graph Initialisation¶

In [14]:

Copied!

G = AnnNet(directed=True)

G.add_vertices_bulk(vertex_table.to_dicts())

print("Vertices loaded:", G.num_vertices)
print("Sample vertex attributes:")
print(G.vertex_attributes.head(5))
G = AnnNet(directed=True)

G.add_vertices_bulk(vertex_table.to_dicts())

print("Vertices loaded:", G.num_vertices)
print("Sample vertex attributes:")
print(G.vertex_attributes.head(5))

Vertices loaded: 37114
Sample vertex attributes:
shape: (5, 7)
┌───────────────┬──────────────┬────────────┬────────────┬─────────────┬─────────────┬─────────────┐
│ vertex_id     ┆ hpa_location ┆ recon3d_id ┆ uniprot_id ┆ entity_type ┆ gene_symbol ┆ compartment │
│ ---           ┆ ---          ┆ ---        ┆ ---        ┆ ---         ┆ ---         ┆ ---         │
│ str           ┆ str          ┆ null       ┆ str        ┆ str         ┆ str         ┆ str         │
╞═══════════════╪══════════════╪════════════╪════════════╪═════════════╪═════════════╪═════════════╡
│ prot:TSPAN6   ┆ Cell         ┆ null       ┆ O43657     ┆ protein     ┆ TSPAN6      ┆ c           │
│               ┆ Junctions,   ┆            ┆            ┆             ┆             ┆             │
│               ┆ Cytosol      ┆            ┆            ┆             ┆             ┆             │
│ prot:TNMD     ┆ null         ┆ null       ┆ Q9H2S6     ┆ protein     ┆ TNMD        ┆ c           │
│ prot:DPM1     ┆ null         ┆ null       ┆ O60762     ┆ protein     ┆ DPM1        ┆ c           │
│ prot:SCYL3    ┆ Golgi        ┆ null       ┆ Q8IZE3     ┆ protein     ┆ SCYL3       ┆ g           │
│               ┆ apparatus,   ┆            ┆            ┆             ┆             ┆             │
│               ┆ Cytosol      ┆            ┆            ┆             ┆             ┆             │
│ prot:C1orf112 ┆ Nucleoplasm  ┆ null       ┆ Q9NSG2     ┆ protein     ┆ C1orf112    ┆ n           │
└───────────────┴──────────────┴────────────┴────────────┴─────────────┴─────────────┴─────────────┘

In [15]:

Copied!





# Initialise the Kivelä multilayer structure with four biological aspects.
G.set_aspects(["type"], {"type": ["signaling", "metabolic", "complex", "regulatory"]})

for row in G.vertex_attributes.iter_rows(named=True):
    vid = row["vertex_id"]
    etype = row["entity_type"]
    if etype == "protein":
        G.add_presence(vid, ("signaling",))
        G.add_presence(vid, ("complex",))
    elif etype == "gene":
        G.add_presence(vid, ("regulatory",))

print(f"Aspects    : {G.aspects}")
print(f"Elem layers: {G.elem_layers}")
print(f"_VM size   : {len(G._VM)}")
# Initialise the Kivelä multilayer structure with four biological aspects.
G.set_aspects(["type"], {"type": ["signaling", "metabolic", "complex", "regulatory"]})

for row in G.vertex_attributes.iter_rows(named=True):
    vid = row["vertex_id"]
    etype = row["entity_type"]
    if etype == "protein":
        G.add_presence(vid, ("signaling",))
        G.add_presence(vid, ("complex",))
    elif etype == "gene":
        G.add_presence(vid, ("regulatory",))

print(f"Aspects    : {G.aspects}")
print(f"Elem layers: {G.elem_layers}")
print(f"_VM size   : {len(G._VM)}")

Aspects    : ['type']
Elem layers: {'type': ['signaling', 'metabolic', 'complex', 'regulatory']}
_VM size   : 55671

6. Signaling Layer — OmniPath Directed PPI [Protein–Protein Interaction]¶

In [16]:

Copied!





signaling_raw = op.interactions.OmniPath.get(
    genesymbols=True,
    fields=["sources", "references", "curation_effort"],
)
print("Columns:", signaling_raw.columns.tolist())
print(signaling_raw.head(3))
signaling_raw = op.interactions.OmniPath.get(
    genesymbols=True,
    fields=["sources", "references", "curation_effort"],
)
print("Columns:", signaling_raw.columns.tolist())
print(signaling_raw.head(3))

Columns: ['source', 'target', 'source_genesymbol', 'target_genesymbol', 'is_directed', 'is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition', 'sources', 'references', 'curation_effort', 'n_sources', 'n_primary_sources', 'n_references', 'references_stripped']
   source  target source_genesymbol target_genesymbol  is_directed  \
0  P0DP25  P48995             CALM3             TRPC1         True   
1  P0DP23  P48995             CALM1             TRPC1         True   
2  P0DP24  P48995             CALM2             TRPC1         True   

   is_stimulation  is_inhibition  consensus_direction  consensus_stimulation  \
0           False           True                 True                  False   
1           False           True                 True                  False   
2           False           True                 True                  False   

   consensus_inhibition sources                                 references  \
0                  True    TRIP  TRIP:11983166;TRIP:12601176;TRIP:11290752   
1                  True    TRIP  TRIP:11983166;TRIP:12601176;TRIP:11290752   
2                  True    TRIP  TRIP:11983166;TRIP:12601176;TRIP:11290752   

   curation_effort  n_sources  n_primary_sources n_references  \
0                3          1                  1            3   
1                3          1                  1            3   
2                3          1                  1            3   

          references_stripped  
0  11290752;11983166;12601176  
1  11290752;11983166;12601176  
2  11290752;11983166;12601176

In [17]:

Copied!





sig = pl.from_pandas(
    signaling_raw[[
        "source_genesymbol", "target_genesymbol",
        "is_stimulation", "is_inhibition",
        "consensus_direction", "consensus_stimulation", "consensus_inhibition",
        "sources", "references",
    ]].dropna(subset=["source_genesymbol", "target_genesymbol"])
)

sig = sig.filter(
    pl.col("source_genesymbol").is_in(hek293_genes)
    & pl.col("target_genesymbol").is_in(hek293_genes)
)

print("Signaling edges after HEK293 [Human Embryonic Kidney 293] filter:", sig.shape)
print(sig.head(3))
sig = pl.from_pandas(
    signaling_raw[[
        "source_genesymbol", "target_genesymbol",
        "is_stimulation", "is_inhibition",
        "consensus_direction", "consensus_stimulation", "consensus_inhibition",
        "sources", "references",
    ]].dropna(subset=["source_genesymbol", "target_genesymbol"])
)

sig = sig.filter(
    pl.col("source_genesymbol").is_in(hek293_genes)
    & pl.col("target_genesymbol").is_in(hek293_genes)
)

print("Signaling edges after HEK293 [Human Embryonic Kidney 293] filter:", sig.shape)
print(sig.head(3))

Signaling edges after HEK293 [Human Embryonic Kidney 293] filter: (31898, 9)
shape: (3, 9)
┌────────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬─────────┬───────────┐
│ source_gen ┆ target_ge ┆ is_stimul ┆ is_inhibi ┆ … ┆ consensus ┆ consensus ┆ sources ┆ reference │
│ esymbol    ┆ nesymbol  ┆ ation     ┆ tion      ┆   ┆ _stimulat ┆ _inhibiti ┆ ---     ┆ s         │
│ ---        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ion       ┆ on        ┆ str     ┆ ---       │
│ str        ┆ str       ┆ bool      ┆ bool      ┆   ┆ ---       ┆ ---       ┆         ┆ str       │
│            ┆           ┆           ┆           ┆   ┆ bool      ┆ bool      ┆         ┆           │
╞════════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═════════╪═══════════╡
│ CALM3      ┆ TRPC1     ┆ false     ┆ true      ┆ … ┆ false     ┆ true      ┆ TRIP    ┆ TRIP:1198 │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ 3166;TRIP │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ :12601176 │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ ;TR…      │
│ CALM1      ┆ TRPC1     ┆ false     ┆ true      ┆ … ┆ false     ┆ true      ┆ TRIP    ┆ TRIP:1198 │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ 3166;TRIP │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ :12601176 │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ ;TR…      │
│ CALM2      ┆ TRPC1     ┆ false     ┆ true      ┆ … ┆ false     ┆ true      ┆ TRIP    ┆ TRIP:1198 │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ 3166;TRIP │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ :12601176 │
│            ┆           ┆           ┆           ┆   ┆           ┆           ┆         ┆ ;TR…      │
└────────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴─────────┴───────────┘

In [18]:

Copied!





sig_tuples = []
for row in sig.iter_rows(named=True):
    src  = f"prot:{row['source_genesymbol']}"
    tgt  = f"prot:{row['target_genesymbol']}"
    sign = 1.0 if row["is_stimulation"] else (-1.0 if row["is_inhibition"] else 0.0)
    sig_tuples.append((src, tgt, sign))

eids = G.add_intra_edges_bulk(sig_tuples, ("signaling",), fast_mode=True)

sig_attrs = {
    eid: {
        "edge_type":      "signaling",
        "layer":          "signaling",
        "is_stimulation": row["is_stimulation"],
        "is_inhibition":  row["is_inhibition"],
        "consensus_dir":  row["consensus_direction"],
        "sources":        row["sources"],
    }
    for eid, row in zip(eids, sig.iter_rows(named=True))
}

G.set_edge_attrs_bulk(sig_attrs)
print(f"Signaling edges added: {len(eids)}")
print(f"Total edges: {G.num_edges}")
sig_tuples = []
for row in sig.iter_rows(named=True):
    src  = f"prot:{row['source_genesymbol']}"
    tgt  = f"prot:{row['target_genesymbol']}"
    sign = 1.0 if row["is_stimulation"] else (-1.0 if row["is_inhibition"] else 0.0)
    sig_tuples.append((src, tgt, sign))

eids = G.add_intra_edges_bulk(sig_tuples, ("signaling",), fast_mode=True)

sig_attrs = {
    eid: {
        "edge_type":      "signaling",
        "layer":          "signaling",
        "is_stimulation": row["is_stimulation"],
        "is_inhibition":  row["is_inhibition"],
        "consensus_dir":  row["consensus_direction"],
        "sources":        row["sources"],
    }
    for eid, row in zip(eids, sig.iter_rows(named=True))
}

G.set_edge_attrs_bulk(sig_attrs)
print(f"Signaling edges added: {len(eids)}")
print(f"Total edges: {G.num_edges}")

Signaling edges added: 31898
Total edges: 31898

7. Metabolic Layer — Human-GEM [Generic Enzyme Model] Stoichiometric Hyperedges¶

Human-GEM is loaded directly via the AnnNet SBML [Systems Biology Markup Language] adapter. Each reaction becomes a directed hyperedge with stoichiometric coefficients preserved in the incidence matrix.

In [19]:

Copied!

from_sbml(str(HUMANGEM_XML), graph=G, slice="metabolic", preserve_stoichiometry=True)

print("Vertices in graph:", G.num_vertices)
print("Edges in graph   :", G.num_edges)
print("Slices           :", list(G._slices.keys()))
from_sbml(str(HUMANGEM_XML), graph=G, slice="metabolic", preserve_stoichiometry=True)

print("Vertices in graph:", G.num_vertices)
print("Edges in graph   :", G.num_edges)
print("Slices           :", list(G._slices.keys()))

Vertices in graph: 45571
Edges in graph   : 44869
Slices           : ['default', 'signaling', 'e', 'x', 'm', 'c', 'l', 'r', 'g', 'n', 'i', 'metabolic']

In [20]:

Copied!





# Register metabolite vertices in the metabolic layer presence map.
added = 0
for vid in G._slices["metabolic"]["vertices"]:
    if G.entity_types.get(vid) == "vertex" and not vid.startswith("__"):
        G.add_presence(vid, ("metabolic",))
        added += 1
print(f"Metabolite presences registered: {added}")
print(f"_VM size now: {len(G._VM)}")
# Register metabolite vertices in the metabolic layer presence map.
added = 0
for vid in G._slices["metabolic"]["vertices"]:
    if G.entity_types.get(vid) == "vertex" and not vid.startswith("__"):
        G.add_presence(vid, ("metabolic",))
        added += 1
print(f"Metabolite presences registered: {added}")
print(f"_VM size now: {len(G._VM)}")

Metabolite presences registered: 8455
_VM size now: 64126

In [21]:

Copied!





metabolic_edges = G._slices["metabolic"]["edges"]
metabolic_verts = G._slices["metabolic"]["vertices"]

sample_eid = next(iter(metabolic_edges))
hdef = G.hyperedge_definitions[sample_eid]

print("Metabolic edges   :", len(metabolic_edges))
print("Metabolic vertices:", len(metabolic_verts))
print("Total vertices    :", G.num_vertices)
print("Total edges       :", G.num_edges)
print(f"\nSample hyperedge  : {sample_eid}")
print("Head (products)   :", hdef["head"])
print("Tail (reactants)  :", hdef["tail"])

col = G.edge_to_idx[sample_eid]
mat = G._matrix.tocsc()
col_data = mat.getcol(col)
rows, _ = col_data.nonzero()
print("\nStoichiometric coefficients:")
for r in rows:
    vid = G.idx_to_entity[r]
    coeff = col_data[r, 0]
    print(f"  {vid}: {coeff}")
metabolic_edges = G._slices["metabolic"]["edges"]
metabolic_verts = G._slices["metabolic"]["vertices"]

sample_eid = next(iter(metabolic_edges))
hdef = G.hyperedge_definitions[sample_eid]

print("Metabolic edges   :", len(metabolic_edges))
print("Metabolic vertices:", len(metabolic_verts))
print("Total vertices    :", G.num_vertices)
print("Total edges       :", G.num_edges)
print(f"\nSample hyperedge  : {sample_eid}")
print("Head (products)   :", hdef["head"])
print("Tail (reactants)  :", hdef["tail"])

col = G.edge_to_idx[sample_eid]
mat = G._matrix.tocsc()
col_data = mat.getcol(col)
rows, _ = col_data.nonzero()
print("\nStoichiometric coefficients:")
for r in rows:
    vid = G.idx_to_entity[r]
    coeff = col_data[r, 0]
    print(f"  {vid}: {coeff}")

Metabolic edges   : 12971
Metabolic vertices: 8457
Total vertices    : 45571
Total edges       : 44869

Sample hyperedge  : MAR06128
Head (products)   : {'MAM02046e', 'MAM02901e', 'MAM02366c'}
Tail (reactants)  : {'MAM02366e', 'MAM02901c', 'MAM02046c'}

Stoichiometric coefficients:
  MAM02046c: -1.0
  MAM02366c: 1.0
  MAM02901c: -1.0
  MAM02046e: 1.0
  MAM02366e: -1.0
  MAM02901e: 1.0

8. Complex Layer — OmniPath Protein Complexes as Undirected Hyperedges¶

Only complexes where all subunits are HEK293-expressed are retained, ensuring the complex layer is fully cell-line-specific.

In [22]:

Copied!





complex_hyperedges = []
complex_attr_records = []
skipped = 0

for row in complexes_raw.iter_rows(named=True):
    members_raw = row["components_genesymbols"]
    if not members_raw:
        continue
    members = members_raw.split("_")
    members_expressed = [m for m in members if m in hek293_genes]

    if len(members_expressed) < len(members):
        skipped += 1
        continue

    members_vids = [f"prot:{m}" for m in members_expressed]
    eid = f"cpx:{row['name']}"

    complex_hyperedges.append({
        "edge_id": eid,
        "members": members_vids,
        "weight":  1.0,
    })
    complex_attr_records.append({
        "edge_id":       eid,
        "edge_type":     "complex",
        "name":          row["name"],
        "sources":       row["sources"],
        "stoichiometry": row["stoichiometry"],
    })

print(f"Complexes kept : {len(complex_hyperedges)}")
print(f"Complexes skipped (members not in HEK293 [Human Embryonic Kidney 293]): {skipped}")
print(f"Sample: {complex_hyperedges[0]}")
complex_hyperedges = []
complex_attr_records = []
skipped = 0

for row in complexes_raw.iter_rows(named=True):
    members_raw = row["components_genesymbols"]
    if not members_raw:
        continue
    members = members_raw.split("_")
    members_expressed = [m for m in members if m in hek293_genes]

    if len(members_expressed) < len(members):
        skipped += 1
        continue

    members_vids = [f"prot:{m}" for m in members_expressed]
    eid = f"cpx:{row['name']}"

    complex_hyperedges.append({
        "edge_id": eid,
        "members": members_vids,
        "weight":  1.0,
    })
    complex_attr_records.append({
        "edge_id":       eid,
        "edge_type":     "complex",
        "name":          row["name"],
        "sources":       row["sources"],
        "stoichiometry": row["stoichiometry"],
    })

print(f"Complexes kept : {len(complex_hyperedges)}")
print(f"Complexes skipped (members not in HEK293 [Human Embryonic Kidney 293]): {skipped}")
print(f"Sample: {complex_hyperedges[0]}")

Complexes kept : 11075
Complexes skipped (members not in HEK293 [Human Embryonic Kidney 293]): 435
Sample: {'edge_id': 'cpx:NFY', 'members': ['prot:NFYA', 'prot:NFYB', 'prot:NFYC'], 'weight': 1.0}

In [23]:

Copied!

G.add_hyperedges_bulk(complex_hyperedges)
print("Total edges after complexes:", G.num_edges)
print("Total vertices             :", G.num_vertices)
G.add_hyperedges_bulk(complex_hyperedges)
print("Total edges after complexes:", G.num_edges)
print("Total vertices             :", G.num_vertices)

Total edges after complexes: 49325
Total vertices             : 45571

9. Regulatory Layer — DoRothEA [Discriminant Regulon Expression Analysis] TF [Transcription Factor]–Target Interactions¶

Confidence levels A and B only (experimentally supported). Edges run between gene:TF and gene:target vertices in the regulatory layer.

In [24]:

Copied!





reg = dorothea.filter(
    pl.col("source_genesymbol").is_in(hek293_genes)
    & pl.col("target_genesymbol").is_in(hek293_genes)
)
print("Regulatory edges after HEK293 [Human Embryonic Kidney 293] filter:", reg.shape)
print("Unique TFs [Transcription Factors]:", reg["source_genesymbol"].n_unique())
reg = dorothea.filter(
    pl.col("source_genesymbol").is_in(hek293_genes)
    & pl.col("target_genesymbol").is_in(hek293_genes)
)
print("Regulatory edges after HEK293 [Human Embryonic Kidney 293] filter:", reg.shape)
print("Unique TFs [Transcription Factors]:", reg["source_genesymbol"].n_unique())

Regulatory edges after HEK293 [Human Embryonic Kidney 293] filter: (15146, 12)
Unique TFs [Transcription Factors]: 367

In [25]:

Copied!





reg_tuples = [
    (
        f"gene:{row['source_genesymbol']}",
        f"gene:{row['target_genesymbol']}",
        1.0 if row["is_stimulation"] else (-1.0 if row["is_inhibition"] else 0.0),
    )
    for row in reg.iter_rows(named=True)
]

reg_eids = G.add_intra_edges_bulk(reg_tuples, ("regulatory",), fast_mode=True)

reg_attrs = {
    eid: {
        "edge_type":   "regulatory",
        "layer":       "regulatory",
        "tf":          row["source_genesymbol"],
        "target_gene": row["target_genesymbol"],
        "sources":     row["sources"],
    }
    for eid, row in zip(reg_eids, reg.iter_rows(named=True))
}

G.set_edge_attrs_bulk(reg_attrs)
print(f"Regulatory edges added: {len(reg_eids)}")
print(f"Total edges: {G.num_edges}")
reg_tuples = [
    (
        f"gene:{row['source_genesymbol']}",
        f"gene:{row['target_genesymbol']}",
        1.0 if row["is_stimulation"] else (-1.0 if row["is_inhibition"] else 0.0),
    )
    for row in reg.iter_rows(named=True)
]

reg_eids = G.add_intra_edges_bulk(reg_tuples, ("regulatory",), fast_mode=True)

reg_attrs = {
    eid: {
        "edge_type":   "regulatory",
        "layer":       "regulatory",
        "tf":          row["source_genesymbol"],
        "target_gene": row["target_genesymbol"],
        "sources":     row["sources"],
    }
    for eid, row in zip(reg_eids, reg.iter_rows(named=True))
}

G.set_edge_attrs_bulk(reg_attrs)
print(f"Regulatory edges added: {len(reg_eids)}")
print(f"Total edges: {G.num_edges}")

Regulatory edges added: 15146
Total edges: 64471

10. Cross-Layer Coupling Edges¶

Two coupling edge types link the regulatory and signaling layers:

gene:X → prot:X — translation: every expressed gene produces its protein product
prot:TF → gene:target — TF [Transcription Factor] protein activity drives target gene regulation

In [26]:

Copied!





coupling_records = []

for i, gene in enumerate(hek293_genes):
    gene_vid = f"gene:{gene}"
    prot_vid = f"prot:{gene}"
    if gene_vid in G.entity_to_idx and prot_vid in G.entity_to_idx:
        coupling_records.append({
            "edge_id":       f"coup_gp_{i:06d}",
            "source":        gene_vid,
            "target":        prot_vid,
            "directed":      True,
            "weight":        1.0,
            "edge_type":     "coupling",
            "coupling_type": "gene_to_protein",
        })

print(f"gene→protein coupling edges: {len(coupling_records)}")
G.add_edges_bulk(coupling_records)
print("Total edges:", G.num_edges)
coupling_records = []

for i, gene in enumerate(hek293_genes):
    gene_vid = f"gene:{gene}"
    prot_vid = f"prot:{gene}"
    if gene_vid in G.entity_to_idx and prot_vid in G.entity_to_idx:
        coupling_records.append({
            "edge_id":       f"coup_gp_{i:06d}",
            "source":        gene_vid,
            "target":        prot_vid,
            "directed":      True,
            "weight":        1.0,
            "edge_type":     "coupling",
            "coupling_type": "gene_to_protein",
        })

print(f"gene→protein coupling edges: {len(coupling_records)}")
G.add_edges_bulk(coupling_records)
print("Total edges:", G.num_edges)

gene→protein coupling edges: 18557
Total edges: 83028

In [27]:

Copied!





tf_coupling = []

for j, row in enumerate(reg.iter_rows(named=True)):
    tf_prot  = f"prot:{row['source_genesymbol']}"
    tgt_gene = f"gene:{row['target_genesymbol']}"
    if tf_prot in G.entity_to_idx and tgt_gene in G.entity_to_idx:
        mor = 1 if row["is_stimulation"] else (-1 if row["is_inhibition"] else 0)
        tf_coupling.append({
            "edge_id":       f"coup_tf_{j:06d}",
            "source":        tf_prot,
            "target":        tgt_gene,
            "directed":      True,
            "weight":        float(mor),
            "edge_type":     "coupling",
            "coupling_type": "tf_protein_to_gene",
        })

print(f"TF [Transcription Factor] protein→gene coupling edges: {len(tf_coupling)}")
G.add_edges_bulk(tf_coupling)
print("Total edges:", G.num_edges)
tf_coupling = []

for j, row in enumerate(reg.iter_rows(named=True)):
    tf_prot  = f"prot:{row['source_genesymbol']}"
    tgt_gene = f"gene:{row['target_genesymbol']}"
    if tf_prot in G.entity_to_idx and tgt_gene in G.entity_to_idx:
        mor = 1 if row["is_stimulation"] else (-1 if row["is_inhibition"] else 0)
        tf_coupling.append({
            "edge_id":       f"coup_tf_{j:06d}",
            "source":        tf_prot,
            "target":        tgt_gene,
            "directed":      True,
            "weight":        float(mor),
            "edge_type":     "coupling",
            "coupling_type": "tf_protein_to_gene",
        })

print(f"TF [Transcription Factor] protein→gene coupling edges: {len(tf_coupling)}")
G.add_edges_bulk(tf_coupling)
print("Total edges:", G.num_edges)

TF [Transcription Factor] protein→gene coupling edges: 15146
Total edges: 98174

11. Edge Attribute Consolidation¶

Rewrite the layer and edge_type columns on edge_attributes as a clean, null-free join — overwriting any stale values left by incremental inserts.

In [28]:

Copied!





tag_map = {}

for eid in G.layer_edge_set(("signaling",)):
    tag_map[eid] = {"layer": "signaling", "edge_type": "signaling"}

for eid in G.layer_edge_set(("regulatory",)):
    tag_map[eid] = {"layer": "regulatory", "edge_type": "regulatory"}

for eid in G._slices["metabolic"]["edges"]:
    tag_map[eid] = {"layer": "metabolic", "edge_type": "metabolic"}

for rec in complex_attr_records:
    tag_map[rec["edge_id"]] = {"layer": "complex", "edge_type": "complex"}

for rec in coupling_records + tf_coupling:
    tag_map[rec["edge_id"]] = {"layer": "coupling", "edge_type": "coupling"}

print(f"Tags built: {len(tag_map)}")

tag_df = pl.DataFrame([
    {"edge_id": eid, "layer": v["layer"], "edge_type": v["edge_type"]}
    for eid, v in tag_map.items()
])

drop_cols = [c for c in G.edge_attributes.columns if c in ("layer", "edge_type", "layer_right")]
G.edge_attributes = G.edge_attributes.drop(drop_cols).join(tag_df, on="edge_id", how="left")

print("edge_attributes shape:", G.edge_attributes.shape)
print("Layer distribution:")
print(G.edge_attributes["layer"].value_counts().sort("count", descending=True))
print("edge_type nulls:", G.edge_attributes["edge_type"].null_count())
tag_map = {}

for eid in G.layer_edge_set(("signaling",)):
    tag_map[eid] = {"layer": "signaling", "edge_type": "signaling"}

for eid in G.layer_edge_set(("regulatory",)):
    tag_map[eid] = {"layer": "regulatory", "edge_type": "regulatory"}

for eid in G._slices["metabolic"]["edges"]:
    tag_map[eid] = {"layer": "metabolic", "edge_type": "metabolic"}

for rec in complex_attr_records:
    tag_map[rec["edge_id"]] = {"layer": "complex", "edge_type": "complex"}

for rec in coupling_records + tf_coupling:
    tag_map[rec["edge_id"]] = {"layer": "coupling", "edge_type": "coupling"}

print(f"Tags built: {len(tag_map)}")

tag_df = pl.DataFrame([
    {"edge_id": eid, "layer": v["layer"], "edge_type": v["edge_type"]}
    for eid, v in tag_map.items()
])

drop_cols = [c for c in G.edge_attributes.columns if c in ("layer", "edge_type", "layer_right")]
G.edge_attributes = G.edge_attributes.drop(drop_cols).join(tag_df, on="edge_id", how="left")

print("edge_attributes shape:", G.edge_attributes.shape)
print("Layer distribution:")
print(G.edge_attributes["layer"].value_counts().sort("count", descending=True))
print("edge_type nulls:", G.edge_attributes["edge_type"].null_count())

Tags built: 98174
edge_attributes shape: (60015, 16)
Layer distribution:
shape: (3, 2)
┌────────────┬───────┐
│ layer      ┆ count │
│ ---        ┆ ---   │
│ str        ┆ u32   │
╞════════════╪═══════╡
│ signaling  ┆ 31898 │
│ regulatory ┆ 15146 │
│ metabolic  ┆ 12971 │
└────────────┴───────┘
edge_type nulls: 0

12. Organelle and Condition Slices¶

Named slices provide fast access to organelle-localised or condition-specific subgraphs. SBML [Systems Biology Markup Language] import creates metabolite-only slices; this section enriches them with protein vertices from HPA [Human Protein Atlas] compartment assignments.

In [29]:

Copied!





ORGANELLE_SLICES = {
    "m": "mitochondria",
    "n": "nucleus",
    "r": "er",
    "l": "lysosome",
    "c": "cytoplasm",
    "g": "golgi",
    "x": "peroxisome",
    "p": "plasma_membrane",
    "e": "extracellular",
}

for code, name in ORGANELLE_SLICES.items():
    prot_vids = (
        G.vertex_attributes
        .filter(
            (pl.col("compartment") == code)
            & (pl.col("entity_type") == "protein")
        )["vertex_id"]
        .to_list()
    )

    existing = G._slices.get(code, {}).get("vertices", set())
    vid_set = set(prot_vids) | existing

    slice_edges = []
    for eid, (src, tgt, _) in G.edge_definitions.items():
        if src in vid_set or tgt in vid_set:
            slice_edges.append(eid)
    for eid, hdef in G.hyperedge_definitions.items():
        members = hdef.get("head", set()) | hdef.get("tail", set()) | hdef.get("members", set())
        if members & vid_set:
            slice_edges.append(eid)

    if name not in G._slices:
        G._slices[name] = {"vertices": set(), "edges": set(), "attributes": {}}
    G._slices[name]["vertices"] = vid_set
    G._slices[name]["edges"] = set(slice_edges)

    print(f"{name:20s}: {len(vid_set):6d} vertices, {len(slice_edges):6d} edges")
ORGANELLE_SLICES = {
    "m": "mitochondria",
    "n": "nucleus",
    "r": "er",
    "l": "lysosome",
    "c": "cytoplasm",
    "g": "golgi",
    "x": "peroxisome",
    "p": "plasma_membrane",
    "e": "extracellular",
}

for code, name in ORGANELLE_SLICES.items():
    prot_vids = (
        G.vertex_attributes
        .filter(
            (pl.col("compartment") == code)
            & (pl.col("entity_type") == "protein")
        )["vertex_id"]
        .to_list()
    )

    existing = G._slices.get(code, {}).get("vertices", set())
    vid_set = set(prot_vids) | existing

    slice_edges = []
    for eid, (src, tgt, _) in G.edge_definitions.items():
        if src in vid_set or tgt in vid_set:
            slice_edges.append(eid)
    for eid, hdef in G.hyperedge_definitions.items():
        members = hdef.get("head", set()) | hdef.get("tail", set()) | hdef.get("members", set())
        if members & vid_set:
            slice_edges.append(eid)

    if name not in G._slices:
        G._slices[name] = {"vertices": set(), "edges": set(), "attributes": {}}
    G._slices[name]["vertices"] = vid_set
    G._slices[name]["edges"] = set(slice_edges)

    print(f"{name:20s}: {len(vid_set):6d} vertices, {len(slice_edges):6d} edges")

mitochondria        :   1897 vertices,   4161 edges
nucleus             :   5614 vertices,  38622 edges
er                  :   1411 vertices,   3055 edges
lysosome            :    470 vertices,    695 edges
cytoplasm           :  12084 vertices,  40946 edges
golgi               :   1107 vertices,   3611 edges
peroxisome          :    528 vertices,    732 edges
plasma_membrane     :   1141 vertices,   8727 edges
extracellular       :   1660 vertices,   4893 edges

In [30]:

Copied!





basal_edges = set(
    G.edge_attributes
    .filter(pl.col("layer") == "signaling")["edge_id"]
    .to_list()
)
basal_verts = set()
for eid in basal_edges:
    edef = G.edge_definitions.get(eid)
    if edef:
        basal_verts.add(edef[0])
        basal_verts.add(edef[1])

G._slices["basal"] = {"vertices": basal_verts, "edges": basal_edges, "attributes": {}}
print(f"Basal slice    : {len(basal_verts)} vertices, {len(basal_edges)} edges")

EGF_PROTEINS = {
    "prot:EGFR",  "prot:GRB2",    "prot:SOS1",  "prot:KRAS",  "prot:BRAF",
    "prot:MAP2K1","prot:MAPK1",   "prot:MAPK3", "prot:AKT1",  "prot:PIK3CA",
    "prot:PTEN",  "prot:MTOR",    "prot:RPS6KB1","prot:ELK1", "prot:MYC",
    "prot:JUN",   "prot:FOS",     "prot:STAT3",  "prot:SRC",   "prot:SHC1",
}

egf_edges = set()
egf_verts = set()
for eid in basal_edges:
    edef = G.edge_definitions.get(eid)
    if edef and (edef[0] in EGF_PROTEINS or edef[1] in EGF_PROTEINS):
        egf_edges.add(eid)
        egf_verts.add(edef[0])
        egf_verts.add(edef[1])

G._slices["egf_stimulated"] = {"vertices": egf_verts, "edges": egf_edges, "attributes": {}}
print(f"EGF [Epidermal Growth Factor]-stimulated : {len(egf_verts)} vertices, {len(egf_edges)} edges")
print(f"\nAll slices: {list(G._slices.keys())}")
basal_edges = set(
    G.edge_attributes
    .filter(pl.col("layer") == "signaling")["edge_id"]
    .to_list()
)
basal_verts = set()
for eid in basal_edges:
    edef = G.edge_definitions.get(eid)
    if edef:
        basal_verts.add(edef[0])
        basal_verts.add(edef[1])

G._slices["basal"] = {"vertices": basal_verts, "edges": basal_edges, "attributes": {}}
print(f"Basal slice    : {len(basal_verts)} vertices, {len(basal_edges)} edges")

EGF_PROTEINS = {
    "prot:EGFR",  "prot:GRB2",    "prot:SOS1",  "prot:KRAS",  "prot:BRAF",
    "prot:MAP2K1","prot:MAPK1",   "prot:MAPK3", "prot:AKT1",  "prot:PIK3CA",
    "prot:PTEN",  "prot:MTOR",    "prot:RPS6KB1","prot:ELK1", "prot:MYC",
    "prot:JUN",   "prot:FOS",     "prot:STAT3",  "prot:SRC",   "prot:SHC1",
}

egf_edges = set()
egf_verts = set()
for eid in basal_edges:
    edef = G.edge_definitions.get(eid)
    if edef and (edef[0] in EGF_PROTEINS or edef[1] in EGF_PROTEINS):
        egf_edges.add(eid)
        egf_verts.add(edef[0])
        egf_verts.add(edef[1])

G._slices["egf_stimulated"] = {"vertices": egf_verts, "edges": egf_edges, "attributes": {}}
print(f"EGF [Epidermal Growth Factor]-stimulated : {len(egf_verts)} vertices, {len(egf_edges)} edges")
print(f"\nAll slices: {list(G._slices.keys())}")

Basal slice    : 7451 vertices, 31898 edges
EGF [Epidermal Growth Factor]-stimulated : 1394 vertices, 2552 edges

All slices: ['default', 'signaling', 'e', 'x', 'm', 'c', 'l', 'r', 'g', 'n', 'i', 'metabolic', 'regulatory', 'mitochondria', 'nucleus', 'er', 'lysosome', 'cytoplasm', 'golgi', 'peroxisome', 'plasma_membrane', 'extracellular', 'basal', 'egf_stimulated']

13. Biological Queries¶

In [31]:

Copied!





# Query 1 — Mito-ER [Endoplasmic Reticulum] contact site edges (cross-slice intersection)
contact_edges = G._slices["mitochondria"]["edges"] & G._slices["er"]["edges"]
print(f"Mito-ER [Endoplasmic Reticulum] contact site edges: {len(contact_edges)}")

for eid in list(contact_edges)[:5]:
    hdef = G.hyperedge_definitions.get(eid)
    edef = G.edge_definitions.get(eid)
    if hdef:
        members = hdef.get("members") or (hdef.get("head", set()) | hdef.get("tail", set()))
        print(f"  hyperedge {eid}: {members}")
    elif edef:
        print(f"  edge {eid}: {edef[0]} → {edef[1]}")
# Query 1 — Mito-ER [Endoplasmic Reticulum] contact site edges (cross-slice intersection)
contact_edges = G._slices["mitochondria"]["edges"] & G._slices["er"]["edges"]
print(f"Mito-ER [Endoplasmic Reticulum] contact site edges: {len(contact_edges)}")

for eid in list(contact_edges)[:5]:
    hdef = G.hyperedge_definitions.get(eid)
    edef = G.edge_definitions.get(eid)
    if hdef:
        members = hdef.get("members") or (hdef.get("head", set()) | hdef.get("tail", set()))
        print(f"  hyperedge {eid}: {members}")
    elif edef:
        print(f"  edge {eid}: {edef[0]} → {edef[1]}")

Mito-ER [Endoplasmic Reticulum] contact site edges: 45
  hyperedge cpx:Commander complex: {'prot:DENND10', 'prot:COMMD3', 'prot:VPS35L', 'prot:COMMD8', 'prot:CCDC22', 'prot:VPS29', 'prot:CCDC93', 'prot:COMMD10', 'prot:COMMD5', 'prot:COMMD4', 'prot:COMMD6', 'prot:COMMD7', 'prot:VPS26C', 'prot:COMMD2', 'prot:COMMD1', 'prot:COMMD9'}
  hyperedge cpx:COMMD1-CCDC22-CCDC93-C16orf62 complex: {'prot:CCDC22', 'prot:CCDC93', 'prot:COMMD1', 'prot:VPS35L'}
  hyperedge cpx:TIM23-TOM22-TIM50-HSD3B2 complex: {'prot:TIMM23', 'prot:HSD3B2', 'prot:TIMM50', 'prot:TOMM22'}
  hyperedge MAR01824: {'MAM00623r', 'MAM00623m'}
  hyperedge MAR00037: {'MAM00987m', 'MAM01001m', 'MAM01011c', 'MAM00652x', 'MAM00659c', 'MAM00988c', 'MAM00641r', 'MAM00642r', 'MAM00641m', 'MAM00649r', 'MAM00652c', 'MAM00987c', 'MAM01001c', 'MAM00651c', 'MAM01001r', 'MAM00660c', 'MAM00641c', 'MAM10001e', 'MAM00432r', 'MAM00604c', 'MAM01001x', 'MAM00987x', 'MAM00642c', 'MAM01012c', 'MAM01314c', 'MAM00651x', 'MAM01794r', 'MAM00644c', 'MAM01792r', 'MAM00925c', 'MAM00642x', 'MAM00645c', 'MAM00641x', 'MAM00652m', 'MAM00651r', 'MAM00828r', 'MAM00947c', 'MAM00400r', 'MAM00747r', 'MAM00945c', 'MAM00949c', 'MAM02469c', 'MAM00651m', 'MAM00400c', 'MAM00624c', 'MAM00660r', 'MAM00927c', 'MAM00987r', 'MAM00642m', 'MAM00652r', 'MAM00649l', 'MAM00830c'}

In [32]:

Copied!





# Query 2 — Organelle-specific hub proteins (degree within each organelle slice)
organelles = ["mitochondria", "nucleus", "er", "lysosome", "cytoplasm", "golgi", "plasma_membrane"]

print("Top 5 hub proteins per organelle:\n")
for org in organelles:
    verts = G._slices[org]["vertices"]
    edges = G._slices[org]["edges"]
    degree = Counter()
    for eid in edges:
        edef = G.edge_definitions.get(eid)
        if edef:
            if edef[0] in verts and edef[0].startswith("prot:"):
                degree[edef[0]] += 1
            if edef[1] in verts and edef[1].startswith("prot:"):
                degree[edef[1]] += 1
    top5 = degree.most_common(5)
    print(f"{org}:")
    for prot, deg in top5:
        print(f"  {prot:30s} degree={deg}")
    print()
# Query 2 — Organelle-specific hub proteins (degree within each organelle slice)
organelles = ["mitochondria", "nucleus", "er", "lysosome", "cytoplasm", "golgi", "plasma_membrane"]

print("Top 5 hub proteins per organelle:\n")
for org in organelles:
    verts = G._slices[org]["vertices"]
    edges = G._slices[org]["edges"]
    degree = Counter()
    for eid in edges:
        edef = G.edge_definitions.get(eid)
        if edef:
            if edef[0] in verts and edef[0].startswith("prot:"):
                degree[edef[0]] += 1
            if edef[1] in verts and edef[1].startswith("prot:"):
                degree[edef[1]] += 1
    top5 = degree.most_common(5)
    print(f"{org}:")
    for prot, deg in top5:
        print(f"  {prot:30s} degree={deg}")
    print()

Top 5 hub proteins per organelle:

mitochondria:
  prot:CASP3                     degree=124
  prot:TRAF6                     degree=72
  prot:BAD                       degree=61
  prot:BCL2                      degree=58
  prot:CASP8                     degree=53

nucleus:
  prot:TP53                      degree=825
  prot:STAT1                     degree=785
  prot:GSK3B                     degree=688
  prot:HNF4A                     degree=622
  prot:CTCF                      degree=616

er:
  prot:PTPN1                     degree=77
  prot:FGFR3                     degree=43
  prot:VTN                       degree=27
  prot:ITGA1                     degree=26
  prot:COL1A1                    degree=26

lysosome:
  prot:RPTOR                     degree=26
  prot:RAB7A                     degree=11
  prot:TFRC                      degree=10
  prot:CLTC                      degree=9
  prot:GRN                       degree=7

cytoplasm:
  prot:PRKCA                     degree=547
  prot:MAPK14                    degree=478
  prot:MITF                      degree=458
  prot:PRKACA                    degree=445
  prot:ZNF263                    degree=313

golgi:
  prot:EGFR                      degree=254
  prot:LCK                       degree=132
  prot:MTOR                      degree=77
  prot:APP                       degree=62
  prot:RET                       degree=48

plasma_membrane:
  prot:SRC                       degree=490
  prot:FYN                       degree=199
  prot:INSR                      degree=169
  prot:CTNNB1                    degree=161
  prot:ITGB1                     degree=149

In [33]:

Copied!





# Query 3 — Signal-to-transcription bridge TFs [Transcription Factors]
# A bridge TF [Transcription Factor] is both a target of signaling AND a regulator in the regulatory layer.
sig_target_prots = set()
for eid in G.edge_attributes.filter(pl.col("layer") == "signaling")["edge_id"].to_list():
    edef = G.edge_definitions.get(eid)
    if edef:
        sig_target_prots.add(edef[1].replace("prot:", ""))

reg_tf_genes = set()
for eid in G.edge_attributes.filter(pl.col("layer") == "regulatory")["edge_id"].to_list():
    edef = G.edge_definitions.get(eid)
    if edef:
        reg_tf_genes.add(edef[0].replace("gene:", ""))

bridge_tfs = sig_target_prots & reg_tf_genes
print(f"Signal-to-transcription bridge TFs [Transcription Factors]: {len(bridge_tfs)}")
print("\nFirst 20 bridge TFs [Transcription Factors] (alphabetical):")
for tf in sorted(bridge_tfs)[:20]:
    print(f"  {tf}")
# Query 3 — Signal-to-transcription bridge TFs [Transcription Factors]
# A bridge TF [Transcription Factor] is both a target of signaling AND a regulator in the regulatory layer.
sig_target_prots = set()
for eid in G.edge_attributes.filter(pl.col("layer") == "signaling")["edge_id"].to_list():
    edef = G.edge_definitions.get(eid)
    if edef:
        sig_target_prots.add(edef[1].replace("prot:", ""))

reg_tf_genes = set()
for eid in G.edge_attributes.filter(pl.col("layer") == "regulatory")["edge_id"].to_list():
    edef = G.edge_definitions.get(eid)
    if edef:
        reg_tf_genes.add(edef[0].replace("gene:", ""))

bridge_tfs = sig_target_prots & reg_tf_genes
print(f"Signal-to-transcription bridge TFs [Transcription Factors]: {len(bridge_tfs)}")
print("\nFirst 20 bridge TFs [Transcription Factors] (alphabetical):")
for tf in sorted(bridge_tfs)[:20]:
    print(f"  {tf}")

Signal-to-transcription bridge TFs [Transcription Factors]: 311

First 20 bridge TFs [Transcription Factors] (alphabetical):
  AHR
  AR
  ARID3A
  ARNT
  ASCL1
  ATF1
  ATF2
  ATF3
  ATF4
  ATF6
  ATOH1
  BACH1
  BACH2
  BATF
  BCL6
  BHLHE40
  BMAL1
  CDX1
  CDX2
  CEBPA

In [34]:

Copied!





# Query 4 — EGF [Epidermal Growth Factor]-stimulated vs basal edge comparison
egf_only   = G._slices["egf_stimulated"]["edges"] - G._slices["basal"]["edges"]
basal_only = G._slices["basal"]["edges"] - G._slices["egf_stimulated"]["edges"]
shared     = G._slices["egf_stimulated"]["edges"] & G._slices["basal"]["edges"]

print(f"EGF [Epidermal Growth Factor]-stimulated slice edges : {len(G._slices['egf_stimulated']['edges'])}")
print(f"Basal slice edges                                    : {len(G._slices['basal']['edges'])}")
print(f"Shared (always active)                               : {len(shared)}")
print(f"EGF [Epidermal Growth Factor]-only edges             : {len(egf_only)}")
print(f"Basal-only (EGF-suppressed)                          : {len(basal_only)}")

print("\nSample EGF [Epidermal Growth Factor]-stimulated edges:")
for eid in list(G._slices["egf_stimulated"]["edges"])[:8]:
    edef = G.edge_definitions.get(eid)
    if edef:
        ea_row = G.edge_attributes.filter(pl.col("edge_id") == eid)
        layer = ea_row["layer"][0] if len(ea_row) > 0 else "?"
        print(f"  {edef[0]:25s} → {edef[1]:25s}  [{layer}]")
# Query 4 — EGF [Epidermal Growth Factor]-stimulated vs basal edge comparison
egf_only   = G._slices["egf_stimulated"]["edges"] - G._slices["basal"]["edges"]
basal_only = G._slices["basal"]["edges"] - G._slices["egf_stimulated"]["edges"]
shared     = G._slices["egf_stimulated"]["edges"] & G._slices["basal"]["edges"]

print(f"EGF [Epidermal Growth Factor]-stimulated slice edges : {len(G._slices['egf_stimulated']['edges'])}")
print(f"Basal slice edges                                    : {len(G._slices['basal']['edges'])}")
print(f"Shared (always active)                               : {len(shared)}")
print(f"EGF [Epidermal Growth Factor]-only edges             : {len(egf_only)}")
print(f"Basal-only (EGF-suppressed)                          : {len(basal_only)}")

print("\nSample EGF [Epidermal Growth Factor]-stimulated edges:")
for eid in list(G._slices["egf_stimulated"]["edges"])[:8]:
    edef = G.edge_definitions.get(eid)
    if edef:
        ea_row = G.edge_attributes.filter(pl.col("edge_id") == eid)
        layer = ea_row["layer"][0] if len(ea_row) > 0 else "?"
        print(f"  {edef[0]:25s} → {edef[1]:25s}  [{layer}]")

EGF [Epidermal Growth Factor]-stimulated slice edges : 2552
Basal slice edges                                    : 31898
Shared (always active)                               : 2552
EGF [Epidermal Growth Factor]-only edges             : 0
Basal-only (EGF-suppressed)                          : 29346

Sample EGF [Epidermal Growth Factor]-stimulated edges:
  prot:MAPK1                → prot:DUSP5                 [signaling]
  prot:SRC                  → prot:ITGAL                 [signaling]
  prot:AKT1                 → prot:AKT1S1                [signaling]
  prot:CDR2                 → prot:MYC                   [signaling]
  prot:PAK1                 → prot:AKT1                  [signaling]
  prot:MAPK3                → prot:GTF2I                 [signaling]
  prot:WDR83                → prot:MAP2K1                [signaling]
  prot:EGFR                 → prot:SOX2                  [signaling]

14. Benchmark — AnnNet Hyperedges vs NetworkX [Network Analysis in Python] Binary Expansion¶

Demonstrates the memory and fidelity advantage of native hyperedge representation over the conventional approach of expanding every reaction into all pairwise binary edges.

In [35]:

Copied!





metabolic_eids = [
    eid for eid, kind in G.edge_kind.items()
    if kind == "hyper" and eid in G._slices["metabolic"]["edges"]
]

metabolic_edges_raw = [
    (list(G.hyperedge_definitions[eid].get("head", [])),
     list(G.hyperedge_definitions[eid].get("tail", [])),
     eid)
    for eid in metabolic_eids
]

binary_expansion_count = sum(len(h) * len(t) for h, t, _ in metabolic_edges_raw)
print(f"Metabolic hyperedges (AnnNet): {len(metabolic_edges_raw)}")
print(f"Binary edges needed (NX [NetworkX] expansion): {binary_expansion_count}")
print(f"Edge inflation factor: {binary_expansion_count / len(metabolic_edges_raw):.1f}x")

tracemalloc.start()
_ = [eid for eid in G._slices["metabolic"]["edges"]]
annnet_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()

tracemalloc.start()
t0 = time.perf_counter()
NX, _ = to_nx(G, hyperedge_mode="expand", slices=["metabolic"])
t_nx = time.perf_counter() - t0
nx_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()

print(f"\nNetworkX [Network Analysis in Python] (expanded): {NX.number_of_nodes()} nodes, {NX.number_of_edges()} edges")
print(f"  Build time : {t_nx:.3f}s")
print(f"  Peak memory: {nx_mem / 1e6:.1f} MB")
print(f"\nAnnNet metabolic slice memory : {annnet_mem / 1e6:.4f} MB")
print(f"Stoichiometry preserved in AnnNet    : YES")
print(f"Stoichiometry preserved in NetworkX  : NO (binary adjacency only)")
metabolic_eids = [
    eid for eid, kind in G.edge_kind.items()
    if kind == "hyper" and eid in G._slices["metabolic"]["edges"]
]

metabolic_edges_raw = [
    (list(G.hyperedge_definitions[eid].get("head", [])),
     list(G.hyperedge_definitions[eid].get("tail", [])),
     eid)
    for eid in metabolic_eids
]

binary_expansion_count = sum(len(h) * len(t) for h, t, _ in metabolic_edges_raw)
print(f"Metabolic hyperedges (AnnNet): {len(metabolic_edges_raw)}")
print(f"Binary edges needed (NX [NetworkX] expansion): {binary_expansion_count}")
print(f"Edge inflation factor: {binary_expansion_count / len(metabolic_edges_raw):.1f}x")

tracemalloc.start()
_ = [eid for eid in G._slices["metabolic"]["edges"]]
annnet_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()

tracemalloc.start()
t0 = time.perf_counter()
NX, _ = to_nx(G, hyperedge_mode="expand", slices=["metabolic"])
t_nx = time.perf_counter() - t0
nx_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()

print(f"\nNetworkX [Network Analysis in Python] (expanded): {NX.number_of_nodes()} nodes, {NX.number_of_edges()} edges")
print(f"  Build time : {t_nx:.3f}s")
print(f"  Peak memory: {nx_mem / 1e6:.1f} MB")
print(f"\nAnnNet metabolic slice memory : {annnet_mem / 1e6:.4f} MB")
print(f"Stoichiometry preserved in AnnNet    : YES")
print(f"Stoichiometry preserved in NetworkX  : NO (binary adjacency only)")

Metabolic hyperedges (AnnNet): 12971
Binary edges needed (NX [NetworkX] expansion): 79886
Edge inflation factor: 6.2x

NetworkX [Network Analysis in Python] (expanded): 45571 nodes, 276293 edges
  Build time : 11.924s
  Peak memory: 391.5 MB

AnnNet metabolic slice memory : 0.1203 MB
Stoichiometry preserved in AnnNet    : YES
Stoichiometry preserved in NetworkX  : NO (binary adjacency only)

15. Multilayer Metrics¶

In [36]:

Copied!





# Participation coefficient: fraction of a vertex's edges that span multiple layers.
# Computed manually to correctly handle hyperedges alongside binary edges.
layers = ["signaling", "metabolic", "complex", "regulatory"]
layer_edges = {
    l: set(G.edge_attributes.filter(pl.col("layer") == l)["edge_id"].to_list())
    for l in layers
}

vertex_layer_deg = defaultdict(lambda: defaultdict(int))
for layer, eids in layer_edges.items():
    for eid in eids:
        ed = G.edge_definitions.get(eid)
        hd = G.hyperedge_definitions.get(eid)
        if hd is not None:
            involved = hd.get("head", set()) | hd.get("tail", set()) | hd.get("members", set())
            for v in involved:
                if v and not v.startswith("__"):
                    vertex_layer_deg[v][layer] += 1
        elif ed is not None and ed[0] is not None:
            vertex_layer_deg[ed[0]][layer] += 1
            vertex_layer_deg[ed[1]][layer] += 1

participation = {
    v: 1.0 - sum((ld[l] / sum(ld.values())) ** 2 for l in ld)
    for v, ld in vertex_layer_deg.items()
    if sum(ld.values()) > 0
}

p_vals = np.array(list(participation.values()))
print("Participation coefficient (all 4 layers, covers hyperedges):")
print(f"  Vertices with edges : {len(p_vals)}")
print(f"  Mean P              : {p_vals.mean():.4f}")
print(f"  Max P               : {p_vals.max():.4f}")
print(f"  P>0.3 (multi-layer) : {(p_vals > 0.3).sum()}")
top10 = sorted(participation.items(), key=lambda x: -x[1])[:10]
print("  Top 10:")
for v, p in top10:
    ld = vertex_layer_deg[v]
    print(f"    {v:35s} P={p:.3f}  " + "  ".join(f"{l[:3]}={ld.get(l, 0)}" for l in layers))
# Participation coefficient: fraction of a vertex's edges that span multiple layers.
# Computed manually to correctly handle hyperedges alongside binary edges.
layers = ["signaling", "metabolic", "complex", "regulatory"]
layer_edges = {
    l: set(G.edge_attributes.filter(pl.col("layer") == l)["edge_id"].to_list())
    for l in layers
}

vertex_layer_deg = defaultdict(lambda: defaultdict(int))
for layer, eids in layer_edges.items():
    for eid in eids:
        ed = G.edge_definitions.get(eid)
        hd = G.hyperedge_definitions.get(eid)
        if hd is not None:
            involved = hd.get("head", set()) | hd.get("tail", set()) | hd.get("members", set())
            for v in involved:
                if v and not v.startswith("__"):
                    vertex_layer_deg[v][layer] += 1
        elif ed is not None and ed[0] is not None:
            vertex_layer_deg[ed[0]][layer] += 1
            vertex_layer_deg[ed[1]][layer] += 1

participation = {
    v: 1.0 - sum((ld[l] / sum(ld.values())) ** 2 for l in ld)
    for v, ld in vertex_layer_deg.items()
    if sum(ld.values()) > 0
}

p_vals = np.array(list(participation.values()))
print("Participation coefficient (all 4 layers, covers hyperedges):")
print(f"  Vertices with edges : {len(p_vals)}")
print(f"  Mean P              : {p_vals.mean():.4f}")
print(f"  Max P               : {p_vals.max():.4f}")
print(f"  P>0.3 (multi-layer) : {(p_vals > 0.3).sum()}")
top10 = sorted(participation.items(), key=lambda x: -x[1])[:10]
print("  Top 10:")
for v, p in top10:
    ld = vertex_layer_deg[v]
    print(f"    {v:35s} P={p:.3f}  " + "  ".join(f"{l[:3]}={ld.get(l, 0)}" for l in layers))

Participation coefficient (all 4 layers, covers hyperedges):
  Vertices with edges : 21104
  Mean P              : 0.0000
  Max P               : 0.0000
  P>0.3 (multi-layer) : 0
  Top 10:
    prot:AR                             P=0.000  sig=118  met=0  com=0  reg=0
    prot:CTNNB1                         P=0.000  sig=160  met=0  com=0  reg=0
    prot:PAK1                           P=0.000  sig=124  met=0  com=0  reg=0
    prot:AKT1                           P=0.000  sig=360  met=0  com=0  reg=0
    prot:F9                             P=0.000  sig=8  met=0  com=0  reg=0
    prot:LRP1                           P=0.000  sig=55  met=0  com=0  reg=0
    prot:MDM2                           P=0.000  sig=110  met=0  com=0  reg=0
    prot:IGF1R                          P=0.000  sig=125  met=0  com=0  reg=0
    prot:IGSF10                         P=0.000  sig=4  met=0  com=0  reg=0
    prot:CD200R1L                       P=0.000  sig=2  met=0  com=0  reg=0

In [37]:

Copied!





# Supra-adjacency matrix over binary layers only (signaling + regulatory).
print("Supra-adjacency (signaling + regulatory layers — binary intra edges only):")
A = G.supra_adjacency(["signaling", "regulatory"])
print(f"  Shape     : {A.shape}")
print(f"  Non-zeros : {A.nnz}")
# Supra-adjacency matrix over binary layers only (signaling + regulatory).
print("Supra-adjacency (signaling + regulatory layers — binary intra edges only):")
A = G.supra_adjacency(["signaling", "regulatory"])
print(f"  Shape     : {A.shape}")
print(f"  Non-zeros : {A.nnz}")

Supra-adjacency (signaling + regulatory layers — binary intra edges only):
  Shape     : (37114, 37114)
  Non-zeros : 49398

In [38]:

Copied!





# EGF [Epidermal Growth Factor]-stimulated subgraph component and hub analysis.
G_egf, _ = to_nx(G, slices=["egf_stimulated"], hyperedge_mode="skip")
G_egf_u = G_egf.to_undirected()

components = sorted(nx.connected_components(G_egf_u), key=len, reverse=True)
print(f"EGF [Epidermal Growth Factor] subgraph : {G_egf.number_of_nodes()} nodes, {G_egf.number_of_edges()} edges")
print(f"Components                             : {len(components)}")
print(f"Largest component                      : {len(components[0])} nodes")

largest   = G_egf_u.subgraph(components[0]).copy()
density   = nx.density(largest)
avg_deg   = sum(d for _, d in largest.degree()) / largest.number_of_nodes()
top_hubs  = sorted(largest.degree(), key=lambda x: x[1], reverse=True)[:5]

print(f"\nLargest component:")
print(f"  Density    : {density:.2e}")
print(f"  Avg degree : {avg_deg:.2f}")
print(f"  Top 5 hubs : {[(v, d) for v, d in top_hubs]}")
# EGF [Epidermal Growth Factor]-stimulated subgraph component and hub analysis.
G_egf, _ = to_nx(G, slices=["egf_stimulated"], hyperedge_mode="skip")
G_egf_u = G_egf.to_undirected()

components = sorted(nx.connected_components(G_egf_u), key=len, reverse=True)
print(f"EGF [Epidermal Growth Factor] subgraph : {G_egf.number_of_nodes()} nodes, {G_egf.number_of_edges()} edges")
print(f"Components                             : {len(components)}")
print(f"Largest component                      : {len(components[0])} nodes")

largest   = G_egf_u.subgraph(components[0]).copy()
density   = nx.density(largest)
avg_deg   = sum(d for _, d in largest.degree()) / largest.number_of_nodes()
top_hubs  = sorted(largest.degree(), key=lambda x: x[1], reverse=True)[:5]

print(f"\nLargest component:")
print(f"  Density    : {density:.2e}")
print(f"  Avg degree : {avg_deg:.2f}")
print(f"  Top 5 hubs : {[(v, d) for v, d in top_hubs]}")

EGF [Epidermal Growth Factor] subgraph : 45571 nodes, 80747 edges
Components                             : 17650
Largest component                      : 18642 nodes

Largest component:
  Density    : 4.11e-04
  Avg degree : 7.67
  Top 5 hubs : [('prot:TP53', 825), ('prot:STAT1', 785), ('gene:STAT1', 736), ('prot:GSK3B', 688), ('prot:ESR1', 637)]

16. Serialisation and Export¶

In [39]:

Copied!





out_path = DATA / "uc2_hek293.annnet"
G.write(str(out_path), overwrite=True)
print(f"Written: {out_path}  ({os.path.getsize(out_path) / 1e6:.1f} MB)")

G2 = AnnNet.read(str(out_path))
print("\nRound-trip check:")
print(f"  vertices  orig={G.number_of_vertices()}  reloaded={G2.number_of_vertices()}  "
      f"match={G.number_of_vertices() == G2.number_of_vertices()}")
print(f"  edges     orig={G.number_of_edges()}  reloaded={G2.number_of_edges()}  "
      f"match={G.number_of_edges() == G2.number_of_edges()}")
print(f"  slices    orig={len(G._slices)}  reloaded={len(G2._slices)}  "
      f"match={len(G._slices) == len(G2._slices)}")

sample_met = next(iter(layer_edges["metabolic"]))
print(f"\n  Stoichiometry preserved for {sample_met}:")
print(f"    orig     : {G.hyperedge_definitions[sample_met]}")
print(f"    reloaded : {G2.hyperedge_definitions.get(sample_met)}")
out_path = DATA / "uc2_hek293.annnet"
G.write(str(out_path), overwrite=True)
print(f"Written: {out_path}  ({os.path.getsize(out_path) / 1e6:.1f} MB)")

G2 = AnnNet.read(str(out_path))
print("\nRound-trip check:")
print(f"  vertices  orig={G.number_of_vertices()}  reloaded={G2.number_of_vertices()}  "
      f"match={G.number_of_vertices() == G2.number_of_vertices()}")
print(f"  edges     orig={G.number_of_edges()}  reloaded={G2.number_of_edges()}  "
      f"match={G.number_of_edges() == G2.number_of_edges()}")
print(f"  slices    orig={len(G._slices)}  reloaded={len(G2._slices)}  "
      f"match={len(G._slices) == len(G2._slices)}")

sample_met = next(iter(layer_edges["metabolic"]))
print(f"\n  Stoichiometry preserved for {sample_met}:")
print(f"    orig     : {G.hyperedge_definitions[sample_met]}")
print(f"    reloaded : {G2.hyperedge_definitions.get(sample_met)}")

Written: data/uc2_hek293.annnet  (7.7 MB)

Round-trip check:
  vertices  orig=45571  reloaded=45571  match=True
  edges     orig=98174  reloaded=98174  match=True
  slices    orig=24  reloaded=24  match=True

  Stoichiometry preserved for MAR06128:
    orig     : {'directed': True, 'head': {'MAM02046e', 'MAM02901e', 'MAM02366c'}, 'tail': {'MAM02366e', 'MAM02901c', 'MAM02046c'}}
    reloaded : {'directed': True, 'head': ['MAM02046e', 'MAM02366c', 'MAM02901e'], 'tail': ['MAM02046c', 'MAM02366e', 'MAM02901c']}

In [41]:

Copied!





cx2_data = to_cx2(G, export_name="UC2_HEK293_signaling", hyperedges="skip")
cx2_path = DATA / "uc2_signaling.cx2"
with open(cx2_path, "w") as f:
    json.dump(cx2_data, f)

print(f"CX2 [Cytoscape Exchange Format v2] written: {cx2_path}  ({os.path.getsize(cx2_path) / 1e6:.1f} MB)")
print(f"Aspects: {[list(a.keys())[0] for a in cx2_data]}")
cx2_data = to_cx2(G, export_name="UC2_HEK293_signaling", hyperedges="skip")
cx2_path = DATA / "uc2_signaling.cx2"
with open(cx2_path, "w") as f:
    json.dump(cx2_data, f)

print(f"CX2 [Cytoscape Exchange Format v2] written: {cx2_path}  ({os.path.getsize(cx2_path) / 1e6:.1f} MB)")
print(f"Aspects: {[list(a.keys())[0] for a in cx2_data]}")

CX2 [Cytoscape Exchange Format v2] written: data/uc2_signaling.cx2  (164.0 MB)
Aspects: ['CXVersion', 'metaData', 'attributeDeclarations', 'networkAttributes', 'nodes', 'edges', 'status']

In [42]:

Copied!





# Map entity_type to short kind labels expected by the PyG [PyTorch Geometric] adapter.
kind_map = {"protein": "prot", "gene": "gene"}
G.vertex_attributes = G.vertex_attributes.with_columns(
    pl.col("entity_type").replace_strict(kind_map, default="metab").alias("kind")
)

print("Kind value counts:", G.vertex_attributes["kind"].value_counts())

pyg_data = to_pyg(G, hyperedge_mode="reify", device="cpu")
print(f"\nHeteroData node types : {pyg_data.node_types}")
print(f"HeteroData edge types : {pyg_data.edge_types}")
for nt in pyg_data.node_types:
    print(f"  {nt}: {pyg_data[nt].num_nodes} nodes")
for et in pyg_data.edge_types:
    print(f"  {et}: {pyg_data[et].edge_index.shape[1]} edges")
# Map entity_type to short kind labels expected by the PyG [PyTorch Geometric] adapter.
kind_map = {"protein": "prot", "gene": "gene"}
G.vertex_attributes = G.vertex_attributes.with_columns(
    pl.col("entity_type").replace_strict(kind_map, default="metab").alias("kind")
)

print("Kind value counts:", G.vertex_attributes["kind"].value_counts())

pyg_data = to_pyg(G, hyperedge_mode="reify", device="cpu")
print(f"\nHeteroData node types : {pyg_data.node_types}")
print(f"HeteroData edge types : {pyg_data.edge_types}")
for nt in pyg_data.node_types:
    print(f"  {nt}: {pyg_data[nt].num_nodes} nodes")
for et in pyg_data.edge_types:
    print(f"  {et}: {pyg_data[et].edge_index.shape[1]} edges")

Kind value counts: shape: (3, 2)
┌───────┬───────┐
│ kind  ┆ count │
│ ---   ┆ ---   │
│ str   ┆ u32   │
╞═══════╪═══════╡
│ metab ┆ 8457  │
│ prot  ┆ 18564 │
│ gene  ┆ 18564 │
└───────┴───────┘

HeteroData node types : ['prot', 'gene', 'metab', 'hypernode']
HeteroData edge types : [('prot', 'edge', 'prot'), ('metab', 'member_of', 'hypernode'), ('prot', 'member_of', 'hypernode'), ('gene', 'edge', 'gene'), ('gene', 'edge', 'prot'), ('prot', 'edge', 'gene')]
  prot: 18557 nodes
  gene: 18557 nodes
  metab: 8457 nodes
  hypernode: 17427 nodes
  ('prot', 'edge', 'prot'): 31898 edges
  ('metab', 'member_of', 'hypernode'): 57102 edges
  ('prot', 'member_of', 'hypernode'): 17066 edges
  ('gene', 'edge', 'gene'): 15146 edges
  ('gene', 'edge', 'prot'): 18557 edges
  ('prot', 'edge', 'gene'): 15146 edges

17. Summary¶

In [43]:

Copied!





coup_count = G.edge_attributes.filter(pl.col("layer") == "coupling")["edge_id"].len()

rows = [
    ("Vertices",                 G.number_of_vertices(),          "prot + gene + metab"),
    ("Edges (total)",            G.number_of_edges(),             "all layers + coupling"),
    ("── Signaling",             len(layer_edges["signaling"]),   "directed, signed binary"),
    ("── Metabolic",             len(layer_edges["metabolic"]),   "directed hyperedges, stoichiometric"),
    ("── Protein complexes",     len(layer_edges["complex"]),     "undirected hyperedges"),
    ("── Regulatory (DoRothEA)", len(layer_edges["regulatory"]),  "directed TF→target, levels A+B"),
    ("── Coupling",              coup_count,                      "gene↔prot + TF bridges"),
    ("Organelle slices",         9,                               "mito/nucleus/ER/lysosome/cyto/golgi/perox/PM/extracell"),
    ("Condition slices",         2,                               "basal / EGF-stimulated"),
    ("Bridge TFs",               len(bridge_tfs),                 "signaling target + DoRothEA regulator"),
    ("Max participation coeff",  round(float(p_vals.max()), 3),   "prot in signaling+complex only"),
    ("NX edge inflation",        f"{binary_expansion_count / len(metabolic_edges_raw):.1f}×", "metabolic layer, hyperedges→binary"),
    ("Memory advantage vs NX",   f"{nx_mem / max(annnet_mem, 1e-9):.0f}×",                   "metabolic layer filter"),
    ("Serialized (.annnet)",     f"{os.path.getsize(out_path) / 1e6:.1f} MB",                "lossless round-trip verified"),
    ("CX2 export",               f"{os.path.getsize(cx2_path) / 1e6:.1f} MB",                "full graph, Cytoscape-ready"),
    ("PyG node types",           len(pyg_data.node_types),        "prot / gene / metab / hypernode"),
    ("PyG edge types",           len(pyg_data.edge_types),        "heterogeneous, reified hyperedges"),
]

print(f"{'Metric':<35} {'Value':>12}  Notes")
print("─" * 80)
for name, val, note in rows:
    print(f"{name:<35} {str(val):>12}  {note}")
coup_count = G.edge_attributes.filter(pl.col("layer") == "coupling")["edge_id"].len()

rows = [
    ("Vertices",                 G.number_of_vertices(),          "prot + gene + metab"),
    ("Edges (total)",            G.number_of_edges(),             "all layers + coupling"),
    ("── Signaling",             len(layer_edges["signaling"]),   "directed, signed binary"),
    ("── Metabolic",             len(layer_edges["metabolic"]),   "directed hyperedges, stoichiometric"),
    ("── Protein complexes",     len(layer_edges["complex"]),     "undirected hyperedges"),
    ("── Regulatory (DoRothEA)", len(layer_edges["regulatory"]),  "directed TF→target, levels A+B"),
    ("── Coupling",              coup_count,                      "gene↔prot + TF bridges"),
    ("Organelle slices",         9,                               "mito/nucleus/ER/lysosome/cyto/golgi/perox/PM/extracell"),
    ("Condition slices",         2,                               "basal / EGF-stimulated"),
    ("Bridge TFs",               len(bridge_tfs),                 "signaling target + DoRothEA regulator"),
    ("Max participation coeff",  round(float(p_vals.max()), 3),   "prot in signaling+complex only"),
    ("NX edge inflation",        f"{binary_expansion_count / len(metabolic_edges_raw):.1f}×", "metabolic layer, hyperedges→binary"),
    ("Memory advantage vs NX",   f"{nx_mem / max(annnet_mem, 1e-9):.0f}×",                   "metabolic layer filter"),
    ("Serialized (.annnet)",     f"{os.path.getsize(out_path) / 1e6:.1f} MB",                "lossless round-trip verified"),
    ("CX2 export",               f"{os.path.getsize(cx2_path) / 1e6:.1f} MB",                "full graph, Cytoscape-ready"),
    ("PyG node types",           len(pyg_data.node_types),        "prot / gene / metab / hypernode"),
    ("PyG edge types",           len(pyg_data.edge_types),        "heterogeneous, reified hyperedges"),
]

print(f"{'Metric':<35} {'Value':>12}  Notes")
print("─" * 80)
for name, val, note in rows:
    print(f"{name:<35} {str(val):>12}  {note}")

Metric                                     Value  Notes
────────────────────────────────────────────────────────────────────────────────
Vertices                                   45571  prot + gene + metab
Edges (total)                              98174  all layers + coupling
── Signaling                               31898  directed, signed binary
── Metabolic                               12971  directed hyperedges, stoichiometric
── Protein complexes                           0  undirected hyperedges
── Regulatory (DoRothEA)                   15146  directed TF→target, levels A+B
── Coupling                                    0  gene↔prot + TF bridges
Organelle slices                               9  mito/nucleus/ER/lysosome/cyto/golgi/perox/PM/extracell
Condition slices                               2  basal / EGF-stimulated
Bridge TFs                                   311  signaling target + DoRothEA regulator
Max participation coeff                      0.0  prot in signaling+complex only
NX edge inflation                           6.2×  metabolic layer, hyperedges→binary
Memory advantage vs NX                     3255×  metabolic layer filter
Serialized (.annnet)                      7.7 MB  lossless round-trip verified
CX2 export                              164.0 MB  full graph, Cytoscape-ready
PyG node types                                 4  prot / gene / metab / hypernode
PyG edge types                                 6  heterogeneous, reified hyperedges

In [ ]: