Use Case 2 — HEK293 Multilayer Biological Network¶
Constructs a cell-line-specific, multilayer heterogeneous graph of HEK293 proteins using:
- Signaling layer: OmniPath directed protein–protein interactions
- Metabolic layer: Human-GEM stoichiometric hyperedges (SBML [Systems Biology Markup Language])
- Complex layer: OmniPath protein complexes as undirected hyperedges
- Regulatory layer: DoRothEA TF [Transcription Factor]–target interactions (confidence levels A+B)
- Coupling edges: gene→protein translation links + TF protein→target gene bridges
Cell-line specificity is enforced throughout by filtering all interactions to proteins with protein-level expression evidence in the Human Protein Atlas (HPA [Human Protein Atlas]).
1. Imports¶
import os
import sys
import time
import json
import zipfile
import tracemalloc
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np
import polars as pl
import requests
import networkx as nx
import omnipath as op
sys.path.insert(0, os.path.abspath('..'))
from annnet.core.graph import AnnNet
from annnet.io.sbml import from_sbml
from annnet.io.cx2 import to_cx2
from annnet.adapters.networkx_adapter import to_nx
from annnet.adapters.pyg_adapter import to_pyg
2. Data Acquisition¶
All external files are cached locally under ./data/. Downloads are skipped if the file already exists.
DATA = Path('./data')
DATA.mkdir(exist_ok=True)
print(f'Cache dir: {DATA.resolve()}')
HPA_URL = 'https://www.proteinatlas.org/download/proteinatlas.tsv.zip'
HPA_ZIP = DATA / 'proteinatlas.tsv.zip'
HPA_TSV = DATA / 'proteinatlas.tsv'
if not HPA_TSV.exists():
print('Downloading HPA [Human Protein Atlas]... (~100 MB, takes a minute)')
r = requests.get(HPA_URL, stream=True)
with open(HPA_ZIP, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
with zipfile.ZipFile(HPA_ZIP, 'r') as z:
z.extractall(DATA)
print('Done.')
else:
print('Already cached.')
HPA_CELL_TSV = DATA / 'rna_celline.tsv'
if not HPA_CELL_TSV.exists():
print('Downloading HPA [Human Protein Atlas] cell-line RNA [Ribonucleic Acid] table...')
r = requests.get('https://www.proteinatlas.org/download/rna_celline.tsv.gz', stream=True)
with open(HPA_CELL_TSV, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
print('Done.')
else:
print('Already cached.')
HUMANGEM_XML = DATA / 'Human-GEM.xml'
if not HUMANGEM_XML.exists():
print('Downloading Human-GEM [Generic Enzyme Model]...')
r = requests.get(
'https://raw.githubusercontent.com/SysBioChalmers/Human-GEM/main/model/Human-GEM.xml',
stream=True,
allow_redirects=True,
)
print('Status:', r.status_code)
with open(HUMANGEM_XML, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
print(f'Done. Size: {HUMANGEM_XML.stat().st_size} bytes')
else:
print(f'Already cached. Size: {HUMANGEM_XML.stat().st_size} bytes')
COMPLEXES_TSV = DATA / 'omnipath_complexes.tsv'
if not COMPLEXES_TSV.exists():
r = requests.get(
'https://omnipathdb.org/complexes?databases=CORUM,ComplexPortal,hu.MAP2&fields=sources&format=tsv',
timeout=120,
)
print('Status:', r.status_code)
with open(COMPLEXES_TSV, 'w') as f:
f.write(r.text)
print('Downloaded complexes.')
else:
print('Already cached:', COMPLEXES_TSV)
complexes_raw = pl.read_csv(COMPLEXES_TSV, separator='\t', infer_schema_length=5000)
print('Shape:', complexes_raw.shape)
print('Columns:', complexes_raw.columns)
print(complexes_raw.head(3))
DOROTHEA_TSV = DATA / 'dorothea.tsv'
if not DOROTHEA_TSV.exists():
print('Downloading DoRothEA [Discriminant Regulon Expression Analysis]...')
r = requests.get(
'https://omnipathdb.org/interactions?datasets=dorothea&dorothea_levels=A,B'
'&genesymbols=yes&fields=sources,references&format=tsv',
timeout=120,
)
print('Status:', r.status_code)
with open(DOROTHEA_TSV, 'w') as f:
f.write(r.text)
print(f'Done. Size: {DOROTHEA_TSV.stat().st_size} bytes')
else:
print('Already cached.')
dorothea = pl.read_csv(DOROTHEA_TSV, separator='\t', infer_schema_length=5000)
print('Shape:', dorothea.shape)
print('Columns:', dorothea.columns)
print(dorothea.head(3))
3. HEK293 Expressed Protein Set¶
Filter the HPA [Human Protein Atlas] atlas to genes with protein-level experimental evidence. This is the conservative but reproducible criterion for "expressed in HEK293": only proteins for which mass-spectrometry or antibody-based evidence exists are retained.
hpa_raw = pl.read_csv(HPA_TSV, separator='\t', infer_schema_length=10000)
print('Columns:', hpa_raw.columns[:10])
print('Shape:', hpa_raw.shape)
# The main atlas file contains per-gene summaries. Filter to protein-level evidence
# (mass-spectrometry or antibody-validated) — these are confirmed translated proteins.
hpa_expressed = hpa_raw.filter(
pl.col('Evidence').str.contains('protein level')
| pl.col('UniProt evidence').str.contains('protein level')
).select(
['Gene', 'Uniprot', 'Subcellular main location', 'Subcellular additional location', 'Evidence']
)
print('Expressed proteins (protein-level evidence):', hpa_expressed.shape)
print(hpa_expressed.head(5))
hek293_genes = set(hpa_expressed['Gene'].to_list())
print(f'HEK293 [Human Embryonic Kidney 293] expressed gene set size: {len(hek293_genes)}')
4. Vertex Table Construction¶
Two vertex types are created per expressed gene:
prot:GENENAME— the protein product, localised to its HPA [Human Protein Atlas] subcellular compartmentgene:GENENAME— the gene locus, always assigned to the nucleus (n)
GEM [Generic Enzyme Model] single-letter compartment codes are used throughout.
# Maps HPA [Human Protein Atlas] location strings to GEM [Generic Enzyme Model] single-letter compartment codes.
# First match wins; unmapped locations default to cytoplasm ('c').
COMPARTMENT_MAP = {
'Mitochondria': 'm',
'Nucleus': 'n',
'Nucleoplasm': 'n',
'Nucleoli': 'n',
'Nuclear membrane': 'n',
'Endoplasmic reticulum': 'r',
'Golgi apparatus': 'g',
'Lysosome': 'l',
'Cytosol': 'c',
'Cytoplasm': 'c',
'Plasma membrane': 'p',
'Cell Junctions': 'p',
'Peroxisome': 'x',
'Vesicles': 'v',
'Extracellular': 'e',
}
def map_compartment(location_str: str | None) -> str:
if not location_str:
return 'c'
loc_lower = location_str.lower()
for key, code in COMPARTMENT_MAP.items():
if key.lower() in loc_lower:
return code
return 'c'
hpa_expressed = hpa_expressed.with_columns(
[
pl.col('Subcellular main location')
.map_elements(map_compartment, return_dtype=pl.Utf8)
.alias('compartment')
]
)
print(hpa_expressed.select(['Gene', 'compartment']).head(10))
print('\nCompartment distribution:')
print(hpa_expressed['compartment'].value_counts().sort('count', descending=True))
vertex_proteins = hpa_expressed.select(
[
pl.concat_str([pl.lit('prot:'), pl.col('Gene')]).alias('vertex_id'),
pl.col('Gene').alias('gene_symbol'),
pl.col('Uniprot').alias('uniprot_id'),
pl.lit('protein').alias('entity_type'),
pl.col('compartment'),
pl.col('Subcellular main location').alias('hpa_location'),
pl.lit(None).alias('recon3d_id'),
]
)
# Genes always localised to nucleus — they exist as separate vertices to
# support gene→protein coupling and TF [Transcription Factor]→target edges.
vertex_genes = hpa_expressed.select(
[
pl.concat_str([pl.lit('gene:'), pl.col('Gene')]).alias('vertex_id'),
pl.col('Gene').alias('gene_symbol'),
pl.col('Uniprot').alias('uniprot_id'),
pl.lit('gene').alias('entity_type'),
pl.lit('n').alias('compartment'),
pl.col('Subcellular main location').alias('hpa_location'),
pl.lit(None).alias('recon3d_id'),
]
)
vertex_table = pl.concat([vertex_proteins, vertex_genes])
print('Vertex table shape:', vertex_table.shape)
print(vertex_table.head(5))
vertex_table = vertex_table.with_columns(pl.col('compartment').fill_null('c'))
print('Null compartments remaining:', vertex_table['compartment'].null_count())
print('Compartment distribution:')
print(vertex_table['compartment'].value_counts().sort('count', descending=True))
5. AnnNet Graph Initialisation¶
G = AnnNet(directed=True)
G.add_vertices_bulk(vertex_table.to_dicts())
print('Vertices loaded:', G.num_vertices)
print('Sample vertex attributes:')
print(G.vertex_attributes.head(5))
# Initialise the Kivelä multilayer structure with four biological aspects.
G.set_aspects(['type'], {'type': ['signaling', 'metabolic', 'complex', 'regulatory']})
for row in G.vertex_attributes.iter_rows(named=True):
vid = row['vertex_id']
etype = row['entity_type']
if etype == 'protein':
G.add_presence(vid, ('signaling',))
G.add_presence(vid, ('complex',))
elif etype == 'gene':
G.add_presence(vid, ('regulatory',))
print(f'Aspects : {G.aspects}')
print(f'Elem layers: {G.elem_layers}')
print(f'_VM size : {len(G._VM)}')
6. Signaling Layer — OmniPath Directed PPI [Protein–Protein Interaction]¶
signaling_raw = op.interactions.OmniPath.get(
genesymbols=True,
fields=['sources', 'references', 'curation_effort'],
)
print('Columns:', signaling_raw.columns.tolist())
print(signaling_raw.head(3))
sig = pl.from_pandas(
signaling_raw[
[
'source_genesymbol',
'target_genesymbol',
'is_stimulation',
'is_inhibition',
'consensus_direction',
'consensus_stimulation',
'consensus_inhibition',
'sources',
'references',
]
].dropna(subset=['source_genesymbol', 'target_genesymbol'])
)
sig = sig.filter(
pl.col('source_genesymbol').is_in(hek293_genes)
& pl.col('target_genesymbol').is_in(hek293_genes)
)
print('Signaling edges after HEK293 [Human Embryonic Kidney 293] filter:', sig.shape)
print(sig.head(3))
sig_tuples = []
for row in sig.iter_rows(named=True):
src = f'prot:{row["source_genesymbol"]}'
tgt = f'prot:{row["target_genesymbol"]}'
sign = 1.0 if row['is_stimulation'] else (-1.0 if row['is_inhibition'] else 0.0)
sig_tuples.append((src, tgt, sign))
eids = G.add_intra_edges_bulk(sig_tuples, ('signaling',), fast_mode=True)
sig_attrs = {
eid: {
'edge_type': 'signaling',
'layer': 'signaling',
'is_stimulation': row['is_stimulation'],
'is_inhibition': row['is_inhibition'],
'consensus_dir': row['consensus_direction'],
'sources': row['sources'],
}
for eid, row in zip(eids, sig.iter_rows(named=True), strict=False)
}
G.set_edge_attrs_bulk(sig_attrs)
print(f'Signaling edges added: {len(eids)}')
print(f'Total edges: {G.num_edges}')
7. Metabolic Layer — Human-GEM [Generic Enzyme Model] Stoichiometric Hyperedges¶
Human-GEM is loaded directly via the AnnNet SBML [Systems Biology Markup Language] adapter. Each reaction becomes a directed hyperedge with stoichiometric coefficients preserved in the incidence matrix.
from_sbml(str(HUMANGEM_XML), graph=G, slice='metabolic', preserve_stoichiometry=True)
print('Vertices in graph:', G.num_vertices)
print('Edges in graph :', G.num_edges)
print('Slices :', list(G._slices.keys()))
# Register metabolite vertices in the metabolic layer presence map.
added = 0
for vid in G._slices['metabolic']['vertices']:
if G.entity_types.get(vid) == 'vertex' and not vid.startswith('__'):
G.add_presence(vid, ('metabolic',))
added += 1
print(f'Metabolite presences registered: {added}')
print(f'_VM size now: {len(G._VM)}')
metabolic_edges = G._slices['metabolic']['edges']
metabolic_verts = G._slices['metabolic']['vertices']
sample_eid = next(iter(metabolic_edges))
hdef = G.hyperedge_definitions[sample_eid]
print('Metabolic edges :', len(metabolic_edges))
print('Metabolic vertices:', len(metabolic_verts))
print('Total vertices :', G.num_vertices)
print('Total edges :', G.num_edges)
print(f'\nSample hyperedge : {sample_eid}')
print('Head (products) :', hdef['head'])
print('Tail (reactants) :', hdef['tail'])
col = G.edge_to_idx[sample_eid]
mat = G._matrix.tocsc()
col_data = mat.getcol(col)
rows, _ = col_data.nonzero()
print('\nStoichiometric coefficients:')
for r in rows:
vid = G.idx_to_entity[r]
coeff = col_data[r, 0]
print(f' {vid}: {coeff}')
8. Complex Layer — OmniPath Protein Complexes as Undirected Hyperedges¶
Only complexes where all subunits are HEK293-expressed are retained, ensuring the complex layer is fully cell-line-specific.
complex_hyperedges = []
complex_attr_records = []
skipped = 0
for row in complexes_raw.iter_rows(named=True):
members_raw = row['components_genesymbols']
if not members_raw:
continue
members = members_raw.split('_')
members_expressed = [m for m in members if m in hek293_genes]
if len(members_expressed) < len(members):
skipped += 1
continue
members_vids = [f'prot:{m}' for m in members_expressed]
eid = f'cpx:{row["name"]}'
complex_hyperedges.append(
{
'edge_id': eid,
'members': members_vids,
'weight': 1.0,
}
)
complex_attr_records.append(
{
'edge_id': eid,
'edge_type': 'complex',
'name': row['name'],
'sources': row['sources'],
'stoichiometry': row['stoichiometry'],
}
)
print(f'Complexes kept : {len(complex_hyperedges)}')
print(f'Complexes skipped (members not in HEK293 [Human Embryonic Kidney 293]): {skipped}')
print(f'Sample: {complex_hyperedges[0]}')
G.add_hyperedges_bulk(complex_hyperedges)
print('Total edges after complexes:', G.num_edges)
print('Total vertices :', G.num_vertices)
9. Regulatory Layer — DoRothEA [Discriminant Regulon Expression Analysis] TF [Transcription Factor]–Target Interactions¶
Confidence levels A and B only (experimentally supported). Edges run between
gene:TF and gene:target vertices in the regulatory layer.
reg = dorothea.filter(
pl.col('source_genesymbol').is_in(hek293_genes)
& pl.col('target_genesymbol').is_in(hek293_genes)
)
print('Regulatory edges after HEK293 [Human Embryonic Kidney 293] filter:', reg.shape)
print('Unique TFs [Transcription Factors]:', reg['source_genesymbol'].n_unique())
reg_tuples = [
(
f'gene:{row["source_genesymbol"]}',
f'gene:{row["target_genesymbol"]}',
1.0 if row['is_stimulation'] else (-1.0 if row['is_inhibition'] else 0.0),
)
for row in reg.iter_rows(named=True)
]
reg_eids = G.add_intra_edges_bulk(reg_tuples, ('regulatory',), fast_mode=True)
reg_attrs = {
eid: {
'edge_type': 'regulatory',
'layer': 'regulatory',
'tf': row['source_genesymbol'],
'target_gene': row['target_genesymbol'],
'sources': row['sources'],
}
for eid, row in zip(reg_eids, reg.iter_rows(named=True), strict=False)
}
G.set_edge_attrs_bulk(reg_attrs)
print(f'Regulatory edges added: {len(reg_eids)}')
print(f'Total edges: {G.num_edges}')
10. Cross-Layer Coupling Edges¶
Two coupling edge types link the regulatory and signaling layers:
gene:X → prot:X— translation: every expressed gene produces its protein productprot:TF → gene:target— TF [Transcription Factor] protein activity drives target gene regulation
coupling_records = []
for i, gene in enumerate(hek293_genes):
gene_vid = f'gene:{gene}'
prot_vid = f'prot:{gene}'
if gene_vid in G.entity_to_idx and prot_vid in G.entity_to_idx:
coupling_records.append(
{
'edge_id': f'coup_gp_{i:06d}',
'source': gene_vid,
'target': prot_vid,
'directed': True,
'weight': 1.0,
'edge_type': 'coupling',
'coupling_type': 'gene_to_protein',
}
)
print(f'gene→protein coupling edges: {len(coupling_records)}')
G.add_edges_bulk(coupling_records)
print('Total edges:', G.num_edges)
tf_coupling = []
for j, row in enumerate(reg.iter_rows(named=True)):
tf_prot = f'prot:{row["source_genesymbol"]}'
tgt_gene = f'gene:{row["target_genesymbol"]}'
if tf_prot in G.entity_to_idx and tgt_gene in G.entity_to_idx:
mor = 1 if row['is_stimulation'] else (-1 if row['is_inhibition'] else 0)
tf_coupling.append(
{
'edge_id': f'coup_tf_{j:06d}',
'source': tf_prot,
'target': tgt_gene,
'directed': True,
'weight': float(mor),
'edge_type': 'coupling',
'coupling_type': 'tf_protein_to_gene',
}
)
print(f'TF [Transcription Factor] protein→gene coupling edges: {len(tf_coupling)}')
G.add_edges_bulk(tf_coupling)
print('Total edges:', G.num_edges)
11. Edge Attribute Consolidation¶
Rewrite the layer and edge_type columns on edge_attributes as a clean,
null-free join — overwriting any stale values left by incremental inserts.
tag_map = {}
for eid in G.layer_edge_set(('signaling',)):
tag_map[eid] = {'layer': 'signaling', 'edge_type': 'signaling'}
for eid in G.layer_edge_set(('regulatory',)):
tag_map[eid] = {'layer': 'regulatory', 'edge_type': 'regulatory'}
for eid in G._slices['metabolic']['edges']:
tag_map[eid] = {'layer': 'metabolic', 'edge_type': 'metabolic'}
for rec in complex_attr_records:
tag_map[rec['edge_id']] = {'layer': 'complex', 'edge_type': 'complex'}
for rec in coupling_records + tf_coupling:
tag_map[rec['edge_id']] = {'layer': 'coupling', 'edge_type': 'coupling'}
print(f'Tags built: {len(tag_map)}')
tag_df = pl.DataFrame(
[
{'edge_id': eid, 'layer': v['layer'], 'edge_type': v['edge_type']}
for eid, v in tag_map.items()
]
)
drop_cols = [c for c in G.edge_attributes.columns if c in ('layer', 'edge_type', 'layer_right')]
G.edge_attributes = G.edge_attributes.drop(drop_cols).join(tag_df, on='edge_id', how='left')
print('edge_attributes shape:', G.edge_attributes.shape)
print('Layer distribution:')
print(G.edge_attributes['layer'].value_counts().sort('count', descending=True))
print('edge_type nulls:', G.edge_attributes['edge_type'].null_count())
12. Organelle and Condition Slices¶
Named slices provide fast access to organelle-localised or condition-specific subgraphs. SBML [Systems Biology Markup Language] import creates metabolite-only slices; this section enriches them with protein vertices from HPA [Human Protein Atlas] compartment assignments.
ORGANELLE_SLICES = {
'm': 'mitochondria',
'n': 'nucleus',
'r': 'er',
'l': 'lysosome',
'c': 'cytoplasm',
'g': 'golgi',
'x': 'peroxisome',
'p': 'plasma_membrane',
'e': 'extracellular',
}
for code, name in ORGANELLE_SLICES.items():
prot_vids = G.vertex_attributes.filter(
(pl.col('compartment') == code) & (pl.col('entity_type') == 'protein')
)['vertex_id'].to_list()
existing = G._slices.get(code, {}).get('vertices', set())
vid_set = set(prot_vids) | existing
slice_edges = []
for eid, (src, tgt, _) in G.edge_definitions.items():
if src in vid_set or tgt in vid_set:
slice_edges.append(eid)
for eid, hdef in G.hyperedge_definitions.items():
members = hdef.get('head', set()) | hdef.get('tail', set()) | hdef.get('members', set())
if members & vid_set:
slice_edges.append(eid)
if name not in G._slices:
G._slices[name] = {'vertices': set(), 'edges': set(), 'attributes': {}}
G._slices[name]['vertices'] = vid_set
G._slices[name]['edges'] = set(slice_edges)
print(f'{name:20s}: {len(vid_set):6d} vertices, {len(slice_edges):6d} edges')
basal_edges = set(G.edge_attributes.filter(pl.col('layer') == 'signaling')['edge_id'].to_list())
basal_verts = set()
for eid in basal_edges:
edef = G.edge_definitions.get(eid)
if edef:
basal_verts.add(edef[0])
basal_verts.add(edef[1])
G._slices['basal'] = {'vertices': basal_verts, 'edges': basal_edges, 'attributes': {}}
print(f'Basal slice : {len(basal_verts)} vertices, {len(basal_edges)} edges')
EGF_PROTEINS = {
'prot:EGFR',
'prot:GRB2',
'prot:SOS1',
'prot:KRAS',
'prot:BRAF',
'prot:MAP2K1',
'prot:MAPK1',
'prot:MAPK3',
'prot:AKT1',
'prot:PIK3CA',
'prot:PTEN',
'prot:MTOR',
'prot:RPS6KB1',
'prot:ELK1',
'prot:MYC',
'prot:JUN',
'prot:FOS',
'prot:STAT3',
'prot:SRC',
'prot:SHC1',
}
egf_edges = set()
egf_verts = set()
for eid in basal_edges:
edef = G.edge_definitions.get(eid)
if edef and (edef[0] in EGF_PROTEINS or edef[1] in EGF_PROTEINS):
egf_edges.add(eid)
egf_verts.add(edef[0])
egf_verts.add(edef[1])
G._slices['egf_stimulated'] = {'vertices': egf_verts, 'edges': egf_edges, 'attributes': {}}
print(
f'EGF [Epidermal Growth Factor]-stimulated : {len(egf_verts)} vertices, {len(egf_edges)} edges'
)
print(f'\nAll slices: {list(G._slices.keys())}')
13. Biological Queries¶
# Query 1 — Mito-ER [Endoplasmic Reticulum] contact site edges (cross-slice intersection)
contact_edges = G._slices['mitochondria']['edges'] & G._slices['er']['edges']
print(f'Mito-ER [Endoplasmic Reticulum] contact site edges: {len(contact_edges)}')
for eid in list(contact_edges)[:5]:
hdef = G.hyperedge_definitions.get(eid)
edef = G.edge_definitions.get(eid)
if hdef:
members = hdef.get('members') or (hdef.get('head', set()) | hdef.get('tail', set()))
print(f' hyperedge {eid}: {members}')
elif edef:
print(f' edge {eid}: {edef[0]} → {edef[1]}')
# Query 2 — Organelle-specific hub proteins (degree within each organelle slice)
organelles = ['mitochondria', 'nucleus', 'er', 'lysosome', 'cytoplasm', 'golgi', 'plasma_membrane']
print('Top 5 hub proteins per organelle:\n')
for org in organelles:
verts = G._slices[org]['vertices']
edges = G._slices[org]['edges']
degree = Counter()
for eid in edges:
edef = G.edge_definitions.get(eid)
if edef:
if edef[0] in verts and edef[0].startswith('prot:'):
degree[edef[0]] += 1
if edef[1] in verts and edef[1].startswith('prot:'):
degree[edef[1]] += 1
top5 = degree.most_common(5)
print(f'{org}:')
for prot, deg in top5:
print(f' {prot:30s} degree={deg}')
print()
# Query 3 — Signal-to-transcription bridge TFs [Transcription Factors]
# A bridge TF [Transcription Factor] is both a target of signaling AND a regulator in the regulatory layer.
sig_target_prots = set()
for eid in G.edge_attributes.filter(pl.col('layer') == 'signaling')['edge_id'].to_list():
edef = G.edge_definitions.get(eid)
if edef:
sig_target_prots.add(edef[1].replace('prot:', ''))
reg_tf_genes = set()
for eid in G.edge_attributes.filter(pl.col('layer') == 'regulatory')['edge_id'].to_list():
edef = G.edge_definitions.get(eid)
if edef:
reg_tf_genes.add(edef[0].replace('gene:', ''))
bridge_tfs = sig_target_prots & reg_tf_genes
print(f'Signal-to-transcription bridge TFs [Transcription Factors]: {len(bridge_tfs)}')
print('\nFirst 20 bridge TFs [Transcription Factors] (alphabetical):')
for tf in sorted(bridge_tfs)[:20]:
print(f' {tf}')
# Query 4 — EGF [Epidermal Growth Factor]-stimulated vs basal edge comparison
egf_only = G._slices['egf_stimulated']['edges'] - G._slices['basal']['edges']
basal_only = G._slices['basal']['edges'] - G._slices['egf_stimulated']['edges']
shared = G._slices['egf_stimulated']['edges'] & G._slices['basal']['edges']
print(
f'EGF [Epidermal Growth Factor]-stimulated slice edges : {len(G._slices["egf_stimulated"]["edges"])}'
)
print(f'Basal slice edges : {len(G._slices["basal"]["edges"])}')
print(f'Shared (always active) : {len(shared)}')
print(f'EGF [Epidermal Growth Factor]-only edges : {len(egf_only)}')
print(f'Basal-only (EGF-suppressed) : {len(basal_only)}')
print('\nSample EGF [Epidermal Growth Factor]-stimulated edges:')
for eid in list(G._slices['egf_stimulated']['edges'])[:8]:
edef = G.edge_definitions.get(eid)
if edef:
ea_row = G.edge_attributes.filter(pl.col('edge_id') == eid)
layer = ea_row['layer'][0] if len(ea_row) > 0 else '?'
print(f' {edef[0]:25s} → {edef[1]:25s} [{layer}]')
14. Benchmark — AnnNet Hyperedges vs NetworkX [Network Analysis in Python] Binary Expansion¶
Demonstrates the memory and fidelity advantage of native hyperedge representation over the conventional approach of expanding every reaction into all pairwise binary edges.
metabolic_eids = [
eid
for eid, kind in G.edge_kind.items()
if kind == 'hyper' and eid in G._slices['metabolic']['edges']
]
metabolic_edges_raw = [
(
list(G.hyperedge_definitions[eid].get('head', [])),
list(G.hyperedge_definitions[eid].get('tail', [])),
eid,
)
for eid in metabolic_eids
]
binary_expansion_count = sum(len(h) * len(t) for h, t, _ in metabolic_edges_raw)
print(f'Metabolic hyperedges (AnnNet): {len(metabolic_edges_raw)}')
print(f'Binary edges needed (NX [NetworkX] expansion): {binary_expansion_count}')
print(f'Edge inflation factor: {binary_expansion_count / len(metabolic_edges_raw):.1f}x')
tracemalloc.start()
_ = list(G._slices['metabolic']['edges'])
annnet_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()
tracemalloc.start()
t0 = time.perf_counter()
NX, _ = to_nx(G, hyperedge_mode='expand', slices=['metabolic'])
t_nx = time.perf_counter() - t0
nx_mem = tracemalloc.get_traced_memory()[1]
tracemalloc.stop()
print(
f'\nNetworkX [Network Analysis in Python] (expanded): {NX.number_of_nodes()} nodes, {NX.ne} edges'
)
print(f' Build time : {t_nx:.3f}s')
print(f' Peak memory: {nx_mem / 1e6:.1f} MB')
print(f'\nAnnNet metabolic slice memory : {annnet_mem / 1e6:.4f} MB')
print('Stoichiometry preserved in AnnNet : YES')
print('Stoichiometry preserved in NetworkX : NO (binary adjacency only)')
15. Multilayer Metrics¶
# Participation coefficient: fraction of a vertex's edges that span multiple layers.
# Computed manually to correctly handle hyperedges alongside binary edges.
layers = ['signaling', 'metabolic', 'complex', 'regulatory']
layer_edges = {
l: set(G.edge_attributes.filter(pl.col('layer') == l)['edge_id'].to_list()) for l in layers
}
vertex_layer_deg = defaultdict(lambda: defaultdict(int))
for layer, eids in layer_edges.items():
for eid in eids:
ed = G.edge_definitions.get(eid)
hd = G.hyperedge_definitions.get(eid)
if hd is not None:
involved = hd.get('head', set()) | hd.get('tail', set()) | hd.get('members', set())
for v in involved:
if v and not v.startswith('__'):
vertex_layer_deg[v][layer] += 1
elif ed is not None and ed[0] is not None:
vertex_layer_deg[ed[0]][layer] += 1
vertex_layer_deg[ed[1]][layer] += 1
participation = {
v: 1.0 - sum((ld[l] / sum(ld.values())) ** 2 for l in ld)
for v, ld in vertex_layer_deg.items()
if sum(ld.values()) > 0
}
p_vals = np.array(list(participation.values()))
print('Participation coefficient (all 4 layers, covers hyperedges):')
print(f' Vertices with edges : {len(p_vals)}')
print(f' Mean P : {p_vals.mean():.4f}')
print(f' Max P : {p_vals.max():.4f}')
print(f' P>0.3 (multi-layer) : {(p_vals > 0.3).sum()}')
top10 = sorted(participation.items(), key=lambda x: -x[1])[:10]
print(' Top 10:')
for v, p in top10:
ld = vertex_layer_deg[v]
print(f' {v:35s} P={p:.3f} ' + ' '.join(f'{l[:3]}={ld.get(l, 0)}' for l in layers))
# Supra-adjacency matrix over binary layers only (signaling + regulatory).
print('Supra-adjacency (signaling + regulatory layers — binary intra edges only):')
A = G.supra_adjacency(['signaling', 'regulatory'])
print(f' Shape : {A.shape}')
print(f' Non-zeros : {A.nnz}')
# EGF [Epidermal Growth Factor]-stimulated subgraph component and hub analysis.
G_egf, _ = to_nx(G, slices=['egf_stimulated'], hyperedge_mode='skip')
G_egf_u = G_egf.to_undirected()
components = sorted(nx.connected_components(G_egf_u), key=len, reverse=True)
print(f'EGF [Epidermal Growth Factor] subgraph : {G_egf.number_of_nodes()} nodes, {G_egf.ne} edges')
print(f'Components : {len(components)}')
print(f'Largest component : {len(components[0])} nodes')
largest = G_egf_u.subgraph(components[0]).copy()
density = nx.density(largest)
avg_deg = sum(d for _, d in largest.degree()) / largest.number_of_nodes()
top_hubs = sorted(largest.degree(), key=lambda x: x[1], reverse=True)[:5]
print('\nLargest component:')
print(f' Density : {density:.2e}')
print(f' Avg degree : {avg_deg:.2f}')
print(f' Top 5 hubs : {[(v, d) for v, d in top_hubs]}')
16. Serialisation and Export¶
out_path = DATA / 'uc2_hek293.annnet'
G.write(str(out_path), overwrite=True)
print(f'Written: {out_path} ({os.path.getsize(out_path) / 1e6:.1f} MB)')
G2 = AnnNet.read(str(out_path))
print('\nRound-trip check:')
print(f' vertices orig={G.nv} reloaded={G2.nv} match={G.nv == G2.nv}')
print(f' edges orig={G.ne} reloaded={G2.ne} match={G.ne == G2.ne}')
print(
f' slices orig={len(G._slices)} reloaded={len(G2._slices)} '
f'match={len(G._slices) == len(G2._slices)}'
)
sample_met = next(iter(layer_edges['metabolic']))
print(f'\n Stoichiometry preserved for {sample_met}:')
print(f' orig : {G.hyperedge_definitions[sample_met]}')
print(f' reloaded : {G2.hyperedge_definitions.get(sample_met)}')
cx2_data = to_cx2(G, export_name='UC2_HEK293_signaling', hyperedges='skip')
cx2_path = DATA / 'uc2_signaling.cx2'
with open(cx2_path, 'w') as f:
json.dump(cx2_data, f)
print(
f'CX2 [Cytoscape Exchange Format v2] written: {cx2_path} ({os.path.getsize(cx2_path) / 1e6:.1f} MB)'
)
print(f'Aspects: {[list(a.keys())[0] for a in cx2_data]}')
# Map entity_type to short kind labels expected by the PyG [PyTorch Geometric] adapter.
kind_map = {'protein': 'prot', 'gene': 'gene'}
G.vertex_attributes = G.vertex_attributes.with_columns(
pl.col('entity_type').replace_strict(kind_map, default='metab').alias('kind')
)
print('Kind value counts:', G.vertex_attributes['kind'].value_counts())
pyg_data = to_pyg(G, hyperedge_mode='reify', device='cpu')
print(f'\nHeteroData node types : {pyg_data.node_types}')
print(f'HeteroData edge types : {pyg_data.edge_types}')
for nt in pyg_data.node_types:
print(f' {nt}: {pyg_data[nt].num_nodes} nodes')
for et in pyg_data.edge_types:
print(f' {et}: {pyg_data[et].edge_index.shape[1]} edges')
17. Summary¶
coup_count = G.edge_attributes.filter(pl.col('layer') == 'coupling')['edge_id'].len()
rows = [
('Vertices', G.nv, 'prot + gene + metab'),
('Edges (total)', G.ne, 'all layers + coupling'),
('── Signaling', len(layer_edges['signaling']), 'directed, signed binary'),
('── Metabolic', len(layer_edges['metabolic']), 'directed hyperedges, stoichiometric'),
('── Protein complexes', len(layer_edges['complex']), 'undirected hyperedges'),
('── Regulatory (DoRothEA)', len(layer_edges['regulatory']), 'directed TF→target, levels A+B'),
('── Coupling', coup_count, 'gene↔prot + TF bridges'),
('Organelle slices', 9, 'mito/nucleus/ER/lysosome/cyto/golgi/perox/PM/extracell'),
('Condition slices', 2, 'basal / EGF-stimulated'),
('Bridge TFs', len(bridge_tfs), 'signaling target + DoRothEA regulator'),
('Max participation coeff', round(float(p_vals.max()), 3), 'prot in signaling+complex only'),
(
'NX edge inflation',
f'{binary_expansion_count / len(metabolic_edges_raw):.1f}×',
'metabolic layer, hyperedges→binary',
),
('Memory advantage vs NX', f'{nx_mem / max(annnet_mem, 1e-9):.0f}×', 'metabolic layer filter'),
(
'Serialized (.annnet)',
f'{os.path.getsize(out_path) / 1e6:.1f} MB',
'lossless round-trip verified',
),
('CX2 export', f'{os.path.getsize(cx2_path) / 1e6:.1f} MB', 'full graph, Cytoscape-ready'),
('PyG node types', len(pyg_data.node_types), 'prot / gene / metab / hypernode'),
('PyG edge types', len(pyg_data.edge_types), 'heterogeneous, reified hyperedges'),
]
print(f'{"Metric":<35} {"Value":>12} Notes')
print('─' * 80)
for name, val, note in rows:
print(f'{name:<35} {str(val):>12} {note}')