Multi-Condition Causal Signaling
In [1]:
Copied!
"""
UC1: Multi-Condition Causal Signaling with AnnNet as the Hub
Dependencies:
pip install annnet cptac decoupler omnipath corneto torch-geometric polars narwhals
"""
import time
import polars as pl
import pandas as pd
import numpy as np
import cptac
import decoupler as dc
import corneto as cn
from corneto.methods.signalling.carnival import multi_carnival
import sys
import os
sys.path.insert(0, os.path.abspath(".."))
from annnet.core.graph import AnnNet
from annnet import AnnNet
from annnet.io.read_omnipath import read_omnipath
"""
UC1: Multi-Condition Causal Signaling with AnnNet as the Hub
Dependencies:
pip install annnet cptac decoupler omnipath corneto torch-geometric polars narwhals
"""
import time
import polars as pl
import pandas as pd
import numpy as np
import cptac
import decoupler as dc
import corneto as cn
from corneto.methods.signalling.carnival import multi_carnival
import sys
import os
sys.path.insert(0, os.path.abspath(".."))
from annnet.core.graph import AnnNet
from annnet import AnnNet
from annnet.io.read_omnipath import read_omnipath
/home/l1boll/miniconda3/envs/myenv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
In [ ]:
Copied!
# =============================================================================
# STEP 1 — Load OmniPath into AnnNet
# All metadata into vertex_attributes / edge_attributes.
# =============================================================================
# Load the core signed directed PPI + kinase-substrate network.
# read_omnipath handles fetch, cache (~114MB one-time), and bulk loading.
G = read_omnipath(
dataset="omnipath",
query={"organism": "human", "genesymbols": True},
source_col="source_genesymbol",
target_col="target_genesymbol",
edge_attr_cols=[
"is_stimulation",
"is_inhibition",
"consensus_direction",
"consensus_stimulation",
"consensus_inhibition",
"n_sources",
"n_references",
"sources",
],
slice="omnipath_pkn",
vertex_annotation_sources=[
"HGNC",
"CancerGeneCensus",
"SignaLink_function",
"SignaLink_pathway",
"UniProt_location",
"IntOGen",
"kinase.com",
"Phosphatome",
],
load_vertex_annotations=True,
)
# =============================================================================
# STEP 1 — Load OmniPath into AnnNet
# All metadata into vertex_attributes / edge_attributes.
# =============================================================================
# Load the core signed directed PPI + kinase-substrate network.
# read_omnipath handles fetch, cache (~114MB one-time), and bulk loading.
G = read_omnipath(
dataset="omnipath",
query={"organism": "human", "genesymbols": True},
source_col="source_genesymbol",
target_col="target_genesymbol",
edge_attr_cols=[
"is_stimulation",
"is_inhibition",
"consensus_direction",
"consensus_stimulation",
"consensus_inhibition",
"n_sources",
"n_references",
"sources",
],
slice="omnipath_pkn",
vertex_annotation_sources=[
"HGNC",
"CancerGeneCensus",
"SignaLink_function",
"SignaLink_pathway",
"UniProt_location",
"IntOGen",
"kinase.com",
"Phosphatome",
],
load_vertex_annotations=True,
)
2026-03-09 13:48:08 | [INFO] Downloading data from `https://omnipathdb.org/queries/enzsub?format=json` 2026-03-09 13:48:08 | [INFO] Downloading data from `https://omnipathdb.org/queries/interactions?format=json` 2026-03-09 13:48:08 | [INFO] Downloading data from `https://omnipathdb.org/queries/complexes?format=json` 2026-03-09 13:48:09 | [INFO] Downloading data from `https://omnipathdb.org/queries/annotations?format=json` 2026-03-09 13:48:09 | [INFO] Downloading data from `https://omnipathdb.org/queries/intercell?format=json` 2026-03-09 13:48:09 | [INFO] Downloading data from `https://omnipathdb.org/about?format=text`
[timing] fetch/receive df: 4.420s
[timing] column resolution: 0.0115s
source='source_genesymbol' target='target_genesymbol' directed='is_directed'
edge_attr_cols (8): ['is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition', 'n_sources', 'n_references', 'sources']
[timing] AnnNet init: 0.001s (pre-sized n=85217 e=85217)
[timing] _to_dicts: 0.309s (85217 rows)
[timing] bulk list build: 0.264s (85217 edges)
[timing] add_edges_bulk: 2.344s
vertices=8791 edges=85217
[vertex annotations] loading from cache: /home/l1boll/.cache/annnet/omnipath_annotations.tsv.gz
In [3]:
Copied!
print(G.shape)
print(G.vertex_attributes.head(2))
print(G.edge_attributes.head(2))
print(G.shape)
print(G.vertex_attributes.head(2))
print(G.edge_attributes.head(2))
(8791, 85217) shape: (2, 32) ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐ │ vertex_id ┆ CancerGen ┆ CancerGen ┆ kinase.co ┆ … ┆ CancerGen ┆ CancerGen ┆ IntOGen:c ┆ IntOGen: │ │ --- ┆ eCensus:g ┆ eCensus:h ┆ m:group ┆ ┆ eCensus:t ┆ eCensus:s ┆ urated ┆ oncodriv │ │ str ┆ ermline ┆ allmark ┆ --- ┆ ┆ issue_typ ┆ omatic ┆ --- ┆ e_role_p │ │ ┆ --- ┆ --- ┆ str ┆ ┆ e ┆ --- ┆ str ┆ rob │ │ ┆ str ┆ str ┆ ┆ ┆ --- ┆ str ┆ ┆ --- │ │ ┆ ┆ ┆ ┆ ┆ str ┆ ┆ ┆ str │ ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡ │ CALM3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │ │ TRPC1 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │ └───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘ shape: (2, 9) ┌─────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬─────────┐ │ edge_id ┆ is_stimula ┆ is_inhibit ┆ consensus_ ┆ … ┆ consensus ┆ n_sources ┆ n_referen ┆ sources │ │ --- ┆ tion ┆ ion ┆ direction ┆ ┆ _inhibiti ┆ --- ┆ ces ┆ --- │ │ str ┆ --- ┆ --- ┆ --- ┆ ┆ on ┆ i64 ┆ --- ┆ str │ │ ┆ bool ┆ bool ┆ bool ┆ ┆ --- ┆ ┆ i64 ┆ │ │ ┆ ┆ ┆ ┆ ┆ bool ┆ ┆ ┆ │ ╞═════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═════════╡ │ edge_0 ┆ false ┆ true ┆ true ┆ … ┆ true ┆ 1 ┆ 3 ┆ TRIP │ │ edge_1 ┆ false ┆ true ┆ true ┆ … ┆ true ┆ 1 ┆ 3 ┆ TRIP │ └─────────┴────────────┴────────────┴────────────┴───┴───────────┴───────────┴───────────┴─────────┘
In [4]:
Copied!
# AnnNet value prop: one Polars filter that would be a messy join in other network packages.
trusted_edges = (
G.edge_attributes
.filter(
(pl.col("n_sources") >= 3) &
(pl.col("consensus_direction") == True)
)
.select("edge_id")
.to_series()
.to_list()
)
print(f"Trusted edges (>=3 sources, consensus direction): {len(trusted_edges)}")
# AnnNet value prop: one Polars filter that would be a messy join in other network packages.
trusted_edges = (
G.edge_attributes
.filter(
(pl.col("n_sources") >= 3) &
(pl.col("consensus_direction") == True)
)
.select("edge_id")
.to_series()
.to_list()
)
print(f"Trusted edges (>=3 sources, consensus direction): {len(trusted_edges)}")
Trusted edges (>=3 sources, consensus direction): 12923
In [5]:
Copied!
# Load kinase-substrate network separately for kinase activity inference.
# We load it into a plain DataFrame for decoupler.
import omnipath as op
kin_df = op.interactions.KinaseExtra.get(genesymbols=True)
kin_net = kin_df[["source_genesymbol", "target_genesymbol", "is_stimulation"]].copy()
kin_net.columns = ["source", "target", "weight"]
kin_net["weight"] = kin_net["weight"].map({True: 1, False: -1}).fillna(1)
# Load kinase-substrate network separately for kinase activity inference.
# We load it into a plain DataFrame for decoupler.
import omnipath as op
kin_df = op.interactions.KinaseExtra.get(genesymbols=True)
kin_net = kin_df[["source_genesymbol", "target_genesymbol", "is_stimulation"]].copy()
kin_net.columns = ["source", "target", "weight"]
kin_net["weight"] = kin_net["weight"].map({True: 1, False: -1}).fillna(1)
In [6]:
Copied!
# CollecTRI for TF regulons from decoupler
collectri = dc.op.collectri()
# CollecTRI for TF regulons from decoupler
collectri = dc.op.collectri()
In [7]:
Copied!
# =============================================================================
# STEP 2 — Load CPTAC LUAD transcriptomics, infer TF and kinase activities
# =============================================================================
luad = cptac.Luad()
rna = luad.get_transcriptomics("bcm")
rna = rna.dropna(axis=1, how="all").transpose()
# =============================================================================
# STEP 2 — Load CPTAC LUAD transcriptomics, infer TF and kinase activities
# =============================================================================
luad = cptac.Luad()
rna = luad.get_transcriptomics("bcm")
rna = rna.dropna(axis=1, how="all").transpose()
In [8]:
Copied!
# =============================================================================
# STEP 2.1 — Define AnnNet (Kivelä) layers: one aspect (patient)
# Expression + decoupler outputs will live as vertex–layer attributes (u, (patient,))
# =============================================================================
# Ensure CPTAC sample IDs are flat strings (CPTAC can use MultiIndex sometimes)
if hasattr(rna.columns, "levels"): # pandas MultiIndex
rna.columns = rna.columns.get_level_values(0)
rna.columns = rna.columns.astype(str)
# rna is currently genes x patients
patients = rna.columns.tolist()
# Configure AnnNet multilayer structure: 1 aspect ("patient"), layers = CPTAC sample ids
G.set_aspects(["patient"], {"patient": patients})
print(f"Configured aspects: {G.aspects}")
print(f" Number of patient layers: {len(G.elem_layers['patient'])}")
# =============================================================================
# STEP 2.1 — Define AnnNet (Kivelä) layers: one aspect (patient)
# Expression + decoupler outputs will live as vertex–layer attributes (u, (patient,))
# =============================================================================
# Ensure CPTAC sample IDs are flat strings (CPTAC can use MultiIndex sometimes)
if hasattr(rna.columns, "levels"): # pandas MultiIndex
rna.columns = rna.columns.get_level_values(0)
rna.columns = rna.columns.astype(str)
# rna is currently genes x patients
patients = rna.columns.tolist()
# Configure AnnNet multilayer structure: 1 aspect ("patient"), layers = CPTAC sample ids
G.set_aspects(["patient"], {"patient": patients})
print(f"Configured aspects: {G.aspects}")
print(f" Number of patient layers: {len(G.elem_layers['patient'])}")
Configured aspects: ['patient'] Number of patient layers: 213
In [9]:
Copied!
# =============================================================================
# STEP 2.2 — Add vertex presence + write expression as vertex–layer attributes
# Contract: (gene, (patient,)) -> {"expr": float}
# =============================================================================
# CPTAC transcriptomics rows are MultiIndex -> extract gene symbols
if isinstance(rna.index, pd.MultiIndex):
genes = rna.index.get_level_values(0).astype(str)
rna = rna.copy()
rna.index = genes
else:
genes = rna.index.astype(str)
# Keep only genes that exist as vertices in the PKN graph
genes_in_graph = [
g for g in genes
if (g in G.entity_to_idx and G.entity_types.get(g) == "vertex")
]
print(f"Genes in CPTAC rna: {len(genes)}")
print(f"Genes present as vertices in AnnNet: {len(genes_in_graph)}")
# Collect all vertex–layer attrs into a dict, then write once (avoids
# per-call DataFrame reconstruction which is O(n²) for large data)
_vla: dict = {}
for p in patients:
aa = (p,)
col = rna.loc[genes_in_graph, p]
for g, v in col.items():
if not G.has_presence(g, aa):
G.add_presence(g, aa)
_vla[(g, aa)] = {"expr": float(v)}
G._vertex_layer_attrs = _vla # single DataFrame build via property setter
print("Expression injection done (vertex–layer attrs).")
# Sanity check
_test_gene = genes_in_graph[0]
_test_patient = patients[0]
print(
"Example vertex–layer attrs:",
_test_gene,
_test_patient,
G.get_vertex_layer_attrs(_test_gene, (_test_patient,))
)
# =============================================================================
# STEP 2.2 — Add vertex presence + write expression as vertex–layer attributes
# Contract: (gene, (patient,)) -> {"expr": float}
# =============================================================================
# CPTAC transcriptomics rows are MultiIndex -> extract gene symbols
if isinstance(rna.index, pd.MultiIndex):
genes = rna.index.get_level_values(0).astype(str)
rna = rna.copy()
rna.index = genes
else:
genes = rna.index.astype(str)
# Keep only genes that exist as vertices in the PKN graph
genes_in_graph = [
g for g in genes
if (g in G.entity_to_idx and G.entity_types.get(g) == "vertex")
]
print(f"Genes in CPTAC rna: {len(genes)}")
print(f"Genes present as vertices in AnnNet: {len(genes_in_graph)}")
# Collect all vertex–layer attrs into a dict, then write once (avoids
# per-call DataFrame reconstruction which is O(n²) for large data)
_vla: dict = {}
for p in patients:
aa = (p,)
col = rna.loc[genes_in_graph, p]
for g, v in col.items():
if not G.has_presence(g, aa):
G.add_presence(g, aa)
_vla[(g, aa)] = {"expr": float(v)}
G._vertex_layer_attrs = _vla # single DataFrame build via property setter
print("Expression injection done (vertex–layer attrs).")
# Sanity check
_test_gene = genes_in_graph[0]
_test_patient = patients[0]
print(
"Example vertex–layer attrs:",
_test_gene,
_test_patient,
G.get_vertex_layer_attrs(_test_gene, (_test_patient,))
)
Genes in CPTAC rna: 59286
Genes present as vertices in AnnNet: 7715
Expression injection done (vertex–layer attrs).
Example vertex–layer attrs: A1BG 11LU013 {'expr': 5.6}
In [10]:
Copied!
# =============================================================================
# STEP 2.3 — Prepare decoupler input (genes x samples) from the same rna table
# We'll run decoupler next, then write outputs back as vertex–layer attrs:
# (TF, patient) -> tf_activity
# (kinase, patient) -> kinase_activity
# =============================================================================
expr = rna.loc[genes_in_graph, patients].copy()
# decoupler likes string indices/columns
expr.index = expr.index.astype(str)
expr.columns = expr.columns.astype(str)
print("expr shape (genes x patients):", expr.shape)
# =============================================================================
# STEP 2.3 — Prepare decoupler input (genes x samples) from the same rna table
# We'll run decoupler next, then write outputs back as vertex–layer attrs:
# (TF, patient) -> tf_activity
# (kinase, patient) -> kinase_activity
# =============================================================================
expr = rna.loc[genes_in_graph, patients].copy()
# decoupler likes string indices/columns
expr.index = expr.index.astype(str)
expr.columns = expr.columns.astype(str)
print("expr shape (genes x patients):", expr.shape)
expr shape (genes x patients): (7715, 213)
In [11]:
Copied!
# Full transcriptome for TF inference
expr_full = rna.copy() # genes x patients (≈59k genes)
# PKN-restricted for kinase inference + CORNETO
expr_pkn = expr.loc[genes_in_graph, patients]
# Full transcriptome for TF inference
expr_full = rna.copy() # genes x patients (≈59k genes)
# PKN-restricted for kinase inference + CORNETO
expr_pkn = expr.loc[genes_in_graph, patients]
In [12]:
Copied!
expr_genes = set(expr_full.index)
collectri_targets = set(collectri["target"])
print("Expr genes:", len(expr_genes))
print("CollecTRI targets:", len(collectri_targets))
print("Intersection:", len(expr_genes & collectri_targets))
expr_genes = set(expr_full.index)
collectri_targets = set(collectri["target"])
print("Expr genes:", len(expr_genes))
print("CollecTRI targets:", len(collectri_targets))
print("Intersection:", len(expr_genes & collectri_targets))
Expr genes: 59286 CollecTRI targets: 6675 Intersection: 6604
In [13]:
Copied!
shared_genes = sorted(expr_genes & collectri_targets)
print("Shared genes used for TF inference:", len(shared_genes))
expr_tf = expr_full.loc[shared_genes]
shared_genes = sorted(expr_genes & collectri_targets)
print("Shared genes used for TF inference:", len(shared_genes))
expr_tf = expr_full.loc[shared_genes]
Shared genes used for TF inference: 6604
In [14]:
Copied!
# =============================================================================
# STEP 2.4 — TF activity inference (decoupler ULM)
# =============================================================================
tf_es, tf_pv = dc.mt.ulm(
expr_tf.T, # samples x genes
net=collectri,
)
print("TF activity matrix shape:", tf_es.shape)
print("TF p-value matrix shape:", tf_pv.shape)
tf_es.head()
# =============================================================================
# STEP 2.4 — TF activity inference (decoupler ULM)
# =============================================================================
tf_es, tf_pv = dc.mt.ulm(
expr_tf.T, # samples x genes
net=collectri,
)
print("TF activity matrix shape:", tf_es.shape)
print("TF p-value matrix shape:", tf_pv.shape)
tf_es.head()
TF activity matrix shape: (213, 772) TF p-value matrix shape: (213, 772)
Out[14]:
| ABL1 | AEBP1 | AHR | AHRR | AIP | AIRE | AP1 | APEX1 | AR | ARID1A | ... | ZNF384 | ZNF395 | ZNF410 | ZNF423 | ZNF436 | ZNF699 | ZNF76 | ZNF804A | ZNF91 | ZXDC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11LU013 | 1.133472 | 0.708240 | -1.962136 | -2.333035 | 0.678352 | -1.332731 | 0.405117 | 0.669775 | 4.633401 | 0.567397 | ... | -0.754275 | 0.572304 | 1.852227 | -0.528300 | 0.577286 | -0.827495 | 0.098637 | -1.153709 | -1.710633 | 0.234676 |
| 11LU016 | 1.480053 | 0.706964 | -1.858925 | -2.039485 | -0.152183 | -1.364615 | 1.361173 | 0.950786 | 4.077365 | 0.666269 | ... | -0.295253 | 0.896182 | 2.152773 | -0.859025 | 0.541056 | -0.925776 | 0.091121 | -1.190906 | -1.481072 | 0.646525 |
| 11LU022 | 1.190274 | 0.293049 | -1.498655 | -1.970668 | 0.174403 | -1.569903 | 1.169726 | 0.601941 | 3.965616 | 0.804321 | ... | -0.867206 | 0.777887 | 2.837829 | -0.538930 | 0.472557 | -1.292777 | -0.045054 | -1.452778 | -1.562590 | 0.711311 |
| 11LU035 | 1.344234 | 0.525223 | -1.857996 | -1.563781 | 0.610280 | -1.380070 | 1.659655 | 0.738020 | 4.329121 | 0.891667 | ... | -1.016767 | 0.547489 | 2.241772 | -0.257387 | 0.345918 | -1.193952 | 0.089076 | -1.342837 | -1.469566 | 0.337983 |
| C3L-00001 | 1.061214 | -0.070905 | -1.886939 | -2.456549 | 0.723482 | -1.807795 | -0.194376 | 0.423190 | 3.802744 | 1.021300 | ... | -0.832999 | 0.637368 | 1.775025 | -0.451123 | 0.297039 | -1.391086 | 0.111673 | -1.944245 | -1.735636 | 0.446377 |
5 rows × 772 columns
In [15]:
Copied!
# =============================================================================
# STEP 2.5 — Kinase activity inference (decoupler ULM)
# =============================================================================
# Deduplicate kinase network/dataframe (required by decoupler)
kin_net = kin_net.drop_duplicates(subset=["source", "target"])
kin_es, kin_pv = dc.mt.ulm(
expr_pkn.T, # samples x genes
net=kin_net,
)
print("Kinase activity matrix shape:", kin_es.shape)
print("Kinase p-value matrix shape:", kin_pv.shape)
kin_es.head()
# =============================================================================
# STEP 2.5 — Kinase activity inference (decoupler ULM)
# =============================================================================
# Deduplicate kinase network/dataframe (required by decoupler)
kin_net = kin_net.drop_duplicates(subset=["source", "target"])
kin_es, kin_pv = dc.mt.ulm(
expr_pkn.T, # samples x genes
net=kin_net,
)
print("Kinase activity matrix shape:", kin_es.shape)
print("Kinase p-value matrix shape:", kin_pv.shape)
kin_es.head()
Kinase activity matrix shape: (213, 621) Kinase p-value matrix shape: (213, 621)
Out[15]:
| A0A0S2Z3I6 | A0A173G4P4 | A9UF07 | ABL1 | ABL2 | ACP1 | ACVR1 | ACVR1B | ACVR2A | ACVR2B | ... | VRK3 | WEE1 | WNK1 | WNK3 | WNK4 | WNT3A | WNT5A | YES1 | YWHAG | ZAP70 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11LU013 | 1.263516 | -2.013876 | -2.234202 | 1.756996 | 0.054817 | -1.592046 | -1.069990 | 0.592178 | 0.519227 | 0.361554 | ... | -1.500442 | 1.234711 | 0.588950 | 0.268160 | -0.710863 | -1.328525 | -1.939888 | -0.511714 | -0.094613 | -0.602269 |
| 11LU016 | 1.650377 | -2.227240 | -2.094616 | 1.362521 | -0.133711 | -1.285073 | -1.078553 | 0.750032 | 0.889098 | 0.660859 | ... | -1.718357 | 1.287584 | 0.782859 | 0.210761 | -0.727134 | -1.153274 | -1.596066 | -0.879446 | 0.441479 | -0.931106 |
| 11LU022 | 1.217549 | -1.824825 | -1.969072 | 1.427724 | -0.091816 | -1.359380 | -1.056250 | 0.947175 | 0.618070 | 0.583511 | ... | -1.653995 | 1.178478 | 0.890627 | 0.221997 | -0.686181 | -1.236057 | -1.288252 | -0.932426 | 0.326997 | -0.859030 |
| 11LU035 | 1.586950 | -2.288885 | -2.186818 | 1.387084 | 0.111139 | -1.071856 | -0.849802 | 1.404323 | 1.140538 | 1.391097 | ... | -1.870351 | 0.884177 | 0.665510 | -0.402899 | -0.720063 | -1.147660 | -1.364607 | -1.112730 | 0.358034 | -0.596497 |
| C3L-00001 | 1.510546 | -2.111909 | -1.855225 | 1.608509 | -0.197432 | -1.101459 | -0.916394 | 0.967513 | 0.668357 | 0.537464 | ... | -1.795972 | 1.298341 | 0.592515 | 0.451061 | -1.143321 | -0.951475 | -1.457891 | -0.890744 | 0.484124 | -0.391161 |
5 rows × 621 columns
In [16]:
Copied!
# =============================================================================
# STEP 2.6a — Store TF activities as vertex–layer attributes
# =============================================================================
tfs_in_graph = [
tf for tf in tf_es.columns
if tf in G.entity_to_idx and G.entity_types.get(tf) == "vertex"
]
print(f"TFs inferred: {tf_es.shape[1]}")
print(f"TFs present in AnnNet: {len(tfs_in_graph)}")
for patient in tf_es.index:
aa = (patient,)
row = tf_es.loc[patient, tfs_in_graph]
for tf, val in row.items():
if not G.has_presence(tf, aa):
G.add_presence(tf, aa)
G.set_vertex_layer_attrs(tf, aa, tf_activity=float(val))
print("TF activity written to AnnNet.")
# =============================================================================
# STEP 2.6a — Store TF activities as vertex–layer attributes
# =============================================================================
tfs_in_graph = [
tf for tf in tf_es.columns
if tf in G.entity_to_idx and G.entity_types.get(tf) == "vertex"
]
print(f"TFs inferred: {tf_es.shape[1]}")
print(f"TFs present in AnnNet: {len(tfs_in_graph)}")
for patient in tf_es.index:
aa = (patient,)
row = tf_es.loc[patient, tfs_in_graph]
for tf, val in row.items():
if not G.has_presence(tf, aa):
G.add_presence(tf, aa)
G.set_vertex_layer_attrs(tf, aa, tf_activity=float(val))
print("TF activity written to AnnNet.")
TFs inferred: 772 TFs present in AnnNet: 607 TF activity written to AnnNet.
In [20]:
Copied!
# =============================================================================
# STEP 2.6b — Store kinase activities as vertex–layer attributes
# =============================================================================
kinases_in_graph = [
k for k in kin_es.columns
if k in G.entity_to_idx and G.entity_types.get(k) == "vertex"
]
print(f"Kinases inferred: {kin_es.shape[1]}")
print(f"Kinases present in AnnNet: {len(kinases_in_graph)}")
for patient in kin_es.index:
aa = (patient,)
row = kin_es.loc[patient, kinases_in_graph]
for k, val in row.items():
if not G.has_presence(k, aa):
G.add_presence(k, aa)
G.set_vertex_layer_attrs(k, aa, kinase_activity=float(val))
print("Kinase activity written to AnnNet.")
# =============================================================================
# STEP 2.6b — Store kinase activities as vertex–layer attributes
# =============================================================================
kinases_in_graph = [
k for k in kin_es.columns
if k in G.entity_to_idx and G.entity_types.get(k) == "vertex"
]
print(f"Kinases inferred: {kin_es.shape[1]}")
print(f"Kinases present in AnnNet: {len(kinases_in_graph)}")
for patient in kin_es.index:
aa = (patient,)
row = kin_es.loc[patient, kinases_in_graph]
for k, val in row.items():
if not G.has_presence(k, aa):
G.add_presence(k, aa)
G.set_vertex_layer_attrs(k, aa, kinase_activity=float(val))
print("Kinase activity written to AnnNet.")
Kinases inferred: 621 Kinases present in AnnNet: 601 Kinase activity written to AnnNet.
In [21]:
Copied!
# =============================================================================
# STEP 3 — CORNETO
# =============================================================================
# --- 3.0 Differential profiles (tumor vs matched normal) -------------------
def compute_diff(input_df):
cols = list(input_df.columns)
normals = {s[:-2]: s for s in cols if s.endswith(".N")}
tumors = [s for s in cols if not s.endswith(".N")]
matched = {t: normals[t] for t in tumors if t in normals}
print(f" Matched tumor/normal pairs: {len(matched)}")
diffs = [input_df[t] / input_df[n] for t, n in matched.items()]
diff_df = pd.concat(diffs, axis=1)
diff_df.columns = list(matched.keys())
diff_df = diff_df.replace([np.inf, -np.inf], np.nan).replace(0, np.nan)
return np.log2(diff_df)
rna = luad.get_transcriptomics("bcm")
pp = luad.get_phosphoproteomics("umich")
print("Computing RNA differential profiles...")
rna_diff = compute_diff(rna.T)
rna_diff = rna_diff.groupby(level=0).mean()
print("Computing phospho differential profiles...")
pp_diff = compute_diff(pp.T)
pp_diff = pp_diff.groupby(level=[0, 1]).mean()
pp_diff.index = pp_diff.index.map("_".join)
# --- 3.1 Kinase activity from phosphoproteomics ----------------------------
# op.interactions.post_translational doesn't exist in all versions — use direct URL
psiteplus = pd.read_csv(
"https://omnipathdb.org/enz_sub?genesymbols=1",
sep="\t"
)
psiteplus_kin = (
psiteplus[psiteplus["modification"] == "phosphorylation"]
[["enzyme_genesymbol", "substrate_genesymbol", "residue_type", "residue_offset"]]
.copy()
)
psiteplus_kin["target"] = (
psiteplus_kin["substrate_genesymbol"] + "_" +
psiteplus_kin["residue_type"] +
psiteplus_kin["residue_offset"].astype(str)
)
psiteplus_kin = (
psiteplus_kin[["enzyme_genesymbol", "target"]]
.rename(columns={"enzyme_genesymbol": "source"})
.drop_duplicates()
.assign(weight=1)
)
pp_dc_input = pp_diff.fillna(0)
kin_es_new, _ = dc.mt.ulm(pp_dc_input.T, net=psiteplus_kin)
print(f"Kinase activity from phospho: {kin_es_new.shape}")
# =============================================================================
# STEP 3 — CORNETO
# =============================================================================
# --- 3.0 Differential profiles (tumor vs matched normal) -------------------
def compute_diff(input_df):
cols = list(input_df.columns)
normals = {s[:-2]: s for s in cols if s.endswith(".N")}
tumors = [s for s in cols if not s.endswith(".N")]
matched = {t: normals[t] for t in tumors if t in normals}
print(f" Matched tumor/normal pairs: {len(matched)}")
diffs = [input_df[t] / input_df[n] for t, n in matched.items()]
diff_df = pd.concat(diffs, axis=1)
diff_df.columns = list(matched.keys())
diff_df = diff_df.replace([np.inf, -np.inf], np.nan).replace(0, np.nan)
return np.log2(diff_df)
rna = luad.get_transcriptomics("bcm")
pp = luad.get_phosphoproteomics("umich")
print("Computing RNA differential profiles...")
rna_diff = compute_diff(rna.T)
rna_diff = rna_diff.groupby(level=0).mean()
print("Computing phospho differential profiles...")
pp_diff = compute_diff(pp.T)
pp_diff = pp_diff.groupby(level=[0, 1]).mean()
pp_diff.index = pp_diff.index.map("_".join)
# --- 3.1 Kinase activity from phosphoproteomics ----------------------------
# op.interactions.post_translational doesn't exist in all versions — use direct URL
psiteplus = pd.read_csv(
"https://omnipathdb.org/enz_sub?genesymbols=1",
sep="\t"
)
psiteplus_kin = (
psiteplus[psiteplus["modification"] == "phosphorylation"]
[["enzyme_genesymbol", "substrate_genesymbol", "residue_type", "residue_offset"]]
.copy()
)
psiteplus_kin["target"] = (
psiteplus_kin["substrate_genesymbol"] + "_" +
psiteplus_kin["residue_type"] +
psiteplus_kin["residue_offset"].astype(str)
)
psiteplus_kin = (
psiteplus_kin[["enzyme_genesymbol", "target"]]
.rename(columns={"enzyme_genesymbol": "source"})
.drop_duplicates()
.assign(weight=1)
)
pp_dc_input = pp_diff.fillna(0)
kin_es_new, _ = dc.mt.ulm(pp_dc_input.T, net=psiteplus_kin)
print(f"Kinase activity from phospho: {kin_es_new.shape}")
Computing RNA differential profiles... Matched tumor/normal pairs: 102 Computing phospho differential profiles... Matched tumor/normal pairs: 102
/home/l1boll/miniconda3/envs/myenv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:395: RuntimeWarning: invalid value encountered in log2 result = func(self.values, **kwargs)
Kinase activity from phospho: (102, 321)
In [22]:
Copied!
# --- 3.2 TF activity from RNA differential ----------------------------------
rna_dc_input = rna_diff.fillna(0)
tf_es_new, _ = dc.mt.ulm(rna_dc_input.T, net=collectri)
print(f"TF activity from RNA diff: {tf_es_new.shape}")
# --- 3.3 Patient selection: top mutational burden (top 10%) -----------------
soms = luad.get_somatic_mutation("harmonized")
int_muts = ["Missense_Mutation", "Nonsense_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins"]
mut_mat = (
soms[soms.Mutation.isin(int_muts)][["Gene", "Tumor_Sample_Barcode"]]
.drop_duplicates()
.assign(mutated=1)
.pivot(index="Gene", columns="Tumor_Sample_Barcode", values="mutated")
.fillna(0)
)
mut_mat.columns = mut_mat.columns.str.replace("_T", "")
common = set(tf_es_new.index) & set(kin_es_new.index) & set(mut_mat.columns)
mut_sum = mut_mat[list(common)].sum(axis=0).sort_values(ascending=False)
top_patients = mut_sum[mut_sum >= mut_sum.quantile(0.9)].index.tolist()
print(f"Top mutational burden patients: {len(top_patients)}")
# --- 3.2 TF activity from RNA differential ----------------------------------
rna_dc_input = rna_diff.fillna(0)
tf_es_new, _ = dc.mt.ulm(rna_dc_input.T, net=collectri)
print(f"TF activity from RNA diff: {tf_es_new.shape}")
# --- 3.3 Patient selection: top mutational burden (top 10%) -----------------
soms = luad.get_somatic_mutation("harmonized")
int_muts = ["Missense_Mutation", "Nonsense_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins"]
mut_mat = (
soms[soms.Mutation.isin(int_muts)][["Gene", "Tumor_Sample_Barcode"]]
.drop_duplicates()
.assign(mutated=1)
.pivot(index="Gene", columns="Tumor_Sample_Barcode", values="mutated")
.fillna(0)
)
mut_mat.columns = mut_mat.columns.str.replace("_T", "")
common = set(tf_es_new.index) & set(kin_es_new.index) & set(mut_mat.columns)
mut_sum = mut_mat[list(common)].sum(axis=0).sort_values(ascending=False)
top_patients = mut_sum[mut_sum >= mut_sum.quantile(0.9)].index.tolist()
print(f"Top mutational burden patients: {len(top_patients)}")
TF activity from RNA diff: (102, 770) Top mutational burden patients: 10
In [23]:
Copied!
# --- 3.4 PKN pre-filtered to kinases -> (kinases∪TFs) ------------
kinase_set = set(kin_es_new.columns)
tf_set = set(tf_es_new.columns)
pkn_raw = (
G.edge_attributes.to_pandas()
.merge(
pd.DataFrame(
[(src, tgt, eid) for eid, (src, tgt, _) in G.edge_definitions.items()
if src is not None and tgt is not None],
columns=["source", "target", "edge_id"]
),
on="edge_id"
)
)
pkn_filt = pkn_raw[
pkn_raw["source"].isin(kinase_set) &
pkn_raw["target"].isin(kinase_set | tf_set)
].copy()
pkn_filt["sign"] = np.where(pkn_filt["consensus_stimulation"], 1,
np.where(pkn_filt["consensus_inhibition"], -1, 0))
pkn_filt = pkn_filt[pkn_filt["sign"] != 0]
C_filt = cn.Graph.from_tuples(
pkn_filt[["source", "sign", "target"]].apply(tuple, axis=1).values
)
print(f"Pre-filtered PKN: {C_filt.num_vertices} nodes, {C_filt.num_edges} edges")
# --- 3.4 PKN pre-filtered to kinases -> (kinases∪TFs) ------------
kinase_set = set(kin_es_new.columns)
tf_set = set(tf_es_new.columns)
pkn_raw = (
G.edge_attributes.to_pandas()
.merge(
pd.DataFrame(
[(src, tgt, eid) for eid, (src, tgt, _) in G.edge_definitions.items()
if src is not None and tgt is not None],
columns=["source", "target", "edge_id"]
),
on="edge_id"
)
)
pkn_filt = pkn_raw[
pkn_raw["source"].isin(kinase_set) &
pkn_raw["target"].isin(kinase_set | tf_set)
].copy()
pkn_filt["sign"] = np.where(pkn_filt["consensus_stimulation"], 1,
np.where(pkn_filt["consensus_inhibition"], -1, 0))
pkn_filt = pkn_filt[pkn_filt["sign"] != 0]
C_filt = cn.Graph.from_tuples(
pkn_filt[["source", "sign", "target"]].apply(tuple, axis=1).values
)
print(f"Pre-filtered PKN: {C_filt.num_vertices} nodes, {C_filt.num_edges} edges")
Pre-filtered PKN: 600 nodes, 2605 edges
In [24]:
Copied!
# --- 3.5 per-patient CORNETO loop, top 10% cohort ---
import time
KIN_TH = 1.5
TF_TH = 1.5
TOP_K = 10
TOP_TF_POS = 8
TOP_TF_NEG = 7
patient_data = {}
for p in top_patients:
if p not in kin_es_new.index or p not in tf_es_new.index:
continue
kin_row = kin_es_new.loc[p]
tf_row = tf_es_new.loc[p]
kin_top = kin_row[kin_row.abs() >= KIN_TH].abs().nlargest(TOP_K).index.tolist()
tf_pos = tf_row[tf_row >= TF_TH].nlargest(TOP_TF_POS).index.tolist()
tf_neg = tf_row[tf_row <= -TF_TH].nsmallest(TOP_TF_NEG).index.tolist()
inputs = {k: float(np.sign(kin_row[k])) for k in kin_top}
outputs = {tf: float(tf_row[tf]) for tf in tf_pos + tf_neg}
if inputs and outputs:
patient_data[p] = {"input": inputs, "output": outputs}
print(f"Patients with valid data: {len(patient_data)}")
results = {}
t_start = time.time()
for idx, (p, data) in enumerate(patient_data.items()):
t0 = time.time()
P, G_p, summary = multi_carnival(C_filt, {p: data}, lambd=0)
sol = P.solve(solver="scip")
elapsed = time.time() - t0
print(f"[{idx+1}/{len(patient_data)}] {p}: status={sol.status} | "
f"obj={sol.value:.1f} | {elapsed:.1f}s")
results[p] = {"P": P, "G": G_p, "sol": sol}
print(f"\nDone in {time.time()-t_start:.1f}s")
# --- 3.5 per-patient CORNETO loop, top 10% cohort ---
import time
KIN_TH = 1.5
TF_TH = 1.5
TOP_K = 10
TOP_TF_POS = 8
TOP_TF_NEG = 7
patient_data = {}
for p in top_patients:
if p not in kin_es_new.index or p not in tf_es_new.index:
continue
kin_row = kin_es_new.loc[p]
tf_row = tf_es_new.loc[p]
kin_top = kin_row[kin_row.abs() >= KIN_TH].abs().nlargest(TOP_K).index.tolist()
tf_pos = tf_row[tf_row >= TF_TH].nlargest(TOP_TF_POS).index.tolist()
tf_neg = tf_row[tf_row <= -TF_TH].nsmallest(TOP_TF_NEG).index.tolist()
inputs = {k: float(np.sign(kin_row[k])) for k in kin_top}
outputs = {tf: float(tf_row[tf]) for tf in tf_pos + tf_neg}
if inputs and outputs:
patient_data[p] = {"input": inputs, "output": outputs}
print(f"Patients with valid data: {len(patient_data)}")
results = {}
t_start = time.time()
for idx, (p, data) in enumerate(patient_data.items()):
t0 = time.time()
P, G_p, summary = multi_carnival(C_filt, {p: data}, lambd=0)
sol = P.solve(solver="scip")
elapsed = time.time() - t0
print(f"[{idx+1}/{len(patient_data)}] {p}: status={sol.status} | "
f"obj={sol.value:.1f} | {elapsed:.1f}s")
results[p] = {"P": P, "G": G_p, "sol": sol}
print(f"\nDone in {time.time()-t_start:.1f}s")
Patients with valid data: 10 [1/10] C3L-00144: status=optimal | obj=0.0 | 4.4s [2/10] C3N-00175: status=optimal | obj=0.0 | 3.1s [3/10] C3N-00560: status=optimal | obj=0.0 | 3.0s [4/10] C3N-02089: status=optimal | obj=0.0 | 1.8s [5/10] C3N-00580: status=optimal | obj=0.0 | 3.3s [6/10] C3L-00095: status=optimal | obj=0.0 | 1.6s [7/10] C3N-01030: status=optimal | obj=0.0 | 3.9s [8/10] C3N-01489: status=optimal | obj=0.0 | 1.8s [9/10] C3L-00080: status=optimal | obj=0.0 | 3.4s [10/10] C3L-00279: status=optimal | obj=0.0 | 3.2s Done in 29.5s
In [25]:
Copied!
# =============================================================================
# STEP 3.6 — Extract CORNETO results into AnnNet vertex-layer attributes
# =============================================================================
for p, res in results.items():
aa = (p,)
P = res["P"]
G_p = res["G"]
vvals = P.expr.vertex_value.value # (n_nodes x 1)
nodes = list(G_p.V)
n_written = 0
for i, node in enumerate(nodes):
if node.startswith("_"):
continue
val = float(vvals[i, 0])
if abs(val) < 0.5:
continue
if not G.has_vertex(node):
continue
if not G.has_presence(node, aa):
G.add_presence(node, aa)
G.set_vertex_layer_attrs(node, aa, corneto_signal=val)
n_written += 1
print(f" {p}: {n_written} nodes with corneto_signal")
print("Extraction done.")
# =============================================================================
# STEP 3.6 — Extract CORNETO results into AnnNet vertex-layer attributes
# =============================================================================
for p, res in results.items():
aa = (p,)
P = res["P"]
G_p = res["G"]
vvals = P.expr.vertex_value.value # (n_nodes x 1)
nodes = list(G_p.V)
n_written = 0
for i, node in enumerate(nodes):
if node.startswith("_"):
continue
val = float(vvals[i, 0])
if abs(val) < 0.5:
continue
if not G.has_vertex(node):
continue
if not G.has_presence(node, aa):
G.add_presence(node, aa)
G.set_vertex_layer_attrs(node, aa, corneto_signal=val)
n_written += 1
print(f" {p}: {n_written} nodes with corneto_signal")
print("Extraction done.")
C3L-00144: 67 nodes with corneto_signal C3N-00175: 24 nodes with corneto_signal C3N-00560: 63 nodes with corneto_signal C3N-02089: 42 nodes with corneto_signal C3N-00580: 40 nodes with corneto_signal C3L-00095: 62 nodes with corneto_signal C3N-01030: 34 nodes with corneto_signal C3N-01489: 44 nodes with corneto_signal C3L-00080: 42 nodes with corneto_signal C3L-00279: 36 nodes with corneto_signal Extraction done.
In [26]:
Copied!
# =============================================================================
# STEP 3.7 — Extract CORNETO results into AnnNet as intra-layer edges + node attrs
# Each patient layer is their active inferred causal subgraph
# =============================================================================
for p, res in results.items():
aa = (p,)
P = res["P"]
G_p = res["G"]
vvals = P.expr.vertex_value.value # (n_nodes, 1) or (n_nodes,)
evals = P.expr.edge_value.value # (n_edges, 1) or (n_edges,)
nodes = list(G_p.V)
edges = list(G_p.E) # list of (frozenset_src, frozenset_tgt)
# flatten shape
if vvals.ndim == 2:
vvals = vvals[:, 0]
if evals.ndim == 2:
evals = evals[:, 0]
# 1 — node signals as vertex-layer attrs (keep for cross-layer queries)
active_nodes = set()
for i, node in enumerate(nodes):
if node.startswith("_"):
continue
val = float(vvals[i])
if abs(val) < 0.5:
continue
if not G.has_vertex(node):
continue
if not G.has_presence(node, aa):
G.add_presence(node, aa)
G.set_vertex_layer_attrs(node, aa, corneto_signal=val)
active_nodes.add(node)
# 2 — active edges as intra-layer edges (the actual Kivelä representation)
n_edges_added = 0
for j, (src_fs, tgt_fs) in enumerate(edges):
val = float(evals[j])
if abs(val) < 0.5:
continue
# flow graph frozensets — extract single node
src_list = [n for n in src_fs if not str(n).startswith("_")]
tgt_list = [n for n in tgt_fs if not str(n).startswith("_")]
if not src_list or not tgt_list:
continue
src, tgt = src_list[0], tgt_list[0]
if not G.has_vertex(src) or not G.has_vertex(tgt):
continue
# ensure presence before adding intra-layer edge
if not G.has_presence(src, aa):
G.add_presence(src, aa)
G.set_vertex_layer_attrs(src, aa, corneto_signal=float(vvals[nodes.index(src)]) if src in nodes else 0.0)
if not G.has_presence(tgt, aa):
G.add_presence(tgt, aa)
G.set_vertex_layer_attrs(tgt, aa, corneto_signal=float(vvals[nodes.index(tgt)]) if tgt in nodes else 0.0)
G.add_intra_edge_nl(src, tgt, aa, weight=val)
n_edges_added += 1
print(f" {p}: {len(active_nodes)} nodes, {n_edges_added} intra-layer edges")
print("Extraction done.")
# =============================================================================
# STEP 3.7 — Extract CORNETO results into AnnNet as intra-layer edges + node attrs
# Each patient layer is their active inferred causal subgraph
# =============================================================================
for p, res in results.items():
aa = (p,)
P = res["P"]
G_p = res["G"]
vvals = P.expr.vertex_value.value # (n_nodes, 1) or (n_nodes,)
evals = P.expr.edge_value.value # (n_edges, 1) or (n_edges,)
nodes = list(G_p.V)
edges = list(G_p.E) # list of (frozenset_src, frozenset_tgt)
# flatten shape
if vvals.ndim == 2:
vvals = vvals[:, 0]
if evals.ndim == 2:
evals = evals[:, 0]
# 1 — node signals as vertex-layer attrs (keep for cross-layer queries)
active_nodes = set()
for i, node in enumerate(nodes):
if node.startswith("_"):
continue
val = float(vvals[i])
if abs(val) < 0.5:
continue
if not G.has_vertex(node):
continue
if not G.has_presence(node, aa):
G.add_presence(node, aa)
G.set_vertex_layer_attrs(node, aa, corneto_signal=val)
active_nodes.add(node)
# 2 — active edges as intra-layer edges (the actual Kivelä representation)
n_edges_added = 0
for j, (src_fs, tgt_fs) in enumerate(edges):
val = float(evals[j])
if abs(val) < 0.5:
continue
# flow graph frozensets — extract single node
src_list = [n for n in src_fs if not str(n).startswith("_")]
tgt_list = [n for n in tgt_fs if not str(n).startswith("_")]
if not src_list or not tgt_list:
continue
src, tgt = src_list[0], tgt_list[0]
if not G.has_vertex(src) or not G.has_vertex(tgt):
continue
# ensure presence before adding intra-layer edge
if not G.has_presence(src, aa):
G.add_presence(src, aa)
G.set_vertex_layer_attrs(src, aa, corneto_signal=float(vvals[nodes.index(src)]) if src in nodes else 0.0)
if not G.has_presence(tgt, aa):
G.add_presence(tgt, aa)
G.set_vertex_layer_attrs(tgt, aa, corneto_signal=float(vvals[nodes.index(tgt)]) if tgt in nodes else 0.0)
G.add_intra_edge_nl(src, tgt, aa, weight=val)
n_edges_added += 1
print(f" {p}: {len(active_nodes)} nodes, {n_edges_added} intra-layer edges")
print("Extraction done.")
C3L-00144: 67 nodes, 61 intra-layer edges C3N-00175: 24 nodes, 17 intra-layer edges C3N-00560: 63 nodes, 55 intra-layer edges C3N-02089: 42 nodes, 33 intra-layer edges C3N-00580: 40 nodes, 35 intra-layer edges C3L-00095: 62 nodes, 56 intra-layer edges C3N-01030: 34 nodes, 26 intra-layer edges C3N-01489: 44 nodes, 37 intra-layer edges C3L-00080: 42 nodes, 32 intra-layer edges C3L-00279: 36 nodes, 33 intra-layer edges Extraction done.
In [27]:
Copied!
# =============================================================================
# STEP 3.8 — Consensus layer: nodes active in >= 50% of patients
# =============================================================================
from collections import Counter
min_freq = 0.5
n_pts = len(results)
signal_counter = Counter()
for p in results:
aa = (p,)
for node in G.vertices():
if abs(G.get_vertex_layer_attrs(node, aa).get("corneto_signal", 0.0)) >= 0.5:
signal_counter[node] += 1
consensus_nodes = [n for n, cnt in signal_counter.items() if cnt >= min_freq * n_pts]
if "consensus" not in G.elem_layers.get("patient", []):
G.elem_layers["patient"].append("consensus")
consensus_aa = ("consensus",)
for node in consensus_nodes:
if not G.has_presence(node, consensus_aa):
G.add_presence(node, consensus_aa)
sigs = [G.get_vertex_layer_attrs(node, (p,)).get("corneto_signal", 0.0)
for p in results
if abs(G.get_vertex_layer_attrs(node, (p,)).get("corneto_signal", 0.0)) >= 0.5]
G.set_vertex_layer_attrs(node, consensus_aa,
corneto_signal=float(np.mean(sigs)),
patient_frequency=signal_counter[node] / n_pts)
print(f"Consensus layer: {len(consensus_nodes)} nodes active in >={min_freq*100:.0f}% of patients")
print("Sample consensus nodes:", consensus_nodes[:10])
# =============================================================================
# STEP 3.8 — Consensus layer: nodes active in >= 50% of patients
# =============================================================================
from collections import Counter
min_freq = 0.5
n_pts = len(results)
signal_counter = Counter()
for p in results:
aa = (p,)
for node in G.vertices():
if abs(G.get_vertex_layer_attrs(node, aa).get("corneto_signal", 0.0)) >= 0.5:
signal_counter[node] += 1
consensus_nodes = [n for n, cnt in signal_counter.items() if cnt >= min_freq * n_pts]
if "consensus" not in G.elem_layers.get("patient", []):
G.elem_layers["patient"].append("consensus")
consensus_aa = ("consensus",)
for node in consensus_nodes:
if not G.has_presence(node, consensus_aa):
G.add_presence(node, consensus_aa)
sigs = [G.get_vertex_layer_attrs(node, (p,)).get("corneto_signal", 0.0)
for p in results
if abs(G.get_vertex_layer_attrs(node, (p,)).get("corneto_signal", 0.0)) >= 0.5]
G.set_vertex_layer_attrs(node, consensus_aa,
corneto_signal=float(np.mean(sigs)),
patient_frequency=signal_counter[node] / n_pts)
print(f"Consensus layer: {len(consensus_nodes)} nodes active in >={min_freq*100:.0f}% of patients")
print("Sample consensus nodes:", consensus_nodes[:10])
Consensus layer: 33 nodes active in >=50% of patients Sample consensus nodes: ['SRC', 'FYN', 'PRKACA', 'EGFR', 'LYN', 'MAPK14', 'JAK2', 'MAPK3', 'MAPK1', 'AKT1']
In [29]:
Copied!
# =============================================================================
# STEP 4 — Cross-layer query: AnnNet
# Consensus-active stimulatory edges with both endpoints signaling
# ==============================================================================
consensus_set = set(consensus_nodes)
rows = []
edge_attr = G.edge_attributes.to_pandas().set_index("edge_id")
for eid, (src, tgt, _) in G.edge_definitions.items():
if src not in consensus_set or tgt not in consensus_set:
continue
if eid not in edge_attr.index:
continue
row = edge_attr.loc[eid]
src_attr = G.get_vertex_layer_attrs(src, consensus_aa)
tgt_attr = G.get_vertex_layer_attrs(tgt, consensus_aa)
rows.append({
"source": src,
"target": tgt,
"edge_sign": 1 if row.get("consensus_stimulation", False) else -1,
"n_sources": int(row.get("n_sources", 0) if not pd.isna(row.get("n_sources", 0)) else 0),
"src_signal": src_attr.get("corneto_signal", 0.0),
"tgt_signal": tgt_attr.get("corneto_signal", 0.0),
"src_freq": src_attr.get("patient_frequency", 0.0),
"tgt_freq": tgt_attr.get("patient_frequency", 0.0),
})
query_result = (
pl.DataFrame(rows)
.filter(
(pl.col("n_sources") >= 3) &
(pl.col("src_freq") >= 0.5) &
(pl.col("tgt_freq") >= 0.5)
)
.sort("n_sources", descending=True)
)
print(f"High-confidence consensus edges: {len(query_result)}")
print(query_result.head(20))
# =============================================================================
# STEP 4 — Cross-layer query: AnnNet
# Consensus-active stimulatory edges with both endpoints signaling
# ==============================================================================
consensus_set = set(consensus_nodes)
rows = []
edge_attr = G.edge_attributes.to_pandas().set_index("edge_id")
for eid, (src, tgt, _) in G.edge_definitions.items():
if src not in consensus_set or tgt not in consensus_set:
continue
if eid not in edge_attr.index:
continue
row = edge_attr.loc[eid]
src_attr = G.get_vertex_layer_attrs(src, consensus_aa)
tgt_attr = G.get_vertex_layer_attrs(tgt, consensus_aa)
rows.append({
"source": src,
"target": tgt,
"edge_sign": 1 if row.get("consensus_stimulation", False) else -1,
"n_sources": int(row.get("n_sources", 0) if not pd.isna(row.get("n_sources", 0)) else 0),
"src_signal": src_attr.get("corneto_signal", 0.0),
"tgt_signal": tgt_attr.get("corneto_signal", 0.0),
"src_freq": src_attr.get("patient_frequency", 0.0),
"tgt_freq": tgt_attr.get("patient_frequency", 0.0),
})
query_result = (
pl.DataFrame(rows)
.filter(
(pl.col("n_sources") >= 3) &
(pl.col("src_freq") >= 0.5) &
(pl.col("tgt_freq") >= 0.5)
)
.sort("n_sources", descending=True)
)
print(f"High-confidence consensus edges: {len(query_result)}")
print(query_result.head(20))
High-confidence consensus edges: 181 shape: (20, 8) ┌─────────┬────────┬───────────┬───────────┬────────────┬────────────┬──────────┬──────────┐ │ source ┆ target ┆ edge_sign ┆ n_sources ┆ src_signal ┆ tgt_signal ┆ src_freq ┆ tgt_freq │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ i64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════════╪════════╪═══════════╪═══════════╪════════════╪════════════╪══════════╪══════════╡ │ SRC ┆ EGFR ┆ 1 ┆ 40 ┆ -0.333333 ┆ -0.714286 ┆ 0.6 ┆ 0.7 │ │ AKT1 ┆ GSK3B ┆ -1 ┆ 37 ┆ 0.6 ┆ 0.333333 ┆ 0.5 ┆ 0.9 │ │ PDPK1 ┆ AKT1 ┆ 1 ┆ 32 ┆ 0.142857 ┆ 0.6 ┆ 0.7 ┆ 0.5 │ │ GSK3B ┆ MYC ┆ -1 ┆ 30 ┆ 0.333333 ┆ 1.0 ┆ 0.9 ┆ 0.5 │ │ SRC ┆ ABL1 ┆ 1 ┆ 30 ┆ -0.333333 ┆ -0.666667 ┆ 0.6 ┆ 0.6 │ │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ │ CSK ┆ LYN ┆ -1 ┆ 24 ┆ 0.333333 ┆ -0.714286 ┆ 0.6 ┆ 0.7 │ │ CSNK2A1 ┆ CDK1 ┆ -1 ┆ 23 ┆ 0.666667 ┆ -0.142857 ┆ 0.6 ┆ 0.7 │ │ ERBB2 ┆ EGFR ┆ -1 ┆ 23 ┆ -0.2 ┆ -0.714286 ┆ 0.5 ┆ 0.7 │ │ CSNK2A1 ┆ JUN ┆ -1 ┆ 23 ┆ 0.666667 ┆ -1.0 ┆ 0.6 ┆ 0.6 │ │ JAK2 ┆ EGFR ┆ 1 ┆ 23 ┆ -0.666667 ┆ -0.714286 ┆ 0.6 ┆ 0.7 │ └─────────┴────────┴───────────┴───────────┴────────────┴────────────┴──────────┴──────────┘
In [30]:
Copied!
# =============================================================================
# STEP 5 — PyG export using AnnNet's native adapter
# =============================================================================
from annnet.adapters.pyg_adapter import to_pyg
pyg_data = to_pyg(
G,
node_features={
"default": ["corneto_signal", "patient_frequency", "tf_activity", "kinase_activity"]
},
edge_features={
("default", "edge", "default"): ["consensus_stimulation", "consensus_inhibition", "n_sources"]
},
slice_id=None,
hyperedge_mode="skip",
device="cpu",
)
print(f"Node types : {pyg_data.node_types}")
print(f"Edge types : {pyg_data.edge_types}")
# =============================================================================
# STEP 5 — PyG export using AnnNet's native adapter
# =============================================================================
from annnet.adapters.pyg_adapter import to_pyg
pyg_data = to_pyg(
G,
node_features={
"default": ["corneto_signal", "patient_frequency", "tf_activity", "kinase_activity"]
},
edge_features={
("default", "edge", "default"): ["consensus_stimulation", "consensus_inhibition", "n_sources"]
},
slice_id=None,
hyperedge_mode="skip",
device="cpu",
)
print(f"Node types : {pyg_data.node_types}")
print(f"Edge types : {pyg_data.edge_types}")
Node types : ['default']
Edge types : [('default', 'edge', 'default')]
In [86]:
Copied!
# =============================================================================
# STEP 7 — Save AnnNet (containing all information from workflow) as .annnet
# =============================================================================
G.write("OP-dR-CN.annnet", overwrite = True)
# =============================================================================
# STEP 7 — Save AnnNet (containing all information from workflow) as .annnet
# =============================================================================
G.write("OP-dR-CN.annnet", overwrite = True)
In [ ]:
Copied!