Single sample intracellular signalling network inference#

In this notebook we showcase how to use the advanced CARNIVAL implementation available in CORNETO. This implementation extends the capabilities of the original CARNIVAL method by enabling advanced modelling and injection of knowledge for hypothesis generation. We will use a dataset consisting of 6 samples of hepatic stellate cells (HSC) where three of them were activated by the cytokine Transforming growth factor (TGF-β).

In the first part, we will show how to estimate Transcription Factor activities from gene expression data, following the Decoupler tutorial for functional analysis. Then, we will use the CARNIVAL method available in CORNETO to infer a network from TFs to receptors, assuming that we don’t really know which treatment was used.

# --- Saezlab tools ---
# https://decoupler-py.readthedocs.io/
import gzip
import os
import shutil
import tempfile
import urllib.request

import decoupler as dc
import numpy as np

# https://omnipathdb.org/
import omnipath as op

# Additional packages
import pandas as pd

# --- Additional libs ---
# Pydeseq for differential expression analysis
from pydeseq2.dds import DefaultInference, DeseqDataSet
from pydeseq2.ds import DeseqStats

# https://saezlab.github.io/
import corneto as cn
max_time = 300
seed = 0
# We need to download the dataset, available at GEO GSE151251
url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE151251&format=file&file=GSE151251%5FHSCs%5FCtrl%2Evs%2EHSCs%5FTGFb%2Ecounts%2Etsv%2Egz"

adata = None
with tempfile.TemporaryDirectory() as tmpdirname:
    # Path for the gzipped file in the temp folder
    gz_file_path = os.path.join(tmpdirname, "counts.txt.gz")

    # Download the file
    with urllib.request.urlopen(url) as response:
        with open(gz_file_path, "wb") as out_file:
            shutil.copyfileobj(response, out_file)

    # Decompress the file
    decompressed_file_path = gz_file_path[:-3]  # Removing '.gz' extension
    with gzip.open(gz_file_path, "rb") as f_in:
        with open(decompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    adata = pd.read_csv(decompressed_file_path, index_col=2, sep="\t").iloc[:, 5:].T

adata
GeneName DDX11L1 WASH7P MIR6859-1 MIR1302-11 MIR1302-9 FAM138A OR4G4P OR4G11P OR4F5 RP11-34P13.7 ... MT-ND4 MT-TH MT-TS2 MT-TL2 MT-ND5 MT-ND6 MT-TE MT-CYB MT-TT MT-TP
25_HSCs-Ctrl1 0 9 10 1 0 0 0 0 0 33 ... 93192 342 476 493 54466 17184 1302 54099 258 475
26_HSCs-Ctrl2 0 12 14 0 0 0 0 0 0 66 ... 114914 355 388 436 64698 21106 1492 62679 253 396
27_HSCs-Ctrl3 0 14 10 0 0 0 0 0 0 52 ... 155365 377 438 480 85650 31860 2033 89559 282 448
31_HSCs-TGFb1 0 11 16 0 0 0 0 0 0 54 ... 110866 373 441 481 60325 19496 1447 66283 172 341
32_HSCs-TGFb2 0 5 8 0 0 0 0 0 0 44 ... 45488 239 331 343 27442 9054 624 27535 96 216
33_HSCs-TGFb3 0 12 5 0 0 0 0 0 0 32 ... 70704 344 453 497 45443 13796 1077 43415 192 243

6 rows × 64253 columns

Data preprocessing#

We will use AnnData and PyDeseq2 to pre-process the data and compute differential expression between control and tretament

from anndata import AnnData

adata = AnnData(adata, dtype=np.float32)
adata.var_names_make_unique()
adata
AnnData object with n_obs × n_vars = 6 × 64253
# Process treatment information
adata.obs["condition"] = [
    "control" if "-Ctrl" in sample_id else "treatment" for sample_id in adata.obs.index
]

# Process sample information
adata.obs["sample_id"] = [sample_id.split("_")[0] for sample_id in adata.obs.index]

# Visualize metadata
adata.obs
condition sample_id
25_HSCs-Ctrl1 control 25
26_HSCs-Ctrl2 control 26
27_HSCs-Ctrl3 control 27
31_HSCs-TGFb1 treatment 31
32_HSCs-TGFb2 treatment 32
33_HSCs-TGFb3 treatment 33
# Obtain genes that pass the thresholds
genes = dc.filter_by_expr(
    adata, group="condition", min_count=10, min_total_count=15, large_n=1, min_prop=1
)

# Filter by these genes
adata = adata[:, genes].copy()
adata
AnnData object with n_obs × n_vars = 6 × 19713
    obs: 'condition', 'sample_id'
# Estimation of differential expression

inference = DefaultInference()
dds = DeseqDataSet(
    adata=adata,
    design_factors="condition",
    refit_cooks=True,
    inference=inference,
)
# Compute LFCs
dds.deseq2()
stat_res = DeseqStats(
    dds, contrast=["condition", "treatment", "control"], inference=inference
)

stat_res.summary()
Log2 fold change & Wald test p-value: condition treatment vs control
                  baseMean  log2FoldChange     lfcSE      stat    pvalue  \
GeneName                                                                   
WASH7P           10.349784       -0.011129  0.651922 -0.017071  0.986380   
MIR6859-1        10.114621        0.000625  0.657581  0.000950  0.999242   
RP11-34P13.7     45.731312        0.078209  0.324512  0.241005  0.809551   
RP11-34P13.8     29.498379       -0.065178  0.393711 -0.165549  0.868512   
CICP27          106.032659        0.150594  0.223024  0.675239  0.499524   
...                    ...             ...       ...       ...       ...   
MT-ND6        17914.984474       -0.435304  0.278796 -1.561372  0.118436   
MT-TE          1281.293477       -0.332495  0.288073 -1.154204  0.248416   
MT-CYB        54955.449372       -0.313285  0.286900 -1.091966  0.274848   
MT-TT           204.692221       -0.485882  0.220606 -2.202488  0.027631   
MT-TP           345.049755       -0.460675  0.161701 -2.848936  0.004387   

                  padj  
GeneName                
WASH7P        0.991409  
MIR6859-1     0.999520  
RP11-34P13.7  0.877381  
RP11-34P13.8  0.917121  
CICP27        0.637374  
...                ...  
MT-ND6        0.211001  
MT-TE         0.380057  
MT-CYB        0.411271  
MT-TT         0.061644  
MT-TP         0.012217  

[19713 rows x 6 columns]
results_df = stat_res.results_df
results_df
baseMean log2FoldChange lfcSE stat pvalue padj
GeneName
WASH7P 10.349784 -0.011129 0.651922 -0.017071 0.986380 0.991409
MIR6859-1 10.114621 0.000625 0.657581 0.000950 0.999242 0.999520
RP11-34P13.7 45.731312 0.078209 0.324512 0.241005 0.809551 0.877381
RP11-34P13.8 29.498379 -0.065178 0.393711 -0.165549 0.868512 0.917121
CICP27 106.032659 0.150594 0.223024 0.675239 0.499524 0.637374
... ... ... ... ... ... ...
MT-ND6 17914.984474 -0.435304 0.278796 -1.561372 0.118436 0.211001
MT-TE 1281.293477 -0.332495 0.288073 -1.154204 0.248416 0.380057
MT-CYB 54955.449372 -0.313285 0.286900 -1.091966 0.274848 0.411271
MT-TT 204.692221 -0.485882 0.220606 -2.202488 0.027631 0.061644
MT-TP 345.049755 -0.460675 0.161701 -2.848936 0.004387 0.012217

19713 rows × 6 columns

Prior knowledge with Decoupler and Omnipath#

# Retrieve CollecTRI gene regulatory network (through Omnipath)
collectri = dc.get_collectri(organism="human", split_complexes=False)
collectri
source target weight PMID
0 MYC TERT 1 10022128;10491298;10606235;10637317;10723141;1...
1 SPI1 BGLAP 1 10022617
2 SMAD3 JUN 1 10022869;12374795
3 SMAD4 JUN 1 10022869;12374795
4 STAT5A IL2 1 10022878;11435608;17182565;17911616;22854263;2...
... ... ... ... ...
43173 NFKB hsa-miR-143-3p 1 19472311
43174 AP1 hsa-miR-206 1 19721712
43175 NFKB hsa-miR-21-5p 1 20813833;22387281
43176 NFKB hsa-miR-224-5p 1 23474441;23988648
43177 AP1 hsa-miR-144 1 23546882

43178 rows × 4 columns

mat = results_df[["stat"]].T.rename(index={"stat": "treatment.vs.control"})
mat
GeneName WASH7P MIR6859-1 RP11-34P13.7 RP11-34P13.8 CICP27 FO538757.2 AP006222.2 RP4-669L17.10 MTND1P23 MTND2P28 ... MT-ND4 MT-TH MT-TS2 MT-TL2 MT-ND5 MT-ND6 MT-TE MT-CYB MT-TT MT-TP
treatment.vs.control -0.017071 0.00095 0.241005 -0.165549 0.675239 -1.645412 2.041302 -0.376843 -1.994386 -0.498507 ... -1.435973 0.754913 1.139002 1.167032 -1.242582 -1.561372 -1.154204 -1.091966 -2.202488 -2.848936

1 rows × 19713 columns

tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri, verbose=True)
tf_acts
Running ulm on mat with 1 samples and 19713 targets for 655 sources.
ABL1 AHR AIRE AP1 APEX1 AR ARID1A ARID3A ARID3B ARID4A ... ZNF362 ZNF382 ZNF384 ZNF395 ZNF436 ZNF699 ZNF76 ZNF804A ZNF91 ZXDC
treatment.vs.control -2.181499 -1.556123 -1.824668 -1.983679 -2.673171 0.235021 -3.93546 0.870591 1.649556 -0.612672 ... -0.063138 2.123082 1.887668 -1.141122 1.571889 1.277879 -0.116911 1.92635 0.91651 -2.744753

1 rows × 655 columns

dc.plot_barplot(
    acts=tf_acts, contrast="treatment.vs.control", top=25, vertical=True, figsize=(3, 6)
)
../_images/297769ff221b87e13d46919d8d6a609c022c57c38530005c3ac497dcf9136770.png
# We obtain ligand-receptor interactions from Omnipath, and we keep only the receptors
# This is our list of a prior potential receptors from which we will infer the network
unique_receptors = set(
    op.interactions.LigRecExtra.get(genesymbols=True)[
        "target_genesymbol"
    ].values.tolist()
)
len(unique_receptors)
1201
df_de_receptors = results_df.loc[results_df.index.intersection(unique_receptors)]
df_de_receptors = df_de_receptors.sort_values(by="stat", ascending=False)
df_de_receptors.plot.scatter(x="log2FoldChange", y="stat")
<Axes: xlabel='log2FoldChange', ylabel='stat'>
../_images/467f3749370c5b00baf85c44e9c8eeb8f707ec2a16818bd817c8dcaa51397428.png
# We will take the top 20 receptors that increased the expression after treatment
df_top_receptors = df_de_receptors.head(20)
df_top_receptors
baseMean log2FoldChange lfcSE stat pvalue padj
GeneName
CDH6 28262.296983 3.019806 0.077417 39.007075 0.000000e+00 0.000000e+00
CRLF1 7919.672343 3.341644 0.087997 37.974317 0.000000e+00 0.000000e+00
FZD8 2719.781498 3.843015 0.102696 37.421427 1.751855e-306 1.817595e-303
CDH2 7111.507547 2.809461 0.081929 34.291273 1.058818e-257 6.138964e-255
DYSF 441.449511 4.593250 0.182207 25.208911 3.198723e-140 5.680759e-138
INHBA 24329.359641 2.598613 0.104321 24.909787 5.828479e-137 9.655193e-135
CELSR1 492.324435 3.517315 0.154495 22.766506 9.849205e-115 1.303070e-112
PDGFC 5481.167075 1.894857 0.083811 22.608707 3.558087e-113 4.676039e-111
VDR 1450.304353 2.555007 0.116782 21.878467 4.166147e-106 4.977410e-104
HHIP 379.064034 6.005841 0.287374 20.899036 5.463696e-97 5.439689e-95
IGF1 361.189076 6.462656 0.311795 20.727232 1.967869e-95 1.910966e-93
NPTN 12640.971164 1.418676 0.079355 17.877480 1.766413e-71 1.095010e-69
EFNB2 3320.076805 1.560331 0.087847 17.761861 1.395299e-70 8.385834e-69
ITGB3 6530.314159 1.364767 0.078772 17.325469 3.022208e-67 1.673505e-65
TGFB1 8948.243694 1.335271 0.077618 17.203057 2.518840e-66 1.367876e-64
IL21R 259.221468 3.227903 0.192097 16.803541 2.298997e-63 1.180212e-61
ITGA1 26223.832504 1.310351 0.079830 16.414270 1.511887e-60 7.251541e-59
FAP 7231.157483 1.750901 0.115188 15.200426 3.513274e-52 1.413412e-50
EGF 144.262044 3.733301 0.251634 14.836246 8.540295e-50 3.194589e-48
ITGA11 12471.033668 3.673199 0.250314 14.674378 9.407359e-49 3.390261e-47

Inferring intracellular signalling network with CARNIVAL and CORNETO#

CORNETO is a unified framework for knowledge-driven network inference. It includes a very flexible implementation of CARNIVAL that expands its original capabilities. We will see how to use it under different assumptions to extract a network from a prior knowledge network and a set of potential receptors + our estimated TFs

cn.info()
Installed version:v1.0.0.dev1 (latest: v1.0.0.dev0)
Available backends:CVXPY v1.5.3
Default backend (corneto.opt):CVXPY
Installed solvers:GUROBI, SCIP, SCIPY
Graphviz version:v0.20.3
Installed path:C:\Users\pablo\Documents\work\projects\corneto\corneto
Repository:https://github.com/saezlab/corneto
# We get only interactions from SIGNOR http://signor.uniroma2.it/
pkn = op.interactions.OmniPath.get(databases=["SIGNOR"], genesymbols=True)
pkn = pkn[pkn.consensus_direction == True]
pkn.head()
source target source_genesymbol target_genesymbol is_directed is_stimulation is_inhibition consensus_direction consensus_stimulation consensus_inhibition curation_effort references sources n_sources n_primary_sources n_references references_stripped
0 Q13976 Q13507 PRKG1 TRPC3 True False True True False True 9 HPRD:14983059;KEA:14983059;ProtMapper:14983059... HPRD;HPRD_KEA;HPRD_MIMP;KEA;MIMP;PhosphoPoint;... 15 8 2 14983059;16331690
1 Q13976 Q9HCX4 PRKG1 TRPC7 True True False True True False 3 SIGNOR:21402151;TRIP:21402151;iPTMnet:21402151 SIGNOR;TRIP;iPTMnet 3 3 1 21402151
2 Q13438 Q9HBA0 OS9 TRPV4 True True True True True True 3 HPRD:17932042;SIGNOR:17932042;TRIP:17932042 HPRD;SIGNOR;TRIP 3 3 1 17932042
3 P18031 Q9H1D0 PTPN1 TRPV6 True False True True False True 11 DEPOD:15894168;DEPOD:17197020;HPRD:15894168;In... DEPOD;HPRD;IntAct;Lit-BM-17;SIGNOR;SPIKE_LC;TRIP 7 6 2 15894168;17197020
4 P63244 Q9BX84 RACK1 TRPM6 True False True True False True 2 SIGNOR:18258429;TRIP:18258429 SIGNOR;TRIP 2 2 1 18258429
pkn["interaction"] = pkn["is_stimulation"].astype(int) - pkn["is_inhibition"].astype(
    int
)
sel_pkn = pkn[["source_genesymbol", "interaction", "target_genesymbol"]]
sel_pkn
source_genesymbol interaction target_genesymbol
0 PRKG1 -1 TRPC3
1 PRKG1 1 TRPC7
2 OS9 0 TRPV4
3 PTPN1 -1 TRPV6
4 RACK1 -1 TRPM6
... ... ... ...
61579 DTNBP1 1 WASF2
61580 CDK1 1 KMT5A
61581 PIM2 1 CDKN1A
61582 AKT1 1 WNK1
61583 PRKCA 1 CSPG4

60903 rows × 3 columns

# We create the CORNETO graph by importing the edges and interaction
G = cn.Graph.from_sif_tuples(
    [(r[0], r[1], r[2]) for _, r in sel_pkn.iterrows() if r[1] != 0]
)
G.shape
(5436, 60020)
# As measurements, we take the estimated TFs, we will filter out TFs with p-val > 0.001
significant_tfs = (
    tf_acts[tf_pvals <= 0.001]
    .T.dropna()
    .sort_values(by="treatment.vs.control", ascending=False)
)
significant_tfs
treatment.vs.control
SRF 7.421123
MYOCD 7.372481
SMAD3 7.190940
BTG2 6.206934
TCF21 6.183177
... ...
SPI1 -5.427345
NR4A3 -5.749818
HIF3A -5.882730
NR1H4 -6.081834
IRF1 -9.395861

84 rows × 1 columns

# We keep only the ones in the PKN graph
measurements = significant_tfs.loc[significant_tfs.index.intersection(G.V)].to_dict()[
    "treatment.vs.control"
]
measurements
{'SRF': 7.4211225509643555,
 'MYOCD': 7.372481346130371,
 'SMAD3': 7.190939903259277,
 'BTG2': 6.206934452056885,
 'SMAD2': 5.634183883666992,
 'JUNB': 5.632753849029541,
 'HMGA2': 4.937854766845703,
 'POU3F1': 4.921204090118408,
 'RORA': 4.765202045440674,
 'SMAD4': 4.588079452514648,
 'MEF2A': 4.287934303283691,
 'FOSL1': 3.938856363296509,
 'SOX4': 3.9252400398254395,
 'NCOA1': 3.8250725269317627,
 'SFPQ': 3.7440223693847656,
 'FOSB': 3.7315196990966797,
 'SP7': 3.6977312564849854,
 'HBP1': 3.6794629096984863,
 'CREB3': 3.632990837097168,
 'ASXL1': 3.574924945831299,
 'MEIS2': 3.4930355548858643,
 'TAL1': 3.354771852493286,
 'HOXC8': 3.347809314727783,
 'DLX5': 3.3378002643585205,
 'DLX2': -3.2941677570343018,
 'CDX2': -3.2988994121551514,
 'MAFA': -3.310776472091675,
 'STAT5A': -3.430389165878296,
 'RXRB': -3.4593443870544434,
 'MSX2': -3.5386831760406494,
 'SMAD6': -3.5516138076782227,
 'CEBPA': -3.59158992767334,
 'PLAGL1': -3.6208605766296387,
 'RELA': -3.63592267036438,
 'TP53': -3.6599860191345215,
 'NKX2-1': -3.6630992889404297,
 'NFKB1': -3.842646598815918,
 'CEBPB': -3.861898422241211,
 'WWTR1': -3.9853055477142334,
 'SMAD5': -4.020087242126465,
 'VHL': -4.030272483825684,
 'CIITA': -4.151106834411621,
 'NFKBIB': -4.1750593185424805,
 'REL': -4.184780120849609,
 'PITX3': -4.216534614562988,
 'PITX1': -4.358892440795898,
 'MECP2': -4.400153160095215,
 'TGIF1': -4.501400470733643,
 'MECOM': -4.5638203620910645,
 'IRF3': -4.733184814453125,
 'RELB': -4.848578453063965,
 'NFE2L2': -4.937533378601074,
 'STAT1': -5.215514659881592,
 'HOXC6': -5.283187389373779,
 'IRF2': -5.291377544403076,
 'KLF11': -5.293704509735107,
 'SMAD7': -5.3725266456604,
 'SPI1': -5.427344799041748,
 'NR4A3': -5.749818325042725,
 'HIF3A': -5.882729530334473,
 'NR1H4': -6.081834316253662,
 'IRF1': -9.39586067199707}
# We will infer the direction, so for the inputs, we use a value of 0 (=unknown direction)
inputs = {k: 0 for k in df_top_receptors.index.intersection(G.V).values}
inputs
{'CDH6': 0,
 'CRLF1': 0,
 'FZD8': 0,
 'CDH2': 0,
 'INHBA': 0,
 'VDR': 0,
 'IGF1': 0,
 'EFNB2': 0,
 'ITGB3': 0,
 'TGFB1': 0,
 'IL21R': 0,
 'EGF': 0,
 'ITGA11': 0}
# We prune the network from inputs (receptors) to TFs to improve the performance.
# The pruning step removes only parts of the network that cannot be reached from the pre-selected receptors
from corneto.methods.carnival import preprocess_graph

Gp, inputs_p, measurements_p = preprocess_graph(G, inputs, measurements)
Gp.shape
(958, 3279)
vertices = Gp.V
from corneto.methods.carnival import milp_carnival

# We create a CARNIVAL problem
P = milp_carnival(Gp, inputs_p, measurements_p, beta_weight=0.2)

# The CARNIVAL problem contains useful variables that we will estimate from the data
P.expr
{'vertex_inhibited': Variable((958,), vertex_inhibited, boolean=True),
 'edge_activating': Variable((3279,), edge_activating, boolean=True),
 'edge_inhibiting': Variable((3279,), edge_inhibiting, boolean=True),
 'vertex_activated': Variable((958,), vertex_activated, boolean=True),
 'vertex_position': Variable((958,), vertex_position),
 'vertex_values': Expression(AFFINE, UNKNOWN, (958,)),
 'edge_values': Expression(AFFINE, UNKNOWN, (3279,))}
# We first run CARNIVAL using all selected receptors.
# We find the value for the variables using the solver GUROBI
# (needs to be installed with a valid license, academic licenses are free)
# TimeLimit and Seed are GUROBI specific parameters
P.solve(solver="GUROBI", Seed=seed)

# We extract the selected edges
G_sol = Gp.edge_subgraph(np.flatnonzero(P.expr.edge_values.value))

# We plot generating custom drawing attributes
values = P.expr.vertex_values.value
vertex_values = {v: values[i] for i, v in enumerate(Gp.V)}
vertex_sol_values = [vertex_values[v] for v in G_sol.V]
G_sol.plot(
    custom_vertex_attr=cn.pl.create_graphviz_vertex_attributes(
        G_sol.V, vertex_sol_values
    )
)
../_images/4af22eae9cd362af318cac2cbde595e7fe1ef2125236e5758fe3212993e72366.svg
# Print the values of the objectives:
# - First objective is error (non-fited TFs).
# - Second objective is number of interactions
for o in P.objectives:
    print(o.value)
10.373496770858765
88.0
# The CARNIVAL problem can be manipulated for hypothesis exploration.
# For example, we will say that only 1 of the provided receptors has to be selected, at most.
P = milp_carnival(Gp, inputs_p, measurements_p, beta_weight=0.2)
idx_receptors = [vertices.index(k) for k in inputs_p.keys()]
protein_selected = P.expr.vertex_activated + P.expr.vertex_inhibited
P += sum(protein_selected[idx_receptors]) == 1
P.solve(solver="GUROBI", TimeLimit=max_time, Seed=seed);
# We extract the edges that have some signal (edge_values != 0)
# We select the sub-graph from the processed PKN
G_sol = Gp.edge_subgraph(np.flatnonzero(P.expr.edge_values.value))

# We plot generating custom drawing attributes
values = P.expr.vertex_values.value
vertex_values = {v: values[i] for i, v in enumerate(Gp.V)}
vertex_sol_values = [vertex_values[v] for v in G_sol.V]
G_sol.plot(
    custom_vertex_attr=cn.pl.create_graphviz_vertex_attributes(
        G_sol.V, vertex_sol_values
    )
)
../_images/991db37fbfd790ab984ac086ea3470f4113f4ec84675612ed5c3d4ed59cedeaa.svg
# Print the values of the objectives:
# - First objective is error (non-fited TFs).
# - Second objective is number of interactions
for o in P.objectives:
    print(o.value)
10.373496770858765
88.0

Adding more knowledge#

We can add more prior knowledge to the CARNIVAL problem. For example, we are going to penalise genes that are lowly abundant, according to the average basal gene expression levels.

df_lowly_abundant_genes = results_df[
    (results_df.baseMean <= results_df.baseMean.quantile(0.25))
    & (results_df.padj >= 0.05)
]
df_lowly_abundant_genes.sort_values(by="padj")
baseMean log2FoldChange lfcSE stat pvalue padj
GeneName
TRIM17 53.443434 -0.689125 0.300338 -2.294499 0.021762 0.050210
NLGN3 32.581601 -0.895798 0.390573 -2.293545 0.021817 0.050307
RP11-1109F11.3 11.672715 -1.587070 0.692036 -2.293336 0.021829 0.050323
RP11-661A12.5 15.285895 -1.297808 0.566496 -2.290938 0.021967 0.050602
RP11-77K12.7 25.489581 -0.991542 0.432922 -2.290350 0.022001 0.050661
... ... ... ... ... ... ...
GDPD1 74.892252 0.000326 0.262746 0.001241 0.999010 0.999415
RP11-452K12.7 49.986985 -0.000259 0.302890 -0.000855 0.999317 0.999520
MIR6859-1 10.114621 0.000625 0.657581 0.000950 0.999242 0.999520
RP11-463C8.4 48.479410 -0.000280 0.305891 -0.000916 0.999269 0.999520
AIF1L 11.870434 -0.000351 0.619997 -0.000566 0.999549 0.999599

3750 rows × 6 columns

lowly_abundant_genes = set(Gp.V).intersection(df_lowly_abundant_genes.index.tolist())
lowly_abundant_genes, len(lowly_abundant_genes)
({'CARD11',
  'DCC',
  'DLX2',
  'ERBB3',
  'FERMT3',
  'GHR',
  'HOXC8',
  'INPP5D',
  'ITGB4',
  'MAF',
  'MSX1',
  'NOXO1',
  'PARD6A',
  'PLEKHG6',
  'PSTPIP1',
  'PTPN22',
  'RPS6KA5',
  'TEC',
  'TIAM1'},
 19)
# Now we will add a penalty to avoid selecting lowly expressed genes
vertices = Gp.V
penalties = np.zeros(Gp.num_vertices)
penalties[[vertices.index(v) for v in lowly_abundant_genes.intersection(vertices)]] = (
    0.01
)
penalties
array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  ])
# Create the carnival problem
P = milp_carnival(Gp, inputs_p, measurements_p, beta_weight=0.2)

# We penalize the inclusion of lowly expressed genes:
# protein_selected is just the sum of the binary variables vertex activated and vertex inhibited, defined in carnival.
# these variables are mutually exclusive, so the sum is at most 1, meaning that the vertex was selected, either activated (+1) or inhibited (-1)
protein_selected = P.expr.vertex_activated + P.expr.vertex_inhibited

# We multiply the vector variable of selected proteins and the penalties to
# sum the total cost: sum (v1 * penalty1 + v2 * penalty2, + v3 ...) and we add this as an objective
penalty_vertices = protein_selected @ penalties
P.add_objectives(penalty_vertices)

# Select only 1 receptor
protein_selected = P.expr.vertex_activated + P.expr.vertex_inhibited
# P += sum(protein_selected[idx_receptors]) == 1

P.solve(solver="GUROBI", Seed=seed, TimeLimit=max_time);
G_sol = Gp.edge_subgraph(np.flatnonzero(P.expr.edge_values.value))
values = P.expr.vertex_values.value
vertex_values = {v: values[i] for i, v in enumerate(Gp.V)}
vertex_sol_values = [vertex_values[v] for v in G_sol.V]
G_sol.plot(
    custom_vertex_attr=cn.pl.create_graphviz_vertex_attributes(
        G_sol.V, vertex_sol_values
    )
)
../_images/104dc7f4c9243fa4d2c8b3e2c9ed7ece8949129ea3f056196ba13e50f4c319e8.svg
# Print the values of the objectives:
for o in P.objectives:
    print(o.value)
10.373496770858765
88.0
0.0
# Now we are going to force that only TGFB1 (active)
Gp, inputs_p, measurements_p = preprocess_graph(G, {"TGFB1": 1}, measurements)
Gp.shape
(949, 3261)
P = milp_carnival(Gp, inputs_p, measurements_p, beta_weight=0.2)
P.solve(solver="GUROBI", Seed=seed, TimeLimit=max_time);
# Same error but +2 edges were included to explain the TFs
for o in P.objectives:
    print(o.value)
10.373496770858765
90.0
G_sol = Gp.edge_subgraph(np.flatnonzero(P.expr.edge_values.value))
values = P.expr.vertex_values.value
vertex_values = {v: values[i] for i, v in enumerate(Gp.V)}
vertex_sol_values = [vertex_values[v] for v in G_sol.V]
G_sol.plot(
    custom_vertex_attr=cn.pl.create_graphviz_vertex_attributes(
        G_sol.V, vertex_sol_values
    )
)
../_images/df88918a82e5b31f2635c86ac349701721e65c5347f934b905f72d174b3b3ef7.svg
# Biasing the networks towards activatory interactions

# Create the carnival problem with Beta = 0 (to not equally penalise all interactions)
# Gp, inputs_p, measurements_p = preprocess_graph(G, {"TGFB1": 0}, measurements)
Gp, inputs_p, measurements_p = preprocess_graph(G, inputs, measurements)
P = milp_carnival(Gp, inputs_p, measurements_p, beta_weight=0)

# Bias towards activations, by penalizing only inhibitions
P.add_objectives(P.expr.edge_inhibiting.sum(), weights=0.2)

# Much smaller penalty for activations, since we only want to penalise them
# slightly to avoid having spurious interactions. However, inhibitions are
# penalised 20x more than activations (weight 0.2 vs 0.01)
P.add_objectives(P.expr.edge_activating.sum(), weights=0.01)

P.solve(solver="GUROBI", TimeLimit=max_time, Seed=seed);
G_sol = Gp.edge_subgraph(np.flatnonzero(P.expr.edge_values.value))
values = P.expr.vertex_values.value
vertex_values = {v: values[i] for i, v in enumerate(Gp.V)}
vertex_sol_values = [vertex_values[v] for v in G_sol.V]
G_sol.plot(
    custom_vertex_attr=cn.pl.create_graphviz_vertex_attributes(
        G_sol.V, vertex_sol_values
    )
)
../_images/e4057e9d3d708f40c1c7c2936317dbc8df1f6b6113d32bb8b4b8435d5334fad3.svg
# 48 inhibitory interactions and 44 activations
for o in P.objectives:
    print(o.value)
10.373496770858765
48.0
44.0

We have explored different approaches and assumptions to recover a signalling network from TF activities and a list of potential receptors. Although changes in gene expression for signalling proteins are not always predictive of signalling cascades—hence the use of CARNIVAL as a footprint-based method to bridge the gaps between receptors and TFs—we can introduce a slight bias in the network towards signalling proteins whose gene expression has changed significantly after treatment. These changes, while indirect, may still provide valuable cues. In a manner similar to how we penalised low-abundance genes, we will now prioritize the inclusion of upregulated genes.

tf_acts.columns
Index(['ABL1', 'AHR', 'AIRE', 'AP1', 'APEX1', 'AR', 'ARID1A', 'ARID3A',
       'ARID3B', 'ARID4A',
       ...
       'ZNF362', 'ZNF382', 'ZNF384', 'ZNF395', 'ZNF436', 'ZNF699', 'ZNF76',
       'ZNF804A', 'ZNF91', 'ZXDC'],
      dtype='object', length=655)
# Since the problem minimises error, we change sign. Upregulated genes get negative score
# so if they are selected, the error decreases. The opposite for downregulated genes
df_gene_scores = -results_df.loc[
    results_df.index.intersection(Gp.V).difference(tf_acts.columns)
].stat
dict_scores = df_gene_scores.to_dict()

df_gene_scores.sort_values()
NUAK1      -40.581591
CDH6       -39.007075
FZD8       -37.421427
CDH2       -34.291273
STK38L     -30.552887
              ...    
IL6R        21.833188
MAP3K5      21.883284
CASP1       23.060970
IL1R1       24.058471
TNFRSF1B    25.812839
Name: stat, Length: 627, dtype: float64
# Now we will add a penalty to avoid selecting lowly expressed genes
vertices = Gp.V
scores = np.array([dict_scores.get(v, 0) for v in vertices])
np.sum(np.abs(scores))
3045.9775260752576
# Create the carnival problem
P = milp_carnival(Gp, inputs_p, measurements_p, beta_weight=0.2)
vertex_selected = P.expr.vertex_activated + P.expr.vertex_inhibited
total_score = vertex_selected @ scores
P.add_objectives(total_score, weights=1e-3)
P.solve(solver="GUROBI", Seed=seed, TimeLimit=max_time);
G_sol = Gp.edge_subgraph(np.flatnonzero(P.expr.edge_values.value))
values = P.expr.vertex_values.value
vertex_values = {v: values[i] for i, v in enumerate(Gp.V)}
vertex_sol_values = [vertex_values[v] for v in G_sol.V]
G_sol.plot(
    custom_vertex_attr=cn.pl.create_graphviz_vertex_attributes(
        G_sol.V, vertex_sol_values
    )
)
../_images/6c2c91d3ff88e924d96ca5c537f20f493e0543f6146b0ae4a1f538b2e4efbb30.svg
for o in P.objectives:
    print(o.value)
10.373496770858765
88.0
-245.81810302737185