Source code for lipyd.metabolite

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2018 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

from __future__ import generator_stop

from future.utils import iteritems
from past.builtins import xrange, range

import itertools
import collections
import operator
import functools
import copy

import numpy as np

import lipyd.formula as formula
import lipyd.lipproc as lipproc


[docs]class AbstractMetaboliteComponent(formula.Formula): def __init__(self, core = 0.0, charge = 0, isotope = 0, name = 'Unknown', getname = lambda parent: parent.name, **kwargs): """ Represents a component of a molecule. It can be the basis of a core carrying more substituents. Or can be the basis of a substituent which might provide a custom set of options or a homolog series with different carbon counts and unsaturations on the aliphatic chain. :param core: You can provide the core (unchanged part) 3 ways: As a `float` which considered to be an exact mass. As a formula, e.g. `C2H5OH`. As a `dict` of atom counts, e.g. `{'C': 2, 'H': 6, 'O': 1}`. """ formula.Formula.__init__( self, core if type(core) is not float else None, charge, isotope, **kwargs ) if not self.has_mass(): if type(core) is float: self.mass = core else: raise ValueError( 'Please provide either formula or atom counts or mass.' ) self.name = name self.getname = getname
[docs]class AbstractMetabolite(AbstractMetaboliteComponent): def __init__(self, core = 0.0, charge = 0, isotope = 0, name = 'Unknown', hg = None, getname = lambda parent, subs: '%s(%s)' % ( parent.name, '/'.join( lipproc.cu_str(s.c, s.u) for s in subs if hasattr(s, 'c') and hasattr(s, 'u') ) ), subs = None, sum_only = False, **kwargs ): """ Represents a metabolite with an unchanged core and a set of variable substituents. :param bool sum_only: Do not iterate all aliphatic chains independently but consider only sum of chain lengths and unsaturations. """ AbstractMetaboliteComponent.__init__( self, core, charge = charge, isotope = isotope, name = name, getname = getname, **kwargs ) self.subs = subs or () self.subs = tuple(self.get_substituent(s) for s in self.subs) self.sum_only = sum_only self.sub0 = None self.hg = hg def __iter__(self): for subs, inst in self.subsproduct(): yield inst
[docs] @staticmethod def get_substituent(sub): """ Creates a `Formula` object if the substituent is not already an instance of `Formula` or `Substituent`. """ return ( formula.Formula(sub) if hasattr(sub, 'lower') or type(sub) is float else formula.Formula(**sub) if type(sub) is dict else sub )
[docs] def itersubs(self): """ Iterates all combinations of all substituents. Yields tuples of substituents. """ self._restore_sub0() for subs in itertools.product(*self.subs): yield subs
[docs] def subsproduct(self): """ Iterates instances and substituents in parallel. Yields tuples of two elements. First element is a tuple of all molecule parts (substituents). Second element is the actual instance, i.e. the whole molecule. """ iterator = self.itersum() if self.sum_only else self.itersubs() for subs in iterator: self.inst_name = self.getname(self, subs) self.inst = functools.reduce( operator.add, itertools.chain((self,), (s for s in subs)) ) self.inst.name = self.inst_name yield subs, self.inst
[docs] def iterlines(self): """ Iterates standard lines. """ for subs, inst in self.subsproduct(): full_name = inst.getname() chains = tuple( s.attrs.chain for s in subs if hasattr(s.attrs, 'chain') ) chainsum = lipproc.sum_chains(chains) name = ( (lipproc.summary_str(self.hg, chainsum),) if self.hg and self.sum_only else (lipproc.full_str(self.hg, chains),) if self.hg and not self.sum_only else () ) lab = lipproc.LipidLabel( db_id = None, db = 'lipyd.lipid', names = name, formula = inst.formula, ) rec = lipproc.LipidRecord( lab = lab, hg = self.hg, chainsum = chainsum if chainsum.c else None, chains = () if self.sum_only else chains, ) yield inst.mass, rec
[docs] def itersum(self): """ Iterates by considering only the sum of chain lengths and unsaturations. """ self._restore_sub0() chains = [ (i, s.chlens, s.unsats) for i, s in enumerate(self.subs) if self.has_variable_aliphatic_chain(s) ] if len(chains) <= 1: for subs in self.itersubs(): yield subs return min_chlens = sum(min(c[1]) for c in chains[1:]) min_unsats = sum(min(c[2]) for c in chains[1:]) sum_chlens = list(set( sum(cc) - min_chlens for cc in itertools.product(*(c[1] for c in chains)) )) sum_unsats = list(set( sum(uu) - min_unsats for uu in itertools.product(*(c[2] for c in chains)) )) self.sub0 = chains[0] isub0 = self.sub0[0] sub0 = self.subs[isub0] sub0.chlens = sum_chlens sub0.unsats = sum_unsats for subs in itertools.product(*( s.__iter__() if i == isub0 else # other substituents iterated by their cores only s.__iter__(cores_only = True) for i, s in enumerate(self.subs) )): yield subs # at the end restore the real values self._restore_sub0()
def _restore_sub0(self): """ For iterating with considering only total carbon count and unsaturation accross all aliphatic chains, we pretend that all extra carbons and unsaturations are added to the first substituent with variable aliphatic chain, e.g. it may have C54 while in reality it has max C18 and the other 36 carbons are in the other chains. After setting these fake numbers on the first substituent we need to restore the real numbers otherwise it could mess up things later. """ if self.sub0: self.subs[self.sub0[0]].chlens = self.sub0[1] self.subs[self.sub0[0]].unsats = self.sub0[2]
[docs] def has_variable_aliphatic_chain(self, sub): """ Tells if the substituent really has more than one possible chain length or unsaturation variation. """ return ( hasattr(sub, 'variable_aliphatic_chain') and sub.variable_aliphatic_chain )
[docs]class AbstractSubstituent(AbstractMetaboliteComponent): def __init__( self, cores = 0.0, c = (14, 20), u = (0, 1), counts = None, charges = 0, isotopes = 0, names = 'Unknown', chain_attr = None, chain_type = None, getname = lambda parent: '%u:%u' % (parent.c, parent.u), c_u_diff = lambda c, u: c > u + 1 or (c <= 1 and u == 0), prefix = '', even = False, valence = 1, **kwargs ): """ Represents a set of distinct substituent parts in a molecule or a homolog series spanning accross a range of aliphatic chain length and unsaturated bonds. Args: ----- :param list cores: List of core variations. Same kind of definitions are possible like at `formula.Formula`: formula as `str`, `dict` of atoms or exact mass as `float`. :param tuple c: Tuple of 2 integers: range of chain lengths. :param tuple u: Tuple of 2 integers: range of unsaturations. :param list counts: Dictionary with extra atom counts. If you have one or more extra oxygen, nitrogen, phosphorous or any other atoms in the compound you can include here. Alternatively you can also include them in the core. Also accounts for the valences of the aliphatic chain not occupied by hydrogens. E.g. for a fatty acyl you need to remove 3 hydrogens as 3 valences are occupied by the oxygens. If you have a secondary amine you need to remove one more hydrogen. Otherwise, as this data structure has no information about constitution, we could not guess the number of hydrogens. Similarly, for oxo groups removal of 2 hydrogens necessary. :param list charges: List of integers: charges for each core variation. :param list isotopes: List of integers: extra neutrons for each core variation. :param getname: Method (callable) to create a name from the chain length and the unsaturation. :param c_u_diff: Method (callable) to decide if the chain length and unsaturation are compatible. E.g. if chain length is only C2, an unsaturation of 4 double bonds is not possible. By default, chain length must be greater by 2 than unsaturation, it means at acyl chains we avoid to assume double bond right next to the carboxyl group which is clearly impossible. :param bool even: If true only even chain lengths are considered. :param int valence: Total valence of the substituent. Number of bonds connecting to other parts of the molecule. """ self.prefix = prefix self.c_u_diff = c_u_diff self.even = even self.valence = valence self.getname = getname self.cores = ( cores if type(cores) in {list, set, tuple} else [cores] ) self.set_attr(charges, 'charges') self.set_attr(names, 'names') self.set_attr(isotopes, 'isotopes') self.counts = collections.defaultdict(lambda: 0) self.counts.update(counts or {}) AbstractMetaboliteComponent.__init__( self, self.cores[0], charge = self.charges[0], isotope = self.isotopes[0], name = self.names[0], getname = getname, **kwargs ) # range of possible lengths and unsats self.set_chlens(c) self.set_unsats(u) # current value of length and unsat self.c = self.chlens[0] self.u = self.unsats[0] self.total = len(self.chlens) * len(self.unsats) self.variable_aliphatic_chain = ( not ( len(self.chlens) == 1 and self.chlens[0] == 0 ) ) self.chain_attr = chain_attr self.chain_type = chain_type def __iter__(self, cores_only = False): for i in range(len(self.cores)): self.update_core(i) for c in self.chlens: self.c = c for u in self.unsats: self.u = u if not self.c_u_diff(self.c, self.u): continue # implicit hydrogens h = c * 2 + 2 - self.valence - 2 * u p = self.get_prefix() new_counts = self.counts.copy() new_counts['C'] += c new_counts['H'] += h new_attrs = copy.deepcopy(self.attrs) # `attrs` might contain methods which are called # with the present instance passed and their returned # value will be the attribute value of the # iteration products for k, v in iteritems(new_attrs.__dict__): if hasattr(v, '__call__'): setattr(new_attrs, k, v(self)) if self.chain_type and self.chain_attr and c > 0: new_attrs.chain = self.get_chain() new = self + formula.Formula(**new_counts) new.attrs = new_attrs new.c = c new.u = u new.get_prefix = lambda: p new.variable_aliphatic_chain = ( self.variable_aliphatic_chain ) new.name = self.getname(self) # new.attrs.chain = self.get_chain() yield new if cores_only: break if cores_only: break
[docs] def set_attr(self, val, name): setattr( self, name, val if type(val) in {list, set, tuple} else (val,) * len(self.cores) )
[docs] def set_chlens(self, c): if type(c) is int: c = [c] if type(c) is tuple and len(c) != 2: c = list(c) self.chlens = ( list(c) if type(c) in {list, set, range} else [ i for i in range(c[0], c[1] + 1) if not self.even or i % 2 == 0 ] )
[docs] def set_unsats(self, u): if type(u) is int: u = [u] if type(u) is tuple and len(u) != 2: u = list(u) self.unsats = ( list(u) if type(u) in {list, set, range} else list(range(u[0], u[1] + 1)) )
[docs] def update_core(self, i = 0): try: new = self.cores[i] except: raise IndexError('Number of cores is less than %u' % i + 1) if type(new) is float: self.mass = new self.formula = '' elif type(new) is dict: self.formula_from_dict(new) elif hasattr(new, 'lower'): self.formula = new else: raise ValueError('Wrong mass or formula: `%s`' % str(new)) self.charge = ( self.charges[i] if type(self.charges) in {list, set, tuple} else self.charges ) self.isotope = ( self.isotopes[i] if type(self.isotopes) in {list, set, tuple} else self.isotypes ) self.name = ( self.names[i] if type(self.names) in {list, set, tuple} else self.names ) self.reset_atoms() self.calc_mass()
[docs] def get_prefix(self): return self.prefix
[docs] def get_chain(self): return lipproc.Chain( c = self.c, u = self.u, typ = self.chain_type, attr = self.chain_attr, )