Source code for lipyd.metabolite

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2018 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

from __future__ import generator_stop

from future.utils import iteritems
from past.builtins import xrange, range

import itertools
import collections
import operator
import functools
import copy

import numpy as np

import lipyd.formula as formula
import lipyd.lipproc as lipproc


[docs]class AbstractMetaboliteComponent(formula.Formula):
    
    def __init__(self,
                 core = 0.0,
                 charge = 0,
                 isotope = 0,
                 name = 'Unknown',
                 getname = lambda parent: parent.name,
                 **kwargs):
        """
        Represents a component of a molecule. It can be the basis of a core
        carrying more substituents. Or can be the basis of a substituent
        which might provide a custom set of options or a homolog series
        with different carbon counts and unsaturations on the aliphatic
        chain.
        
        :param core: You can provide the core (unchanged part) 3 ways:
            As a `float` which considered to be an exact mass.
            As a formula, e.g. `C2H5OH`.
            As a `dict` of atom counts, e.g. `{'C': 2, 'H': 6, 'O': 1}`.
        """
        
        formula.Formula.__init__(
            self,
            core if type(core) is not float else None,
            charge,
            isotope,
            **kwargs
        )
        
        if not self.has_mass():
            
            if type(core) is float:
                
                self.mass = core
                
            else:
                
                raise ValueError(
                    'Please provide either formula or atom counts or mass.'
                )
        
        self.name = name
        self.getname = getname


[docs]class AbstractMetabolite(AbstractMetaboliteComponent):
    
    def __init__(self,
            core = 0.0,
            charge = 0,
            isotope = 0,
            name = 'Unknown',
            hg = None,
            getname = lambda parent, subs:
                '%s(%s)' % (
                    parent.name,
                    '/'.join(
                        lipproc.cu_str(s.c, s.u)
                        for s in subs
                        if hasattr(s, 'c') and hasattr(s, 'u')
                    )
                ),
            subs = None,
            sum_only = False,
            **kwargs
        ):
        """
        Represents a metabolite with an unchanged core and a set of variable
        substituents.
        
        :param bool sum_only: Do not iterate all aliphatic chains
            independently but consider only sum of chain lengths and
            unsaturations.
        """
        
        AbstractMetaboliteComponent.__init__(
            self,
            core,
            charge = charge,
            isotope = isotope,
            name = name,
            getname = getname,
            **kwargs
        )
        
        self.subs = subs or ()
        self.subs = tuple(self.get_substituent(s) for s in self.subs)
        self.sum_only = sum_only
        self.sub0 = None
        self.hg = hg
    
    def __iter__(self):
        
        for subs, inst in self.subsproduct():
            
            yield inst
    
[docs]    @staticmethod
    def get_substituent(sub):
        """
        Creates a `Formula` object if the substituent is not
        already an instance of `Formula` or `Substituent`.
        """
        
        return (
                formula.Formula(sub)
            if hasattr(sub, 'lower') or type(sub) is float
                else formula.Formula(**sub)
            if type(sub) is dict
                else sub
        )
    
[docs]    def itersubs(self):
        """
        Iterates all combinations of all substituents.
        Yields tuples of substituents.
        """
        
        self._restore_sub0()
        
        for subs in itertools.product(*self.subs):
            
            yield subs
    
[docs]    def subsproduct(self):
        """
        Iterates instances and substituents in parallel.
        Yields tuples of two elements.
        First element is a tuple of all molecule parts (substituents).
        Second element is the actual instance, i.e. the whole molecule.
        """
        
        iterator = self.itersum() if self.sum_only else self.itersubs()
        
        for subs in iterator:
            
            self.inst_name = self.getname(self, subs)
            
            self.inst = functools.reduce(
                operator.add,
                itertools.chain((self,), (s for s in subs))
            )
            self.inst.name = self.inst_name
            
            yield subs, self.inst
    
[docs]    def iterlines(self):
        """
        Iterates standard lines.
        """
        
        for subs, inst in self.subsproduct():
            
            full_name = inst.getname()
            
            chains = tuple(
                s.attrs.chain for s in subs if hasattr(s.attrs, 'chain')
            )
            chainsum = lipproc.sum_chains(chains)
            name = (
                (lipproc.summary_str(self.hg, chainsum),)
                    if self.hg and self.sum_only else
                (lipproc.full_str(self.hg, chains),)
                    if self.hg and not self.sum_only else
                ()
            )
            lab = lipproc.LipidLabel(
                db_id   = None,
                db      = 'lipyd.lipid',
                names   = name,
                formula = inst.formula,
            )
            rec = lipproc.LipidRecord(
                lab = lab,
                hg  = self.hg,
                chainsum = chainsum if chainsum.c else None,
                chains = () if self.sum_only else chains,
            )
            
            yield inst.mass, rec
    
[docs]    def itersum(self):
        """
        Iterates by considering only the sum of chain lengths and
        unsaturations.
        """
        
        self._restore_sub0()
        
        chains = [
            (i, s.chlens, s.unsats)
            for i, s in enumerate(self.subs)
            if self.has_variable_aliphatic_chain(s)
        ]
        
        if len(chains) <= 1:
            
            for subs in self.itersubs():
                
                yield subs
            
            return
        
        min_chlens = sum(min(c[1]) for c in chains[1:])
        min_unsats = sum(min(c[2]) for c in chains[1:])
        sum_chlens = list(set(
            sum(cc) - min_chlens for cc in
            itertools.product(*(c[1] for c in chains))
        ))
        sum_unsats = list(set(
            sum(uu) - min_unsats for uu in
            itertools.product(*(c[2] for c in chains))
        ))
        
        self.sub0 = chains[0]
        isub0 = self.sub0[0]
        sub0 = self.subs[isub0]
        
        sub0.chlens = sum_chlens
        sub0.unsats = sum_unsats
        
        for subs in itertools.product(*(
                s.__iter__()
                if i == isub0 else
                # other substituents iterated by their cores only
                s.__iter__(cores_only = True)
                for i, s in enumerate(self.subs)
            )):
            
            yield subs
        
        # at the end restore the real values
        self._restore_sub0()
    
    def _restore_sub0(self):
        """
        For iterating with considering only total carbon count and
        unsaturation accross all aliphatic chains, we pretend that
        all extra carbons and unsaturations are added to the first
        substituent with variable aliphatic chain, e.g. it may
        have C54 while in reality it has max C18 and the other 36
        carbons are in the other chains. After setting these fake
        numbers on the first substituent we need to restore the
        real numbers otherwise it could mess up things later.
        """
        
        if self.sub0:
            
            self.subs[self.sub0[0]].chlens = self.sub0[1]
            self.subs[self.sub0[0]].unsats = self.sub0[2]
    
[docs]    def has_variable_aliphatic_chain(self, sub):
        """
        Tells if the substituent really has more than one possible
        chain length or unsaturation variation.
        """
        
        return (
            hasattr(sub, 'variable_aliphatic_chain') and
            sub.variable_aliphatic_chain
        )


[docs]class AbstractSubstituent(AbstractMetaboliteComponent):
    
    def __init__(
            self,
            cores = 0.0,
            c = (14, 20),
            u = (0, 1),
            counts = None,
            charges = 0,
            isotopes = 0,
            names = 'Unknown',
            chain_attr = None,
            chain_type = None,
            getname = lambda parent: '%u:%u' % (parent.c, parent.u),
            c_u_diff = lambda c, u: c > u + 1 or (c <= 1 and u == 0),
            prefix = '',
            even = False,
            valence = 1,
            **kwargs
        ):
        """
        Represents a set of distinct substituent parts in a molecule or
        a homolog series spanning accross a range of aliphatic chain
        length and unsaturated bonds.
        
        Args:
        -----
        :param list cores:
            List of core variations. Same kind of definitions
            are possible like at `formula.Formula`: formula as `str`, `dict`
            of atoms or exact mass as `float`.
        :param tuple c:
            Tuple of 2 integers: range of chain lengths.
        :param tuple u:
            Tuple of 2 integers: range of unsaturations.
        :param list counts:
            Dictionary with extra atom counts.
            If you have one or more extra oxygen, nitrogen, phosphorous or
            any other atoms in the compound you can include here.
            Alternatively you can also include them in the core.
            Also accounts for the valences of the aliphatic chain
            not occupied by hydrogens. E.g. for a fatty acyl you need to
            remove 3 hydrogens as 3 valences are occupied by the oxygens.
            If you have a secondary amine you need to remove one more
            hydrogen. Otherwise, as this data structure has no information
            about constitution, we could not guess the number of hydrogens.
            Similarly, for oxo groups removal of 2 hydrogens necessary.
        :param list charges:
            List of integers: charges for each core
            variation.
        :param list isotopes:
            List of integers: extra neutrons for each core
            variation.
        :param getname:
            Method (callable) to create a name from the chain
            length and the unsaturation.
        :param c_u_diff:
            Method (callable) to decide if the chain length and
            unsaturation are compatible. E.g. if chain length is only C2, an
            unsaturation of 4 double bonds is not possible. By default, chain
            length must be greater by 2 than unsaturation, it means at acyl
            chains we avoid to assume double bond right next to the carboxyl
            group which is clearly impossible.
        :param bool even:
            If true only even chain lengths are considered.
        :param int valence:
            Total valence of the substituent. Number of bonds connecting
            to other parts of the molecule.
        """
        
        self.prefix   = prefix
        self.c_u_diff = c_u_diff
        self.even     = even
        self.valence  = valence
        self.getname  = getname
        self.cores    = (
            cores if type(cores) in {list, set, tuple} else [cores]
        )
        self.set_attr(charges, 'charges')
        self.set_attr(names, 'names')
        self.set_attr(isotopes, 'isotopes')
        self.counts   = collections.defaultdict(lambda: 0)
        self.counts.update(counts or {})
        
        AbstractMetaboliteComponent.__init__(
            self,
            self.cores[0],
            charge = self.charges[0],
            isotope = self.isotopes[0],
            name = self.names[0],
            getname = getname,
            **kwargs
        )
        
        # range of possible lengths and unsats
        self.set_chlens(c)
        self.set_unsats(u)
        
        # current value of length and unsat
        self.c = self.chlens[0]
        self.u = self.unsats[0]
        
        self.total = len(self.chlens) * len(self.unsats)
        self.variable_aliphatic_chain = (
            not (
                len(self.chlens) == 1 and
                self.chlens[0] == 0
            )
        )
        
        self.chain_attr = chain_attr
        self.chain_type = chain_type
    
    def __iter__(self, cores_only = False):
        
        for i in range(len(self.cores)):
            
            self.update_core(i)
            
            for c in self.chlens:
                
                self.c = c
                
                for u in self.unsats:
                    
                    self.u = u
                    
                    if not self.c_u_diff(self.c, self.u):
                        
                        continue
                    
                    # implicit hydrogens
                    h = c * 2 + 2 - self.valence - 2 * u
                    p = self.get_prefix()
                    new_counts = self.counts.copy()
                    new_counts['C'] += c
                    new_counts['H'] += h
                    new_attrs = copy.deepcopy(self.attrs)
                    # `attrs` might contain methods which are called
                    # with the present instance passed and their returned
                    # value will be the attribute value of the
                    # iteration products
                    for k, v in iteritems(new_attrs.__dict__):
                        if hasattr(v, '__call__'):
                            setattr(new_attrs, k, v(self))
                    
                    if self.chain_type and self.chain_attr and c > 0:
                        
                        new_attrs.chain = self.get_chain()
                    
                    new = self + formula.Formula(**new_counts)
                    
                    new.attrs = new_attrs
                    new.c = c
                    new.u = u
                    new.get_prefix = lambda: p
                    new.variable_aliphatic_chain = (
                        self.variable_aliphatic_chain
                    )
                    new.name = self.getname(self)
                    # new.attrs.chain = self.get_chain()
                    
                    yield new
                    
                    if cores_only:
                        
                        break
                
                if cores_only:
                    
                    break
    
[docs]    def set_attr(self, val, name):
        
        setattr(
            self,
            name,
            val
                if type(val) in {list, set, tuple} else
            (val,) * len(self.cores)
        )
    
[docs]    def set_chlens(self, c):
        
        if type(c) is int: c = [c]
        if type(c) is tuple and len(c) != 2: c = list(c)
        
        self.chlens = (
            list(c) if type(c) in {list, set, range}
            else [
                i for i in
                range(c[0], c[1] + 1)
                if not self.even or i % 2 == 0
            ]
        )
    
[docs]    def set_unsats(self, u):
        
        if type(u) is int: u = [u]
        if type(u) is tuple and len(u) != 2: u = list(u)
        
        self.unsats = (
            list(u) if type(u) in {list, set, range}
            else list(range(u[0], u[1] + 1))
        )
    
[docs]    def update_core(self, i = 0):
        
        try:
            new = self.cores[i]
        
        except:
            raise IndexError('Number of cores is less than %u' % i + 1)
        
        if type(new) is float:
            
            self.mass = new
            self.formula = ''
            
        elif type(new) is dict:
            
            self.formula_from_dict(new)
            
        elif hasattr(new, 'lower'):
            
            self.formula = new
            
        else:
            
            raise ValueError('Wrong mass or formula: `%s`' % str(new))
        
        self.charge = (
            self.charges[i]
            if type(self.charges) in {list, set, tuple} else
            self.charges
        )
        self.isotope = (
            self.isotopes[i]
            if type(self.isotopes) in {list, set, tuple} else
            self.isotypes
        )
        self.name = (
            self.names[i]
            if type(self.names) in {list, set, tuple} else
            self.names
        )
        
        self.reset_atoms()
        self.calc_mass()
    
[docs]    def get_prefix(self):
        
        return self.prefix
    
[docs]    def get_chain(self):
        
        return lipproc.Chain(
            c = self.c,
            u = self.u,
            typ = self.chain_type,
            attr = self.chain_attr,
        )