Source code for lipyd.lipproc

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2018 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

import re
import collections
import itertools
import functools


FAMILIES = {
    'SL':  {'Sph', 'Cer', 'SM'},
    'GL':  {'MAG', 'DAG', 'TAG'},
    'GPL': {
        'PC', 'PE', 'PE', 'PG', 'BMP',
        'PA', 'PI', 'PIP', 'PIP2', 'PIP3'
    },
    'GLGPL': {
        'MAG', 'DAG', 'TAG',
        'PC', 'PE', 'PE', 'PG', 'BMP',
        'PA', 'PI', 'PIP', 'PIP2', 'PIP3'
    }
}


SUBCLS_PRE = {'Hex', 'Hex2', 'SHex', 'SHex2', 'Lyso'}

ChainAttr = collections.namedtuple(
    'ChainAttr',
    ['sph', 'ether', 'oh']
)
# attributes of a chain e.g. d or t Ceramide, 2OH fatty acyl, etc
# defaults:
ChainAttr.__new__.__defaults__ = ('', False, ())


[docs]class Chain(collections.namedtuple(
        'ChainBase',
        ['c', 'u', 'typ', 'attr', 'iso']
    )):
    
    def __new__(cls, c, u, typ = 'FA', attr = ChainAttr(), iso = ()):
        
        if hasattr(attr, 'sph') and attr.sph == 'd' and u == 0:
            
            attr = ChainAttr(sph = 'DH', ether = attr.ether, oh = attr.oh)
        
        return super(Chain, cls).__new__(
            cls, c, u, typ = typ, attr = attr, iso = iso
        )
    
    def __str__(self, iso = False):
        
        return '%s%s%u:%u%s%s' % (
            # ether prefix
            'O-' if self.attr.ether else '',
            # sphingoid base prefix e.g. dCer(d38:1)
            self.attr.sph,
            # the carbon count
            self.c,
            # the unsaturation
            self.u,
            # isomer information
            '(%s)' % ','.join(self.iso) if self.iso else '',
            # postfix for hydroxylated fatty acyls e.g. PC(32:1-2OH)
            '-%s' % '-'.join(self.attr.oh) if self.attr.oh else ''
        )
    
[docs]    def isomer_str(self):
        
        return self.__str__(iso = True)
    
    def __add__(self, other):
        
        return sum_chains((self, other))


[docs]class ChainSummary(Chain):
    
    def __new__(cls, c, u, typ = (), attr = (), iso = None):
        
        if u == 0 and attr and attr[0].sph == 'd':
            
            attr = (
                (
                    # fisrt chain, sphingosine base
                    ChainAttr(
                        sph = 'DH',
                        ether = attr[0].ether,
                        oh = attr[0].oh
                    ),
                ) +
                # all other chains
                attr[1:]
            )
        
        return super(ChainSummary, cls).__new__(
            cls, c, u, typ = typ, attr = attr, iso = None
        )
    
    def __str__(self):
        
        return '%s%s%u:%u%s' % (
            # ether prefix
            'O-' if any(a.ether for a in self.attr) else '',
            # sphingoid base prefix e.g. dCer(d38:1)
            ''.join(a.sph for a in self.attr), # of course max one of these
                                               # is not empty string
            # the carbon count
            self.c,
            # the unsaturation
            self.u,
            # postfix for hydroxylated fatty acyls e.g. PC(32:1-2OH)
            '-%s' % '-'.join('-'.join(a.oh) for a in self.attr if a.oh)
                if any(a.oh for a in self.attr) else ''
        )
    
    def __len__(self):
        
        return len(self.typ)


Headgroup = collections.namedtuple(
    'Headgroup',
    ['main', 'sub']
)
Headgroup.__new__.__defaults__ = ((),)


LipidLabel = collections.namedtuple(
    'LipidLabel',
    ['db_id', 'db', 'names', 'formula']
)
# names are empty tuple by default
LipidLabel.__new__.defaults__ = ((), None)


[docs]class LipidRecord(collections.namedtuple(
        'LipidRecordBase',
        ['lab', 'hg', 'chainsum', 'chains']
    )):
    
    def __new__(cls, lab, hg, chainsum, chains):
        
        return super(LipidRecord, cls).__new__(cls, lab, hg, chainsum, chains)
    
[docs]    def full_str(self):
        
        return full_str(self.hg, self.chains, iso = False)
    
[docs]    def summary_str(self):
        
        return summary_str(self.hg, self.chainsum)
    
[docs]    def subclass_str(self):
        
        return subclass_str(self.hg, self.chainsum)


[docs]def empty_chain():
    """
    Returns an empty Chain object which might serve as a dummy object.
    """
    
    return Chain(c = 0, u = 0, attr = ChainAttr())


[docs]def empty_chainsum():
    """
    Returns an empty ChainSummary object.
    """
    
    return ChainSummary(c = 0, u = 0, attr = (), typ = ())


[docs]def str2hg(hgstr):
    """
    From a headgroup string representation creates a Headgroup object.
    """
    
    pieces = hgstr.split('-')
    
    return Headgroup(main = pieces[-1], sub = tuple(pieces[:-1]))


[docs]def sum_chains(chains):
    """
    From a list of chains creates a summary Chain object.
    """
    
    return empty_chainsum() if not chains else (
        ChainSummary(
            c = sum(i.c for i in chains),
            u = sum(i.u for i in chains),
            attr = tuple(c.attr for c in chains),
            typ = tuple(c.typ for c in chains)
        )
    )


[docs]def collapse_attrs(chains):
    """
    Combine the attributes of arbitrary number of chains.
    """
    
    return functools.reduce(combine_attrs, (c.attr for c in chains))


[docs]def combine_attrs(a1, a2):
    """
    Combines the attributes of 2 chains.
    """
    
    return ChainAttr(
        sph = a1.sph or a2.sph,
        ether = a1.ether or a2.ether,
        oh = tuple(itertools.chain(a1.oh, a2.oh))
    )

[docs]def summary_str(hg, chainsum):
    """
    Creates a summary string representation from the headgroup name and
    a summary Chain object.
    """
    
    subcls_pre, sphingo_prefix, ether_prefix, subcls_post, hydroxy = (
        get_attributes(hg, chainsum)
    )
    
    return '%s%s%s%s' % (
        # subclass attributes like *PE*-Cer, *Lyso*-PC
        subcls_pre,
        # main class of headgroup e.g. Cer, PS
        hg.main,
        # subclass attributes like 1-O-phosphate group of Cer1P, Sph1P, etc
        subcls_post,
        # chains summary
        ('(%s)' % chainsum.__str__())
            if chainsum is not None and chainsum.c > 0 else
        ''
    )


[docs]def full_str(hg, chains, iso = False):
    """
    From a Headgroup and a tuple of Chain objects returns a 
    """
    
    subcls_pre, sphingo_prefix, ether_prefix, subcls_post, hydroxy = (
        get_attributes(hg, sum_chains(chains))
    )
    
    return '%s%s%s%s' % (
        # subclass attributes like *PE*-Cer, *Lyso*-PC
        subcls_pre,
        # main class of headgroup e.g. Cer, PS
        hg.main,
        # subclass attributes like 1-O-phosphate group of Cer1P, Sph1P, etc
        subcls_post,
        # chains
        ('(%s)' % '/'.join(c.__str__(iso = iso) for c in chains))
            if chains else
        ''
    )

[docs]def subclass_str(hg, chainsum = None):
    """
    From Headgroup and summary Chain object creates a subclass level
    headgroup string.
    """
    
    subcls_pre, sphingo_prefix, ether_prefix, subcls_post, hydroxy = (
        get_attributes(hg, chainsum)
    )
    
    return '%s%s%s%s%s%s' % (
        # subclass attributes like *PE*-Cer, *Lyso*-PC
        subcls_pre,
        # prefix of shingoid base subclass: d, t, k, DH
        sphingo_prefix,
        # main class of headgroup e.g. Cer, PS
        hg.main,
        # postfix of ether lipids e.g. PC-O, PE-O
        '-O' if ether_prefix else '',
        # subclass attributes like 1-O-phosphate group of Cer1P, Sph1P, etc
        subcls_post,
        # postfix of hydroxylated fatty acyl e.g. Cer-2OH
        hydroxy
    )


[docs]def get_attributes(hg, chainsum = None):
    """
    Processes a Headgroup and a summary Chain object and returns the
    name pre- and postfix string elements.
    """
    
    chainsum = chainsum or empty_chainsum()
    
    hydroxy = '-'.join('-'.join(c.oh) for c in chainsum.attr)
    hydroxy = '-%s' % hydroxy if hydroxy else ''
    
    subcls_pre   = '-'.join(i for i in hg.sub if i in SUBCLS_PRE)
    subcls_pre   = '%s-' % subcls_pre if subcls_pre else ''
    subcls_post  = '-'.join(i for i in hg.sub if i not in SUBCLS_PRE)
    subcls_post  = '-%s' % subcls_post if subcls_post else ''
    
    sphingo_prefix = ''.join(a.sph for a in chainsum.attr)
    ether_prefix = any(a.ether for a in chainsum.attr)
    # this I leave here if maybe later I decide to have
    # plain tuples...
    #
    # sphingo_prefix = set(chainsum.p) & {'d', 't', 'k', 'DH'}
    # sphingo_prefix = sphingo_prefix.pop() if sphingo_prefix else ''
    # ether_prefix = 'O' in chainsum.p
    
    return subcls_pre, sphingo_prefix, ether_prefix, subcls_post, hydroxy


[docs]def match_constraint(rec, constr):
    """
    Matches an MS2 fragment constraint (fragment.FragConstraint)
    against a lipid record lipproc.LipidRecord.
    
    Returns the indices of the chains as integers in a set.
    
    The default attribute values of the `FragConstraint` object always
    correspond to bypass all filters here. Defining a specific value
    may limit the records complying with the constraint.
    
    Args
    ----
    :param LipidRecord rec:
        A lipid database record object.
    :param fragment.FragConstraint constr:
        An MS2 fragment constraint object.
    """
    
    match = False
    chains = set()
    
    if (
        (
            constr.hg == rec.hg.main or (
                constr.hg is None and
                constr.family in FAMILIES and
                rec.hg.main in FAMILIES[constr.family]
            ) or (
                constr.hg is None and
                constr.family is None
            )
        ) and (
            constr.sub is None or
            set(constr.sub) == set(rec.hg.sub)
        )
    ):
        
        match = True
        chainsum = rec.chainsum if rec.chainsum else sum_chains(rec.chains)
        
        for (i, attr), rec_chaintype in zip(
            enumerate(chainsum.attr), chainsum.typ
        ):
            
            if (
                (
                    constr.chaintype is None or
                    constr.chaintype == rec_chaintype
                ) and (
                    constr.sph is None or
                    constr.sph == attr.sph
                ) and
                # matching only the number of OH groups
                (
                    len(attr.oh) == constr.oh
                    if type(constr.oh) is int else
                    set(attr.oh) == set(constr.oh)
                )
            ):
                
                chains.add(i)
    
    return match, chains


[docs]def match_constraints(rec, constraints):
    """
    Matches all fragment constraints in the iterable `constraints`
    against all chains in MS1 record `rec`.
    
    Returns a boolean (wether the fragment can possibly origin from the
    molecular species in the record) and a tuple of chain positions which
    can be the source of the fragment if the fragment is from an aliphatic
    chain moiety.
    
    Args
    ----
    :param LipidRecord rec:
        An MS1 database record object.
    :param iterable constraints:
        A number of `fragment.FragConstraint` objects.
    """
    
    match = False
    chains = set()
    
    for constr in constraints:
        
        match, chains_ = match_constraint(rec, constr)
        chains.update(chains_)
    
    return match, chains

[docs]def cu_str(c, u):
    
    return '%u:%u' % (c, u)

[docs]def charge_str(charge):
    
    return (
        '' if charge == 0 else
        '-' if charge < 0 else
        '+'
    )

# regex captures the summary carbon count
rechainsum = re.compile(
    r'\('
    # prefix (d, t , DH, O-, P-)
    r'([POdtDH]{0,2})-?'
    # cc and unsat
    r'([0-9]{1,2}):([0-9]{1,2})'
    # optional OH
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'\)'
)

# captures 1-4 aliphatic chains data
rechain = re.compile(
    r'\('
    # 1
    r'([POdtDH]{0,2})-?'
    r'([0-9]{1,2}):([0-9]{1,2})'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'[/_]?'
    # 2
    r'([POdtDH]{0,2})-?'
    r'([0-9]{0,2}):?([0-9]{0,2})'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'[/_]?'
    # 3
    r'([POdtDH]{0,2})-?'
    r'([0-9]{0,2}):?([0-9]{0,2})'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'[/_]?'
    # 4
    r'([POdtDH]{0,2})-?'
    r'([0-9]{0,2}):?([0-9]{0,2})'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'\)'
)

# captures 1-4 aliphatic chains with
# conformational isomeric information
rechainiso = re.compile(
    r'\(?'
    # 1
    r'((?:[0-9]+-)?'
    r'[POdtDH]{0,2})-?'
    r'([0-9]{1,2}):([0-9]{1,2})'
    r'(?:\(?([0-9EZ,]{2,})\)?)?'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'[/_]?'
    # 2
    r'((?:[0-9]+-)?'
    r'[POdtDH]{0,2})-?'
    r'([0-9]{0,2}):?([0-9]{0,2})'
    r'(?:\(?([0-9EZ,]{2,})\))?'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'[/_]?'
    # 3
    r'((?:[0-9]+-)?'
    r'[POdtDH]{0,2})-?'
    r'([0-9]{0,2}):?([0-9]{0,2})'
    r'(?:\(?([0-9EZ,]{2,})\))?'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'[/_]?'
    # 4
    r'((?:[0-9]+-)?'
    r'[POdtDH]{0,2})-?'
    r'([0-9]{0,2}):?([0-9]{0,2})'
    r'(?:\(?([0-9EZ,]*)\)?)?'
    r'(?:[-\(]([0-9]{0,2}OH)\)?)?'
    r'\)?'
)

# methyl or ethyl
reme = re.compile(r'methyl|ethyl')

# ?
rebr = re.compile(
    r'(1(?:,2-di)?)-\(((?:[0-9]{0,2}[-]?methyl|ethyl)?)[A-z0-9-]+\)'
    r'-([2,3]{1,3}(?:-di)?)-'
    r'\(((?:[0-9]{0,2}[-]?methyl|ethyl)?)[A-z0-9-]+\)'
)