Source code for lipyd.name

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2018 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

from __future__ import print_function
from past.builtins import xrange, range, reduce

from future.utils import iteritems

import sys
import imp
import re
import itertools

import lipyd.settings as settings
import lipyd.lipproc as lipproc


[docs]class LipidNameProcessor(object): def __init__( self, database = 'swisslipids', with_alcohols = True, with_coa = True, iso = False ): """ Processes lipid names used in databases. Converts names to the standard used in this module and extracts carbon count and unsaturation information and other features. """ self.database = database.lower() self.with_alcohols = with_alcohols self.with_coa = with_coa self.iso = iso self.lipnamesf = settings.get('lipnamesf') self.adducts_constraints = settings.get('adducts_constraints') self.gen_fa_greek() self.read_lipid_names()
[docs] def reload(self, children = False): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new)
[docs] def read_lipid_names(self, add_fa = True): """ Reads annotations for lipid classes: - full names - short notations - database keywords (to process long names from SwissLipids and LipidMaps) - most abundant adducts The input file is given by the ``lipnamesf`` attribute. """ result = {} with open(self.lipnamesf, 'r') as fp: hdr = True for l in fp: if l[0] == '#': continue if hdr: hdr = False continue l = l.strip().split('\t') lip = ( tuple(l[1].split(';')) if l[1] else (), l[0], tuple(l[2].split(';')) if l[2] else () ) result[lip] = { 'full_name': l[4], 'swl': self.process_db_keywords(l[5]), 'lmp': self.process_db_keywords(l[6]), 'pos_adduct': ( l[7] if l[7] != 'ND' and self.adducts_constraints else None ), 'neg_adduct': ( l[8] if l[8] != 'ND' and self.adducts_constraints else None ), 'chains': tuple(l[3].split(';')) if l[3] else () } self.lipnames = result
[docs] @staticmethod def process_db_keywords(kwdstr): return \ list( map( lambda kwdset: { 'neg': list( map( lambda kwd: kwd[1:], list( filter( lambda kwd: len(kwd) and \ kwd[0] == '!', kwdset.split(';') ) ) ) ), 'pos': list( filter( lambda kwd: len(kwd) and kwd[0] != '!', kwdset.split(';') ) ) }, kwdstr.split('|') ) )
[docs] @staticmethod def has_sph_prefix(match): return match[0] in {'t', 'd', 'k', 'DH'}
[docs] @staticmethod def is_ether(match): return match[0] in {'O', 'P'}
[docs] @classmethod def attr_proc(cls, match, u = None): sph = match[0] if cls.has_sph_prefix(match) else '' if u == 0 and sph == 'd': sph = 'DH' return lipproc.ChainAttr( sph = sph, ether = cls.is_ether(match), oh = (match[-1],) if match[-1] else () )
[docs] @staticmethod def get_type( i, sphingo = False, ether = False, types = None, chainsexp = None ): # TODO: this does not feel perfect # some better heuristic should replace return ( types[i] if types else 'Sph' if sphingo and i == 0 else 'FAL' if ether else chainsexp[i] if chainsexp and len(chainsexp) > i else 'FA' )
[docs] def attrs_types(self, cc1, chainsexp): sph = cc1[0] if self.has_sph_prefix(cc1) else '' ether = self.is_ether(cc1) fa1 = min( (i for i, c in enumerate(chainsexp) if c in {'FA', 'FAL'}), default = None ) hasfal = 'FAL' in chainsexp oh = (cc1[-1],) if cc1[-1] else () chainsexp = tuple( 'FAL' if not hasfal and ether and i == fa1 else c for i, c in enumerate(chainsexp) ) attrs = tuple( lipproc.ChainAttr( sph = sph if c == 'Sph' else '', ether = ether and c == 'FAL', # here have no idea which chain carries OHs # hence we add them to the last chain oh = oh if i == len(chainsexp) - 1 else () ) for i, c in enumerate(chainsexp) ) return attrs, chainsexp
[docs] def carbon_counts( self, name, ccexp = 2, chainsexp = None, sphingo = False, iso = False, types = None ): """ Processes carbon and unsaturation counts from name. Args ---- :param str name: Lipid name. :param int ccexp: Expected number of fatty acyl or other residues constaining aliphatic chain. E.g. for DAG this should be 2 and for TAG 3 as the latter has 3 fatty acyls. :param tuple chainsexp: The type of the expected chains, e.g. `('Sph', 'FA')` for one sphingosine and a fatty acyl. This matters only if the chain types can not be inferred from the processed names. :param bool sphingo: Is this a sphingolipid, is the first chain a sphingosine base? :param bool iso: Process conformation isomer details for example 18:2(9E,11Z). :param tuple types: Explicit types for each chains. """ # number of groups in one regex match unit _g = 5 if iso else 4 _i = 3 rechain = lipproc.rechainiso if iso else lipproc.rechain # regex finds the total carbon count cc1 = lipproc.rechainsum.search(name) cc1 = cc1.groups() if cc1 else cc1 # regex finds 1-4 fatty acids cc2 = rechain.search(name) cc2 = cc2.groups() if cc2 else cc2 chains = [] if ( ccexp and cc2 #and #cc2[(ccexp - 1) * _g + 1] and #cc2[(ccexp - 1) * _g + 2] ): for i in xrange(len(cc2) // _g): if cc2[i * _g + 1] and cc2[i * _g + 2]: c = int(cc2[i * _g + 1]) u = int(cc2[i * _g + 2]) attr = self.attr_proc(cc2[i * _g:i * _g + _g], u) sphingo = sphingo or bool(attr.sph) chains.append(lipproc.Chain( c = c, u = u, attr = attr, typ = self.get_type( i, sphingo, attr.ether, types, chainsexp ), iso = ( tuple(cc2[i * _g + _i].split(',')) if iso and cc2[i * _g + _i] else () ) )) zerochains = sum(not c.c for c in chains) # ccexp = ccexp - zerochains chains = tuple(c for c in chains if c.c) chains = [] if len(chains) != ccexp else tuple(chains) # the total carbon count if chains: chainsum = lipproc.sum_chains(chains) elif cc1: c = int(cc1[1]) u = int(cc1[2]) if not chainsexp: attrs = (self.attr_proc(cc1, u),) types = () else: attrs, types = self.attrs_types(cc1, chainsexp) chainsum = lipproc.ChainSummary( c = c, u = u, attr = attrs, typ = types, ) else: chainsum = None return chainsum, chains
[docs] def isomeric_carbon_counts( self, name, ccexp = 2, sphingo = False, types = None ): """ Calls `carbon_counts` with `iso=True`. """ return self.carbon_counts( name, ccexp = ccexp, phingo = sphingo, iso = True, types = types )
[docs] def headgroup_from_lipid_name(self, names, database = None): """ For one database record attempts to identify the lipid class by looking up keywords. Calls greek name identification, greek fatty acid names are identified as 'FA'. Returns tuple of `lipproc.Headgroup` object and expected chain types. """ database = database or self.database names = '|'.join(names) db = 'lmp' if database.lower() == 'lipidmaps' else 'swl' for lipclass, spec in iteritems(self.lipnames): for kwset in spec[db]: matched = [kw in names for kw in kwset['pos']] if sum(matched) == len(kwset['pos']) and \ sum(matched) > 0: matched = [kw in names for kw in kwset['neg']] if sum(matched) == 0: return ( lipproc.Headgroup( main = lipclass[1], # main class, e.g. Cer sub = lipclass[0] # subclass, e.g. Hex ), spec['chains'] ) fa_name = self.process_fa_name(names) if fa_name: return (lipproc.Headgroup(main = fa_name), (fa_name,)) return None, None
[docs] def process_fa_name(self, name): """ Identifies fatty acids based on their greek name. """ return ( 'FA' if name in self.fa_greek or 'FA' in name or 'atty acid' in name else 'FAL' if self.with_alcohols and ( name in self.fal_greek or 'atty alcohol' in name ) else 'FACoA' # TODO: capture carbon counts of acyl-CoAs if self.with_coa and ( name in self.facoa_greek or 'oyl-CoA' in name ) else None )
[docs] def fa_greek_cc(self, name): chainsum, chains = None, None try: name1 = name.split('-')[1] if '-' in name else name for typ in {'FA', 'FAL', 'FACoA'}: if name1 in getattr(self, '%s_greek' % typ.lower()): cc1 = getattr(self, '%s_greek' % typ.lower())[name1] iso = ( tuple(name.split(')')[0][1:].split(',')) if self.iso and '(' in name else () ) chains = [lipproc.Chain( c = cc1[0], u = cc1[1], typ = typ, iso = iso )] chainsum = lipproc.sum_chains(chains) except: pass return chainsum, chains
[docs] def test_branched(self, name): """ Tells if a lipid might contain branched aliphatic chains. """ return bool(self.reme.search(name))
[docs] def process(self, names, database = None): """ The main method of this class. Processes a lipid name string and returns a standard name, prefix, carbon counts and unsaturations. Args ---- :param list names: One or more names to process. Single result will be returned and names will be attempted to be processed one after the other until processing is successful. """ if hasattr(names, 'lower'): # ok, if one passes a string let us still process it names = (names,) database = database or self.database hg, chainsum, chains, chainsiso, chainsexp = ( None, None, None, None, None ) hg, chainsexp = self.headgroup_from_lipid_name( names, database = database ) # try greek fatty acyl carbon counts: if not hg and self.iso and database == 'swisslipids': try: for name0 in names: fa_greek = name0.split('-') if len(fa_greek) > 1: hg = self.process_fa_name(fa_greek[1]) if hg: hg = lipproc.Headgroup(main = fa_name) chainsexp = (fa_name,) break except: pass for n in names: lyso = hg and 'Lyso' in hg.sub # how many aliphatic chains this molecule has ccexp = ( 2 if not hg else 1 if hg.main in {'FA', 'MAG'} or lyso else 3 if hg.main == 'TAG' else 2 ) _chainsum, _chains = self.carbon_counts( n, ccexp = ccexp, chainsexp = chainsexp, iso = self.iso ) chains = chains or _chains chainsum = chainsum or _chainsum if self.iso and _chains and any(c.iso for c in _chains): chains = _chains if ( chainsum and chains and ( not self.iso or any(c.iso for c in chains) ) ): break if hg and hg.main in {'FA', 'FAL', 'FACoA'} and not chainsum: for name0 in names: chainsum, chains = self.fa_greek_cc(name0) if chainsum: break return hg, chainsum, chains
[docs] def gen_fa_greek(self): """ Generates a list of greek fatty acid names with their carbon counts and unsaturations. """ fa_greek_parts = { 'cc': { 'hex': 6, 'hept': 7, 'oct': 8, 'non': 9, 'dec': 10, 'undec': 11, 'dodec': 12, 'tridec': 13, 'tetradec': 14, 'pentadec': 15, 'hexadec': 16, 'heptadec': 17, 'octadec': 18, 'nonadec': 19, 'eicos': 20, 'icos': 20, 'heneicos': 21, 'docos': 22, 'tricos': 23, 'tetracos': 24, 'pentacos': 25, 'hexacos': 26, 'heptacos': 27, 'octacos': 28, 'nonacos': 29, 'triacont': 30 }, 'uns': { '': 1, 'adi': 2, 'atri': 3, 'atetra': 4, 'apenta': 5, 'ahexa': 6, 'ahepta': 7, 'aocta': 8 }, 'end': { 'enoate': 1, 'anoate': 0, 'enoic acid': 1, 'anoic acid': 0 } } fal_greek_end = {} fal_greek_end['anol'] = 0 fal_greek_end['enol'] = 1 facoa_greek_end = {} facoa_greek_end['anoyl-CoA'] = 0 facoa_greek_end['enoyl-CoA'] = 1 self.fa_greek = {} self.fal_greek = {} self.facoa_greek = {} for cc, uns, end in itertools.product( fa_greek_parts['cc'].items(), fa_greek_parts['uns'].items(), fa_greek_parts['end'].items()): if len(uns[0]) and end[1] == 0: continue self.fa_greek['%s%s%s' % (cc[0], uns[0], end[0])] = (cc[1], uns[1] * end[1]) for cc, uns, end in itertools.product( fa_greek_parts['cc'].items(), fa_greek_parts['uns'].items(), fal_greek_end.items()): if len(uns[0]) and end[1] == 0: continue self.fal_greek['%s%s%s' % (cc[0], uns[0], end[0])] = (cc[1], uns[1] * end[1]) for cc, uns, end in itertools.product( fa_greek_parts['cc'].items(), fa_greek_parts['uns'].items(), facoa_greek_end.items()): if len(uns[0]) and end[1] == 0: continue self.facoa_greek['%s%s%s' % (cc[0], uns[0], end[0])] = (cc[1], uns[1] * end[1])