Source code for lipyd.results

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2017 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

import re
import pandas as pd


[docs]class Results(object):
    
    def __init__(self):
        
        pass

[docs]    def read_manual(self):
        """
        Reads adequate columns from manually annotated tables.
        Provides data in `Screening().manual` attribute.
        """
        
        reclass = re.compile(r'(^[IV]*\.?[0-9]?).*')
        
        def read_line(l, protein, mode):
            if len(l[17]) and len(l[12]) and len(l[13]):
                return \
                    [
                        float(l[13]), # m/z corrected
                        reclass.match(l[17]).groups()[0], # result class
                        l[14], # SwissLipids name
                        l[15], # main headgroup class
                        int(float(l[12])), # intensity
                        float(l[2]), # m/z original
                        protein, # protein name
                        mode, # ion mode
                        float(l[5]), # RT mean
                        float(l[6]) if l[6] != 'NA' else np.nan, # RT MS2 closest
                        float(l[4].split('-')[0].strip()), # RT lower
                        float(l[4].split('-')[1].strip())  # RT greater
                    ]
        
        def read_table(tbl, protein, mode):
            return \
                list(
                    filter(
                        lambda l:
                            l is not None,
                        map(
                            lambda l:
                                read_line(l, protein, mode),
                            tbl
                        ),
                    )
                )
        
        data = {}
        
        fnames = \
            list(
                filter(
                    lambda f:
                        f.endswith(self.manualend),
                    os.listdir(self.manualdir)
                )
            )
        
        for f in fnames:
            protein = f.split('_')[0]
            xlsname = os.path.join(self.manualdir, f)
            tblneg = self.read_xls(xlsname,
                                   sheet = '%s_negative_best' % protein)
            tblpos = self.read_xls(xlsname,
                                   sheet = '%s_positive_best' % protein)
            data[protein] = {}
            data[protein]['neg'] = read_table(tblneg[1:], protein, 'neg')
            data[protein]['pos'] = read_table(tblpos[1:], protein, 'pos')
        
        self.manual = data
    
[docs]    def read_manual2(self, fname = 'All_results_v04.xlsx'):
        """
        Reads manually annotated results from Marco's final table.
        """
        result = {}
        reclass = re.compile(r'(^[IV]*\.?[0-9]?).*')
        
        tbl = self.read_xls(fname, sheet = 'final data')
        
        for l in tbl:
            
            protein, mode = l[1].split('_')
            mode = mode[:3]
            
            if protein not in result:
                result[protein] = {}
            
            if mode not in result[protein]:
                result[protein][mode] = []
            
            if l[2].strip()[:4] == 'Qual':
                continue
            
            result[protein][mode].append([
                float(l[15]), # m/z corrected
                reclass.match(l[19]).groups()[0], # result class
                l[16].replace(u'−', '-'), # SwissLipids name
                l[17].replace(u'−', '-'), # main headgroup class
                int(float(l[14])), # intensity
                float(l[4]), # m/z original
                protein, # protein name
                mode, # ion mode
                float(l[7]), # RT mean
                float(l[8]) if l[8] != 'NA' else np.nan, # RT MS2 closest
                float(l[6].split('-')[0].strip()), # RT lower
                float(l[6].split('-')[1].strip())  # RT greater
            ])
        
        self.manual = result
    
[docs]    def manual_df(self, screen = 'A', only_swl_col = False):
        """
        Creates a pandas dataframe from manual results.
        """
        
        shgs = {
            'Monoalkylmonoacylglycerol-O': 'DAG-O',
            'hydroquinone?': 'HQ',
            'Ganglioside GM1': 'GM1',
            'Ganglioside GM2': 'GM2',
            'Ganglioside GM3': 'GM3',
            'Ganglioside GM4': 'GM4',
            'Ganglioside GA1': 'GA1',
            'Lysophosphatidylethanolamine': 'LysoPE',
            'alpha-tocopherol metabolite': 'VE',
            'alpha-tocopherol': 'VE',
            r'Retinol {calculated as -H2O adduct '\
                r'is not in applied database}': 'VA',
            'docosapentaenoate': 'PUFA',
            'octacosatetraenoate': 'PUFA',
            'octacosapentaenoate': 'PUFA',
            'octadecatetraenoate': 'PUFA',
            'octatriacontatetraenoate': 'PUFA',
            'tetracosapentaenoate': 'PUFA',
            'hexacosatetraenoate': 'PUFA',
            'hexacosanoate': 'PUFA',
            'dotriacontapentaenoate': 'PUFA',
            'Sterol ester': 'SE',
            'nothing': 'NA',
            'unknown': 'NA',
            'Monoalkylglycerol-O': 'MAG-O',
            'Monoalkyldiacylglycerol-O': 'TAG-O',
            'Dihexosyldiacylglycerol': 'HexDAG',
            'Monohexosyldiacylglycerol': 'HexDAG',
            'Monoalkylmonoacylglycerol-O': 'DAG-O',
            'Monoalkyldiacylglycerol-O': 'TAG-O',
            'Monoalkylmonoacylglycerol': 'DAG-O',
            '24-Hydroxy-19-norgeminivitamin D3': 'VD',
            'NP40': 'P40',
            'Cer1P': 'CerP',
            'Phosphatidylcholine': 'PC',
            'Phosphatidylethanolamine': 'PE',
            'Phosphatidylcholine-O': 'PC-O',
            'Phosphatidylethanolamine-O': 'PE-O',
            'Dihexosyl ceramide': 'Hex2Cer',
            'Sulfodihexosyl ceramide': 'SHex2Cer',
            'BMP / PG': 'PG/BMP',
            'BMP/PG': 'PG/BMP',
            'Lyso-O-PE': 'LysoPE-O',
            'Lyso-O-PC': 'LysoPC-O',
            'Lyso-O-PG': 'LysoPG-O',
            'Lyso-O-PS': 'LysoPS-O',
            'LysoO-PE': 'LysoPE-O',
            'LysoO-PG': 'LysoPG-O',
            'LysoO-PC': 'LysoPC-O',
            'LysoO-PS': 'LysoPS-O',
            'lysoPG': 'LysoPG',
            'Hex-Cer': 'HexCer'
        }
        
        shgs2 = {
            'Ganglioside': 'GM',
            'Vit.A1': 'VA',
            'Vit. E metabolite': 'VE',
            'SulfohexCer': 'SHexCer',
            'SulfoHexCer': 'SHexCer',
            'Sulfo HexCer': 'SHexCer',
            'SulfodihexCer': 'SHex2Cer',
            'DiHexCer-OH': 'Hex2CerOH',
            'DiHexCer': 'Hex2Cer',
            'PI2xP': 'PIP2',
            'MAMAG': 'DAG-O'
        }
        
        uhgs = {
            'Hex2Cer': 'Hex2Cer',
            'Hex2Cer-OH': 'Hex2CerOH',
            'HexCer-OH': 'HexCerOH',
            'GM3': 'GM',
            'Detergent': 'P40',
            'Hex-Cer': 'HexCer'
        }
        
        def get_names(swl_names, manual_names):
            """
            Extracts the lipid names and carbon counts
            from the SwissLipids IDs field.
            """
            
            something   = []
            nothing     = []
            
            # fixing typos and inconsequent naming:
            manual_name_1 = manual_names.strip().split('(')[0]
            manual_name_1_l = manual_name_1.lower().strip()
            
            if (manual_name_1_l == 'ambiguous' or
                manual_name_1_l == 'ambigous'):
                
                manual_name_1 = 'ambiguous'
            
            if manual_name_1_l == 'unknown' or manual_name_1_l == 'unkown':
                manual_name_1 = 'NA'
            
            if not len(manual_name_1_l):
                manual_name_1 = 'NA'
            
            # :done
            
            # testing if there is PG/BMP ambiguity
            pg_bmp = not bool(set(['BMP', 'PG']) -
                              set(x.split('(')[0].strip()
                                  for x in manual_names.split(','))) or (
                    not bool(set(['BMP', 'PG']) - set(
                        itertools.chain(*[[
                            self.headgroup_from_lipid_name(['S', None, xx])[0]
                                for xx in x.split(';')
                            ] for x in swl_names.split(r'///')])
                        )))
            
            only_pg_bmp = not bool(set(x.split('(')[0].strip()
                                       for x in manual_names.split(',')) -
                                   set(['BMP', 'PG'])) or (
                    not bool(set(
                        itertools.chain(*[[
                            self.headgroup_from_lipid_name(['S', None, xx])[0]
                                for xx in x.split(';')
                            ] for x in swl_names.split(r'///')])
                        ) - set(['BMP', 'PG', None])))
            
            for lips in (
                swl_names.split(r'///')
                if (
                    only_swl_col or
                    manual_name_1 == 'NA' or
                    manual_name_1 == 'ambiguous' or
                    manual_name_1 == 'adduct'
                ) else # at Enric we use the manual names column:
                manual_names.split(',')):
                
                lips = lips.strip()
                
                # matching the adduct type
                add = self.readd.match(lips)
                
                if add is not None:
                    add = add.groups()[0]
                else:
                    add = ''
                
                for lip in lips.split(';'):
                    
                    lip = lip.strip()
                    
                    if lip and lip[0] == '(':
                        lip = lip[1:]
                    
                    if 'lyso' in lip.lower():
                        lyso = 'Lyso'
                        manual_name_1 = manual_name_1.replace('yso-', 'yso')
                    else:
                        lyso = ''
                    
                    swl_parsed = self.headgroup_from_lipid_name(['S', None, lip])[0]
                    
                    if pg_bmp:
                        if lips[:2] == 'PG' or swl_parsed == 'PG':
                            # at PG we replace with PG/BMP
                            lips = lips.replace('PG', 'PG/BMP')
                            fullhg = 'PG/BMP'
                            swl_parsed = 'PG/BMP'
                            if only_pg_bmp:
                                manual_name_1 = 'PG/BMP'
                        elif lips[:3] == 'BMP' or swl_parsed == 'BMP':
                            # at BMP we skip
                            continue
                    
                    if swl_parsed is None:
                        swl_parsed = lip
                        if ']' in swl_parsed:
                            swl_parsed = swl_parsed.split(']')[1]
                        swl_parsed = swl_parsed.split('(')[0].strip()
                        if ':' in swl_parsed:
                            swl_parsed = swl_parsed.split(':')[1].strip()
                    
                    if 'nothing' in swl_parsed:
                        swl_parsed = 'NA'
                    
                    # regex finds the total carbon count
                    cc1 = self.recount1.findall(lip)
                    
                    # special case if the fatty acid
                    # name is greek name
                    if swl_parsed in fa_greek:
                        cc1 = [('', fa_greek[swl_parsed][0],
                                    fa_greek[swl_parsed][1])]
                        swl_parsed = 'FA'
                    
                    # a full headgroup name:
                    fullhg = '%s%s%s' % (
                        lyso if not swl_parsed.startswith('Lyso') else '',
                        swl_parsed,
                        '%s' % ('-O' if len(cc1) and cc1[0][0] == 'O' else '')
                    )
                    
                    # regex finds 2-3 fatty acids
                    cc2s = self.recount3.findall(lip)
                    
                    # the total carbon count
                    ccpart = (
                        [cc1[0][0], int(cc1[0][1]), int(cc1[0][2])]
                        if len(cc1) else
                        ['', np.nan, np.nan]
                    )
                    
                    # carbon counts of fatty acids
                    if len(cc2s) and (
                        any(map(lambda cc2: cc2[4], cc2s)) or
                        swl_parsed == 'FA' or
                        lyso
                    ):
                        
                        faccparts = (
                            list(
                                map(
                                    lambda cc2:
                                        [
                                            # FA1
                                            cc2[0],
                                            int(cc2[1]),
                                            int(cc2[2]),
                                            # FA2
                                            cc2[3],
                                            int(cc2[4]) if cc2[4] else np.nan,
                                            int(cc2[5]) if cc2[5] else np.nan,
                                            # FA3
                                            cc2[6],
                                            int(cc2[7]) if cc2[7] else np.nan,
                                            int(cc2[8]) if cc2[8] else np.nan
                                        ],
                                    filter(
                                        lambda cc2:
                                            # if this is a Lyso species
                                            # or single fatty acid
                                            # we have only one cc:unsat
                                            # otherwise we must have at least 2
                                            cc2[4] or swl_parsed == 'FA' or lyso,
                                        cc2s
                                    )
                                )
                            )
                        )
                        
                    else:
                        faccparts = [
                            [
                                '', np.nan, np.nan,
                                '', np.nan, np.nan,
                                '', np.nan, np.nan
                            ]
                        ]
                    
                    for faccpart in faccparts:
                        
                        if cc2s and not cc1:
                            ccpart = [
                                faccpart[0],
                                np.nansum([faccpart[1], faccpart[4], faccpart[7]]),
                                np.nansum([faccpart[2], faccpart[5], faccpart[8]])
                            ]
                        
                        res = []
                        res.append(swl_parsed)
                        res.append(lyso)
                        res.extend(ccpart)
                        res.extend(faccpart)
                        res.append(fullhg)
                        res.append(manual_name_1)
                        
                        if res[14] == 'NA':
                            nothing.append(res)
                        else:
                            something.append(res)
            
            return something or nothing
        
        if not hasattr(self, 'manual') or self.manual is None:
            self.read_manual2()
        
        if not hasattr(self, 'lipnames') or self.lipnames is None:
            self.read_lipid_names()
        
        result = []
        
        def get_uhg(cnt, counts):
            if cnt[-1] not in ['ambiguous', 'adduct']:
                uhg = cnt[-1]
            else:
                uhg = cnt[-2]
            
            if uhg in uhgs:
                uhg = uhgs[uhg]
            
            uhg = uhg.replace('-O-', '-O')
            
            if (uhg == 'NA' and (len(counts) == 1 or
                                 len(set(c[14] for c in counts)) == 1)):
                uhg = cnt[14]
            
            return uhg
        
        for protein, d in iteritems(self.manual):
            
            for mode, tbl in iteritems(d):
                
                for i, l in enumerate(tbl):
                    
                    counts = get_names(l[2], l[3])
                    
                    if l[3].strip() in shgs2:
                        l[3] = shgs2[l[3].strip()]
                    
                    res = [protein, mode, i, l[0], l[5], l[4], l[1], l[3]] + \
                        l[8:12] # 12 cols: protein -- rtup
                    
                    #this_feature_hgs = set([get_uhg(cnt, counts) for cnt in counts])
                    #if not (set(['PG', 'BMP']) - this_feature_hgs):
                    #    pass
                    
                    for cnt in counts:
                        res1 = res[:]
                        
                        if cnt[1] == 'O':
                            cnt[0] = '%s-O' % cnt[0]
                        
                        if cnt[0].strip() in shgs:
                            cnt[0] = shgs[cnt[0].strip()]
                        
                        cnt[-1] = cnt[-1].strip()
                        
                        for hgi in [-1, -2]:
                            
                            if cnt[hgi] in shgs:
                                cnt[hgi] = shgs[cnt[hgi]]
                            if cnt[hgi] in shgs2:
                                cnt[hgi] = shgs2[cnt[hgi]]
                        
                        uhg = get_uhg(cnt, counts)
                        
                        cnt.append(uhg)
                        
                        if not np.isnan(cnt[3]) and not np.isnan(cnt[4]):
                            cnt.append('%s(%u:%u)' % (
                                cnt[16] if cnt[16] != 'NA' else cnt[14],
                                cnt[3],
                                cnt[4]
                            ))
                            cnt.append('%u:%u' % (cnt[3], cnt[4]))
                        else:
                            cnt.append('NA')
                            cnt.append('NA')
                        
                        facc = []
                        for i in [6, 9, 12]:
                            if not np.isnan(cnt[i]) and not np.isnan(cnt[i+1]):
                                facc.append((cnt[i], cnt[i+1]))
                        
                        facc = '/'.join(map(lambda cc: '%u:%u' % cc,
                                            sorted(facc)))
                        
                        if len(facc):
                            cnt.append('%s(%s)' % (
                                cnt[16] if cnt[16] != 'NA' else cnt[14],
                                facc)
                            )
                            cnt.append(facc)
                        else:
                            cnt.append('NA')
                            cnt.append('NA')
                        
                        
                        res1.extend(cnt)
                        res1.append(screen)
                        
                        result.append(res1)
        
        self.pmanual = pd.DataFrame(result, columns = self.df_header)
    
[docs]    def auto_df(self, screen_name = 'E'):
        """
        Compiles a data frame in the same format as `manual_df`
        just from the programmatic results.
        """
        
        result = []
        
        for protein, d in iteritems(self.valids):
            
            for mode, tbl in iteritems(d):
                
                ii = 0
                
                for i, incl in enumerate(tbl['good']):
                    
                    if not incl:
                        
                        continue
                    
                    mz = tbl['mz'][i]
                    oi = tbl['i'][i]
                    idlevel = tbl['idlevel'][oi]
                    intensity = round(tbl['aaa'][i])
                    rt = tbl['rt'][i,:]
                    rtmean = np.mean(tbl['rt'][i,:])
                    rtms2 = tbl['ms2rt'][i]
                    
                    cids = tbl['cid'][oi]
                    
                    clm = None
                    if not cids:
                        clm = 'unknown'
                        cids = ['unknown']
                    elif len(cids) > 1:
                        clm = 'ambiguous'
                    
                    for lip in cids:
                        
                        res = []
                        
                        res.append(protein)
                        res.append(mode)
                        res.append(ii)
                        res.append(mz)
                        res.append(mz)
                        res.append(intensity)
                        res.append(idlevel)
                        
                        cl = lip.split('(')[0] if clm is None else clm
                        hg = 'NA' if clm == 'unknown' else lip.split('(')[0]
                        lyso = 'Lyso' if 'lyso' in lip.lower() else ''
                        hg = 'NA' if clm == 'unknown' else lip.split('(')[0]
                        pref = 'O' if '-O' in hg else ''
                        if 'Cer' in hg:
                            pref = 'd' if 'CerOH' not in hg else 't'
                        
                        if pref =='O':
                            hg = '%s-O' % hg
                        
                        hg0 = hg.replace(
                            '-O', '').replace(
                            'CerOH', 'Cer').replace(
                            'Cer1P', 'CerP')
                        
                        res.append(hg)
                        res.append(rtmean)
                        res.append(rtms2)
                        res.append(rt[0])
                        res.append(rt[1])
                        res.append(hg0)
                        res.append(lyso)
                        # res.append(pref)
                        
                        cc  = self.recount3.findall(lip)
                        
                        if cc:
                            
                            sumcc  = sum(map(lambda cci:
                                            int(cc[0][cci]) if cc[0][cci] else 0,
                                        [1, 4, 7]))
                            sumuns = sum(map(lambda uni:
                                            int(cc[0][uni]) if cc[0][uni] else 0,
                                        [2, 5, 8]))
                            
                            res.extend([pref, sumcc, sumuns])
                            
                            hgcc = '%s(%u:%u)' % (hg, sumcc, sumuns)
                            sumccuns = '%u:%u' % (sumcc, sumuns)
                            fa = '%s:%s' % (cc[0][1], cc[0][2])
                            if cc[0][4]:
                                fa = '%s/%s:%s' % (fa, cc[0][4], cc[0][5])
                            if cc[0][7]:
                                fa = '%s/%s:%s' % (fa, cc[0][7], cc[0][8])
                            
                        else:
                            res.extend(['', np.nan, np.nan])
                            
                            hgcc = 'NA'
                            sumccuns = 'NA'
                            fa = 'NA'
                        
                        cc2 = self.recount2.findall(lip)
                        
                        if (
                            cc and
                            cc[0][1] and
                            cc[0][2] and (
                                cc[0][4] or hg == 'FA' or lyso
                            )):
                            
                            res.extend([cc[0][0],
                                int(cc[0][1]),
                                int(cc[0][2]),
                                cc[0][3],
                                int(cc[0][4]) if cc[0][4] else np.nan,
                                int(cc[0][5]) if cc[0][5] else np.nan,
                                cc[0][6],
                                int(cc[0][7]) if cc[0][7] else np.nan,
                                int(cc[0][8]) if cc[0][8] else np.nan
                            ])
                            
                        else:
                            
                            res.extend(['', np.nan, np.nan,
                                        '', np.nan, np.nan,
                                        '', np.nan, np.nan])
                        
                        res.append(hg)
                        res.append(hg if clm is None else clm)
                        res.append('NA' if clm == 'unknown' else hg)
                        res.append(hgcc)
                        res.append(sumccuns)
                        res.append('%s(%s)' % (hg, fa))
                        res.append(fa)
                        
                        res.append(screen_name)
                        
                        result.append(res)
                        
                        ii += 1
        
        self.pauto = pd.DataFrame(result, columns = self.df_header)
    
[docs]    def headgroups_cross_screening(self, label1 = 'Screen1',
                                   label2 = 'Screen2',
                                   idlevels = set(['I']),
                                   outfile = 'headgroups_%s_%s.tab'):
        """
        Does a quick comparison at headgroup/protein level
        between 2 screenings.
        """
        
        self.manual_df()
        
        result = []
        
        for protein, d in iteritems(self.valids):
            
            for mode, tbl in iteritems(d):
                
                s1_hg = set(
                    map(
                        lambda hg:
                            hg.replace('-O', ''),
                        self.pmanual[
                            np.logical_and(
                                np.logical_and(
                                    self.pmanual.protein == protein,
                                    self.pmanual.ionm    == mode
                                ),
                                self.pmanual.cls.isin(idlevels)
                            )
                        ]['uhgroup']
                    )
                )
                
                s2_hg = (
                    set(
                        reduce(
                            lambda h1, h2:
                                h1 | h2,
                            map(
                                lambda i:
                                    set(
                                        map(
                                            lambda hgfa:
                                                hgfa.split('(')[0],
                                            tbl['cid'][i[1]]
                                        )
                                    ),
                                filter(
                                    lambda i:
                                        (
                                            tbl['slobb'][i[0]] and
                                            tbl['idlevel'][i[1]] in idlevels
                                        ),
                                    enumerate(tbl['i'])
                                )
                            ),
                            set([])
                        )
                    )
                )
                
                for hg12 in (s2_hg & s1_hg):
                    result.append(['%s-%s' % (protein, mode), hg12,
                                   '%s_%s' % (label1, label2)])
                
                for hg1 in (s1_hg - s2_hg):
                    result.append(['%s-%s' % (protein, mode), hg1, label1])
                
                for hg2 in (s2_hg - s1_hg):
                    result.append(['%s-%s' % (protein, mode), hg2, label2])
        
        outfile = outfile % (label1, label2)
        
        hdr = ['protein_mode', 'hg', 'found_in']
        
        self.cross_screen_hg = result
        
        with open(outfile, 'w') as fp:
            
            fp.write('%s\n' % '\t'.join(hdr))
            
            fp.write('\n'.join(map(lambda row: '\t'.join(row), result)))
    
[docs]    def bubble_altair(self,
                      classes = ['I', 'II'],
                      subtitle = '',
                      main_title = ''):
        
        smodes = {'pos': '+', 'neg': '-'}
        # select the classes
        data = self.pmanual[self.pmanual.cls.isin(classes)]
        
        nrows = 0
        ncols = len(data.ionm.unique())
        subplot_titles = []
        
        allhgs = sorted(data.headgroup.unique())
        unsat = np.arange(min(data.unsat), max(data.unsat) + 1)
        carb = np.arange(min(data.carb), max(data.carb) + 1)
        inte = (min(data.intensity), max(data.intensity))
        traces = []
        
        xlim = [min(unsat), max(unsat)]
        ylim = [min(carb), max(carb)]
        
        data = data.sort_values(by = ['protein', 'ionm'])
        
        a = altair.Chart(data).mark_point().encode(
            row = 'protein',
            column = 'ionm',
            size = 'Intensity:average(intensity)',
            x = altair.X('unsat', axis = altair.Axis(title = 'Unsaturated count')),
            y = altair.Y('carb', axis = altair.Axis(title = 'Carbon count'))
        )
        
        return a
    
[docs]    @staticmethod
    def export_df(df, fname, **kwargs):
        """
        Exports the results from a `pandas.DataFrame` to csv.
        """
        
        if 'sep' not in kwargs:
            kwargs['sep'] = '\t'
        if 'na_rep' not in kwargs:
            kwargs['na_rep'] = 'NaN'
        if 'index' not in kwargs:
            kwargs['index'] = False
        
        df.to_csv(fname, **kwargs)
    
[docs]    def bubble_plotly(self,
                     classes = ['I', 'II'],
                     subtitle = '',
                     main_title = ''):
        
        smodes = {'pos': '+', 'neg': '-'}
        # select the classes
        data = self.pmanual[self.pmanual.cls.isin(classes)]
        
        nrows = 0
        ncols = len(data.ionm.unique())
        subplot_titles = []
        
        allhgs = sorted(data.headgroup.unique())
        unsat = np.arange(min(data.unsat), max(data.unsat) + 1)
        carb = np.arange(min(data.carb), max(data.carb) + 1)
        inte = (min(data.intensity), max(data.intensity))
        traces = []
        
        xlim = [min(unsat), max(unsat)]
        ylim = [min(carb), max(carb)]
        
        for protein in sorted(data.protein.unique()):
            nrows += 1
            
            for mode in sorted(data.ionm.unique()):
                
                subplot_titles.append('%s%s%s' % (
                    protein,
                     smodes[mode],
                     ', %s' % subtitle if len(subtitle) else '')
                )
                
                this_data = \
                    data[(data.protein == protein) & (data.ionm == mode)]
                
                vals = this_data.groupby(['carb', 'unsat'])['intensity'].sum()
                #print(list(iteritems(vals)))
                x = list(map(lambda i: i[0][1], iteritems(vals)))
                y = list(map(lambda i: i[0][0], iteritems(vals)))
                s = list(map(lambda i: i[1] / float(inte[1]), iteritems(vals)))
                
                #print(protein, x, y, s)
                
                traces.append(go.Scatter(x = x, y = y,
                                         mode = 'markers',
                                         marker = dict(size = s, sizemode = 'area', sizeref = 0.0001),
                                         name = '%s%s' % (protein, smodes[mode]), fill = '#333333', showlegend = False,
                                         xaxis = dict(range = xlim),
                                         yaxis = dict(range = ylim))
                                    )
        
        fig = plotly.tools.make_subplots(rows=nrows,
                                         cols=ncols,
                                         print_grid = False,
                                         subplot_titles=subplot_titles
                                        )
        
        for i, trace in enumerate(traces):
            fig.append_trace(trace, row = int(np.floor(i / ncols) + 1), col = (i % ncols) + 1)
        
        fig['layout'].update(height = nrows * 500, width = 600, title = main_title,
                             xaxis = dict(range = xlim), yaxis = dict(range = ylim))
        
        pl.iplot(fig, show_link = False)
    
[docs]    def piecharts_plotly(self, by_class = True, main_title = 'Lipid classes by protein', result_classes = {'I'}):
        """
        Plots piecharts of detected lipids for each protein based on manually
        annotated results.
        Uses plotly, output accessible in Jupyter notebook.
        """
        
        def get_names(r, by_class = True):
            
            counts = []
            
            for lips in r[2].split('///'):
                
                for lip in lips.split(';'):
                    
                    if 'nothing' in lip or not len(lip.strip()):
                        continue
                    
                    cl = self.headgroup_from_lipid_name(['S', None, lip])[0]
                    
                    if cl is None:
                        cl = lip.split('(')[0].strip()
                        if ':' in cl:
                            cl = cl.split(':')[1].strip()
                    
                    if by_class:
                        cc = ''
                    else:
                        cc = self.recount2.findall(lip)
                        
                        if not len(cc):
                            cc = self.recount1.findall(lip)
                        
                        cc = cc[0] if len(cc) else '?'
                    
                    counts.append('%s(%s)' % (cl, cc) if len(cc) else cl)
            
            return counts
        
        if not hasattr(self, 'manual') or self.manual is None:
            self.read_manual()
        
        if not hasattr(self, 'lipnames') or self.lipnames is None:
            self.read_lipid_names()
        
        main_title = '%s (class %s)' % (main_title, ', '.join(sorted(list(result_classes))))
        
        modes = {'pos': 'positive', 'neg': 'negative'}
        smodes = {'pos': '+', 'neg': '-'}
        nrows = int(np.ceil(len(self.manual) / 2.0))
        height = 500 * nrows
        param = {
            'data': [],
            'layout': {
                'title': main_title,
                'annotations': [],
                'autosize': False,
                'width': 600,
                'height': height
            }
        }
        
        traces = []
        #fig = plotly.tools.make_subplots(rows=nrows, cols=2, print_grid = False)
                          #subplot_titles=('First Subplot','Second Subplot', 'Third Subplot'))
        
        n = 0
        for protein in sorted(self.manual.keys()):
            
            for mode in ['neg', 'pos']:
                
                
                this_data = {}
                this_anno = {'font': {'size': 10}, 'showarrow': False}
                lab_val = {}
                
                for r in self.manual[protein][mode]:
                    
                    if r[1].strip() in result_classes:
                        
                        label = '/'.join(get_names(r, by_class = by_class))
                        
                        if label not in lab_val:
                            lab_val[label] = 0.0
                        
                        lab_val[label] += r[4]
                    
                this_data['labels'], this_data['values'] = \
                    zip(*sorted(lab_val.items(), key = lambda i: i[0])) \
                        if len(lab_val) else (['None'], [1])
                this_data['name'] = '%s %s, \nsum of intensities' % (protein, modes[mode])
                this_data['type'] = 'pie'
                this_data['hole'] = 0.4
                this_data['hoverinfo'] = 'label+percent+name'
                this_data['domain'] = {
                    'x': [
                        n % 2 / 2.0,
                        n % 2 / 2.0 + 0.5
                    ],
                    'y': [
                        1.0 - (0.48 / nrows * np.floor(n / 2) + 0.003),
                        1.0 - (0.48 / nrows * (np.floor(n / 2) + 1) - 0.003)
                    ]
                }
                this_pie = go.Pie(**this_data)
                traces.append(this_pie)
                
                # print('%s: n = %u,   %s' % (protein, n, str(this_data['domain'])))
                
                # fig.append_trace(this_pie, int(n % 2 + 1), int(np.floor(n / 2.0) + 1))
                this_anno['text'] = '%s [%s]' % (protein, smodes[mode])
                this_anno['x'] = n % 2 / 2.0 + 0.25
                this_anno['y'] = 1.0 - (0.48 / nrows * np.floor(n / 2) + 0.48 / nrows / 2.0)
                this_anno['xanchor'] = 'center'
                this_anno['yanchor'] = 'middle'
                
                param['data'].append(this_data)
                param['layout']['annotations'].append(this_anno)
                
                n += 1
        
        layout = go.Layout(annotations = param['layout']['annotations'],
                        height = height, title = main_title,
                        #width = 600, autosize = False
                        )
        fig = go.Figure(data = traces, layout = layout)
        # print(param)
        #fig['layout'].update(showlegend = True, title = 'Lipid classes by protein')
        pl.iplot(fig, show_link = False)