Source code for lipyd.settings

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2017 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

from future.utils import iteritems

import os
import copy
import collections

import lipyd.common as common
import lipyd.lipproc as lipproc

_defaults = {
    # The absolute root directory.
    # This should not be necessary, why is it here?
    'path_root': '/',
    # The basedir for every files and directories in the followings.
    'basedir': os.getcwd(),
    # If None will be the same as ``basedir``.
    'data_basedir': None,
    # List of the shared directories containing most of the data.
    # Due to their large size these are kept on separate partition.
    # Lists are joined by `os.path.join`.
    # E.g. two shared folders can be defined as
    # `[['my', 'huge', 'partition'], [`another`, `huge`, `disk`]]`.
    'datadirs': [['share']],
    # the overview table of Enric's screening
    # we read the protein containing and the
    # highest fractions from here
    'fractionsf': 'LTPsceenprogres_v07d.xlsx',
    # list of potentially problematic fractions
    'wrongfracsf': 'wrong_fractions.csv',
    # File with offsets of fractions from SEC in ml.
    # This we used at Antonella, but at Enric, these values
    # are different for each protein, and contained by the
    # individual SEC profile files.
    'ppfracf': 'fractions.csv',
    # Directory with the SEC absorbance profiles for each protein.
    'ppsecdir': 'SEC_profiles',
    # The first section in the SEC profile filenames is the protein
    # name or the second:
    'sec_filenames_protein_first': True,
    # This is a typo in some of the PEAK table headers.
    # If this cause trouble just turn it off.
    'fix_fraction_name_ab_typo': True,
    # Directory with the SDS PAGE protein quantities for some
    # proteins.
    'gelprofdir': 'gel_profiles',
    # The directory containing the standards in mzML format.
    # These are used for calculation of recalibration.
    'stddir': 'Standards_mzML format',
    # Directory with manually processed files. These are originally
    # output of this module, then have been manually processed, and
    # can be read again and compared/further analysed.
    'manualdir': 'Processed_files',
    # ending of processed files
    'manualend': 'final.xlsx',
    # For recalibration we read the dates of runs from here.
    'seqfile': 'Sequence_list_LTP_screen_2015.csv',
    # This is an output file to export the absorbance based
    # protein quantities for all proteins and fractions.
    'pptablef': 'proteins_by_fraction.csv',
    # Defines abbreviations of each lipid names and the keywords
    # to identify these in SwissLipids and LipidMaps.
    'lipnamesf': 'lipid_names_v2.csv',
    # Literature curated data about known binding properties of LTPs.
    'bindpropf': 'binding_properties.csv',
    # Lipid classes properties and database IDsb
    'lipipropf': 'lipid_properties.csv',
    # The file with recalibration values from Marco.
    'recalfile': 'Recalibration_values_LTP.csv',
    # If the recalibration performed by this module, we read the
    # expected values of the standards from this file.
    'metabsf': 'Metabolites.xlsx',
    # Simple list with lipid binding domains, names and UniProt
    # IDs of all lipid transfer proteins.
    'ltplistf': 'ltplist.csv',
    # Tolerate numpy warnings, or raise them as errors.
    # Useful for debugging.
    'tolerate_numpy_warnings': True,
    # URLs of external databases.
    # SwissLipids: calculated masses of hundreds of thousands
    # of lipid species.
    'swisslipids_url': 'http://www.swisslipids.org/php/'\
        'export.php?action=get&file=lipids.csv',
    # Experimentally verified masses of few tens of thousands lipids.
    'lipidmaps_url': 'http://www.lipidmaps.org/resources/downloads/'\
        'LMSDFDownload12Dec17.tar.gz',
    # The filename to use after extracting the archice above.
    'lipidmaps_fname': 'LMSDFDownload12Dec17/'\
        'LMSDFDownload12Dec17FinalAll.sdf',
    # ComPPI: protein subcellular localization data.
    'comppi_url': 'http://comppi.linkgroup.hu/downloads',
    # Gene Ontology.
    'goa_url': 'ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/%s/'\
        'goa_%s.gaf.gz',
    # Gene Ontology.
    'quickgo_url': 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&'\
        'limit=-1%s&termUse=%s&tax=%u&col=proteinID,goID,goName,aspect',
    # PubChem webservice URL to query molecule properties
    # see details here:
    # https://pubchem.ncbi.nlm.nih.gov/help.html#Glossary
    'pubchem_url': ('https://pubchem.ncbi.nlm.nih.gov/'
        'rest/pug/compound/cid/%s/property/'
        'TPSA,' # polar surface area in square angstroms
        'XLogP,' # water octanol partitioning coefficient
        'Complexity,' # Bertz/Hendrickson/Ihlenfeldt formula
        'HBondDonorCount,' # O, N, P, S with hydrogene
        'HBondAcceptorCount,' # O, N, P, S with lone pair
        'HeavyAtomCount,' # non-H atom count
        'RotatableBondCount/XML'), # rotatable bondsgg
    # Manually curated data by Charlotte about localization
    # and membrane composition.
    'localizationf': 'subcellular_localisation_and_binding.xlsx',
    'membranesf': 'membranes_lipid_composition.xlsx',
    # Original MS2 fragment lists manually compiled by Marco.
    # One for positive and one for negative mode.
    'pfragmentsfile': 'lipid_fragments_positive_mode_v11d.txt',
    'nfragmentsfile': 'lipid_fragments_negative_mode_v11d.txt',
    # This is a file to output protein profiles to.
    # Serves only the purpose of checking for errors.
    'pptable_file': 'protein_profiles.txt',
    # Cache files. At certain points this module saves and reloads
    # data from pickles, so it don't need to reprocess everything
    # every time. Although if something changed, or something
    # goes wrong, you might need to delete these to ensure
    # everything processed by the most recent code.
    'featurescache': 'features.pickle',
    'pprofcache': 'pprofiles_raw.pickle',
    'abscache': 'absorbances.pickle',
    'flimcache': 'fraclims.pickle',
    # The directory with all MS2 MGF files. If not set, the MGF files
    # will be searched under the directory of each protein.
    'ms2dir': 'MGFfiles',
    # Directory with manually processed `golden standards`
    # from Marco.
    'marco_dir': 'marco',
    # table with manually set protein ratios among
    # many other columns
    'manual_ppratios_xls': 'Proteins_Overview_05.xlsx',
    # Columns to read from the manual ppratios XLS file.
    # These were different at Antonella and Enric, so we
    # need to set them here.
    #'manual_ppratios_xls_cols': [2, 4, 8, 9], # 03b
    'manual_ppratios_xls_cols': [3, 6, 10, 11], # 05
    'auxcache': 'save.pickle',
    'stdcachefile': 'calibrations.pickle',
    'validscache': 'valids.pickle',
    # Write a log about MS2 processing to this file.
    'ms2log': 'ms2identities.log',
    # use the items only of these levels from SwissLipids.
    # Useful to avoid flooding data with redundant subspecies.
    'swl_levels': ['Species'],
    # the source of recalibration: either from Marco, or calculated
    # by this software by processing raw data from the standards.
    'recal_source': 'marco',
    # at Antonella, these proteins come before our fractions, or
    # maybe in A9, so the further fractions we can consider void
    # and we can use their average to reduce the background of the
    # absorbances of the same fractions of other proteins.
    'background_proteins': set(['BNIPL', 'OSBP', 'SEC14L1']),
    # the minimum total intensities (average areas) of MS1 features
    # in negative and positive mode. Below these values features are
    # not highlighted and not included on the `best` sheet in the
    # output tables unless they have MS2 data.
    'aa_threshold': {
        'neg': 30000.0,
        'pos': 150000.0
    },
    'fr_offsets': [0.0],          # at Enric this is [0.0],
                                    # at Antonella [0.010, 0.045]
    'abs_cols': [1], # this can be [1, 3, 5] if we want to read
                        # also the 260 and 215 nm UV profiles...
    'fraclims_from_sec': True, # read the fraction limits from
                                # the same files as SEC profiles
                                # or simply from `fractions.csv`
    'pp_do_correction': False, # bypass corrections
    'pcont_fracs_from_abs': False, # determine protein containing
                                    # fractions from the absorbances
                                    # or read from separate file
    # allow features to have missing values in first or last
    # protein containing fractions
    'permit_profile_end_nan': True,
    # This is the maximum extent we can broaden the band
    # when selecting the features with intensity ratios
    # closest to the protein ratio. This is used as a
    # ratio-like limit, e.g. if the number here is 0.25,
    # and the protein ratio is 1.0, then the lower limit
    # will be 1.0 * 0.25 = 0.25 and the upper
    # 1.0 / 0.25 * 1.0 = 4.0.
    'peak_ratio_score_max_bandwidth': 0.25,
    'peak_ratio_score_optimal_bandwidth': 0.5,
    # This is the preferred number of features to
    # calculate the mean and SD in protein ratio score.
    # It means we try to select this number of features
    # with intensity ratio closest to the protein ratio,
    # and use their mean and SD to estimate the fit of 
    # other features. The lower the number the closer
    # features will be selected, but it should be enough
    # large to make the SD a meaningful metric. Similarly
    # we can set a minumum population, below this it
    # really does not make sense to calculate an SD.
    'peak_ratio_score_optimal_population': 10,
    'peak_ratio_score_still_good_population': 7,
    'peak_ratio_score_minimal_population': 4,
    # Use the adaptive method at the peak ratio score
    # calculation. This means to iteratively broaden the
    # band at selecting the features with intensity ratios
    # closest to the protein ratio, either until we have
    # the optimal population of features, or we reach the
    # maximum bandwith. If no features found this way,
    # a warning message will be displayed.
    'adaptive_peak_ratio_score': False,
    # Read externally determined protein peak ratios from file
    # these were provided by Marco and used for Antonella`s
    # data analysis
    'use_manual_ppratios': False,
    # omg, I don't remember what it is
    'use_last_ratio': False,
    # for calculation of peak ratio score, take all intensity ratios
    # within this range of tolerance around the protein ratio.
    # E.g if the protein ratio is 0.6, then the lower threshold
    # will be 0.6 * 0.5 = 0.3, and the upper 1 / 0.5 * 0.6 = 1.2;
    # Then the standard deviation of all intensity ratios within
    # this range is calculated from both positive and negative
    # features, and for each feature and for each pairs of fractions
    # the difference between its intensity ratio divided by the SD
    # results the `peak ratio score`. This is calculated for all
    # pairs of protein containing fractions, and we take their
    # mean if more than 2 fractions are available.
    'peak_ratio_range': 0.25,
    # The threshold below the peak ratio scores considered good.
    # We set a cut-off e.g. for highlighting with green in the
    # output tables, but otherwise it is a continuous measure
    # of goodness.
    'peak_ratio_score_threshold': 1.0,
    # constant fraction layout in Antonella's screening
    # not used at Enric, is kind of "deprecated"
    'fixed_fraction_layout': False,
    'fracs': ['A9', 'A10', 'A11', 'A12', 'B1'],
    'all_fracs': ['A5', 'A6', 'A7', 'A8', 'A9',
                    'A10', 'A11', 'A12', 'B1'],
    # constant fraction layout in Antonella's screening
    # not used at Enric, is kind of "deprecated"
    # uppercase version with leading zeros
    'fracsU': ['A09', 'A10', 'A11', 'A12', 'B01'],
    'pp_minratio': 3,
    # at Antonella's screening a fraction which is considered
    # void, so adjusting the zero of the absorbance profiles
    # to this fraction
    # see method ``pp_baseline_correction()``
    'basefrac': 'A5',
    # the tolerance when looking up MS1 m/z values in databases
    # ppm
    'ms1_tolerance': 20,
    # the tolerance when looking up MS2 fragment masses ppm
    'ms2_tolerance': 100,
    # tolarance at matching MS1 peaks against precursors in
    # mgf files with MS2 spectra
    'precursor_match_tolerance': 50,
    # the tolerance at identifying features in standards ppm
    'std_tolerance': 20,
    # MS2 precursors must have their charges determined
    'ms2_precursor_charge': None,
    # Use only fragments from Marco's lists (note: the series)
    # are still programmatically generated and their m/z values
    # calculated automatically), or use an extended fragment list
    # constructed by Denes (note: this might contain more false
    # positives, i.e. fragments which do not occure, or are iso-
    # baric with others, resulting unnecessary noise; but some-
    # times these annotations still might help to have a clue
    # what is there in cases where otherwise you would see only
    # unknown fragments)
    'only_marcos_fragments': True,
    # Expect only certain adducts at certain lipid categories
    # or assume any lipid may form any type of adduct.
    'adducts_constraints': False,
    # In output tables construct the lipid names programmatically
    # or copy the ones from the database.
    'marco_lipnames_from_db': True,
    # Use the average area values from the `PEAK` software
    # output, or recalculate them here using the intensity
    # values of features across all fractions
    'use_original_average_area': True,
    # Overwrite UV absorbance based protein quantities with
    # those manually acquired from SDS PAGE when these are
    # available
    'use_gel_profiles': True,
    # Use slope_filter (by Enric) for the final
    # selection of profiles
    'slope_profile_selection': True,
    # In slope_filter, if there are 2 highest fractions
    # with approximately equal protein content,
    # we use this range of tolerance to filter
    # the intensity ratios between them. E.g if this number is
    # 0.75, ratios between 0.75 and 1.33 will be accepted.
    'slope_equal_fractions_ratio': 0.75,
    # When we have only the descending parts of the profiles
    # in the measured fractions, we set this additional
    # constraint to avoid include everything descending
    # and select those which still fit better the protein
    'slope_descending_slope_diff_tolerance': 0.8,
    # Do not use the fractions marked as wrong
    # at comparing protein and intensity profiles
    'filter_wrong_fractions': True,
    # The MS2 retention time values must be within the RT
    # range of the feature detected in MS1, otherwise
    # will be dropped
    'ms2_rt_within_range': False,
    # Consider the MS2 scans from only those fractions
    # containing the protein, or from all available fractions
    'ms2_only_protein_fractions' : False,
    # Above this threshold we consider the MS2 spectrum to not
    # belong to the protein and highlight with red in the
    # output tables.
    'deltart_threshold': 0.5,
    # Don't know what it is for
    'uniprots': None,
    # example files
    'mgf_neg_examples': 'neg_examples.mgf',
    'mgf_pos_examples': 'pos_examples.mgf',
    'peaks_example': 'peaks_example.csv',
    'peaks_gltpd1_invitro': 'peaks_example_gltpd1_invitro_50.csv',
    'peaks_gltpd1_invivo': 'peaks_example_gltpd1_invivo_50.csv',
    'sec_gltpd1_invitro': 'SEC_GLTPD1_invitro.asc',
    'sec_gltpd1_invivo': 'SEC_GLTPD1_invivo.xls',
    'sec_xls_example': 'SEC_xls_example.xls',
    'sec_unicorn_example': 'SEC_asc_unicorn_example.asc',
    # an MFQL example file
    'mfql_example': 'Neg_bovine_heart_PE.mfql',
    # logarithm base for matching chain fragment intensities
    # this determines the tolerance and represents fold difference
    'chain_fragment_instensity_ratios_logbase': 1.5,
    # if no expected ratios provided still require the ratios to be even
    # at glycero(phospho)lipids?
    'even_chain_fragment_intensity_ratios_gl_gpl': True,
    # if no expected ratios provided still require the ratios to be even
    # at aphingolipids?
    'even_chain_fragment_intensity_ratios_sl': False,
    # at MS2 identification, add chain details
    # (fragment rank, intensity and type) to the MS2Identity object
    # turning this off makes MS2 spectra analysis faster
    'ms2_scan_chain_details': True,
    # Method names to convert between adduct and exact masses
    'ad2ex': {
        1: {
            'pos': {
                '[M+H]+': 'remove_h',
                '[M+NH4]+': 'remove_nh4',
                '[M+Na]+': 'remove_na',
                '[M-H2O+H]+': 'add_oh',
            },
            'neg': {
                '[M-H]-': 'add_h',
                '[M+HCOO]-': 'remove_fo',
            }
        },
        2: {
            'pos': {},
            'neg': {
                '[M-2H]2-': 'add_2h'
            }
        },
        3: {
            'pos': {},
            'neg': {
                '[M-3H]3-': 'add_3h'
            }
        }
    },
    # method names to convert between exact and adduct masses
    'ex2ad': {
        1: {
            'pos': {
                '[M+H]+': 'add_h',
                '[M+NH4]+': 'add_nh4',
                '[M+Na]+': 'add_na',
                '[M-H2O+H]+': 'remove_oh',
            },
            'neg': {
                '[M-H]-': 'remove_h',
                '[M+HCOO]-': 'add_fo',
            }
        },
        2: {
            'pos': {},
            'neg': {
                '[M-2H]2-': 'remove_2h'
            }
        },
        3: {
            'pos': {},
            'neg': {
                '[M-3H]3-': 'remove_3h'
            }
        }
    },
    # metrics to use for determining similarity of protein
    # and intensity profiles; these were attempts, and not
    # used any more, are "deprecated"
    'metrics': [
        ('Kendall\'s tau', 'ktv', False),
        ('Spearman corr.', 'spv', False),
        ('Pearson corr.', 'pev', False),
        ('Euclidean dist.', 'euv', True),
        ('Robust corr.', 'rcv', False),
        ('Goodman-Kruskal\'s gamma', 'gkv', False),
        ('Difference', 'dfv', True)
    ],
    # adducts used by default
    'adducts_default': {
        'neg': {
            1: {
                '[M-H]-',
                '[M+HCOO]-',
            },
        },
        'pos': {
            1: {
                '[M+H]+',
                '[M+NH4]+',
                '[M+Na]+',
            },
        }
    },
    # additional constraints for adduct lookups at various species
    # e.g. by default `[M-H2O+H]+` is not used but at Vitamin A we use it:
    'adduct_constraints': {
        'pos': {
            lipproc.Headgroup(main = 'VA'): {
                '[M+H]+',
                '[M+NH4]+',
                '[M+Na]+',
                '[M-H2O+H]+',
            },
            lipproc.Headgroup(main = 'Cer', sub = ('Hex',)): {
                '[M+H]+',
                '[M+NH4]+',
                '[M+Na]+',
                '[M-H2O+H]+',
            },
            lipproc.Headgroup(main = 'Cer', sub = ('Hex2',)): {
                '[M+H]+',
                '[M+NH4]+',
                '[M+Na]+',
                '[M-H2O+H]+',
            },
            lipproc.Headgroup(main = 'Cer', sub = ('SHex',)): {
                '[M+H]+',
                '[M+NH4]+',
                '[M+Na]+',
                '[M-H2O+H]+',
            },
            lipproc.Headgroup(main = 'Cer', sub = ('SHex2',)): {
                '[M+H]+',
                '[M+NH4]+',
                '[M+Na]+',
                '[M-H2O+H]+',
            },
        },
        'neg': {
        }
    }
}

in_basedir = [
    'fractionsf',
    'ppfracf',
    'seqfile',
    'pptablef',
    'lipnamesf',
    'bindpropf',
    'metabsf',
    'featurescache',
    'auxcache',
    'stdcachefile',
    'validscache',
    'marco_dir',
    'abscache',
    'pptable_file',
    'recalfile',
    'manual_ppratios_xls',
    'manualdir',
    'ltplistf',
    'flimcache',
    'ppsecdir',
    'gelprofdir',
]

in_datadir = {
    'pfragmentsfile',
    'nfragmentsfile',
    'lipnamesf',
    'mgf_example',
    'peaks_example',
    'mfql_example',
    'sec_xls_example',
    'sec_unicorn_example',
    'peaks_gltpd1_invitro',
    'peaks_gltpd1_invivo',
    'sec_gltpd1_invitro',
    'sec_gltpd1_invivo',
}

in_mgfdir = {
    'mgf_neg_examples', 'mgf_pos_examples',
}

[docs]def reset_all(): settings = collections.namedtuple('Settings', list(_defaults.keys())) for k in _defaults.keys(): val = getattr(defaults, k) if k in in_datadir: val = os.path.join(common.ROOT, 'data', val) if k in in_mgfdir: val = os.path.join(common.ROOT, 'data', 'ms2_examples', val) setattr(settings, k, val) globals()['settings'] = settings
[docs]def setup(**kwargs): for param, value in iteritems(kwargs): setattr(settings, param, value)
[docs]def get(param): if hasattr(settings, param): return getattr(settings, param)
[docs]def get_default(param): if hasattr(defaults, param): return getattr(defaults, param)
[docs]def reset(param): setup(param, get_default(param))
defaults = common._const() for k, v in iteritems(_defaults): setattr(defaults, k, v) reset_all()