Source code for lipyd.sample

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `lipyd` python module
#
#  Copyright (c) 2015-2018 - EMBL
#
#  File author(s): Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GNU GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: http://www.ebi.ac.uk/~denes
#

from future.utils import iteritems
from past.builtins import xrange, range


import os
import re
import imp
import warnings
import itertools
import operator
import numpy as np


import lipyd.common as common
basestring = common.basestring
from lipyd import reader
import lipyd.reader.peaks
import lipyd.moldb as moldb
import lipyd.ms2 as ms2
import lipyd.mgf as mgf
import lipyd.settings as settings
import lipyd.progress as progress
import lipyd.sampleattrs as sampleattrs
import lipyd.feature as feature


remgf  = re.compile(r'(\w+)_(pos|neg)_([A-Z])([0-9]{1,2})\.mgf')
remgf2 = re.compile(r'(\w+)_([A-Z])([0-9]{1,2})_(pos|neg)\.mgf')


[docs]class SampleReader(object): reader_classes = { 'peaks': reader.peaks.PeaksReader, } def __init__( self, input_type, ionmode = None, **kwargs ): """ Reads data from files and creates ``Sample``, ``SampleSet`` and ``FeatureAttributes`` objects. :param str input_type: Format of the input files. Possible values: ``peaks``, ``mzml``. Reader for ``mzml`` not yet available. :param str ionmode: Ion mode of the experiment: ``pos`` or ``neg``. By deafult is ``None`` because the reading is possible without the reader being aware of the ion mode. However it is most convenient to provide it here. :param **kwargs: Arguments for the reader. Depends on the input format, please refer to classes in ``lipyd.reader`` modules. """ if input_type not in self.reader_classes: raise RuntimeError('Unknown input type: %s' % input_type) self.ionmode = ionmode self.reader_class = self.reader_classes[input_type] self.reader_args = kwargs self.read()
[docs] def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist = [modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new)
[docs] def read(self): self.reader = self.reader_class(**self.reader_args)
[docs] def get_attributes(self): """ Returns ``lipyd.sample.FeatureAttributes`` object. This object contains variables describing series of features across all samples. E.g. Quality, significance, mean RT, centroid m/z, etc """ attrs = self.reader.get_attributes() attrs['ionmode'] = self.ionmode return FeatureAttributes(**attrs)
[docs] def get_samples(self, bind = True, sample_args = None): """ Yields ``lipyd.sample.Sample`` objects for each sample read. To extract all data from ``PeaksReader`` the ``get_sampleset`` method is more convenient. :param bool bind: Bind samples to each other. This way they will sort together, i.e. if any of them is sorted all the others follow the same order. """ feature_attrs = self.get_attributes() sorter = ( (feature_attrs.sorter or FeatureIdx(len(self.reader.mzs))) if bind else None ) for _sample_args in self.reader.get_samples(): if not bind: feature_attrs = self.get_attributes() _sample_args['sorter'] = feature_attrs.sorter _sample_args['feature_attrs'] = feature_attrs _sample_args['ionmode'] = self.ionmode if sample_args: _sample_args.update(sample_args) yield Sample(**_sample_args)
[docs] def get_sampleset(self, sampleset_args = None): """ Returns a ``SampleSet`` and a ``FeatureAttributes`` object. """ _sampleset_args = self.reader.get_sampleset() feature_attrs = self.get_attributes() _sampleset_args['feature_attrs'] = feature_attrs _sampleset_args['sorter'] = feature_attrs.sorter _sampleset_args['ionmode'] = self.ionmode or self.reader.ionmode if sampleset_args: _sampleset_args.update(sampleset_args) return SampleSet(**_sampleset_args)
[docs]class FeatureBase(object): def __init__(self, sorter = None, **kwargs): """ Serves as a base class for various classes handling arrays of features. Some of its predecessors represent one sample others more than one, or maybe data or annotations but all represent a series of features detected in one LC MS/MS run or more than one runs aligned with each other. """ self.var = self.var if hasattr(self, 'var') else set() self.sorted_by = None for attr, data in iteritems(kwargs): self._add_var(data, attr) self.sorter = ( FeatureIdx(len(self)) if ( sorter is None and self.__class__.__name__ != 'FeatureIdx' and len(self) is not None ) else sorter ) if self.sorter is not None: self.sorter.register(self)
[docs] def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist = [modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new)
def _add_var(self, data, attr): """ Registers a variable (array of data). If an array added this way it will be always sorted the same way as all the other arrays in order to keep the data of features together. This method should not be called by users it is to be called automatically at creation of the object. """ setattr(self, attr, data) if data is None: self.missing.add(attr) else: self.var.add(attr)
[docs] def sort_all( self, by = None, desc = False, resort = False, propagate = True, return_isort = False, indices = (), ): """ Sorts all data arrays according to an index array or values in one of the data arrays. :param np.ndarray,str by: Either an array of indices or the attribute name of any data array of the object. :param bool propagate: Whether to propagate the sorting to bound objects. This is to avoid loops of sorting. :param bool return_isort: Return the argsort vector. A vector of indices which can be used to apply the same sorting on other arrays. """ isort = None if isinstance(by, basestring): if ( (by == self.sorted_by and not resort) or not hasattr(self, by) or getattr(self, by) is None ): return None byarray = getattr(self, by) if len(byarray.shape) > 1: if len(indices) < len(byarray.shape) - 1: warnings.warn( 'You requested sort by array of %u dimensions but ' 'selected only %u indices.\n' 'Selecting 0 for all other axes by default.' % ( len(byarray.shape), len(indices), ) ) indices = tuple( itertools.chain( indices, ((0,) * (len(byarray.shape) - len(indices))) ) ) if len(indices) > len(byarray.shape) - 1: warnings.warn( 'You requested sort by array of %u dimensions but' 'selected %u indices.\n' 'Dropping indices from the end.' % ( len(byarray.shape), len(indices), ) ) indices = indices[ :(len(byarray.shape) - len(indices) - 1) ] for i in indices: byarray = byarray[:,i] isort = byarray.argsort() if desc: isort = isort[::-1] self.sorted_by = by if isinstance(by, np.ndarray): if len(by) != len(self): raise RuntimeError( 'Can not sort object of length %u by ' 'index array of length %u.' (len(self), len(by)) ) isort = by self.sorted_by = None if isort is not None: for var in self.var: setattr(self, var, getattr(self, var)[isort]) if propagate and self.sorter is not None: self.sorter.sort_all(by = isort, origin = id(self))
def __len__(self): return ( getattr(self, next(self.var.__iter__())).shape[0] if self.var else 0 )
[docs] def filter(self, idx, negative = False, propagate = True): """ Filters features at indices in ``idx``. :param list,np.array idx: A list or array of inidices along axis 0. :param bool negative: Drop the features at indices instead of keeping them and removing all others. :param bool propagate: Perform filtering on all object bound to the same sorter. This is to avoid propagation in loops. """ selection = self._idx_to_selection(idx, negative = negative) self._filter(selection, propagate = propagate)
def _idx_to_selection(self, idx, negative = False): if isinstance(idx, list): idx = np.array(idx) selection = np.in1d(np.arange(len(self)), idx) if negative: selection = np.logical_not(selection) return selection def _filter(self, selection, propagate = True): for var in self.var: setattr(self, var, getattr(self, var)[selection]) if propagate: self.sorter._filter(selection = selection, origin = id(self))
[docs] def filter_selection(self, selection, negative = False): """ Takes a ``FeatureSelection`` object and applies it as a filter i.e. removes features where it is ``False``. """ if negative: self._filter(selection.negative) else: self._filter(selection.selection)
[docs] def threshold_filter(self, var, threshold, op = operator.gt): """ Filters the features by any numeric attribute by simply applying a threshold. If the attribute does not exist silently does nothing. :param str var: Name of the attribute. :param float threshold: Value of the threshold. :param operator op: The operator to use. By default ``operator.gt`` (greater than), which means the features having attribute value greatre than the threshold will be kept and all others removed. """ selection = self.threshold_select(var, threshold = threshold, op = op) self._filter(selection)
[docs] def threshold_select(self, var, threshold, op = operator.gt): """ Returns boolean vector by applying an operator on a variable and a threshold. """ if var not in self.var: raise RuntimeError('No such variable: %s' % var) return self._threshold_select( getattr(self, var), threshold = threshold, op = op, )
@staticmethod def _threshold_select(var, threshold, op = operator.gt): return op(var, threshold)
[docs] def feature_data(self, i, variables = None): """ For a feature returns the requested attributes. :param list variables: Names of the variables. """ result = {} for var in varialbes: result[var] = getattr(self, var)[i] return result
[docs]class FeatureAttributes(FeatureBase): def __init__( self, ionmode = None, attrs = None, **kwargs ): """ Features are m/z values accompanied by other attributes. Features might have been detected in one sample (one LC MS/MS run), or across multiple runs (``SampleSet``). ``FeatureAttributes`` handles the attributes for either a ``Sample`` or a ``SampleSet``. """ FeatureBase.__init__(self, **kwargs) self.attrs = attrs or {} self.ionmode = ionmode def __len__(self): obj = self if self.var else self.sorter return len(getattr(obj, next(iter(obj.var))))
[docs] def charges(self): """ Returns a set of ion charges observed in the sample(set). """ return sorted(set(self.charge))
[docs]class Sample(FeatureBase): ms2_collection_methods = { 'mgf': 'collect_mgf', 'mzml': 'collect_mzml', } def __init__( self, mzs, ionmode = None, intensities = None, rts = None, attrs = None, feature_attrs = None, sorter = None, ms2_format = 'mgf', ms2_param = None, silent = False, sample_id = None, sample_id_proc = None, sample_id_proc_method = None, sample_id_proc_names = None, attr_args = None, **kwargs, ): """ Represents one LC MS/MS run. Has at least a vector of m/z's, optionally vector of intensites, retention times and other metadata. :param str,tuple,callable sample_id: An identifier for the sample or a callable method which takes the sample attributes as an argument and returns such an ID. If ``None`` the default method will be applied which attempts to use the fraction ID from the attributes. If fails it uses a random string as a unique identifier for the sample. :param str ms2_format: Format of the MS2 data. At the moment ``mgf`` is accepted. :param dict ms2_param: - ``mgf_files``: A list of MGF file paths. - ``mgf_match_method`` (callable): A method which finds the MGF files. It accepts 2 arguments: the directory path with the MGF files and the attributes used to match the file names. - ``mgfdir`` (path): Path to the directory containing the MGF files. - ``mgf_charge``: If a value provided here only the scans with this precursor charge will be processed from the MGF files. If ``None``, charge won't be considered.' """ self.var = set() self.missing = set() if mzs is None: raise RuntimeError('Sample object must have at least m/z values.') self._add_var(mzs, 'mzs') FeatureBase.__init__( self, intensities = intensities, rts = rts, sorter = sorter, **kwargs, ) self.silent = silent self.ionmode = ionmode self._set_feature_attrs(feature_attrs) self.ms2_format = ms2_format self.ms2_param = ms2_param or {} attr_args = attr_args or { 'sample_id': sample_id, 'attrs': attrs, 'proc': sample_id_proc, 'proc_method': sample_id_proc_method, 'proc_names': sample_id_proc_names, 'length': length, } self._set_attrs(**attr_args)
[docs] def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new)
def _set_attrs(self, **kwargs): self.attrs = sampleattrs.SampleAttrs(**kwargs) def _set_feature_attrs(self, feature_attrs = None): if feature_attrs is None: feature_attrs = FeatureAttributes(sorter = self.sorter) self.feattrs = feature_attrs def _add_var(self, data, attr): # mzs is the first variable so it is added without # checking its length. All others must be the same # length as mzs. if attr != 'mzs' and data is not None and len(data) != len(self): raise RuntimeError( 'Sample object: length of `%s` (%u) is not the same ' 'as number of features in the sample (%u).' % ( attr, len(data), len(self), ) ) FeatureBase._add_var(self, data, attr)
[docs] def sort_all( self, by = 'mzs', desc = False, resort = False, propagate = True, return_isort = False, ): """ Sorts all data arrays according to an index array or values in one of the data arrays. :param np.ndarray,str by: Either an array of indices or the attribute name of any data array of the object. :param bool desc: Sort descending. Applies only if ``by`` is the attribute name of a data array. :param bool resort: Sort even if the object is already sorted according to the value stored in ``sorted_by``. :param bool propagate: Whether to propagate the sorting to bound objects. This is to avoid loops of sorting. :param bool return_isort: Return the argsort vector. A vector of indices which can be used to apply the same sorting on other arrays. """ isort = FeatureBase.sort_all( self, by = by, desc = desc, resort = resort, propagate = propagate, return_isort = return_isort, ) if return_isort: return isort
def __len__(self): """ Returns number of MS1 ions (m/z's) detected in the sample. """ return len(self.mzs)
[docs] def apply( self, method, name, variables = None, feature_attrs = None, **kwargs ): """ Applies a method on each of the features and creates a new variable from the results. :param callable method: The method to apply. :param str name: The name for the new variable. :param list variables: List of variables to be passed to the method. :param list feature_attrs: List of variables in the ``FeatureAttributes`` object. :param **kwargs: Passed to method. """ new = self._apply( method = method, variables = variables, feature_attrs = feature_attrs, **kwargs, ) self.feattrs._add_var(new, name)
def _apply( self, method, variables = None, feature_attrs = None, **kwargs ): result = [] for i in xrange(len(self)): args = self.feature_data( i, variables = variables, feature_attrs = feature_attrs, ) result_i = method(**args, **kwargs) result.append(result_i) np.vstack()
[docs] def select( self, method = None, variables = None, feature_atts = None, threshold = None, negative = False, op = operator.gt, **kwargs ): """ Selects a subset of the features. Returns ``FeatureSelection`` object. If ``method`` is callable, applies it to the variables in ``variables`` and ``feature_attrs`` and selects based on ``threshold`` and ``negative``. If ``method`` is ``None`` and a variable name provided in ``variables`` or ``feature_attrs`` then uses this variable to provide a selection. :param callable method: Method to be applied to obtain a new variable which will be the basis of the selection. :param list,str variables: Eiter a list of variables (to be passed to ``method``) or one variable name which will be the basis of the selection. :param list,str feature_attrs: Like ``variables`` but refers to variables in the ``FeatureAttributes`` object. :param float threshold: A threshold value for the filter. :param bool negative: Apply negative selection (selection vector is ``False`` where) ``method`` returns ``True``. :param operator op: Operator to apply between variable and threshold. """ if callable(method): var = self._apply( method, variables = variables, feature_attrs = feature_attrs, **kwargs ) elif isinstance(variables, basestring): var = getattr(self, var) elif isinstance(feature_attrs, basestring): var = getattr(self.feattrs, var) selection = self._threshold_select( var = var, threshold = threshold, op = op, ) return FeatureSelection(self, selections)
[docs] def feature_data(self, i, variables = None, feature_attrs = None): """ For a feature returns the requested attributes. :param list variables: Names of the variables. :param list feature_attrs: Names of variables in the ``FeatureAttributes`` object. """ result = {} result.update(FeatureBase.feature_data(i, variables)) result.update(self.feattrs.feature_data(i, feature_attrs)) return result
# # Methods for processing #
[docs] def process( self, basic_filters = True, database_lookup = True, ms2_analysis = True, basic_filters_args = None, database_lookup_args = None, ms2_analysis_args = None, ): """ This method implements the workflow of processing the samples. At the moment it does trivial filtering (see ``basic_filters``), performs database lookups (see ``database_lookup``) and analyses the MS2 data (see ``ms2_analysis``). """ if basic_filters: self.basic_filters(**(basic_filters_args or {})) if database_lookup: self.database_lookup(**(database_lookup_args or {})) if ms2_analysis: self.ms2_analysis(**(ms2_analysis_args or {}))
[docs] def quality_filter(self, threshold = .2): """ If the features has an attribute named `quality` it removes the ones with quality below the threshold. """ self.feattrs.threshold_filter('quality', threshold = threshold)
[docs] def rt_filter(self, threshold = 1.): """ Removes the features with mean retention time below the threshold. Works only if the features has an attribute named `rt_means`. """ self.feattrs.threshold_filter('rt_means', threshold = threshold)
[docs] def charge_filter(self, charge = 1): """ Keeps only the features with the preferred charge. Does not care about sign. """ self.feattrs.threshold_filter( 'charge', threshold = abs(charge), op = lambda vc, c: np.abs(vc) == c, )
[docs] def intensity_filter(self, threshold = 10000.): """ Removes the features with total intensities across samples lower than the threshold. Works by the ``total_intensities`` attribute. """ self.feattrs.threshold_filter( 'total_intensities', threshold = threshold, )
[docs] def basic_filters( self, quality_min = .2, charge = 1, rt_min = 1., intensity_min = 10000. ): """ Performs 4 trivial filtering steps which discard a large number of features at the beginning of the analysis. It removes the features with quality lower than 0.2, non single charge, retention time lower than 1 minute or total intensity lower than 10,000. Different threshold values can be provided to this method. """ self.quality_filter(threshold = quality_min) self.charge_filter(charge = charge) self.intensity_filter(threshold = intensity_min) self.rt_filter(threshold = rt_min)
[docs] def database_lookup( self, database_args = None, reinit_db = False, adduct_constraints = True, charge = None, tolerance = None, ): """ Performs database lookups of all m/z's. Creates an array variable named ``records`` in the ``FeatureAttributes`` object (``feattrs``) with the records retrieved from the database. """ if not hasattr(moldb, 'db') or reinit_db: moldb.init_db(**(database_args or {})) # we add an array variable with the recodrs resulted # in the lookup self.feattrs._add_var( moldb.adduct_lookup_many( self.mzs, ionmode = self.ionmode, adduct_constraints = adduct_constraints, charge = charge, tolerance = tolerance, ), 'records', )
[docs] def set_ms2_sources(self, attrs = None): """ Collects the MS2 scans belonging to this sample. The collected resources will be in the ``ms2_source`` attribute. """ self.ms2_source = [] ms2_source = self.collect_ms2(attrs = attrs) if self.sample_id in ms2_source: self.ms2_source = ms2_source[self.sample_id]
[docs] def collect_ms2(self, sample_id = None, attrs = None): """ Collects the MS2 scans belonging to this sample. The collected resources returned as ``dict`` which can be passed to ``ms2.MS2Feature``. """ attrs = attrs or self.attrs sample_id = attrs.sample_id if self.ms2_format is None: raise RuntimeError( '`ms2_format` is necessary to run MS2 analysis.' ) if self.ms2_format in self.ms2_collection_methods: method = self.ms2_collection_methods[self.ms2_format] return getattr(self, method)( sample_id = sample_id, attrs = attrs.attrs, )
[docs] def collect_mgf(self, sample_id = None, attrs = None): """ Collects MGF files containing the MS2 spectra. :param dict attrs: A ``dict`` with attributes necessary to find the appropriate MGF files. """ # if list of filenames provided we simply use those if 'mgf_files' in self.ms2_param: mgf_files = self.ms2_param['mgf_files'] # if directory provided create paths if 'mgfdir' in self.ms2_param: mgf_files = [ os.path.join(self.ms2_param['mgfdir'], fname) for fname in mgf_files ] # otherwise we need to collect the files else: # check if directory with MGF files provided mgfdir = ( self.ms2_param['mgfdir'] if 'mgfdir' in self.ms2_param else # fall back to this default 'mgf' ) # if dir does not exist we can do nothing if not os.path.isdir(mgfdir): raise FileNotFoundError( 'Please provide a directory with MGF files.' ) # ok, now we can find the MGF files by matching against # the sample attributes attrs = attrs or self.attrs sample_id = attrs.sample_id # see if a matching method provided mgf_match_method = ( self.ms2_param['mgf_match_method'] if 'mgf_match_method' in self.ms2_param else # otherwise we use our default self._default_mgf_match_method ) mgf_files = [] for fname in os.listdir(mgfdir): mgf_path = os.path.join(mgfdir, fname) mgf_matches = mgf_match_method(mgf_path, attrs.attrs) if mgf_matches: mgf_files.append(mgf_path) # check if mgf_charge provided in params mgf_charge = ( self.ms2_param['mgf_charge'] if 'mgf_charge' in self.ms2_param else None ) # dict with one key and list of files probably only one element: # do like this to be able to directly pass to ms2.MS2Feature return { sample_id: [ mgf.MgfReader(fname, charge = mgf_charge) for fname in mgf_files ] }
@staticmethod def _default_mgf_match_method(path, attrs): """ The default method for matching names of MGF files against attributes. """ match = remgf.search(path) match2 = remgf2.search(path) if match: main, ionmode, fracrow, fraccol = match.groups() if match2: main, fracrow, fraccol, ionmode = match2.groups() return ( (match or match2) and attrs['label']['main'].lower() in main.lower() and (fracrow, int(fraccol)) == attrs['label']['fraction'] and ionmode == attrs['label']['ionmode'] )
[docs] def collect_mzml(self, attrs = None): raise NotImplementedError
[docs] def ms2_analysis(self, resources = None): """ Runs MS2 identification methods on all features. """ if resources is None: if isinstance(self.ms2_source, list): resources = {self.sample_id: self.ms2_source} elif isinstance(self.ms2_source, dict): resources = self.ms2_source else: raise RuntimeError( 'No resources provided for MS2 identification.' ) ms2_identities = [] if not self.silent: prg = progress.Progress(len(self), 'Analysing MS2 spectra', 1) for i in xrange(len(self)): if not self.silent: prg.step() # MS2 identifications: ms2_fe = ms2.MS2Feature( mz = self.mzs[i], ionmode = self.ionmode, resources = resources, rt = tuple(self.feattrs.rt_ranges[i]), ms1_records = self.feattrs.records[i], ) ms2_fe.main() ms2_identities.append(ms2_fe.identities) if not self.silent: prg.terminate() ms2_identities = np.array(ms2_identities) self.feattrs._add_var(ms2_identities, 'ms2_identities')
[docs] def ms2_identify(self): self.set_ms2_sources() self.ms2_analysis()
# # Methods for exporting the results #
[docs] def get_database_records(self, i, database = None, adduct = None): """ Yields database records for one feature, optionally only records of certain adduct or database. """ if adduct: records = self.feattrs.records[i][adduct][1] else: records = itertools.chain( *(add[1] for add in self.feattrs.records[i].values()) ) for rec in records: if not database or database == rec.lab.db: yield rec
[docs] def table( self, variables = None, headers = None, ): """ Returns results as a header and a table as list of lists. """ self.feattrs.sort_all('total_intensities', desc = True) hdr = [ 'm/z', 'average area', 'RT range', 'quality', 'significance', 'database lookups', 'MS2 top', 'MS2 all', ] if variables: for i, var in enumerate(variables): hdr.append(var if not headers else headers[i]) yield hdr for i in xrange(len(self)): ms2_id = self.feattrs.ms2_identities[i] ms2_max_score = ( max( ( ms2i.score_pct for ms2iii in ms2_id for ms2ii in ms2iii.values() for ms2i in ms2ii ), default = 0 ) ) ms2_best = ';'.join( ms2i.full_str() for ms2iii in ms2_id for ms2ii in ms2iii.values() for ms2i in ms2ii if ms2i.score_pct == ms2_max_score ) ms2_all = ';'.join( ms2i.full_str() for ms2iii in ms2_id for ms2ii in ms2iii.values() for ms2i in ms2ii if ms2i.score_pct > 0.0 ) line = [ '%08f' % self.mzs[i], '%u' % self.feattrs.total_intensities[i], '%.02f - %.02f' % tuple(self.feattrs.rt_ranges[i]), '%.02f' % self.feattrs.quality[i], '%.02f' % self.feattrs.significance[i], moldb.records_string( records = self.feattrs.records[i], show_ppm = True, show_adduct = True, show_db = True, ), ms2_best, ms2_all, ] if variables: for var in variables: line.append(str(getattr(self.feattrs, var)[i])) yield line
[docs] def export_table(self, fname, **kwargs): table = self.table(**kwargs) hdr = next(table) with open(fname, 'w') as fp: _ = fp.write('\t'.join(hdr)) for row in table: _ = fp.write('\n') _ = fp.write('\t'.join(row))
[docs]class FeatureIdx(FeatureBase): def __init__(self, length): """ Helps the sorting of features across multiple samples with keeping track of feature IDs. """ FeatureBase.__init__(self) self.var = {'_original', '_current'} self._original = np.arange(length) self._current = np.arange(length) self.clients = {} def _sort(self, argsort): """ Sorts the two index arrays by an index array. Do not call this because it does sync with clients. """ if len(argsort) != len(self): raise RuntimeError( 'FeatureIdx: can not sort by index array of different length.' ) self._current = self._current[argsort] self._original = self._current.argsort()
[docs] def current(self, o): """ Tells the current index for the original index ``o``. :param int o: An original index. """ return self._original[o]
[docs] def original(self, c): """ Tells the original index for the current index ``c``. :param int c: An index in the current ordering. """ return self._current[c]
def __len__(self): return len(self._original)
[docs] def acurrent(self, ao): """ For a vector of original indices ``ao`` returns a vector of corresponding current indices. :param int ao: Vector of original indices. """ return self._original[ao]
[docs] def aoriginal(self, co): """ For a vector of current indices ``co`` returns a vector of corresponding original indices. :param int co: Vector of current indices. """ return self._current[ac]
[docs] def convert(self, other): """ Converts from the ordering of an other ``FeatureIdx`` instance to the ordering of this one. """ raise NotImplementedError
[docs] def register(self, sortable): """ Binds a sortable object of the same length to this one hence all sorting operations will be applied also to this object. Sortables clients assumed to share the same original indices. It means should have the same ordering at time of registration. """ if sortable.var and len(sortable) != len(self): raise RuntimeError( 'FeatureIdx: Objects of unequal length can not be co-sorted.' ) self.clients[id(sortable)] = sortable
[docs] def unregister(self, id_sortable): """ Removes an object from the list of clients and also makes it forget about this object to be its sorter. Simply it makes the this object and the client independent. :param int,sortable id_sortable: Either the ``id`` of a client or the object itself. """ if not isinstance(id_sortable, int): id_sortable = id(id_sortable) if id_sortable in self.clients: self.clients[id_sortable].sorter = None del self.clients[id_sortable]
[docs] def sort_all(self, by = None, origin = None, ): """ Applies custom sort to the object if ``by`` is an index array. If ``by`` is ``None``, it sorts by the original indices, which means the restoration of the original order. It calls ``sort_all`` by the same argsort on all registered clients hence propagating the sorting. But omits the client if the sort request originates from it in order to avoid infinite sorting loops. Clients don't propagate further the sorting request also to avoid loops. In summary, this method is called either from outside and then it applies a sorting to all object of the system (if called without ``by`` restores the original order); or called by the ``sort_all`` method of one of the clients propagating the sorting to this object and to all other clients. """ if by is None: by = self._current.argsort() self._sort(by) for id_, client in iteritems(self.clients): if id_ != origin: client.sort_all(by = by, propagate = False)
def _filter(self, selection, origin = None): """ Applies filtering to all registered clients. """ FeatureBase._filter(self, selection, propagate = False) for id_, client in iteritems(self.clients): if id_ != origin: client._filter(selection, propagate = False)
[docs]class SampleSet(Sample, sampleattrs.SampleSorter): def __init__( self, mzs, ionmode = None, intensities = None, rts = None, attrs = None, feature_attrs = None, sorter = None, ms2_format = 'mgf', ms2_param = None, sample_ids = None, sample_id_proc = None, sample_id_proc_method = None, sample_id_proc_names = None, sample_data = None, ): """ This class represents an ordered set of samples. It means all arrays have one more dimension than ``Sample``, though the arrays can be multidimensional, i.e. in one data array more than one value can belong to a feature in a sample. It inherits from ``Sample``, see details in docs of ``Sample``. In ``ms2_param`` in addition to the parameters used in ``Sample`` you can provide ``ms2_use_samples`` and ``ms2_use_samples_method``, a data and a function, respectively which will be used to select only certain samples for MS2 analysis. The method should accept sample attributes as first argument and the ``ms2_use_samples`` or ``None`` will be passed as second. It should return ``bool``. By deafult MS2 scans from all samples are used. :param list,callable sample_ids: Either a list of sample identifiers with the same length as number of samples or a method which generates sample identifiers from sample attributes. :param callable sample_id_processor: A method to process sample IDs if ``sample_ids`` provided. :param list sample_data: Other ``SampleSet`` or ``feature.SampleData`` objects which should be co-sorted with this object. """ centr_mzs = ( feature_attrs.centr_mzs.copy() if ( feature_attrs is not None and hasattr(feature_attrs, 'centr_mzs') ) else mzs if len(mzs.shape) == 1 else mzs.mean(axis = tuple(range(1, len(mzs.shape)))) ) length = self._guess_numof_samples( attrs = attrs, variables = (mzs, intensities, rts) ) attr_args = { 'sample_id': sample_ids, 'attrs': attrs, 'proc': sample_id_proc, 'proc_method': sample_id_proc_method, 'proc_names': sample_id_proc_names, 'length': length, } Sample.__init__( self, mzs = centr_mzs, ionmode = ionmode, intensities = intensities, mzs_by_sample = mzs, rts = rts, attrs = attrs, feature_attrs = feature_attrs, sorter = sorter, ms2_format = ms2_format, ms2_param = ms2_param, sample_id = sample_ids, sample_id_proc = sample_id_proc, sample_id_proc_method = sample_id_proc_method, sample_id_proc_names = sample_id_proc_names, attr_args = attr_args, ) sampleattrs.SampleSorter.__init__( self, sample_data = sample_data, sample_axis = 1, )
[docs] @classmethod def combine_samples(cls, attrs, samples, **kwargs): """ Initializes the object by combining a series of ``Sample`` objects. All samples must be of the same length and have the same ordering. It means the corresponding elements must belong to the same feature. The ``FeatureAttributes`` and the ``sorter`` (``FeatureIdx``) will be used from the first sample. """ if len(set(len(s) for s in samples)) > 1: # TODO: call aligner for different length samples # TODO: check if at least sorters show the same ordering raise RuntimeError( 'SampleSet: it\'s possible combine only equal length samples ' 'into a set. Also samples assumed to have the same ordering.' ) mzs = np.hstack([s.mzs for s in samples]) var = {} for var_name in sample[0].var: var[var_name] = np.hstack([getattr(s, var_name) for s in samples]) sample_attrs = [s.attrs for s in samples] # TODO: combine FeatureAttributes feature_attrs = samples[0].feattrs sorter = samples[0].sorter return cls.__init__( mzs = mzs, attrs = sample_attrs, feature_attrs = feature_attrs, sorter = sorter, **var, **kwargs, )
@property def numof_samples(self): return self._guess_numof_samples( self.attrs, (getattr(self, var) for var in self.var), ) def _guess_numof_samples(self, attrs = None, variables = None): if attrs: # if we have attributes number of samples should be obvious return len(attrs) else: # otherwise try to guess from shape of the data arrays for var in variables: if hasattr(var, 'shape') and len(var.shape) > 1: return var.shape[1] # this is the default return 1 _set_attrs = sampleattrs.SampleSorter._set_attrs
[docs] def get_sample_attrs(self, i): """ Returns the sample attributes (dict of metadata) of the ``i``th sample in the set. """ return self.attrs[i]
[docs] def get_sample(self, i): """ Returns the ``i``th sample as a ``Sample`` object. """ var = {} for var_name in self.var: var[var_name] = getattr(self, var_name)[:,i] sample_attrs = self.get_sample_attrs(i) return Sample( mzs = self.mzs_by_sample[:,i], attrs = sample_attrs, feature_attrs = self.feattrs, sorter = self.sorter, **var, )
def _get_sample_id(self, i): if callable(self.sample_ids): sample_id = self.sample_ids else: sample_id = self.sample_ids[i] return Sample._get_sample_id( sample_id = sample_id, attrs = self.attrs[i], )
[docs] def collect_ms2(self, attrs = None): """ Collects the MS2 resources for each sample a similar way as the same method does in ``Sample``. """ ms2_source = {} attrs = attrs or self.attrs for i, sample_attrs in enumerate(attrs): if 'ms2_use_samples_method' in self.ms2_param: select_samples = self.ms2_param['ms2_use_samples_method'] ms2_use_samples = ( self.ms2_param['ms2_use_samples'] if 'ms2_use_samples' in self.ms2_param else None ) if not select_samples(sample_attrs, ms2_use_samples): continue sample_id = self.get_sample_id(i) # adding MS2 sources for this sample to the dict ms2_source.update(Sample.collect_ms2( self, sample_id = sample_id, attrs = sample_attrs )) return ms2_source
[docs] def set_ms2_sources(self, attrs = None): """ Collects the MS2 scans for all samples. The collected resources will be in the ``ms2_source`` attribute. """ self.ms2_source = self.collect_ms2(attrs = attrs)
[docs] def peak_size_filter(self, backg, prot, threshold = 2): def ps(a0, a1): if np.all(np.isnan(a1)) or np.nanmax(a1) == 0: return 0.0 if ( (np.all(np.isnan(a0)) or np.nanmax(a0) == 0) and np.nanmax(a1) > 0 ): return np.inf return np.nanmin(a1) / np.nanmax(a0) backg = set((f[0], int(f[1:])) for f in backg) prot = set((f[0], int(f[1:])) for f in prot) ibackg = np.array([ i for i, attr in enumerate(self.attrs) if attr['label']['fraction'] in backg ]) iprot = np.array([ i for i, attr in enumerate(self.attrs) if attr['label']['fraction'] in prot ]) peaksize = np.array([ ps(self.intensities[i,ibackg], self.intensities[i,iprot]) for i in xrange(len(self)) ]) self.feattrs._add_var(peaksize, 'peaksize') self.feattrs.threshold_filter('peaksize', threshold = threshold)
[docs] def get_selection(self, selection): """ Returns a ``SampleSelection`` object which is a binary selection of some of the samples in the set. :param list,numpy.ndarray selection: Either a list of sample IDs to be selected or a boolean array with the length corresponding to the number of samples. """ return sampleattrs.SampleSelection( selection = selection, samples = self, )
[docs] def get_sample_data(self, **kwargs): """ Returns a ``SampleData`` object which stores any data about the samples. :param numpy.ndarray **kwargs: One or more arrays with same their first dimension in agreement with the number of samples. """ return sampleattrs.SampleData( samples = self, **kwargs, )
[docs]class FeatureSelection(FeatureBase): def __init__(self, samples, selection): self.samples = samples FeatureBase.__init__( self, sorter = self.samples.sorter, selection = selection, ) @property def negative(self): return np.logical_not(self.selection)