Source code for models

from typing import Callable, Mapping, Sequence
from warnings import warn

import numpy as np
import pandas as pd


[docs]class Gene: """Stores gene's identifier and description (multiton). At a time there can be only one gene with given identifier, i.e. after the first initialization, all subsequent attempts to initialize a gene with the same identifier will return exactly the same object. This is so called multiton pattern. Example: >>> x = Gene('TP53') >>> y = Gene('TP53') >>> assert x is y # passes, there is only one gene """ instances = {} def __new__(cls, name, *args, **kwargs): if name not in cls.instances: cls.instances[name] = super(Gene, cls).__new__(cls, *args, **kwargs) return cls.instances[name] def __init__(self, name, description=None): self.name = name self.description = description
[docs]class Sample: """Sample contains expression values for genes.""" def __init__(self, name, data: Mapping[Gene, float]): self.name = name self.data = data @property def genes(self): return self.data.keys()
[docs] @classmethod def from_names(cls, name, data: Mapping[str, float]): """Create a sample from a gene_name: value mapping. Args: name: name of sample data: mapping (e.g. dict) where keys represent gene names """ return cls(name, {Gene(gene_name): value for gene_name, value in data.items()})
[docs] @classmethod def from_array(cls, name, panda_series: pd.Series, descriptions=False): """Create a sample from pd.Series or equivalent. Args: name: name of the sample panda_series: series object where columns represent values of genes and names are either gene identifiers of tuples: ``(gene_identifier, description)`` descriptions: are descriptions present in names of the series object? """ gene_maker = Gene if descriptions: gene_maker = lambda data: Gene(*data) return cls(name, { gene_maker(key): value for key, value in panda_series.to_dict().items() })
[docs] def as_array(self): """ Returns: one-dimensional labeled array with Gene objects as labels """ return pd.Series(self.data)
def __eq__(self, other): return self.name == other.name and self.data == other.data def __repr__(self): return f'<Sample "{self.name}" with {len(self.data)} genes>'
def first_line(file_object): line = None while not line: line = file_object.readline() # return to the beginning file_object.seek(0) return line # TODO class variable with set of genes + method(s) for checking data integrity
[docs]class Phenotype: """Phenotype is a collection of samples of common origin or characteristic. An example phenotype can be: (Breast_cancer_sample_1, Breast_cancer_sample_2) named "Breast cancer". The common origin/characteristics for "Breast cancer" phenotype could be "a breast tumour", though samples had been collected from two donors. Another example are controls: (Control_sample_1, Control_sample_2) named "Control". The common characteristic for these samples is that both are controls. """ def __init__(self, name, samples=None): self.samples = samples or [] self.name = name
[docs] def as_array(self): """ Returns: :class:`pandas.DataFrame` object with data for all samples. """ return {s.name: pd.DataFrame(s) for s in self.samples}
def __add__(self, other): return Phenotype(self.name, self.samples + other.samples)
[docs] @classmethod def from_file( cls, name, file_object, columns_selector: Callable[[Sequence[int]], Sequence[int]]=None, samples=None, delimiter: str='\t', index_col: int=0, use_header=True, reverse_selection=False, prefix=None, header_line=0, description_column=None ): """Create a phenotype (collection of samples) from csv/tsv file. Args: name: a name of the phenotype (or group of samples) which will identify it (like "Tumour_1" or "Control_in_20_degrees") file_object: a file (containing gene expression) of the following structure: - names of samples separated by a tab in the first row, - gene symbol/name followed by gene expression values for every sample in remaining rows; an additional column "description" is allowed between genes column and sample columns, though it has to be explicitly declared with `description_column` argument. columns_selector: a function which will select (and return) a subset of provided column identifiers (do not use with `samples`) samples: a list of names of samples to extract from the file (do not use with `columns_selector`) reverse_selection: if you want to use all columns but the selected ones (or all samples but the selected) set this to True delimiter: the delimiter of the columns index_col: column to use as the gene names use_header: does the file have a header? prefix: prefix for custom samples naming schema header_line: number of non-empty line with sample names description_column: is column with description of present in the file (on the second position, after gene identifiers)? """ if file_object.tell() != 0: warn(f'Passed file object: {file_object} was read before.') raise Exception() line = first_line(file_object) header_items = [item.strip() for item in line.split('\t')] gene_columns = [index_col] if description_column: description_column = 1 gene_columns.append(description_column) else: if any('description' == name.lower() for name in header_items): warn( 'First line of your file contains "description" column, ' 'but you did not provide "--description_column" argument.' ) # a reasonable assumption is that the columns with samples # start after columns with gene symbol and gene description column_shift = max(gene_columns) + 1 if columns_selector: # sniff how many columns do we have in the file columns_count = line.count(delimiter) all_sample_columns = list(range(column_shift, columns_count + column_shift)) # generate identifiers (numbers) for all columns # and take the requested subset columns = columns_selector(all_sample_columns) if reverse_selection: columns = list(columns) columns = [c for c in all_sample_columns if c not in columns] # https://github.com/pandas-dev/pandas/issues/9098#issuecomment-333677100 columns = gene_columns + list(columns) else: columns = None if not use_header: if samples: raise ValueError( 'To select samples by their name, you need a file with ' 'samples names in the header. If you use such file, ' 'please set `use_header=True`, otherwise skip `samples` ' 'in your arguments.' ) if header_line: warn( '`header_line` has no effect when ' '`use_header` is set to `False`' ) # we could leave it to pandas, but it shows an ugly, # not very helpful message. It is better to show the # user where exactly the problem occurs. if samples: available_samples = [ name for name in header_items[column_shift:] ] lacking_samples = set(samples) - set(available_samples) if lacking_samples: raise ValueError( f'Samples {lacking_samples} are not available in {file_object.name} file.\n' f'Following samples were found: {", ".join(available_samples)}.' ) if index_col: # TODO https://github.com/pandas-dev/pandas/issues/9098 warn( 'Using "samples" with "index_col" 0 may cause an ' 'unexpected behaviour due to an upstream issue in ' 'pandas package (pandas-dev/pandas/issues/9098) ' 'for pandas in versions older than 0.21.' ) additional_column_names = [ header_items[index] for index in gene_columns ] # https://github.com/pandas-dev/pandas/issues/9098#issuecomment-333677100 samples = additional_column_names + list(samples) # just to reassure that the pointer is on the beginning if file_object.tell() != 0: warn('Passed file object was read before.') if samples and columns: warn( 'Please, provide either columns or samples, ' 'not both. We will use columns this time.' ) data = pd.read_table( file_object, delimiter=delimiter, # None - do not use, 0 - use first row header=header_line if use_header else None, index_col=gene_columns, usecols=columns or samples, prefix=f'{prefix}_' if prefix else '' ) descriptions = description_column is not None samples = [ Sample.from_array(sample_name, sample_data, descriptions=descriptions) for sample_name, sample_data in data.items() ] return cls(name, samples)
[docs] @classmethod def from_gsea_file(cls): """Stub: if we need to handle very specific files, for various analysis methods, we can extend Phenotype with class methods like from_gsea_file.""" pass
# TODO class variable with set of genes + method(s) for checking data integrity # TODO unify file reading with argument_parser class Experiment: def __init__(self, case: Phenotype, control: Phenotype): self.control = control self.case = case def get_all(self): return self.control + self.case # TODO: are there many ways to compute fold-change? def get_fold_change(self, sample_from_case, use_log=False): assert sample_from_case in self.case.samples # TODO: implement inline calc_fold_change(sample_from_case, self.control, use_log=use_log) """ def fold_change(case, base, log2=False): fold_changes = case.copy() for (idx, row) in base.iterrows(): fold_changes.loc[[idx]] /= (np.mean(row) or 0.01) # TODO for now arbitrary value 0.01 when 0's are found if log2: fold_changes = np.log2(fold_changes) # TODO Runtime Warning when 0's are encountered return fold_changes """ class Study: def __init__(self, cases: Sequence[Phenotype], control: Phenotype): for case in cases: self.experiments = Experiment(case, control)