Source code for models

from typing import Callable, Mapping, Sequence
from warnings import warn

import numpy as np
import pandas as pd


[docs]class Gene:
    """Stores gene's identifier and description (multiton).

    At a time there can be only one gene with given identifier,
    i.e. after the first initialization, all subsequent attempts
    to initialize a gene with the same identifier will return
    exactly the same object. This is so called multiton pattern.

    Example:

        >>> x = Gene('TP53')
        >>> y = Gene('TP53')
        >>> assert x is y   # passes, there is only one gene
    """

    instances = {}

    def __new__(cls, name, *args, **kwargs):
        if name not in cls.instances:
            cls.instances[name] = super(Gene, cls).__new__(cls, *args, **kwargs)
        return cls.instances[name]

    def __init__(self, name, description=None):
        self.name = name
        self.description = description


[docs]class Sample:
    """Sample contains expression values for genes."""

    def __init__(self, name, data: Mapping[Gene, float]):
        self.name = name
        self.data = data

    @property
    def genes(self):
        return self.data.keys()

[docs]    @classmethod
    def from_names(cls, name, data: Mapping[str, float]):
        """Create a sample from a gene_name: value mapping.

        Args:
            name: name of sample
            data: mapping (e.g. dict) where keys represent gene names
        """
        return cls(name, {Gene(gene_name): value for gene_name, value in data.items()})

[docs]    @classmethod
    def from_array(cls, name, panda_series: pd.Series, descriptions=False):
        """Create a sample from pd.Series or equivalent.

        Args:
            name: name of the sample
            panda_series:
                series object where columns represent values of genes and
                names are either gene identifiers of tuples:
                ``(gene_identifier, description)``
            descriptions:
                are descriptions present in names of the series object?
        """
        gene_maker = Gene

        if descriptions:
            gene_maker = lambda data: Gene(*data)

        return cls(name, {
            gene_maker(key): value
            for key, value in panda_series.to_dict().items()
        })

[docs]    def as_array(self):
        """

        Returns: one-dimensional labeled array with Gene objects as labels

        """
        return pd.Series(self.data)

    def __eq__(self, other):
        return self.name == other.name and self.data == other.data

    def __repr__(self):
        return f'<Sample "{self.name}" with {len(self.data)} genes>'


def first_line(file_object):
    line = None

    while not line:
        line = file_object.readline()

    # return to the beginning
    file_object.seek(0)

    return line


# TODO class variable with set of genes + method(s) for checking data integrity
[docs]class Phenotype:
    """Phenotype is a collection of samples of common origin or characteristic.

    An example phenotype can be:
        (Breast_cancer_sample_1, Breast_cancer_sample_2) named "Breast cancer".

        The common origin/characteristics for "Breast cancer" phenotype could be
        "a breast tumour", though samples had been collected from two donors.

    Another example are controls:
        (Control_sample_1, Control_sample_2) named "Control".

        The common characteristic for these samples is that both are controls.
    """

    def __init__(self, name, samples=None):
        self.samples = samples or []
        self.name = name

[docs]    def as_array(self):
        """
        Returns: :class:`pandas.DataFrame` object with data for all samples.
        """
        return {s.name: pd.DataFrame(s) for s in self.samples}

    def __add__(self, other):
        return Phenotype(self.name, self.samples + other.samples)

[docs]    @classmethod
    def from_file(
            cls, name, file_object,
            columns_selector: Callable[[Sequence[int]], Sequence[int]]=None,
            samples=None, delimiter: str='\t', index_col: int=0,
            use_header=True, reverse_selection=False, prefix=None,
            header_line=0, description_column=None
    ):
        """Create a phenotype (collection of samples) from csv/tsv file.

        Args:
            name:
                a name of the phenotype (or group of samples) which will
                identify it (like "Tumour_1" or "Control_in_20_degrees")

            file_object: a file (containing gene expression)
                of the following structure:
                    - names of samples separated by a tab in the first row,
                    - gene symbol/name followed by gene expression values
                      for every sample in remaining rows;

                an additional column "description" is allowed between genes
                column and sample columns, though it has to be explicitly
                declared with `description_column` argument.

            columns_selector:
                a function which will select (and return) a subset of
                provided column identifiers (do not use with `samples`)

            samples:
                a list of names of samples to extract from the file
                (do not use with `columns_selector`)

            reverse_selection:
                if you want to use all columns but the selected ones
                (or all samples but the selected) set this to True

            delimiter: the delimiter of the columns

            index_col: column to use as the gene names

            use_header: does the file have a header?

            prefix: prefix for custom samples naming schema

            header_line: number of non-empty line with sample names

            description_column:
                is column with description of present in the file
                (on the second position, after gene identifiers)?
        """
        if file_object.tell() != 0:
            warn(f'Passed file object: {file_object} was read before.')
            raise Exception()

        line = first_line(file_object)
        header_items = [item.strip() for item in line.split('\t')]
        gene_columns = [index_col]

        if description_column:
            description_column = 1
            gene_columns.append(description_column)
        else:
            if any('description' == name.lower() for name in header_items):
                warn(
                    'First line of your file contains "description" column, '
                    'but you did not provide "--description_column" argument.'
                )

        # a reasonable assumption is that the columns with samples
        # start after columns with gene symbol and gene description
        column_shift = max(gene_columns) + 1

        if columns_selector:
            # sniff how many columns do we have in the file
            columns_count = line.count(delimiter)

            all_sample_columns = list(range(column_shift, columns_count + column_shift))

            # generate identifiers (numbers) for all columns
            # and take the requested subset
            columns = columns_selector(all_sample_columns)

            if reverse_selection:
                columns = list(columns)
                columns = [c for c in all_sample_columns if c not in columns]

            # https://github.com/pandas-dev/pandas/issues/9098#issuecomment-333677100
            columns = gene_columns + list(columns)
        else:
            columns = None

        if not use_header:
            if samples:
                raise ValueError(
                    'To select samples by their name, you need a file with '
                    'samples names in the header. If you use such file, '
                    'please set `use_header=True`, otherwise skip `samples` '
                    'in your arguments.'
                )
            if header_line:
                warn(
                    '`header_line` has no effect when '
                    '`use_header` is set to `False`'
                )

        # we could leave it to pandas, but it shows an ugly,
        # not very helpful message. It is better to show the
        # user where exactly the problem occurs.
        if samples:

            available_samples = [
                name
                for name in header_items[column_shift:]
            ]

            lacking_samples = set(samples) - set(available_samples)

            if lacking_samples:
                raise ValueError(
                    f'Samples {lacking_samples} are not available in {file_object.name} file.\n'
                    f'Following samples were found: {", ".join(available_samples)}.'
                )

            if index_col:
                # TODO https://github.com/pandas-dev/pandas/issues/9098
                warn(
                    'Using "samples" with "index_col" 0 may cause an '
                    'unexpected behaviour due to an upstream issue in '
                    'pandas package (pandas-dev/pandas/issues/9098) '
                    'for pandas in versions older than 0.21.'
                )

            additional_column_names = [
                header_items[index]
                for index in gene_columns
            ]

            # https://github.com/pandas-dev/pandas/issues/9098#issuecomment-333677100
            samples = additional_column_names + list(samples)

        # just to reassure that the pointer is on the beginning
        if file_object.tell() != 0:
            warn('Passed file object was read before.')

        if samples and columns:
            warn(
                'Please, provide either columns or samples, '
                'not both. We will use columns this time.'
            )

        data = pd.read_table(
            file_object,
            delimiter=delimiter,
            # None - do not use, 0 - use first row
            header=header_line if use_header else None,
            index_col=gene_columns,
            usecols=columns or samples,
            prefix=f'{prefix}_' if prefix else ''
        )

        descriptions = description_column is not None

        samples = [
            Sample.from_array(sample_name, sample_data, descriptions=descriptions)
            for sample_name, sample_data in data.items()
        ]

        return cls(name, samples)

[docs]    @classmethod
    def from_gsea_file(cls):
        """Stub: if we need to handle very specific files,
        for various analysis methods, we can extend Phenotype
        with class methods like from_gsea_file."""
        pass


# TODO class variable with set of genes + method(s) for checking data integrity
# TODO unify file reading with argument_parser
class Experiment:

    def __init__(self, case: Phenotype, control: Phenotype):
        self.control = control
        self.case = case

    def get_all(self):
        return self.control + self.case

    # TODO: are there many ways to compute fold-change?
    def get_fold_change(self, sample_from_case, use_log=False):
        assert sample_from_case in self.case.samples
        # TODO: implement inline
        calc_fold_change(sample_from_case, self.control, use_log=use_log)
        """
        def fold_change(case, base, log2=False):
            fold_changes = case.copy()
            for (idx, row) in base.iterrows():
                fold_changes.loc[[idx]] /= (np.mean(row) or 0.01)  # TODO for now arbitrary value 0.01 when 0's are found

            if log2:
                fold_changes = np.log2(fold_changes)  # TODO Runtime Warning when 0's are encountered

            return fold_changes
        """


class Study:
    def __init__(self, cases: Sequence[Phenotype], control: Phenotype):
        for case in cases:
            self.experiments = Experiment(case, control)
Source code for models

Pathway Analysis

Navigation

Related Topics