Source code for cellphonedb.src.core.methods.cpdb_analysis_method


from typing import Tuple
import pandas as pd
import numpy as np
import pickle

from cellphonedb.src.core.core_logger import core_logger

from cellphonedb.src.core.exceptions.AllCountsFilteredException import AllCountsFilteredException
from cellphonedb.src.core.exceptions.MissingRequiredArgumentsException import MissingRequiredArgumentsException
from cellphonedb.src.core.methods import cpdb_statistical_analysis_complex_method
from cellphonedb.src.core.methods import cpdb_statistical_analysis_helper
from cellphonedb.src.core.models.complex import complex_helper
from cellphonedb.utils import db_utils, file_utils, scoring_utils


[docs]def call(
         cpdb_file_path: str = None,
         meta_file_path: str = None,
         counts_file_path: str = None,
         counts_data: str = None,
         output_path: str = None,
         microenvs_file_path: str = None,
         separator: str = "|",
         threshold: float = 0.1,
         result_precision: int = 3,
         debug: bool = False,
         output_suffix: str = None,
         score_interactions: bool = False,
         threads: int = 4
         ) -> dict:
    """Non-statistical method for analysis

    This methods calculates the mean and percent for the cluster interactions
    and for each gene interaction. No shuffling nor DEGs are involved.

    Parameters
    ----------
    cpdb_file_path: str
        CellphoneDB database file path
    meta_file_path: str
        Path to metadata csv file
    counts_file_path: str
        Path to counts csv file
    counts_data: str
        Type of gene identifiers in the counts data: "ensembl", "gene_name", "hgnc_symbol"
    output_path: str
        Output path used to store the analysis results (and to store intermediate files when debugging)
    microenvs_file_path: str, optional
        Path to Micro-environment file. Its content is used to limit cluster interactions
    separator: str
        Separator for pairs of genes (gene1|gene2) and clusters (cluster1|cluster2).
    threshold: float
        Percentage of cells expressing the specific ligand/receptor [0.0 - 1.0]
    result_precision: int
        Number of decimal digits in results.
    debug: bool
        Storge intermediate data as pickle file (debug_intermediate.pkl).
    output_suffix: str, optional
        Suffix to append to the result file's name (if not provided, timestamp will be used)
    score_interactions: bool
        If True, CellphoneDB interactions will be scored per cell type pair, and returned in interaction_scores_dict
    threads: int
        Number of threads to be used when scoring interactions
    Returns
    -------
    Dict with the following keys:
        - means_result
        - deconvoluted_result
        - deconvoluted_percents
        - interaction_scores_dict
    """
    analysis_result = {}
    core_logger.info(
        '[Non Statistical Method] Threshold:{} Precision:{}'.format(threshold,
                                                                    result_precision))
    # Report error unless the required arguments have been provided
    required_arguments = [cpdb_file_path, meta_file_path, counts_file_path, counts_data, output_path]
    if None in required_arguments or '' in required_arguments:
        raise MissingRequiredArgumentsException(description="All of the following arguments need to be provided: {}".format(
            "cpdb_file_path, meta_file_path, counts_file_path, counts_data, output_path"))

    # Load into memory CellphoneDB data
    interactions, genes, complex_compositions, complexes, gene_synonym2gene_name, receptor2tfs = \
        db_utils.get_interactions_genes_complex(cpdb_file_path)

    # Load user files into memory
    counts, meta, microenvs, degs, _ = file_utils.get_user_files(
        counts_fp=counts_file_path, meta_fp=meta_file_path, microenvs_fp=microenvs_file_path,
        gene_synonym2gene_name=gene_synonym2gene_name, counts_data=counts_data)

    # get reduced interactions (drop duplicates)
    interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates()

    # add id multidata and means to counts input
    counts, counts_relations = cpdb_statistical_analysis_helper.add_multidata_and_means_to_counts(
        counts, genes, counts_data)
    if counts.empty:
        raise AllCountsFilteredException(hint='Are you using human data?')

    interactions_filtered, counts_filtered, complex_composition_filtered = \
        cpdb_statistical_analysis_helper.prefilters(interactions_reduced,
                                                    counts,
                                                    complexes,
                                                    complex_compositions)
    if interactions_filtered.empty:
        core_logger.info('No CellphoneDB interactions found in this input.')
        return analysis_result

    meta = meta.loc[counts.columns]
    # Make sure all cell types are strings
    meta['cell_type'] = meta['cell_type'].apply(str)
    if not microenvs.empty:
        microenvs['cell_type'] = microenvs['cell_type'].apply(str)

    complex_to_protein_row_ids = complex_helper.map_complex_to_protein_row_ids(complex_composition_filtered, counts_filtered)
    clusters = cpdb_statistical_analysis_helper.build_clusters(meta,
                                                               counts_filtered,
                                                               complex_to_protein_row_ids,
                                                               skip_percent=False)
    core_logger.info('Running Basic Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names'], microenvs)

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered,
                                                                       cluster_interactions,
                                                                       separator)

    mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered,
                                                                   clusters,
                                                                   cluster_interactions,
                                                                   separator)

    percent_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters,
                                                                         threshold,
                                                                         interactions_filtered,
                                                                         cluster_interactions,
                                                                         separator)

    if debug:
        with open(f"{output_path}/debug_intermediate.pkl", "wb") as fh:
            pickle.dump({
                "genes": genes,
                "interactions": interactions,
                "interactions_filtered": interactions_filtered,
                "interactions_reduced": interactions_reduced,
                "complex_compositions": complex_compositions,
                "counts": counts,
                "counts_relations": counts_relations,
                "clusters_means_percents": clusters,
                "cluster_interactions": cluster_interactions,
                "base_result": base_result,
                "mean_analysis": mean_analysis,
                "percent_analysis": percent_analysis}, fh)

    means_result, significant_means, deconvoluted_result, deconvoluted_percents = build_results(
        interactions_filtered,
        interactions,
        counts_relations,
        mean_analysis,
        percent_analysis,
        clusters['means'],
        clusters['percents'],
        complex_composition_filtered,
        counts,
        genes,
        result_precision,
        counts_data
    )

    max_rank = significant_means['rank'].max()
    significant_means['rank'] = significant_means['rank'].apply(lambda rank: rank if rank != 0 else (1 + max_rank))
    significant_means.sort_values('rank', inplace=True)

    analysis_result['means_result'] = means_result
    analysis_result['deconvoluted'] = deconvoluted_result
    analysis_result['deconvoluted_percents'] = deconvoluted_percents

    if score_interactions:
        interaction_scores = scoring_utils.score_interactions_based_on_participant_expressions_product(
            cpdb_file_path, counts, means_result.copy(), separator, meta, threshold, "cell_type", threads)
        analysis_result['interaction_scores'] = interaction_scores

    file_utils.save_dfs_as_tsv(output_path, output_suffix, "simple_analysis", analysis_result)
    return analysis_result


def build_results(interactions: pd.DataFrame,
                  interactions_original: pd.DataFrame,
                  counts_relations: pd.DataFrame,
                  mean_analysis: pd.DataFrame,
                  percent_analysis: pd.DataFrame,
                  clusters_means: pd.DataFrame,
                  clusters_percents: pd.DataFrame,
                  complex_compositions: pd.DataFrame,
                  counts: pd.DataFrame,
                  genes: pd.DataFrame,
                  result_precision: int,
                  counts_data: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Sets the results data structure from method generated data.

    Results documents are defined by specs.

    Returns
    -------
    Tuple: A tuple containing the results for:
        - means
        - significant_means
        - deconvoluted
        - deconvoluted_percents
    """
    core_logger.info('Building results')
    interactions: pd.DataFrame = interactions_original.loc[interactions.index]
    interactions['interaction_index'] = interactions.index
    interactions = interactions.merge(counts_relations, how='left', left_on='multidata_1_id', right_on='id_multidata', )
    # The column drop below prevents: 'FutureWarning: Passing 'suffixes' which cause duplicate columns {'id_multidata_1'}
    # in the result is deprecated and will raise a MergeError in a future version.'
    interactions = interactions.drop('id_multidata', axis=1)
    interactions = interactions.merge(counts_relations, how='left', left_on='multidata_2_id', right_on='id_multidata',
                                      suffixes=('_1', '_2'))
    interactions.set_index('interaction_index', inplace=True, drop=True)

    interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build(interactions)

    def simple_complex_indicator(interaction: pd.Series, suffix: str) -> str:
        """
        Add simple/complex prefixes to interaction components
        """
        if interaction['is_complex{}'.format(suffix)]:
            return 'complex:{}'.format(interaction['name{}'.format(suffix)])

        return 'simple:{}'.format(interaction['name{}'.format(suffix)])

    interactions['partner_a'] = interactions.apply(lambda interaction: simple_complex_indicator(interaction, '_1'),
                                                   axis=1)
    interactions['partner_b'] = interactions.apply(lambda interaction: simple_complex_indicator(interaction, '_2'),
                                                   axis=1)

    significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means(
        mean_analysis,
        percent_analysis)
    significant_means = significant_means.round(result_precision)

    gene_columns = ['{}_{}'.format('gene_name', suffix) for suffix in ('1', '2')]
    gene_renames = {column: 'gene_{}'.format(suffix) for column, suffix in zip(gene_columns, ['a', 'b'])}

    # Cater for DB version-dependent column names
    interaction_columns = []
    if 'directionality' in interactions.columns:
        interaction_columns = ['directionality', 'classification']
    # Remove superfluous columns
    interactions_data_result = pd.DataFrame(
        interactions[['id_cp_interaction', 'partner_a', 'partner_b', 'receptor_1', 'receptor_2', *gene_columns,
                      'annotation_strategy'] + interaction_columns].copy())

    interactions_data_result = pd.concat([interacting_pair, interactions_data_result], axis=1, sort=False)

    interactions_data_result['secreted'] = (interactions['secreted_1'] | interactions['secreted_2'])
    interactions_data_result['is_integrin'] = (interactions['integrin_1'] | interactions['integrin_2'])

    interactions_data_result.rename(
        columns={**gene_renames, 'receptor_1': 'receptor_a', 'receptor_2': 'receptor_b'},
        inplace=True)

    # Dedupe rows and filter only desired columns
    interactions_data_result.drop_duplicates(inplace=True)

    means_columns = ['id_cp_interaction', 'interacting_pair', 'partner_a', 'partner_b', 'gene_a', 'gene_b', 'secreted',
                     'receptor_a', 'receptor_b', 'annotation_strategy', 'is_integrin'] + interaction_columns

    interactions_data_result = interactions_data_result[means_columns]

    mean_analysis = mean_analysis.round(result_precision)

    # Round result decimals
    for key, cluster_means in clusters_means.items():
        clusters_means[key] = cluster_means.round(result_precision)
    for key, cluster_percents in clusters_percents.items():
        clusters_percents[key] = cluster_percents.round(result_precision)

    # Document 2
    means_result = pd.concat([interactions_data_result, mean_analysis], axis=1, join='inner', sort=False)

    # Document 3
    significant_means_result = pd.concat([interactions_data_result, significant_mean_rank, significant_means], axis=1,
                                         join='inner', sort=False)
    # Document 5
    deconvoluted_result, deconvoluted_percents = cpdb_statistical_analysis_complex_method.deconvoluted_complex_result_build(
        clusters_means,
        clusters_percents,
        interactions,
        complex_compositions,
        counts,
        genes,
        counts_data)

    return means_result, significant_means_result, deconvoluted_result, deconvoluted_percents


def deconvoluted_complex_result_build(clusters_means: dict, interactions: pd.DataFrame,
                                      complex_compositions: pd.DataFrame, counts: pd.DataFrame,
                                      genes: pd.DataFrame, counts_data: str) -> pd.DataFrame:
    genes_counts = list(counts.index)
    genes_filtered = genes[genes[counts_data].apply(lambda gene: gene in genes_counts)]

    deconvoluted_complex_result_1 = deconvolute_complex_interaction_component(complex_compositions, genes_filtered,
                                                                              interactions, '_1', counts_data)
    deconvoluted_simple_result_1 = deconvolute_interaction_component(interactions, '_1', counts_data)

    deconvoluted_complex_result_2 = deconvolute_complex_interaction_component(complex_compositions, genes_filtered,
                                                                              interactions, '_2', counts_data)
    deconvoluted_simple_result_2 = deconvolute_interaction_component(interactions, '_2', counts_data)

    deconvoluted_result = pd.concat([deconvoluted_complex_result_1, deconvoluted_simple_result_1,
                                     deconvoluted_complex_result_2, deconvoluted_simple_result_2], sort=False)

    deconvoluted_result.set_index('gene', inplace=True)

    cluster_counts = pd.DataFrame(index=deconvoluted_result.index)

    for key, cluster_means in clusters_means.items():
        cluster_counts[key] = cluster_means

    cluster_counts = cluster_counts.reindex(sorted(cluster_counts.columns), axis=1)

    # Here we sort and filter unwanted columns
    deconvoluted_columns = ['gene_name', 'name', 'is_complex', 'protein_name', 'complex_name', 'id_cp_interaction']

    deconvoluted_result = deconvoluted_result[deconvoluted_columns]
    deconvoluted_result.rename({'name': 'uniprot'}, axis=1, inplace=True)

    deconvoluted_result = pd.concat([deconvoluted_result, cluster_counts], axis=1, join='inner', sort=False)

    deconvoluted_result.reset_index(inplace=True)
    deconvoluted_result.drop(columns='gene', inplace=True)
    return deconvoluted_result


def deconvolute_interaction_component(interactions, suffix, counts_data):
    interactions = interactions[~interactions['is_complex{}'.format(suffix)]]
    deconvoluted_result = pd.DataFrame()
    deconvoluted_result['gene'] = interactions['{}{}'.format(counts_data, suffix)]

    deconvoluted_result[['protein_name', 'gene_name', 'name', 'is_complex', 'id_cp_interaction', 'receptor']] = \
        interactions[['protein_name{}'.format(suffix), 'gene_name{}'.format(suffix), 'name{}'.format(suffix),
                      'is_complex{}'.format(suffix), 'id_cp_interaction', 'receptor{}'.format(suffix)]]
    deconvoluted_result['complex_name'] = np.nan

    return deconvoluted_result


def deconvolute_complex_interaction_component(complex_compositions, genes_filtered, interactions, suffix, counts_data):
    deconvoluted_result = pd.DataFrame()
    component = pd.DataFrame()
    component[counts_data] = interactions['{}{}'.format(counts_data, suffix)]
    component[['protein_name', 'gene_name', 'name', 'is_complex', 'id_cp_interaction', 'id_multidata', 'receptor']] = \
        interactions[
            ['protein_name{}'.format(suffix), 'gene_name{}'.format(suffix),
             'name{}'.format(suffix), 'is_complex{}'.format(suffix), 'id_cp_interaction',
             'id_multidata{}'.format(suffix), 'receptor{}'.format(suffix)]]

    deconvolution_complex = pd.merge(complex_compositions, component, left_on='complex_multidata_id',
                                     right_on='id_multidata')
    deconvolution_complex = pd.merge(deconvolution_complex, genes_filtered, left_on='protein_multidata_id',
                                     right_on='protein_multidata_id', suffixes=['_complex', '_simple'])

    deconvoluted_result['gene'] = deconvolution_complex['{}_simple'.format(counts_data)]

    deconvoluted_result[
        ['protein_name', 'gene_name', 'name', 'is_complex', 'id_cp_interaction', 'receptor', 'complex_name']] = \
        deconvolution_complex[
            ['protein_name_simple', 'gene_name_simple', 'name_simple',
             'is_complex_complex', 'id_cp_interaction', 'receptor_simple', 'name_complex']]

    return deconvoluted_result