diff --git a/.github/workflows/tox-gha.yml b/.github/workflows/tox-gha.yml index 696456d..6cf0ec4 100644 --- a/.github/workflows/tox-gha.yml +++ b/.github/workflows/tox-gha.yml @@ -2,10 +2,12 @@ name: Python package on: push: + branches: [main] pull_request: - # release: - # types: [published] - + branches: [main] + release: + types: [published] + jobs: build: runs-on: ubuntu-latest diff --git a/acore/decomposition/__init__.py b/acore/decomposition/__init__.py new file mode 100644 index 0000000..9f86d45 --- /dev/null +++ b/acore/decomposition/__init__.py @@ -0,0 +1,3 @@ +from . import pca + +__all__ = ["pca"] diff --git a/acore/decomposition/pca.py b/acore/decomposition/pca.py index 229661d..0ae8102 100644 --- a/acore/decomposition/pca.py +++ b/acore/decomposition/pca.py @@ -26,10 +26,10 @@ def run_pca( n_comp_max = min(df_wide.shape) n_comp_max = min(n_comp_max, n_components) pca = sklearn.decomposition.PCA(n_components=n_comp_max) - PCs = pca.fit_transform(df_wide) + pcs = pca.fit_transform(df_wide) cols = [ f"principal component {i+1} ({var_explained*100:.2f} %)" for i, var_explained in enumerate(pca.explained_variance_ratio_) ] - PCs = pd.DataFrame(PCs, index=df_wide.index, columns=cols) - return PCs, pca + pcs = pd.DataFrame(pcs, index=df_wide.index, columns=cols) + return pcs, pca diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index a3f9082..6838d83 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -1,3 +1,12 @@ +"""Enrichment Analysis Module. Contains different functions to perform enrichment +analysis. + +Most things in this module are covered in https://www.youtube.com/watch?v=2NC1QOXmc5o +by Lars Juhl Jensen. +""" + +from __future__ import annotations + import os import re import uuid @@ -9,17 +18,32 @@ from acore.multiple_testing import apply_pvalue_correction +TYPE_COLS_MSG = """ +columns: 'terms', 'identifiers', 'foreground', + 'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'. +""" + + +def run_fisher( + group1: list[int], + group2: list[int], + alternative: str = "two-sided", +) -> tuple[float, float]: + """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`_. -def run_fisher(group1, group2, alternative="two-sided"): - """annotated not-annotated - group1 a b - group2 c d - ------------------------------------ + .. _scipy.stats.fisher_exact: https://docs.scipy.org/doc/scipy/reference/generated/\ +scipy.stats.fisher_exact.html + + Example:: + + # annotated not-annotated + # group1 a b + # group2 c d - group1 = [a, b] - group2 = [c, d] - odds, pvalue = stats.fisher_exact([[a, b], [c, d]]) + odds, pvalue = stats.fisher_exact(group1=[a, b], + group2 =[c, d] + ) """ odds, pvalue = stats.fisher_exact([group1, group2], alternative) @@ -29,15 +53,23 @@ def run_fisher(group1, group2, alternative="two-sided"): def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"): """ - Compute the Kolmogorov-Smirnov statistic on 2 samples. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html + Compute the Kolmogorov-Smirnov statistic on 2 samples. + See `scipy.stats.ks_2samp`_ + + .. _scipy.stats.ks_2samp: https://docs.scipy.org/doc/scipy/reference/generated/\ +scipy.stats.ks_2samp.html + - :param list dist1: sequence of 1-D ndarray (first distribution to compare) drawn from a continuous distribution - :param list dist2: sequence of 1-D ndarray (second distribution to compare) drawn from a continuous distribution + :param list dist1: sequence of 1-D ndarray (first distribution to compare) + drawn from a continuous distribution + :param list dist2: sequence of 1-D ndarray (second distribution to compare) + drawn from a continuous distribution :param str alternative: defines the alternative hypothesis (default is ‘two-sided’): * **'two-sided'** * **'less'** * **'greater'** :return: statistic float and KS statistic pvalue float Two-tailed p-value. + Example:: result = run_kolmogorov_smirnov(dist1, dist2, alternative='two-sided') @@ -49,193 +81,289 @@ def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"): return result +# ! undocumented for now (find usage example) def run_site_regulation_enrichment( - regulation_data, - annotation, - identifier="identifier", - groups=["group1", "group2"], - annotation_col="annotation", - reject_col="rejected", - group_col="group", - method="fisher", - regex=r"(\w+~.+)_\w\d+\-\w+", - correction="fdr_bh", + regulation_data: pd.DataFrame, + annotation: pd.DataFrame, + identifier: str = "identifier", + groups: list[str] = ("group1", "group2"), + annotation_col: str = "annotation", + rejected_col: str = "rejected", + group_col: str = "group", + method: str = "fisher", + regex: str = "(\\w+~.+)_\\w\\d+\\-\\w+", + correction: str = "fdr_bh", + remove_duplicates: bool = False, ): - """ - This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset. - - :param regulation_data: pandas dataframe resulting from differential regulation analysis. - :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). - :param str identifier: name of the column from annotation containing feature identifiers. - :param list groups: column names from regulation_data containing group identifiers. - :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis. - :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. - :param str method: method used to compute enrichment (only 'fisher' is supported currently). + r""" + This function runs a simple enrichment analysis for significantly + regulated protein sites in a dataset. + + :param regulation_data: pandas.DataFrame resulting from differential + regulation analysis. + :param annotation: pandas.DataFrame with annotations for features + (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). + :param str identifier: name of the column from annotation containing + feature identifiers. + :param list groups: column names from regulation_data containing + group identifiers. + :param str annotation_col: name of the column from annotation containing + annotation terms. + :param str rejected_col: name of the column from regulation_data containing + boolean for rejected null hypothesis. + :param str group_col: column name for new column in annotation dataframe + determining if feature belongs to foreground or background. + :param str method: method used to compute enrichment + (only 'fisher' is supported currently). :param str regex: how to extract the annotated identifier from the site identifier - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. + :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', + 'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'. + + :raises ValueError: if regulation_data is `None` or empty. Example:: - result = run_site_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', match="(\w+~.+)_\w\d+\-\w+") + result = run_site_regulation_enrichment(regulation_data, + annotation, + identifier='identifier', + groups=['group1', 'group2'], + annotation_col='annotation', + rejected_col='rejected', + group_col='group', + method='fisher', + match="(\\w+~.+)_\\w\\d+\\-\\w+" + ) """ result = pd.DataFrame() - if regulation_data is not None: - if not regulation_data.empty: - new_ids = [] - for ident in regulation_data[identifier].tolist(): - match = re.search(regex, ident) - if match is not None: - new_ids.append(match.group(1)) - else: - new_ids.append(ident) - regulation_data[identifier] = new_ids - regulation_data = regulation_data.drop_duplicates() - result = run_regulation_enrichment( - regulation_data, - annotation, - identifier, - groups, - annotation_col, - reject_col, - group_col, - method, - correction, - ) + if regulation_data is None or regulation_data.empty: + raise ValueError("regulation_data is empty") + + new_ids = [] + # find any identifiers with a PTM and save only prot+gene identifer + for ident in regulation_data[identifier].tolist(): + match = re.search(regex, ident) + if match is not None: + new_ids.append( + match.group(1) + ) # removes the PTM extension of the identifier of CKG + else: + new_ids.append(ident) + # so this is normalizing the identifiers to ignore the PTM extension + regulation_data[identifier] = new_ids # matches are used as identifiers + if remove_duplicates: + regulation_data = regulation_data.drop_duplicates(subset=[identifier]) + result = run_regulation_enrichment( + regulation_data, + annotation, + identifier, + groups, + annotation_col, + rejected_col, + group_col, + method, + correction, + ) return result def run_up_down_regulation_enrichment( - regulation_data, - annotation, - identifier="identifier", - groups=["group1", "group2"], - annotation_col="annotation", - reject_col="rejected", - group_col="group", - method="fisher", - correction="fdr_bh", - alpha=0.05, - lfc_cutoff=1, -): + regulation_data: pd.DataFrame, + annotation: pd.DataFrame, + identifier: str = "identifier", + groups: list[str] = ("group1", "group2"), + annotation_col: str = "annotation", + # rejected_col:str="rejected", + group_col: str = "group", + method: str = "fisher", + min_detected_in_set: int = 2, + correction: str = "fdr_bh", + correction_alpha: float = 0.05, + lfc_cutoff: float = 1, +) -> pd.DataFrame: """ - This function runs a simple enrichment analysis for significantly regulated proteins distinguishing between up- and down-regulated. + This function runs a simple enrichment analysis for significantly regulated proteins + distinguishing between up- and down-regulated. - :param regulation_data: pandas dataframe resulting from differential regulation analysis (CKG's regulation table). - :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). + :param pandas.DataFrame regulation_data: pandas.DataFrame resulting from differential regulation + analysis (CKG's regulation table). + :param pandas.DataFrame annotation: pandas.DataFrame with annotations for features + (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. - :param list groups: column names from regulation_data containing group identifiers. + :param list[str] groups: column names from regulation_data containing group identifiers. + See `pandas.DataFrame.groupby`_ for more information. + + .. _pandas.DataFrame.groupby: https://pandas.pydata.org/pandas-docs/stable/\ +reference/api/pandas.DataFrame.groupby.html :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulation_data containing boolean for rejected null hypothesis. - :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. - :param str method: method used to compute enrichment (only 'fisher' is supported currently). + :param str rejected_col: name of the column from regulation_data containing boolean for + rejected null hypothesis. + :param str group_col: column name for new column in annotation dataframe determining + if feature belongs to foreground or background. + :param str method: method used to compute enrichment + (only 'fisher' is supported currently). :param str correction: method to be used for multiple-testing correction :param float alpha: adjusted p-value cutoff to define significance :param float lfc_cutoff: log fold-change cutoff to define practical significance - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. + :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', + 'background', 'pvalue', 'padj', 'rejected', 'direction' and 'comparison'. Example:: - result = run_up_down_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', correction='fdr_bh', alpha=0.05, lfc_cutoff=1) + result = run_up_down_regulation_enrichment( + regulation_data, + annotation, + identifier='identifier', + groups=['group1', + 'group2'], + annotation_col='annotation', + rejected_col='rejected', + group_col='group', + method='fisher', + correction='fdr_bh', + alpha=0.05, + lfc_cutoff=1, + ) """ - enrichment_results = {} + if isinstance(groups, str): + groups = [groups] + if isinstance(groups, tuple): + groups = list(groups) + if len(groups) != 2: + raise ValueError("groups should contains exactly two columns.") + + ret = list() + # In case of multiple comparisons this is used to get all possible combinations for g1, g2 in regulation_data.groupby(groups).groups: + df = regulation_data.groupby(groups).get_group((g1, g2)) + + padj_name = "padj" if "posthoc padj" in df: - df["up_pairwise_regulation"] = (df["posthoc padj"] <= alpha) & ( - df["log2FC"] >= lfc_cutoff - ) - df["down_pairwise_regulation"] = (df["posthoc padj"] <= alpha) & ( - df["log2FC"] <= -lfc_cutoff - ) - else: - df["up_pairwise_regulation"] = (df["padj"] <= alpha) & ( - df["log2FC"] >= lfc_cutoff - ) - df["down_pairwise_regulation"] = (df["padj"] <= alpha) & ( - df["log2FC"] <= -lfc_cutoff - ) + padj_name = "posthoc padj" - enrichment = run_regulation_enrichment( - df, - annotation, - identifier=identifier, - groups=groups, - annotation_col=annotation_col, - reject_col="up_pairwise_regulation", - group_col=group_col, - method=method, - correction=correction, - ) - enrichment["direction"] = "upregulated" - enrichment_results[g1 + "~" + g2] = enrichment - enrichment = run_regulation_enrichment( - df, - annotation, - identifier=identifier, - groups=groups, - annotation_col=annotation_col, - reject_col="down_pairwise_regulation", - group_col=group_col, - method=method, - correction=correction, + df["up_pairwise_regulation"] = (df[padj_name] <= correction_alpha) & ( + df["log2FC"] >= lfc_cutoff ) - enrichment["direction"] = "downregulated" - enrichment_results[g1 + "~" + g2] = enrichment_results[g1 + "~" + g2].append( - enrichment + df["down_pairwise_regulation"] = (df[padj_name] <= correction_alpha) & ( + df["log2FC"] <= -lfc_cutoff ) + comparison_tag = g1 + "~" + g2 + + for rej_col, direction in zip( + ("up_pairwise_regulation", "down_pairwise_regulation"), + ("upregulated", "downregulated"), + ): + _enrichment = run_regulation_enrichment( + df, + annotation, + identifier=identifier, + annotation_col=annotation_col, + rejected_col=rej_col, + group_col=group_col, + method=method, + min_detected_in_set=min_detected_in_set, + correction=correction, + correction_alpha=correction_alpha, + ) + _enrichment["direction"] = direction + _enrichment["comparison"] = comparison_tag + ret.append(_enrichment) + + ret = pd.concat(ret) + + return ret - return enrichment_results + +# ! to move +def _annotate_features( + features: pd.Series, + in_foreground: set[str] | list[str], + in_background: set[str] | list[str], +) -> pd.Series: + """ + Annotate features as foreground or background based on their presence in the + foreground and background lists. + + :param features: pandas.Series with features and their annotations. + :param in_foreground: list of features identifiers in the foreground. + :type in_foreground: set or list-like + :param in_background: list of features identifiers in the background. + :type in_background: set or list-like + :return: pandas.Series containing 'foreground' or 'background'. + missing values are preserved. + + Example:: + + result = _annotate_features(features, in_foreground, in_background) + """ + in_either_or = features.isin(in_foreground) | features.isin(in_background) + res = ( + features.where(in_either_or, np.nan) + .mask(features.isin(in_foreground), "foreground") + .mask(features.isin(in_background), "background") + ) + return res def run_regulation_enrichment( - regulation_data, - annotation, - identifier="identifier", - groups=["group1", "group2"], - annotation_col="annotation", - reject_col="rejected", - group_col="group", - method="fisher", - correction="fdr_bh", -): + regulation_data: pd.DataFrame, + annotation: pd.DataFrame, + identifier: str = "identifier", + annotation_col: str = "annotation", + rejected_col: str = "rejected", + group_col: str = "group", + method: str = "fisher", + min_detected_in_set: int = 2, + correction: str = "fdr_bh", + correction_alpha: float = 0.05, +) -> pd.DataFrame: """ - This function runs a simple enrichment analysis for significantly regulated features in a dataset. + This function runs a simple enrichment analysis for significantly regulated features + in a dataset. - :param regulation_data: pandas dataframe resulting from differential regulation analysis. - :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). + :param regulation_data: pandas.DataFrame resulting from differential regulation analysis. + :param annotation: pandas.DataFrame with annotations for features + (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. - :param list groups: column names from regulation_data containing group identifiers. + It should also be present in `regulation_data`. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis. - :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. + :param str rejected_col: name of the column from `regulation_data` containing boolean for + rejected null hypothesis. + :param str group_col: column name for new column in annotation dataframe determining + if feature belongs to foreground or background. :param str method: method used to compute enrichment (only 'fisher' is supported currently). :param str correction: method to be used for multiple-testing correction - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. + :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', + 'background', 'foreground_pop', 'background_pop', 'pvalue', 'padj' and 'rejected'. Example:: - result = run_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher') + result = run_regulation_enrichment( + regulation_data, + annotation, + identifier='identifier', + annotation_col='annotation', + rejected_col='rejected', + group_col='group', + method='fisher', + min_detected_in_set=2, + correction='fdr_bh', + correction_alpha=0.05, + ) """ - result = {} - foreground_list = ( - regulation_data[regulation_data[reject_col]][identifier].unique().tolist() - ) - background_list = ( - regulation_data[~regulation_data[reject_col]][identifier].unique().tolist() - ) + # ? can we remove NA features in that column? + mask_rejected = regulation_data[rejected_col].astype(bool) + foreground_list = regulation_data.loc[mask_rejected, identifier].unique() + background_list = regulation_data.loc[~mask_rejected, identifier].unique() foreground_pop = len(foreground_list) - background_pop = len(regulation_data[identifier].unique().tolist()) - grouping = [] - for i in annotation[identifier]: - if i in foreground_list: - grouping.append("foreground") - elif i in background_list: - grouping.append("background") - else: - grouping.append(np.nan) - annotation[group_col] = grouping + background_pop = len(regulation_data[identifier].unique()) + # needs to allow for missing annotations + annotation[group_col] = _annotate_features( + features=annotation[identifier], + in_foreground=foreground_list, + in_background=background_list, + ) annotation = annotation.dropna(subset=[group_col]) result = run_enrichment( @@ -249,66 +377,92 @@ def run_regulation_enrichment( identifier_col=identifier, method=method, correction=correction, + min_detected_in_set=min_detected_in_set, + correction_alpha=correction_alpha, ) return result def run_enrichment( - data, - foreground_id, - background_id, - foreground_pop, - background_pop, - annotation_col="annotation", - group_col="group", - identifier_col="identifier", - method="fisher", - correction="fdr_bh", -): + data: pd.DataFrame, + foreground_id: str, + background_id: str, + foreground_pop: int, + background_pop: int, + min_detected_in_set: int = 2, + annotation_col: str = "annotation", + group_col: str = "group", + identifier_col: str = "identifier", + method: str = "fisher", + correction: str = "fdr_bh", + correction_alpha: float = 0.05, +) -> pd.DataFrame: """ - Computes enrichment of the foreground relative to a given backgroung, using Fisher's exact test, and corrects for multiple hypothesis testing. + Computes enrichment of the foreground relative to a given backgroung, + using Fisher's exact test, and corrects for multiple hypothesis testing. - :param data: pandas dataframe with annotations for dataset features (columns: 'annotation', 'identifier', 'source', 'group'). + :param data: pandas.DataFrame with annotations for dataset features + (columns: 'annotation', 'identifier', 'group'). :param str foreground_id: group identifier of features that belong to the foreground. :param str background_id: group identifier of features that belong to the background. + :param int foreground_pop: number of features in the foreground. + :param int background_pop: number of features in the background. :param str annotation_col: name of the column containing annotation terms. :param str group_col: name of column containing the group identifiers. :param str identifier_col: name of column containing dependent variables identifiers. :param str method: method used to compute enrichment (only 'fisher' is supported currently). - :return: Pandas dataframe with annotation terms, features, number of foregroung/background features in each term, p-values and corrected p-values (columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'). + :param str correction: method to be used for multiple-testing correction. + :param float correction_alpha: adjusted p-value cutoff to define significance. + :return: pandas.DataFrame with columns: annotation terms, features, + number of foregroung/background features in each term, + p-values and corrected p-values + (columns: 'terms', 'identifiers', 'foreground', + 'background', 'pvalue', 'padj' and 'rejected'). Example:: - result = run_enrichment(data, foreground='foreground', background='background', foreground_pop=len(foreground_list), background_pop=len(background_list), annotation_col='annotation', group_col='group', identifier_col='identifier', method='fisher') + result = run_enrichment( + data, + foreground='foreground', + background='background', + foreground_pop=len(foreground_list), + background_pop=len(background_list), + annotation_col='annotation', + group_col='group', + identifier_col='identifier', + method='fisher', + ) """ + if method != "fisher": + raise ValueError("Only Fisher's exact test is supported at the moment.") + result = pd.DataFrame() - df = data.copy() terms = [] ids = [] pvalues = [] fnum = [] bnum = [] countsdf = ( - df.groupby([annotation_col, group_col]) + data.groupby([annotation_col, group_col]) .agg(["count"])[(identifier_col, "count")] .reset_index() ) countsdf.columns = [annotation_col, group_col, "count"] - for annotation in ( - countsdf[countsdf[group_col] == foreground_id][annotation_col].unique().tolist() - ): + for annotation in countsdf.loc[ + countsdf[group_col] == foreground_id, annotation_col + ].unique(): counts = countsdf[countsdf[annotation_col] == annotation] num_foreground = counts.loc[counts[group_col] == foreground_id, "count"].values num_background = counts.loc[counts[group_col] == background_id, "count"].values - + # ! counts should always be of length one count? squeeze? if len(num_foreground) == 1: num_foreground = num_foreground[0] if len(num_background) == 1: num_background = num_background[0] else: num_background = 0 - if method == "fisher" and num_foreground > 1: + if num_foreground >= min_detected_in_set: _, pvalue = run_fisher( [num_foreground, foreground_pop - num_foreground], [num_background, background_pop - foreground_pop - num_background], @@ -319,15 +473,19 @@ def run_enrichment( pvalues.append(pvalue) ids.append( ",".join( - df.loc[ - (df[annotation_col] == annotation) - & (df[group_col] == foreground_id), + data.loc[ + (data[annotation_col] == annotation) + & (data[group_col] == foreground_id), identifier_col, - ].tolist() + ] ) ) - if len(pvalues) > 1: - rejected, padj = apply_pvalue_correction(pvalues, alpha=0.05, method=correction) + if len(pvalues) >= 1: + rejected, padj = apply_pvalue_correction( + pvalues, + alpha=correction_alpha, + method=correction, + ) result = pd.DataFrame( { "terms": terms, @@ -338,7 +496,7 @@ def run_enrichment( "background_pop": background_pop, "pvalue": pvalues, "padj": padj, - "rejected": rejected, + "rejected": rejected.astype(bool), } ) result = result.sort_values(by="padj", ascending=True) @@ -347,35 +505,53 @@ def run_enrichment( def run_ssgsea( - data, - annotation, - annotation_col="an notation", - identifier_col="identifier", - set_index=[], - outdir="tmp", - min_size=15, - max_size=500, - scale=False, - permutations=0, -): + data: pd.DataFrame, + annotation: str, + set_index: list[str] = None, + annotation_col: str = "annotation", + identifier_col: str = "identifier", + outdir: str = "tmp", + min_size: int = 15, + max_size: int = 500, + scale: bool = False, + permutations: int = 0, +) -> pd.DataFrame: """ - Project each sample within a data set onto a space of gene set enrichment scores using the ssGSEA projection methodology described in Barbie et al., 2009. - - :param data: pandas dataframe with the quantified features (i.e. subject x proteins) - :param annotation: pandas dataframe with the annotation to be used in the enrichment (i.e. CKG pathway annotation file) + Project each sample within a data set onto a space of gene set enrichment scores using + the single sample gene set enrichment analysis (ssGSEA) projection methodology + described in Barbie et al., 2009: + https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA). + + :param pd.DataFrame data: pandas.DataFrame with the quantified features (i.e. subject x proteins) + :param str annotation: pandas.DataFrame with the annotation to be used in the enrichment + (i.e. CKG pathway annotation file) + :param list[str] set_index: column/s to be used as index. Enrichment will be calculated + for these values (i.e ["subject"] will return subjects x pathways matrix of + enrichment scores) :param str annotation_col: name of the column containing annotation terms. :param str identifier_col: name of column containing dependent variables identifiers. - :param list set_index: column/s to be used as index. Enrichment will be calculated for these values (i.e ["subject"] will return subjects x pathways matrix of enrichment scores) - :param str out_dir: directory path where results will be stored (default None, tmp folder is used) - :param int min_size: minimum number of features (i.e. proteins) in enriched terms (i.e. pathways) - :param int max_size: maximum number of features (i.e. proteins) in enriched terms (i.e. pathways) + :param str out_dir: directory path where results will be stored + (default None, tmp folder is used) + :param int min_size: minimum number of features (i.e. proteins) in enriched terms + (i.e. pathways) + :param int max_size: maximum number of features (i.e. proteins) in enriched terms + (i.e. pathways) :param bool scale: whether or not to scale the data :param int permutations: number of permutations used in the ssgsea analysis - :return: dictionary with two dataframes: es - enrichment scores, and nes - normalized enrichment scores. + :return: pandas.DataFrame containing unnormalized enrichment scores (`ES`) for each sample, + and normalized enrichment scores (`NES`) with the enriched `Term` and sample `Name`. + :rtype: pandas.DataFrame Example:: + stproject = "P0000008" - p = project.Project(stproject, datasets={}, knowledge=None, report={}, configuration_files=None) + p = project.Project( + stproject, + datasets={}, + knowledge=None, + report={}, + configuration_files=None, + ) p.build_project(False) p.generate_report() @@ -383,60 +559,79 @@ def run_ssgsea( annotations = proteomics_dataset.get_dataframe("pathway annotation") processed = proteomics_dataset.get_dataframe('processed') - result = run_ssgsea(processed, annotations, annotation_col='annotation', identifier_col='identifier', set_index=['group', 'sample','subject'], outdir=None, min_size=10, scale=False, permutations=0) - + result = run_ssgsea( + processed, + annotations, + annotation_col='annotation', + identifier_col='identifier', + set_index=['group', + 'sample', + 'subject'], + outdir=None, + min_size=10, + scale=False, + permutations=0 + ) """ - result = {} df = data.copy() if not os.path.exists(outdir): os.makedirs(outdir) + # Comine columns to create a unique name for each set (?) name = [] - index = data[set_index] - for i, row in data[set_index].iterrows(): - name.append("_".join(row[set_index].tolist())) + if set_index is None: + index = data.index.to_frame() + set_index = index.columns.tolist() + else: + index = data[set_index] + df = df.drop(set_index, axis=1) + + for _, row in index.iterrows(): + name.append( + "_".join(row[set_index].tolist()) + ) # this assumes strings as identifiers df["Name"] = name index.index = name - df = df.drop(set_index, axis=1).set_index("Name").transpose() + df = df.set_index("Name").transpose() - if annotation_col in annotation and identifier_col in annotation: - grouped_annotations = ( - annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index() + if not annotation_col in annotation: + raise ValueError( + f"Missing Annotation Column: {annotation_col} as specified by `annotation_col`" ) - fid = uuid.uuid4() - file_path = os.path.join(outdir, str(fid) + ".gmt") - with open(file_path, "w") as out: - for i, row in grouped_annotations.iterrows(): - out.write( - row[annotation_col] - + "\t" - + "\t".join(list(filter(None, row[identifier_col]))) - + "\n" - ) - try: - enrichment = gp.ssgsea( - data=df, - gene_sets=str(file_path), - outdir=outdir, - min_size=min_size, - max_size=max_size, - scale=scale, - permutation_num=permutations, - no_plot=True, - processes=1, - seed=10, - format="png", - ) - - enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose() - enrichment_es = enrichment_es.join(index) - enrichment_nes = enrichment.res2d.transpose() - enrichment_nes = enrichment_nes.join(index) - result = {"es": enrichment_es, "nes": enrichment_nes} - except Exception as e: - print("Error in ssGSEA.", e) - df = None + if not identifier_col in annotation: + raise ValueError( + f"Missing Identifier Column: {identifier_col} as specified by `identifier_col`" + ) + grouped_annotations = ( + annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index() + ) + fid = uuid.uuid4() + file_path = os.path.join(outdir, str(fid) + ".gmt") + with open(file_path, "w", encoding="utf8") as out: + for _, row in grouped_annotations.iterrows(): + out.write( + row[annotation_col] + + "\t" + + "\t".join(list(filter(None, row[identifier_col]))) + + "\n" + ) + enrichment = gp.ssgsea( + data=df, + gene_sets=str(file_path), + outdir=outdir, + min_size=min_size, + max_size=max_size, + scale=scale, + permutation_num=permutations, + no_plot=True, + processes=1, + seed=10, + format="png", + ) + result = pd.DataFrame(enrichment.res2d).set_index("Name") + # potentially return wide format in separate format + # result = {"es": enrichment_es, "nes": enrichment_nes} return result diff --git a/acore/exploratory_analysis.py b/acore/exploratory_analysis.py index b0a09da..356e4a9 100644 --- a/acore/exploratory_analysis.py +++ b/acore/exploratory_analysis.py @@ -6,13 +6,13 @@ from sklearn.manifold import TSNE -def calculate_coefficient_variation(values): +def calculate_coefficient_variation(values: np.ndarray) -> np.ndarray: """ Compute the coefficient of variation, the ratio of the biased standard deviation to the mean, in percentage. For more information visit https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.variation.html. - :param numpy.ndarray values: numpy array + :param numpy.ndarray values: numpy array of log2 transformed values :return: The calculated variation along rows. :rtype: numpy.ndarray @@ -29,13 +29,15 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]): """ Extracts the coefficients of variation in each group. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_columns: column labels to be dropped from the dataframe :param str group: column label containing group identifiers. :param list columns: names to use for the variable column(s), and for the value column(s) - :return: Pandas dataframe with columns 'name' (protein identifier), 'x' (coefficient of variation), 'y' (mean) and 'group'. + :return: Pandas dataframe with columns 'name' (protein identifier), + 'x' (coefficient of variation), 'y' (mean) and 'group'. - Exmaple:: + Example:: result = get_coefficient_variation(data, drop_columns=['sample', 'subject'], group='group') """ @@ -46,24 +48,27 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]): cvs_df = pd.DataFrame() for i in cvs.index: gcvs = cvs[i].tolist() - ints = formated_df.set_index("group").mean().values.tolist() + ints = formated_df.set_index(group).mean().values.tolist() tdf = pd.DataFrame(data={"name": cols, "x": gcvs, "y": ints}) - tdf["group"] = i + tdf[group] = i if cvs_df.empty: cvs_df = tdf.copy() else: - cvs_df = cvs_df.append(tdf) + cvs_df = pd.concat([cvs_df, tdf]) return cvs_df def extract_number_missing(data, min_valid, drop_cols=["sample"], group="group"): """ - Counts how many valid values exist in each column and filters column labels with more valid values than the minimum threshold defined. + Counts how many valid values exist in each column and filters column labels with more + valid values than the minimum threshold defined. :param data: pandas DataFrame with group as rows and protein identifier as column. - :param str group: column label containing group identifiers. If None, number of valid values is counted across all samples, otherwise is counted per unique group identifier. + :param str group: column label containing group identifiers. + If None, number of valid values is counted across all samples, + otherwise is counted per unique group identifier. :param int min_valid: minimum number of valid values to be filtered. :param list drop_columns: column labels to be dropped. :return: List of column labels above the threshold. @@ -89,16 +94,21 @@ def extract_percentage_missing( data, missing_max, drop_cols=["sample"], group="group", how="all" ): """ - Extracts ratio of missing/valid values in each column and filters column labels with lower ratio than the minimum threshold defined. + Extracts ratio of missing/valid values in each column and filters column labels with + lower ratio than the minimum threshold defined. :param data: pandas dataframe with group as rows and protein identifier as column. - :param str group: column label containing group identifiers. If None, ratio is calculated across all samples, otherwise is calculated per unique group identifier. + :param str group: column label containing group identifiers. + If None, ratio is calculated across all samples, + otherwise is calculated per unique group identifier. :param float missing_max: maximum ratio of missing/valid values to be filtered. - :param str how: define if labels with a higher percentage of missing values than the threshold in any group ('any') or in all groups ('all') should be filtered + :param str how: define if labels with a higher percentage of missing values than the threshold + in any group ('any') or in all groups ('all') should be filtered :return: List of column labels below the threshold. Example:: - result = extract_percentage_missing(data, missing_max=0.3, drop_cols=['sample'], group='group') + result = extract_percentage_missing(data, missing_max=0.3, + drop_cols=['sample'], group='group') """ if group is None: groups = data.loc[:, data.isnull().mean() <= missing_max].columns @@ -128,81 +138,86 @@ def run_pca( dropna=True, ): """ - Performs principal component analysis and returns the values of each component for each sample and each protein, and the loadings for each protein. \ - For information visit https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html. + Performs principal component analysis and returns the values of each component for each sample + and each protein, and the loadings for each protein. + + For information visit + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_cols: column labels to be dropped from the dataframe. :param str group: column label containing group identifiers. :param list annotation_cols: list of columns to be added in the scatter plot annotation :param int components: number of components to keep. :param bool dropna: if True removes all columns with any missing values. - :return: tuple: 1) three pandas dataframes: components, loadings and variance; 2) xaxis and yaxis titles with components loadings for plotly. + :return: tuple: 1) three pandas dataframes: components, loadings and variance; 2) + xaxis and yaxis titles with components loadings for plotly. Example:: - result = run_pca(data, drop_cols=['sample', 'subject'], group='group', components=2, dropna=True) + result = run_pca(data, drop_cols=['sample', 'subject'], group='group', + components=2, dropna=True) """ np.random.seed(112736) - resultDf = pd.DataFrame() - loadings = pd.DataFrame() var_exp = [] args = {} - if not data.empty: - df = data.copy() - annotations = pd.DataFrame() - if annotation_cols is not None: - if len(list(set(annotation_cols).intersection(data.columns))) > 0: - annotations = data.set_index(group)[annotation_cols] - drop_cols_int = list(set(drop_cols).intersection(df.columns)) - if len(drop_cols_int) > 0: - df = df.drop(drop_cols_int, axis=1) - - y = df[group].tolist() - df = df.set_index(group) - df = df.select_dtypes(["number"]) - if dropna: - df = df.dropna(axis=1) - X = df.values - - if X.size > 0 and X.shape[1] > components: - pca = PCA(n_components=components) - X = pca.fit_transform(X) - var_exp = pca.explained_variance_ratio_ - loadings = pd.DataFrame(pca.components_.transpose()) - loadings.index = df.columns - values = { - index: np.sqrt(np.power(row, 2).sum()) - for index, row in loadings.iterrows() - } - loadings["value"] = loadings.index.map(values.get) - loadings = loadings.sort_values(by="value", ascending=False) - args = { - "x_title": "PC1" + " ({0:.2f})".format(var_exp[0]), - "y_title": "PC2" + " ({0:.2f})".format(var_exp[1]), - "group": "group", - } - if components == 2: - resultDf = pd.DataFrame(X, index=y, columns=["x", "y"]) - resultDf = resultDf.assign(**annotations) - resultDf = resultDf.reset_index() - resultDf.columns = ["group", "x", "y"] + annotation_cols - - loadings.columns = ["x", "y", "value"] - if components > 2: - args.update({"z_title": "PC3" + " ({0:.2f})".format(var_exp[2])}) - resultDf = pd.DataFrame(X, index=y) - resultDf = resultDf.assign(**annotations) - resultDf = resultDf.reset_index() - pca_cols = [] - loading_cols = [] - if components > 3: - pca_cols = [str(i) for i in resultDf.columns[4:]] - loading_cols = [str(i) for i in loadings.columns[3:]] - - resultDf.columns = ["group", "x", "y", "z"] + pca_cols - loadings.columns = ["x", "y", "z"] + loading_cols + if data.empty: + raise ValueError("Dataframe is empty.") + + df = data.copy() + annotations = pd.DataFrame() + if annotation_cols is not None: + if len(list(set(annotation_cols).intersection(data.columns))) > 0: + annotations = data.set_index(group)[annotation_cols] + drop_cols_int = list(set(drop_cols).intersection(df.columns)) + if len(drop_cols_int) > 0: + df = df.drop(drop_cols_int, axis=1) + + y = df[group].tolist() + df = df.set_index(group) + df = df.select_dtypes(["number"]) + if dropna: + df = df.dropna(axis=1) + X = df.values + + if X.size > 0 and X.shape[1] > components: + pca = PCA(n_components=components) + X = pca.fit_transform(X) + var_exp = pca.explained_variance_ratio_ + loadings = pd.DataFrame(pca.components_.transpose()) + loadings.index = df.columns + values = { + index: np.sqrt(np.power(row, 2).sum()) for index, row in loadings.iterrows() + } + loadings["value"] = loadings.index.map(values.get) + loadings = loadings.sort_values(by="value", ascending=False) + args = { + "x_title": "PC1" + " ({0:.2f})".format(var_exp[0]), + "y_title": "PC2" + " ({0:.2f})".format(var_exp[1]), + "group": "group", + } + if components == 2: + resultDf = pd.DataFrame(X, index=y, columns=["x", "y"]) + resultDf = resultDf.assign(**annotations) + resultDf = resultDf.reset_index() + resultDf.columns = ["group", "x", "y"] + annotation_cols + + loadings.columns = ["x", "y", "value"] + if components > 2: + args.update({"z_title": "PC3" + " ({0:.2f})".format(var_exp[2])}) + resultDf = pd.DataFrame(X, index=y) + resultDf = resultDf.assign(**annotations) + resultDf = resultDf.reset_index() + pca_cols = [] + loading_cols = [] + if components > 3: + pca_cols = [str(i) for i in resultDf.columns[4:]] + loading_cols = [str(i) for i in loadings.columns[3:]] + + resultDf.columns = ["group", "x", "y", "z"] + pca_cols + loadings.columns = ["x", "y", "z"] + loading_cols return (resultDf, loadings, var_exp), args @@ -219,22 +234,39 @@ def run_tsne( dropna=True, ): """ - Performs t-distributed Stochastic Neighbor Embedding analysis. For more information visit https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html. + Performs t-distributed Stochastic Neighbor Embedding analysis. + + For more information visit + https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_cols: column labels to be dropped from the dataframe. :param str group: column label containing group identifiers. :param int components: dimension of the embedded space. :param list annotation_cols: list of columns to be added in the scatter plot annotation - :param int perplexity: related to the number of nearest neighbors that is used in other manifold learning algorithms. Consider selecting a value between 5 and 50. + :param int perplexity: related to the number of nearest neighbors that is used + in other manifold learning algorithms. + Consider selecting a value between 5 and 50. :param int n_iter: maximum number of iterations for the optimization (at least 250). - :param str init: initialization of embedding ('random', 'pca' or numpy array of shape n_samples x n_components). + :param str init: initialization of embedding ('random', 'pca' or + numpy array of shape n_samples x n_components). :param bool dropna: if True removes all columns with any missing values. - :return: Two dictionaries: 1) pandas dataframe with embedding vectors, 2) xaxis and yaxis titles for plotly. + :return: Two dictionaries: + 1) pandas dataframe with embedding vectors, + 2) xaxis and yaxis titles for plotly. Example:: - result = run_tsne(data, drop_cols=['sample', 'subject'], group='group', components=2, perplexity=40, n_iter=1000, init='pca', dropna=True) + result = run_tsne(data, + drop_cols=['sample', 'subject'], + group='group', + components=2, + perplexity=40, + n_iter=1000, + init='pca', + dropna=True + ) """ result = {} args = {} @@ -256,7 +288,7 @@ def run_tsne( n_components=components, verbose=0, perplexity=perplexity, - max_iter=n_iter, + n_iter=n_iter, init=init, ) X = tsne.fit_transform(X) @@ -289,21 +321,34 @@ def run_umap( dropna=True, ): """ - Performs Uniform Manifold Approximation and Projection. For more information vist https://umap-learn.readthedocs.io. + Performs Uniform Manifold Approximation and Projection. + + For more information vist https://umap-learn.readthedocs.io. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_cols: column labels to be dropped from the dataframe. :param str group: column label containing group identifiers. :param list annotation_cols: list of columns to be added in the scatter plot annotation - :param int n_neighbors: number of neighboring points used in local approximations of manifold structure. + :param int n_neighbors: number of neighboring points used + in local approximations of manifold structure. :param float min_dist: controls how tightly the embedding is allowed compress points together. :param str metric: metric used to measure distance in the input space. :param bool dropna: if True removes all columns with any missing values. - :return: Two dictionaries: 1) pandas dataframe with embedding of the training data in low-dimensional space, 2) xaxis and yaxis titles for plotly. + :return: Two dictionaries: + 1) pandas dataframe with embedding of the training data in low-dimensional space, + 2) xaxis and yaxis titles for plotly. Example:: - result = run_umap(data, drop_cols=['sample', 'subject'], group='group', n_neighbors=10, min_dist=0.3, metric='cosine', dropna=True) + result = run_umap(data, + drop_cols=['sample', 'subject'], + group='group', + n_neighbors=10, + min_dist=0.3, + metric='cosine', + dropna=True + ) """ np.random.seed(1145536) result = {} diff --git a/acore/io/uniprot/__init__.py b/acore/io/uniprot/__init__.py new file mode 100644 index 0000000..96e6002 --- /dev/null +++ b/acore/io/uniprot/__init__.py @@ -0,0 +1,48 @@ +"""Uniprot API user functions for fetching annotations for UniProt IDs and providing +the results as a pandas.DataFrame.""" + +import pandas as pd + +from .uniprot import ( + check_id_mapping_results_ready, + get_id_mapping_results_link, + get_id_mapping_results_search, + submit_id_mapping, +) + + +# function for outside usage +def fetch_annotations( + ids: pd.Index | list, + fields: str = "accession,go_p,go_c,go_f", +) -> pd.DataFrame: + """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's + knowledgebase (KB). + + Parameters + ---------- + ids : pd.Index | list + Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields. + fields : str, optional + Fields to fetch, by default "accession,go_p,go_c. See for availble fields: + https://www.uniprot.org/help/return_fields + + Returns + ------- + pd.DataFrame + DataFrame with annotations of the UniProt IDs. + """ + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids) + # tsv used here to return a DataFrame. Maybe other formats are availale at some points + _format = "tsv" + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + # add fields to the link to get more information + # From and Entry (accession) are the same for UniProt IDs. + results = get_id_mapping_results_search( + link + f"?fields={fields}&format={_format}" + ) + header = results.pop(0).split("\t") + results = [line.split("\t") for line in results] + df = pd.DataFrame(results, columns=header) + return df diff --git a/acore/io/uniprot/uniprot.py b/acore/io/uniprot/uniprot.py new file mode 100644 index 0000000..c9abf05 --- /dev/null +++ b/acore/io/uniprot/uniprot.py @@ -0,0 +1,325 @@ +"""Uniprot ID mapping using Python. + +Source: https://www.uniprot.org/help/id_mapping +""" + +import json +import re +import time +import zlib +from urllib.parse import parse_qs, urlencode, urlparse +from xml.etree import ElementTree + +import pandas as pd +import requests +from requests.adapters import HTTPAdapter, Retry + +POLLING_INTERVAL = 3 +API_URL = "https://rest.uniprot.org" + + +retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) +session = requests.Session() +session.mount("https://", HTTPAdapter(max_retries=retries)) + + +def check_response(response): + try: + response.raise_for_status() + except requests.HTTPError: + print(response.json()) + raise + + +def submit_id_mapping(from_db, to_db, ids): + request = requests.post( + f"{API_URL}/idmapping/run", + data={"from": from_db, "to": to_db, "ids": ",".join(ids)}, + ) + check_response(request) + return request.json()["jobId"] + + +def get_next_link(headers): + re_next_link = re.compile(r'<(.+)>; rel="next"') + if "Link" in headers: + match = re_next_link.match(headers["Link"]) + if match: + return match.group(1) + + +def check_id_mapping_results_ready(job_id): + while True: + request = session.get(f"{API_URL}/idmapping/status/{job_id}") + check_response(request) + j = request.json() + if "jobStatus" in j: + if j["jobStatus"] in ("NEW", "RUNNING"): + print(f"Retrying in {POLLING_INTERVAL}s") + time.sleep(POLLING_INTERVAL) + else: + raise Exception(j["jobStatus"]) + else: + return bool(j["results"] or j["failedIds"]) + + +def get_batch(batch_response, file_format, compressed): + batch_url = get_next_link(batch_response.headers) + while batch_url: + batch_response = session.get(batch_url) + batch_response.raise_for_status() + yield decode_results(batch_response, file_format, compressed) + batch_url = get_next_link(batch_response.headers) + + +def combine_batches(all_results, batch_results, file_format): + if file_format == "json": + for key in ("results", "failedIds"): + if key in batch_results and batch_results[key]: + all_results[key] += batch_results[key] + elif file_format == "tsv": + return all_results + batch_results[1:] + else: + return all_results + batch_results + return all_results + + +def get_id_mapping_results_link(job_id): + url = f"{API_URL}/idmapping/details/{job_id}" + request = session.get(url) + check_response(request) + return request.json()["redirectURL"] + + +def decode_results(response, file_format, compressed): + if compressed: + decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) + if file_format == "json": + j = json.loads(decompressed.decode("utf-8")) + return j + elif file_format == "tsv": + return [line for line in decompressed.decode("utf-8").split("\n") if line] + elif file_format == "xlsx": + return [decompressed] + elif file_format == "xml": + return [decompressed.decode("utf-8")] + else: + return decompressed.decode("utf-8") + elif file_format == "json": + return response.json() + elif file_format == "tsv": + return [line for line in response.text.split("\n") if line] + elif file_format == "xlsx": + return [response.content] + elif file_format == "xml": + return [response.text] + return response.text + + +def get_xml_namespace(element): + m = re.match(r"\{(.*)\}", element.tag) + return m.groups()[0] if m else "" + + +def merge_xml_results(xml_results): + merged_root = ElementTree.fromstring(xml_results[0]) + for result in xml_results[1:]: + root = ElementTree.fromstring(result) + for child in root.findall("{http://uniprot.org/uniprot}entry"): + merged_root.insert(-1, child) + ElementTree.register_namespace("", get_xml_namespace(merged_root[0])) + return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True) + + +def print_progress_batches(batch_index, size, total): + n_fetched = min((batch_index + 1) * size, total) + print(f"Fetched: {n_fetched} / {total}") + + +def get_id_mapping_results_search(url): + parsed = urlparse(url) + query = parse_qs(parsed.query) + file_format = query["format"][0] if "format" in query else "json" + if "size" in query: + size = int(query["size"][0]) + else: + size = 500 + query["size"] = size + compressed = ( + query["compressed"][0].lower() == "true" if "compressed" in query else False + ) + parsed = parsed._replace(query=urlencode(query, doseq=True)) + url = parsed.geturl() + request = session.get(url) + check_response(request) + results = decode_results(request, file_format, compressed) + total = int(request.headers["x-total-results"]) + print_progress_batches(0, size, total) + for i, batch in enumerate(get_batch(request, file_format, compressed), 1): + results = combine_batches(results, batch, file_format) + print_progress_batches(i, size, total) + if file_format == "xml": + return merge_xml_results(results) + return results + + +def get_id_mapping_results_stream(url): + if "/stream/" not in url: + url = url.replace("/results/", "/results/stream/") + request = session.get(url) + check_response(request) + parsed = urlparse(url) + query = parse_qs(parsed.query) + file_format = query["format"][0] if "format" in query else "json" + compressed = ( + query["compressed"][0].lower() == "true" if "compressed" in query else False + ) + return decode_results(request, file_format, compressed) + + + + +if __name__ == "__main__": + # id mapping is used to create a link to a query (you can see the json in the browser) + # UniProtKB is the knowleadgebase integrating all kind of other databases + import pandas as pd + + job_id = submit_id_mapping( + from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=["P05067", "P12345"] + ) + + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + # add fields to the link to get more information + # From and Entry (accession) are the same. + results = get_id_mapping_results_search( + link + "?fields=accession,go_p,go_c,go_f&format=tsv" + ) + # see the available fields you can add + # https://www.uniprot.org/help/return_fields + # and the available formats: json and tsv (for most endpoints) + # https://www.uniprot.org/help/api_queries#tips + print(results) + header = results.pop(0).split("\t") + results = [line.split("\t") for line in results] + df = pd.DataFrame(results, columns=header) + # result from: + # df.to_dict(orient="records") + records = [ + { + "From": "P05067", + "Gene Ontology (biological process)": ( + "adult locomotory behavior [GO:0008344]; amyloid fibril formation" + " [GO:1990000]; astrocyte activation [GO:0048143]; astrocyte activation" + " involved in immune response [GO:0002265]; axo-dendritic transport" + " [GO:0008088]; axon midline choice point recognition [GO:0016199];" + " axonogenesis [GO:0007409]; cell adhesion [GO:0007155]; cellular" + " response to amyloid-beta [GO:1904646]; central nervous system" + " development [GO:0007417]; cholesterol metabolic process [GO:0008203];" + " cognition [GO:0050890]; collateral sprouting in absence of injury" + " [GO:0048669]; cytosolic mRNA polyadenylation [GO:0180011]; dendrite" + " development [GO:0016358]; endocytosis [GO:0006897]; extracellular" + " matrix organization [GO:0030198]; forebrain development [GO:0030900];" + " G2/M transition of mitotic cell cycle [GO:0000086]; intracellular" + " copper ion homeostasis [GO:0006878]; ionotropic glutamate receptor" + " signaling pathway [GO:0035235]; learning [GO:0007612]; learning or" + " memory [GO:0007611]; locomotory behavior [GO:0007626]; mating" + " behavior [GO:0007617]; microglia development [GO:0014005]; microglial" + " cell activation [GO:0001774]; modulation of excitatory postsynaptic" + " potential [GO:0098815]; negative regulation of cell population" + " proliferation [GO:0008285]; negative regulation of gene expression" + " [GO:0010629]; negative regulation of long-term synaptic potentiation" + " [GO:1900272]; negative regulation of neuron differentiation" + " [GO:0045665]; neuromuscular process controlling balance [GO:0050885];" + " neuron apoptotic process [GO:0051402]; neuron cellular homeostasis" + " [GO:0070050]; neuron projection development [GO:0031175]; neuron" + " projection maintenance [GO:1990535]; neuron remodeling [GO:0016322];" + " NMDA selective glutamate receptor signaling pathway [GO:0098989];" + " Notch signaling pathway [GO:0007219]; positive regulation of amyloid" + " fibril formation [GO:1905908]; positive regulation of" + " calcium-mediated signaling [GO:0050850]; positive regulation of" + " chemokine production [GO:0032722]; positive regulation of ERK1 and" + " ERK2 cascade [GO:0070374]; positive regulation of G2/M transition of" + " mitotic cell cycle [GO:0010971]; positive regulation of gene" + " expression [GO:0010628]; positive regulation of glycolytic process" + " [GO:0045821]; positive regulation of inflammatory response" + " [GO:0050729]; positive regulation of interleukin-1 beta production" + " [GO:0032731]; positive regulation of interleukin-6 production" + " [GO:0032755]; positive regulation of JNK cascade [GO:0046330];" + " positive regulation of long-term synaptic potentiation [GO:1900273];" + " positive regulation of mitotic cell cycle [GO:0045931]; positive" + " regulation of non-canonical NF-kappaB signal transduction" + " [GO:1901224]; positive regulation of peptidyl-serine phosphorylation" + " [GO:0033138]; positive regulation of peptidyl-threonine" + " phosphorylation [GO:0010800]; positive regulation of protein" + " metabolic process [GO:0051247]; positive regulation of protein" + " phosphorylation [GO:0001934]; positive regulation of T cell migration" + " [GO:2000406]; positive regulation of transcription by RNA polymerase" + " II [GO:0045944]; positive regulation of tumor necrosis factor" + " production [GO:0032760]; regulation of gene expression [GO:0010468];" + " regulation of long-term neuronal synaptic plasticity [GO:0048169];" + " regulation of multicellular organism growth [GO:0040014]; regulation" + " of peptidyl-tyrosine phosphorylation [GO:0050730]; regulation of" + " presynapse assembly [GO:1905606]; regulation of spontaneous synaptic" + " transmission [GO:0150003]; regulation of synapse structure or" + " activity [GO:0050803]; regulation of translation [GO:0006417];" + " regulation of Wnt signaling pathway [GO:0030111]; response to" + " interleukin-1 [GO:0070555]; response to oxidative stress" + " [GO:0006979]; smooth endoplasmic reticulum calcium ion homeostasis" + " [GO:0051563]; suckling behavior [GO:0001967]; synapse organization" + " [GO:0050808]; synaptic assembly at neuromuscular junction" + " [GO:0051124]; visual learning [GO:0008542]" + ), + "Gene Ontology (cellular component)": ( + "apical part of cell [GO:0045177]; axon [GO:0030424]; cell surface" + " [GO:0009986]; cell-cell junction [GO:0005911]; ciliary rootlet" + " [GO:0035253]; clathrin-coated pit [GO:0005905]; COPII-coated ER to" + " Golgi transport vesicle [GO:0030134]; cytoplasm [GO:0005737]; cytosol" + " [GO:0005829]; dendrite [GO:0030425]; dendritic shaft [GO:0043198];" + " dendritic spine [GO:0043197]; early endosome [GO:0005769];" + " endoplasmic reticulum [GO:0005783]; endoplasmic reticulum lumen" + " [GO:0005788]; endosome [GO:0005768]; endosome lumen [GO:0031904];" + " extracellular exosome [GO:0070062]; extracellular region" + " [GO:0005576]; extracellular space [GO:0005615]; Golgi apparatus" + " [GO:0005794]; Golgi lumen [GO:0005796]; Golgi-associated vesicle" + " [GO:0005798]; growth cone [GO:0030426]; membrane [GO:0016020];" + " membrane raft [GO:0045121]; mitochondrial inner membrane" + " [GO:0005743]; neuromuscular junction [GO:0031594]; nuclear envelope" + " lumen [GO:0005641]; perikaryon [GO:0043204]; perinuclear region of" + " cytoplasm [GO:0048471]; plasma membrane [GO:0005886]; platelet alpha" + " granule lumen [GO:0031093]; presynaptic active zone [GO:0048786];" + " receptor complex [GO:0043235]; recycling endosome [GO:0055037];" + " smooth endoplasmic reticulum [GO:0005790]; spindle midzone" + " [GO:0051233]; synapse [GO:0045202]; synaptic vesicle [GO:0008021];" + " trans-Golgi network membrane [GO:0032588]" + ), + "Gene Ontology (molecular function)": ( + "DNA binding [GO:0003677]; enzyme binding [GO:0019899]; heparin binding" + " [GO:0008201]; identical protein binding [GO:0042802]; protein" + " serine/threonine kinase binding [GO:0120283]; PTB domain binding" + " [GO:0051425]; receptor ligand activity [GO:0048018]; RNA polymerase" + " II cis-regulatory region sequence-specific DNA binding [GO:0000978];" + " serine-type endopeptidase inhibitor activity [GO:0004867]; signaling" + " receptor activator activity [GO:0030546]; signaling receptor binding" + " [GO:0005102]; transition metal ion binding [GO:0046914]" + ), + }, + { + "From": "P12345", + "Gene Ontology (biological process)": ( + "2-oxoglutarate metabolic process [GO:0006103]; aspartate catabolic" + " process [GO:0006533]; aspartate metabolic process [GO:0006531];" + " glutamate metabolic process [GO:0006536]; lipid transport" + " [GO:0006869]; protein folding [GO:0006457]" + ), + "Gene Ontology (cellular component)": ( + "mitochondrial matrix [GO:0005759]; mitochondrion [GO:0005739]; plasma" + " membrane [GO:0005886]" + ), + "Gene Ontology (molecular function)": ( + "kynurenine-oxoglutarate transaminase activity [GO:0016212];" + " L-aspartate:2-oxoglutarate aminotransferase activity [GO:0004069];" + " pyridoxal phosphate binding [GO:0030170]" + ), + }, + ] diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py index 35fd5d6..5261b39 100644 --- a/acore/multiple_testing.py +++ b/acore/multiple_testing.py @@ -5,37 +5,52 @@ from sklearn.utils import shuffle from statsmodels.stats import multitest +# ? dictionary with available methods in statsmodels.stats.multitest.multipletests: +# multitest.multitest_methods_names -def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"): + +def apply_pvalue_correction( + pvalues: np.ndarray, alpha: float = 0.05, method: str = "bonferroni" +) -> tuple[np.ndarray, np.ndarray]: """ - Performs p-value correction using the specified method. For more information visit https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html. + Performs p-value correction using the specified method as in + statsmodels.stats.multitest.multipletests_. + + .. _statsmodels.stats.multitest.multipletests: \ +https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html :param numpy.ndarray pvalues: et of p-values of the individual tests. :param float alpha: error rate. :param str method: method of p-value correction: - - bonferroni : one-step correction - - sidak : one-step correction - - holm-sidak : step down method using Sidak adjustments - - holm : step-down method using Bonferroni adjustments - - simes-hochberg : step-up method (independent) - - hommel : closed method based on Simes tests (non-negative) - - fdr_bh : Benjamini/Hochberg (non-negative) - - fdr_by : Benjamini/Yekutieli (negative) - - fdr_tsbh : two stage fdr correction (non-negative) - - fdr_tsbky : two stage fdr correction (non-negative) - :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value. - Exmaple:: + - 'bonferroni' : one-step correction + - 'sidak' : one-step correction + - 'holm-sidak' : step down method using Sidak adjustments + - 'holm' : step-down method using Bonferroni adjustments + - 'simes-hochberg' : step-up method (independent) + - 'hommel' : closed method based on Simes tests (non-negative) + - 'fdr_bh' : Benjamini/Hochberg (non-negative) + - 'fdr_by' : Benjamini/Yekutieli (negative) + - 'fdr_tsbh' : two stage fdr correction (non-negative) + - 'fdr_tsbky' : two stage fdr correction (non-negative) + + :return: Tuple with two `numpy.array`s, boolen for rejecting H0 hypothesis + and float for adjusted p-value. Can contain missing values if `pvalues` + contain missing values. + + Example:: result = apply_pvalue_correction(pvalues, alpha=0.05, method='bonferroni') """ p = np.array(pvalues) mask = np.isfinite(p) pval_corrected = np.full(p.shape, np.nan) - pval_corrected[mask] = multitest.multipletests(p[mask], alpha, method)[1] - rejected = [p <= alpha for p in pval_corrected] + _rejected, _pvals_corrected, _, _ = multitest.multipletests(p[mask], alpha, method) + pval_corrected[mask] = _pvals_corrected + rejected = np.full(p.shape, np.nan) + rejected[mask] = _rejected - return (rejected, pval_corrected.tolist()) + return (rejected, pval_corrected) def apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method="indep"): @@ -47,7 +62,7 @@ def apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method="indep"): :param str method: method of p-value correction ('indep', 'negcorr'). :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value. - Exmaple:: + Example:: result = apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method='indep') """ @@ -65,7 +80,7 @@ def apply_pvalue_twostage_fdrcorrection(pvalues, alpha=0.05, method="bh"): :param str method: method of p-value correction ('bky', 'bh'). :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value. - Exmaple:: + Example:: result = apply_pvalue_twostage_fdrcorrection(pvalues, alpha=0.05, method='bh') """ diff --git a/acore/network_analysis.py b/acore/network_analysis.py index 6af038e..50f9376 100644 --- a/acore/network_analysis.py +++ b/acore/network_analysis.py @@ -2,10 +2,11 @@ import networkx as nx import pandas as pd import snf -import utils from sklearn import cluster from sklearn.cluster import AffinityPropagation +from . import utils + def get_network_communities(graph, args): """ diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb new file mode 100644 index 0000000..17e96d8 --- /dev/null +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -0,0 +1,650 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f79a8051", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Enrichment analysis\n", + "\n", + "- we need some groups of genes to compute clusters\n", + "- we need functional annotations, i.e. a category summarizing a set of genes.\n", + "-\n", + "You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis\n", + "on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o).\n", + "\n", + "Use example data for ovarian cancer\n", + "([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "956ed7b7", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "%pip install acore vuecore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3030d08", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import dsp_pandas\n", + "import pandas as pd\n", + "\n", + "import acore\n", + "import acore.differential_regulation\n", + "import acore.enrichment_analysis\n", + "from acore.io.uniprot import fetch_annotations\n", + "\n", + "dsp_pandas.format.set_pandas_options(max_colwidth=60)" + ] + }, + { + "cell_type": "markdown", + "id": "fddd607c", + "metadata": {}, + "source": [ + "Parameters of this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6af9349a", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "base_path: str = (\n", + " \"https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/\"\n", + " \"example_data/PXD010372/processed\"\n", + ")\n", + "omics: str = f\"{base_path}/omics.csv\"\n", + "meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n", + "meta: str = f\"{base_path}/meta_patients.csv\"\n", + "features_to_sample: int = 100" + ] + }, + { + "cell_type": "markdown", + "id": "10ed1830", + "metadata": {}, + "source": [ + "# Load processed data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d70ef4c", + "metadata": {}, + "outputs": [], + "source": [ + "df_omics = pd.read_csv(omics, index_col=0)\n", + "df_meta_pgs = pd.read_csv(meta_pgs, index_col=0)\n", + "df_meta = pd.read_csv(meta, index_col=0)\n", + "df_omics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3897e1f", + "metadata": {}, + "outputs": [], + "source": [ + "ax = df_omics.notna().sum().sort_values(ascending=True).plot()" + ] + }, + { + "cell_type": "markdown", + "id": "8ce47108", + "metadata": {}, + "source": [ + "Keep only features with a certain amount of non-NaN values and select 100 of these\n", + "for illustration. Add the ones which were differently regulated in the ANOVA using all\n", + "the protein groups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3a8ab49", + "metadata": {}, + "outputs": [], + "source": [ + "idx_always_included = [\"Q5HYN5\", \"P39059\", \"O43432\", \"O43175\"]\n", + "df_omics[idx_always_included]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1145a2cd", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "df_omics = (\n", + " df_omics\n", + " # .dropna(axis=1)\n", + " .drop(idx_always_included, axis=1)\n", + " .dropna(thresh=18, axis=1)\n", + " .sample(\n", + " features_to_sample - len(idx_always_included),\n", + " axis=1,\n", + " random_state=42,\n", + " )\n", + " .join(df_omics[idx_always_included])\n", + ")\n", + "df_omics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea77e80", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "df_meta" + ] + }, + { + "cell_type": "markdown", + "id": "4bbf5dc4", + "metadata": {}, + "source": [ + "## Compute up and downregulated genes\n", + "These will be used to find enrichments in the set of both up and downregulated genes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "231bb6da", + "metadata": {}, + "outputs": [], + "source": [ + "group = \"Status\"\n", + "covariates = [\"PlatinumValue\"]\n", + "diff_reg = acore.differential_regulation.run_anova(\n", + " df_omics.join(df_meta[[group]]),\n", + " drop_cols=[],\n", + " subject=None,\n", + " group=group,\n", + ")\n", + "diff_reg.describe(exclude=[\"float\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e347b06", + "metadata": {}, + "outputs": [], + "source": [ + "diff_reg[\"rejected\"] = diff_reg[\"rejected\"].astype(bool) # ! needs to be fixed in anova\n", + "diff_reg.query(\"rejected\")" + ] + }, + { + "cell_type": "markdown", + "id": "d6c0a225", + "metadata": {}, + "source": [ + "## Find functional annotations, here pathways\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2668415", + "metadata": {}, + "outputs": [], + "source": [ + "fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n", + "fname = Path(fname_annotations)\n", + "try:\n", + " annotations = pd.read_csv(fname, index_col=0)\n", + " print(f\"Loaded annotations from {fname}\")\n", + "except FileNotFoundError:\n", + " print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n", + " fields = \"go_p,go_c,go_f\"\n", + " annotations = fetch_annotations(df_omics.columns, fields=fields)\n", + " # First column (`From`) is additional to specified fields\n", + " d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n", + "\n", + " # expand go terms\n", + " to_expand = list()\n", + " for field in d_fields_to_col:\n", + " if \"go_\" in field:\n", + " col = d_fields_to_col[field]\n", + " annotations[col] = annotations[col].str.split(\";\")\n", + " to_expand.append(col)\n", + " for col in to_expand:\n", + " # this is a bit wastefull. Processing to stack format should be done here.\n", + " annotations = annotations.explode(col, ignore_index=True)\n", + " # process other than go term columns\n", + " annotations = (\n", + " annotations.set_index(\"From\")\n", + " .rename_axis(\"identifier\")\n", + " # .drop(\"Entry\", axis=1)\n", + " .rename_axis(\"source\", axis=1)\n", + " .stack()\n", + " .to_frame(\"annotation\")\n", + " .reset_index()\n", + " .drop_duplicates(ignore_index=True)\n", + " )\n", + " fname.parent.mkdir(exist_ok=True, parents=True)\n", + " annotations.to_csv(fname, index=True)\n", + "\n", + "annotations" + ] + }, + { + "cell_type": "markdown", + "id": "d4734452", + "metadata": {}, + "source": [ + "See how many protein groups are associated with each annotation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57fddefb", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "_ = (\n", + " annotations.groupby(\"annotation\")\n", + " .size()\n", + " .value_counts()\n", + " .sort_index()\n", + " .plot(kind=\"bar\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4165bc94", + "metadata": {}, + "source": [ + "## Enrichment analysis\n", + "Is done separately for up- and downregulated genes as it's assumed that biological\n", + "processes are regulated in one direction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a631495", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "diff_reg.query(\"rejected\")[\n", + " [\n", + " \"identifier\",\n", + " \"group1\",\n", + " \"group2\",\n", + " \"pvalue\",\n", + " \"padj\",\n", + " \"rejected\",\n", + " \"log2FC\",\n", + " \"FC\",\n", + " ]\n", + "].sort_values(\"log2FC\")" + ] + }, + { + "cell_type": "markdown", + "id": "7380a528", + "metadata": {}, + "source": [ + "- this additionally sets a fold change cutoff\n", + "- and the fore and backgroud populations are changed due to the separation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a73ed6e", + "metadata": {}, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " min_detected_in_set=2, # ! default is 2, so more conservative\n", + " lfc_cutoff=0.5, # ! the default is 1\n", + ")\n", + "ret" + ] + }, + { + "cell_type": "markdown", + "id": "5cb036be", + "metadata": {}, + "source": [ + "here we see differences for the same set of differently regulated protein groups,\n", + "which can be reset using lfc_cutoff=0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e5e2b61", + "metadata": {}, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " min_detected_in_set=1, # ! default is 2, so more conservative\n", + " lfc_cutoff=0.1, # ! the default is 1\n", + ")\n", + "ret" + ] + }, + { + "cell_type": "markdown", + "id": "e3530547", + "metadata": {}, + "source": [ + "Or restricting the analysis to functional annotation for which we at least found 2\n", + "protein groups to be upregulated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ea367ca", + "metadata": {}, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " min_detected_in_set=2,\n", + " lfc_cutoff=0.5, # ! the default is 1\n", + ")\n", + "ret" + ] + }, + { + "cell_type": "markdown", + "id": "ecf75e7c", + "metadata": {}, + "source": [ + "### Site specific enrichment analysis" + ] + }, + { + "cell_type": "markdown", + "id": "0ee3a7c8", + "metadata": {}, + "source": [ + "The basic example uses a modified peptide sequence to\n", + "demonstrate the enrichment analysis.\n", + "> TODO: The example on how to do that needs a PTM focused dataset.\n", + "The details of how site specific enrichment analysis is done will depend on the\n", + "dataset and the question at hand.\n", + "\n", + "If the identifiers contain PTMs this information is removed to match it to the annotation\n", + "using a regular expression (in the function). For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b05e0c", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "regex = \"(\\\\w+~.+)_\\\\w\\\\d+\\\\-\\\\w+\"\n", + "identifier_ckg = \"gnd~P00350_T10-WW\"\n", + "match = re.search(regex, identifier_ckg)\n", + "match.group(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9e2fddf", + "metadata": {}, + "outputs": [], + "source": [ + "# ToDo: Add example for site specific enrichment analysis\n", + "# acore.enrichment_analysis.run_up_down_regulation_enrichment" + ] + }, + { + "cell_type": "markdown", + "id": "dbc34b7b", + "metadata": {}, + "source": [ + "## Single sample GSEA (ssGSEA)\n", + "Run a gene set enrichment analysis (GSEA) for each sample,\n", + "see [article](https://www.nature.com/articles/nature08460#Sec3) and\n", + "the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea)\n", + "for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e211bf65", + "metadata": {}, + "outputs": [], + "source": [ + "enrichtments = acore.enrichment_analysis.run_ssgsea(\n", + " data=df_omics,\n", + " annotation=annotations,\n", + " min_size=1,\n", + ")\n", + "enrichtments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fd08e79", + "metadata": {}, + "outputs": [], + "source": [ + "enrichtments.iloc[0].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca9e4908", + "metadata": {}, + "outputs": [], + "source": [ + "enrichtments[\"NES\"].plot.hist()" + ] + }, + { + "cell_type": "markdown", + "id": "6b9a50cc", + "metadata": {}, + "source": [ + "The normalised enrichment score (NES) can be used in a PCA plot to see if the samples\n", + "cluster according to the enrichment of the gene sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d917326", + "metadata": {}, + "outputs": [], + "source": [ + "nes = enrichtments.set_index(\"Term\", append=True).unstack()[\"NES\"].convert_dtypes()\n", + "nes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02dbec64", + "metadata": {}, + "outputs": [], + "source": [ + "import acore.exploratory_analysis as ea\n", + "\n", + "pca_result, pca_annotation = ea.run_pca(\n", + " data=nes.join(df_meta[[group]]),\n", + " drop_cols=[],\n", + " annotation_cols=[],\n", + " group=group,\n", + " components=2,\n", + " dropna=False,\n", + ")\n", + "resultDf, loadings, var_exp = pca_result\n", + "resultDf" + ] + }, + { + "cell_type": "markdown", + "id": "acd530d6", + "metadata": {}, + "source": [ + "The loadings show how the variables are correlated with the principal components." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28464b86", + "metadata": {}, + "outputs": [], + "source": [ + "loadings" + ] + }, + { + "cell_type": "markdown", + "id": "5d0c1eba", + "metadata": {}, + "source": [ + "We will plot both on the sample plot (samples on the first two principal components and\n", + "loadings of variables). We use the\n", + "[`vuecore` package](https://github.com/Multiomics-Analytics-Group/vuecore)\n", + "for this, which is also developed by the Multiomics Analytics Group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c66fcef", + "metadata": {}, + "outputs": [], + "source": [ + "from plotly.offline import iplot\n", + "from vuecore import viz\n", + "\n", + "args = {\"factor\": 1, \"loadings\": 10}\n", + "#! pca_results has three items, but docstring requests only two -> double check\n", + "figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n", + "iplot(figure)" + ] + }, + { + "cell_type": "markdown", + "id": "4be5a8c8", + "metadata": {}, + "source": [ + "## Compare two distributions - KS test\n", + "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions.\n", + "- we compare the distributions of the two differently upregulated protein groups\n", + "This is not the best example for comparing distributions, but it shows how to use the\n", + "KS test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dd57b99", + "metadata": {}, + "outputs": [], + "source": [ + "# plot two histograms of intensity values here\n", + "sel_pgs = [\"O43175\", \"P39059\"]\n", + "view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())\n", + "ax = view.plot.hist(bins=20, alpha=0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "a480b65b", + "metadata": {}, + "source": [ + "Let us compare the two centered distributions using the KS test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0f8e6a8", + "metadata": {}, + "outputs": [], + "source": [ + "acore.enrichment_analysis.run_kolmogorov_smirnov(view[sel_pgs[0]], view[sel_pgs[1]])" + ] + }, + { + "cell_type": "markdown", + "id": "385b0ceb", + "metadata": {}, + "source": [ + "The result suggests that the two distributions are from the same distribution." + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "tags,-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py new file mode 100644 index 0000000..5f09cd0 --- /dev/null +++ b/docs/api_examples/enrichment_analysis.py @@ -0,0 +1,326 @@ +# %% [markdown] +# # Enrichment analysis +# +# - we need some groups of genes to compute clusters +# - we need functional annotations, i.e. a category summarizing a set of genes. +# - +# You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis +# on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o). +# +# Use example data for ovarian cancer +# ([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372)) + + +# %% tags=["hide-output"] +# %pip install acore vuecore + + +# %% +from pathlib import Path + +import dsp_pandas +import pandas as pd + +import acore +import acore.differential_regulation +import acore.enrichment_analysis +from acore.io.uniprot import fetch_annotations + +dsp_pandas.format.set_pandas_options(max_colwidth=60) + +# %% [markdown] +# Parameters of this notebook + +# %% tags=["parameters"] +base_path: str = ( + "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/" + "example_data/PXD010372/processed" +) +omics: str = f"{base_path}/omics.csv" +meta_pgs: str = f"{base_path}/meta_pgs.csv" +meta: str = f"{base_path}/meta_patients.csv" +features_to_sample: int = 100 + +# %% [markdown] +# # Load processed data + +# %% +df_omics = pd.read_csv(omics, index_col=0) +df_meta_pgs = pd.read_csv(meta_pgs, index_col=0) +df_meta = pd.read_csv(meta, index_col=0) +df_omics + +# %% +ax = df_omics.notna().sum().sort_values(ascending=True).plot() + +# %% [markdown] +# Keep only features with a certain amount of non-NaN values and select 100 of these +# for illustration. Add the ones which were differently regulated in the ANOVA using all +# the protein groups. + +# %% +idx_always_included = ["Q5HYN5", "P39059", "O43432", "O43175"] +df_omics[idx_always_included] + +# %% tags=["hide-input"] +df_omics = ( + df_omics + # .dropna(axis=1) + .drop(idx_always_included, axis=1) + .dropna(thresh=18, axis=1) + .sample( + features_to_sample - len(idx_always_included), + axis=1, + random_state=42, + ) + .join(df_omics[idx_always_included]) +) +df_omics + + +# %% +df_meta + + +# %% [markdown] +# ## Compute up and downregulated genes +# These will be used to find enrichments in the set of both up and downregulated genes. + +# %% +group = "Status" +covariates = ["PlatinumValue"] +diff_reg = acore.differential_regulation.run_anova( + df_omics.join(df_meta[[group]]), + drop_cols=[], + subject=None, + group=group, +) +diff_reg.describe(exclude=["float"]) + +# %% +diff_reg["rejected"] = diff_reg["rejected"].astype(bool) # ! needs to be fixed in anova +diff_reg.query("rejected") + +# %% [markdown] +# ## Find functional annotations, here pathways +# + +# %% +fname_annotations = f"downloaded/annotations_{features_to_sample}.csv" +fname = Path(fname_annotations) +try: + annotations = pd.read_csv(fname, index_col=0) + print(f"Loaded annotations from {fname}") +except FileNotFoundError: + print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.") + fields = "go_p,go_c,go_f" + annotations = fetch_annotations(df_omics.columns, fields=fields) + # First column (`From`) is additional to specified fields + d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])} + + # expand go terms + to_expand = list() + for field in d_fields_to_col: + if "go_" in field: + col = d_fields_to_col[field] + annotations[col] = annotations[col].str.split(";") + to_expand.append(col) + for col in to_expand: + # this is a bit wastefull. Processing to stack format should be done here. + annotations = annotations.explode(col, ignore_index=True) + # process other than go term columns + annotations = ( + annotations.set_index("From") + .rename_axis("identifier") + # .drop("Entry", axis=1) + .rename_axis("source", axis=1) + .stack() + .to_frame("annotation") + .reset_index() + .drop_duplicates(ignore_index=True) + ) + fname.parent.mkdir(exist_ok=True, parents=True) + annotations.to_csv(fname, index=True) + +annotations + +# %% [markdown] +# See how many protein groups are associated with each annotation. + +# %% tags=["hide-input"] +_ = ( + annotations.groupby("annotation") + .size() + .value_counts() + .sort_index() + .plot(kind="bar") +) + +# %% [markdown] +# ## Enrichment analysis +# Is done separately for up- and downregulated genes as it's assumed that biological +# processes are regulated in one direction. + +# %% tags=["hide-input"] +diff_reg.query("rejected")[ + [ + "identifier", + "group1", + "group2", + "pvalue", + "padj", + "rejected", + "log2FC", + "FC", + ] +].sort_values("log2FC") + +# %% [markdown] +# - this additionally sets a fold change cutoff +# - and the fore and backgroud populations are changed due to the separation + +# %% +ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( + regulation_data=diff_reg, + annotation=annotations, + min_detected_in_set=2, # ! default is 2, so more conservative + lfc_cutoff=0.5, # ! the default is 1 +) +ret + +# %% [markdown] +# here we see differences for the same set of differently regulated protein groups, +# which can be reset using lfc_cutoff=0. + +# %% +ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( + regulation_data=diff_reg, + annotation=annotations, + min_detected_in_set=1, # ! default is 2, so more conservative + lfc_cutoff=0.1, # ! the default is 1 +) +ret + +# %% [markdown] +# Or restricting the analysis to functional annotation for which we at least found 2 +# protein groups to be upregulated. + +# %% +ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( + regulation_data=diff_reg, + annotation=annotations, + min_detected_in_set=2, + lfc_cutoff=0.5, # ! the default is 1 +) +ret + +# %% [markdown] +# ### Site specific enrichment analysis + +# %% [markdown] +# The basic example uses a modified peptide sequence to +# demonstrate the enrichment analysis. +# > TODO: The example on how to do that needs a PTM focused dataset. +# The details of how site specific enrichment analysis is done will depend on the +# dataset and the question at hand. +# +# If the identifiers contain PTMs this information is removed to match it to the annotation +# using a regular expression (in the function). For example: + +# %% +import re + +regex = "(\\w+~.+)_\\w\\d+\\-\\w+" +identifier_ckg = "gnd~P00350_T10-WW" +match = re.search(regex, identifier_ckg) +match.group(1) + +# %% +# ToDo: Add example for site specific enrichment analysis +# acore.enrichment_analysis.run_up_down_regulation_enrichment + +# %% [markdown] +# ## Single sample GSEA (ssGSEA) +# Run a gene set enrichment analysis (GSEA) for each sample, +# see [article](https://www.nature.com/articles/nature08460#Sec3) and +# the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea) +# for more details. + +# %% +enrichtments = acore.enrichment_analysis.run_ssgsea( + data=df_omics, + annotation=annotations, + min_size=1, +) +enrichtments + +# %% +enrichtments.iloc[0].to_dict() + +# %% +enrichtments["NES"].plot.hist() + +# %% [markdown] +# The normalised enrichment score (NES) can be used in a PCA plot to see if the samples +# cluster according to the enrichment of the gene sets. + +# %% +nes = enrichtments.set_index("Term", append=True).unstack()["NES"].convert_dtypes() +nes + +# %% +import acore.exploratory_analysis as ea + +pca_result, pca_annotation = ea.run_pca( + data=nes.join(df_meta[[group]]), + drop_cols=[], + annotation_cols=[], + group=group, + components=2, + dropna=False, +) +resultDf, loadings, var_exp = pca_result +resultDf + +# %% [markdown] +# The loadings show how the variables are correlated with the principal components. + +# %% +loadings + +# %% [markdown] +# We will plot both on the sample plot (samples on the first two principal components and +# loadings of variables). We use the +# [`vuecore` package](https://github.com/Multiomics-Analytics-Group/vuecore) +# for this, which is also developed by the Multiomics Analytics Group. + +# %% +from plotly.offline import iplot +from vuecore import viz + +args = {"factor": 1, "loadings": 10} +# #! pca_results has three items, but docstring requests only two -> double check +figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args) +iplot(figure) + +# %% [markdown] +# ## Compare two distributions - KS test +# The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions. +# - we compare the distributions of the two differently upregulated protein groups +# This is not the best example for comparing distributions, but it shows how to use the +# KS test. + +# %% +# plot two histograms of intensity values here +sel_pgs = ["O43175", "P39059"] +view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean()) +ax = view.plot.hist(bins=20, alpha=0.5) + +# %% [markdown] +# Let us compare the two centered distributions using the KS test. + +# %% +acore.enrichment_analysis.run_kolmogorov_smirnov(view[sel_pgs[0]], view[sel_pgs[1]]) + +# %% [markdown] +# The result suggests that the two distributions are from the same distribution. diff --git a/docs/index.rst b/docs/index.rst index 91a92e6..eba7982 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,6 +16,7 @@ api_examples/exploratory_analysis api_examples/normalization_analysis + api_examples/enrichment_analysis .. toctree:: :maxdepth: 1 diff --git a/tests/test_differential_regulation.py b/tests/test_differential_regulation.py index 2e13f92..169bafa 100644 --- a/tests/test_differential_regulation.py +++ b/tests/test_differential_regulation.py @@ -7,56 +7,85 @@ class TestCalculateTtest(unittest.TestCase): def setUp(self): self.data = { - 'subject': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], - 'sample': ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12'], - 'group': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'], - 'protein': [1.4, 6.2, 9.03, 2.3, 7.4, 14.01, 4.5, 9.6, 10.4, 7.5, 11.6, 16.4], - 'group2': ['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b'], + "subject": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + "sample": [ + "S1", + "S2", + "S3", + "S4", + "S5", + "S6", + "S7", + "S8", + "S9", + "S10", + "S11", + "S12", + ], + "group": ["A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"], + "protein": [ + 1.4, + 6.2, + 9.03, + 2.3, + 7.4, + 14.01, + 4.5, + 9.6, + 10.4, + 7.5, + 11.6, + 16.4, + ], + "group2": ["a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"], } self.df = pd.DataFrame(self.data) def test_ttest_ind(self): data = { - 'group1': [1.3, 1.2, 0.2, 2.2, 3.3, 8.43], - 'group2': [1.4, 6.2, 7.3, 0.4, 1.5, 0.6] + "group1": [1.3, 1.2, 0.2, 2.2, 3.3, 8.43], + "group2": [1.4, 6.2, 7.3, 0.4, 1.5, 0.6], } df = pd.DataFrame(data) - condition1 = 'group1' - condition2 = 'group2' + condition1 = "group1" + condition2 = "group2" is_logged = True non_par = False expected_result = pg.ttest(df[condition1], df[condition2], paired=False) - result = dr.calculate_ttest(df, condition1, condition2, is_logged=is_logged, non_par=non_par) + result = dr.calculate_ttest( + df, condition1, condition2, is_logged=is_logged, non_par=non_par + ) self.assertAlmostEqual(result[0], expected_result["T"].values[0]) self.assertAlmostEqual(result[1], expected_result["p-val"].values[0]) def test_mann_whitney(self): - data = { - 'group1': [1, 2, 3, 4, 5], - 'group2': [2, 4, 6, 8, 10] - } + data = {"group1": [1, 2, 3, 4, 5], "group2": [2, 4, 6, 8, 10]} df = pd.DataFrame(data) - condition1 = 'group1' - condition2 = 'group2' + condition1 = "group1" + condition2 = "group2" is_logged = True non_par = True expected_result = pg.mwu(df[condition1], df[condition2]) - result = dr.calculate_ttest(df, condition1, condition2, is_logged=is_logged, non_par=non_par) + result = dr.calculate_ttest( + df, condition1, condition2, is_logged=is_logged, non_par=non_par + ) self.assertAlmostEqual(result[0], expected_result["U-val"].values[0]) self.assertAlmostEqual(result[1], expected_result["p-val"].values[0]) def test_calculate_anova(self): - column = 'protein' - group = 'group' + column = "protein" + group = "group" expected_result = pg.anova(data=self.df, dv=column, between=group) - expected_t, expected_df1, expected_df2, expected_pvalue = expected_result[['F', 'ddof1', 'ddof2', 'p-unc']].values[0] + expected_t, expected_df1, expected_df2, expected_pvalue = expected_result[ + ["F", "ddof1", "ddof2", "p-unc"] + ].values[0] result = dr.calculate_anova(self.df, column, group=group) @@ -66,14 +95,20 @@ def test_calculate_anova(self): self.assertEqual(result[4], expected_pvalue) def test_calculate_ancova(self): - column = 'protein' - group = 'group' + column = "protein" + group = "group" covariates = [] - expected_result = pg.ancova(data=self.df, dv=column, between=group, covar=covariates) - expected_t, expected_df, expected_pvalue = expected_result.loc[expected_result['Source'] == group, ['F', 'DF', 'p-unc']].values[0] + expected_result = pg.ancova( + data=self.df, dv=column, between=group, covar=covariates + ) + expected_t, expected_df, expected_pvalue = expected_result.loc[ + expected_result["Source"] == group, ["F", "DF", "p-unc"] + ].values[0] - result = dr.calculate_ancova(self.df, column, group=group, covariates=covariates) + result = dr.calculate_ancova( + self.df, column, group=group, covariates=covariates + ) self.assertEqual(result[1], expected_df) self.assertEqual(result[2], expected_df) @@ -82,16 +117,27 @@ def test_calculate_ancova(self): def test_calculate_repeated_measures_anova(self): """Source SS DF MS F p-unc ng2 eps -0 group 1.50 1 1.500 0.087642 0.795106 0.032609 1.0 -1 Error 34.23 2 17.115 NaN NaN NaN NaN""" - column = 'protein' - subject = 'subject' - within = 'group' - - expected_result = pg.rm_anova(data=self.df, dv=column, within=within, subject=subject, detailed=True, correction=True) - expected_t, expected_pvalue = expected_result.loc[0, ["F", "p-unc"]].values.tolist() + 0 group 1.50 1 1.500 0.087642 0.795106 0.032609 1.0 + 1 Error 34.23 2 17.115 NaN NaN NaN NaN""" + column = "protein" + subject = "subject" + within = "group" + + expected_result = pg.rm_anova( + data=self.df, + dv=column, + within=within, + subject=subject, + detailed=True, + correction=True, + ) + expected_t, expected_pvalue = expected_result.loc[ + 0, ["F", "p-unc"] + ].values.tolist() expected_df1, expected_df2 = expected_result["DF"] - result = dr.calculate_repeated_measures_anova(self.df, column, subject=subject, within=within) + result = dr.calculate_repeated_measures_anova( + self.df, column, subject=subject, within=within + ) self.assertEqual(result[1], expected_df1) self.assertEqual(result[2], expected_df2) @@ -99,19 +145,30 @@ def test_calculate_repeated_measures_anova(self): self.assertEqual(result[4], expected_pvalue) def test_calculate_mixed_anova(self): - column = 'protein' - subject = 'subject' - within = 'group' - between = 'group2' - - expected_result = pg.mixed_anova(data=self.df, dv=column, within=within, between=between, subject=subject, correction=True) - expected_result['identifier'] = column - expected_result = expected_result[['identifier', 'DF1', 'DF2', 'F', 'p-unc', 'Source']] - - result = dr.calculate_mixed_anova(self.df, column, subject=subject, within=within, between=between) + column = "protein" + subject = "subject" + within = "group" + between = "group2" + + expected_result = pg.mixed_anova( + data=self.df, + dv=column, + within=within, + between=between, + subject=subject, + correction=True, + ) + expected_result["identifier"] = column + expected_result = expected_result[ + ["identifier", "DF1", "DF2", "F", "p-unc", "Source"] + ] + + result = dr.calculate_mixed_anova( + self.df, column, subject=subject, within=within, between=between + ) self.assertEqual(result.values.tolist(), expected_result.values.tolist()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py index cfd2e03..6c0930a 100644 --- a/tests/test_enrichment.py +++ b/tests/test_enrichment.py @@ -1,5 +1,9 @@ import unittest + +import numpy as np +import pandas as pd from scipy import stats + import acore.enrichment_analysis as ea @@ -7,9 +11,11 @@ class TestRunFisher(unittest.TestCase): def test_run_fisher(self): group1 = [10, 5] group2 = [8, 12] - alternative = 'two-sided' + alternative = "two-sided" - expected_odds, expected_pvalue = stats.fisher_exact([[10, 5], [8, 12]], alternative) + expected_odds, expected_pvalue = stats.fisher_exact( + [[10, 5], [8, 12]], alternative + ) result = ea.run_fisher(group1, group2, alternative=alternative) @@ -21,9 +27,11 @@ class TestRunKolmogorovSmirnov(unittest.TestCase): def test_run_kolmogorov_smirnov(self): dist1 = [1, 2, 3, 4, 5] dist2 = [1, 2, 3, 4, 6] - alternative = 'two-sided' + alternative = "two-sided" - expected_result = stats.ks_2samp(dist1, dist2, alternative=alternative, mode='auto') + expected_result = stats.ks_2samp( + dist1, dist2, alternative=alternative, mode="auto" + ) result = ea.run_kolmogorov_smirnov(dist1, dist2, alternative=alternative) @@ -31,5 +39,62 @@ def test_run_kolmogorov_smirnov(self): self.assertEqual(result[1], expected_result.pvalue) -if __name__ == '__main__': +def test__annotate_features(): + expected = pd.Series( + [ + "foreground", + "foreground", + "background", + "foreground", + "background", + "background", + np.nan, + ] + ) + + features = pd.Series(["G1", "G2", "G3", "G4", "G5", "G6", "G9"]) + in_foreground = ["G1", "G2", "G4"] + in_background = ["G3", "G5", "G6"] + actual = ea._annotate_features(features, in_foreground, in_background) + pd.testing.assert_series_equal(expected, actual) + + +def test_run_regulation_enrichment(): + """Integration test for run_regulation_enrichment. Indirectly tests + run_enrichment from enrichment_analysis module.""" + annotation = { + "annotation": ["path1", "path1", "path1", "path2", "path2", "path3", "path3"], + "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6", "gene9"], + "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P", "GO_P"], + } + annotation = pd.DataFrame(annotation) + regulation_res = { + "identifier": ["gene1", "gene2", "gene3", "gene4", "gene5", "gene6"], + "rejected": [True, True, False, False, True, True], + } + regulation_res = pd.DataFrame(regulation_res) + + actual = ea.run_regulation_enrichment( + regulation_data=regulation_res, + annotation=annotation, + min_detected_in_set=1, + ) + + expected = pd.DataFrame( + { + "terms": ["path1", "path2", "path3"], + "identifiers": ["gene1,gene2", "gene1,gene5", "gene6"], + "foreground": [2, 2, 1], + "background": [1, 0, 0], + "foreground_pop": [4, 4, 4], + "background_pop": [6, 6, 6], + "pvalue": [1.0, 0.4666666666666667, 1.0], + "padj": [1.0, 1.0, 1.0], + "rejected": [False, False, False], + } + ) + assert expected.equals(actual) + + +if __name__ == "__main__": unittest.main() diff --git a/tests/test_exploratory.py b/tests/test_exploratory.py index ef84e3a..97dd673 100644 --- a/tests/test_exploratory.py +++ b/tests/test_exploratory.py @@ -1,94 +1,152 @@ -import unittest import platform +import unittest + import pandas as pd + import acore.exploratory_analysis as ea class TestExtractMissing(unittest.TestCase): def setUp(self): - self.data = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'sample': ['S1', 'S2', 'S3', 'S4'], - 'protein1': [1.4, 2.2, None, 4.2], - 'protein2': [5.6, None, None, 8.1], - 'protein3': [9.1, 10.01, 11.2, 12.9] - }) + self.data = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "sample": ["S1", "S2", "S3", "S4"], + "protein1": [1.4, 2.2, None, 4.2], + "protein2": [5.6, None, None, 8.1], + "protein3": [9.1, 10.01, 11.2, 12.9], + } + ) def test_extract_number_missing(self): min_valid = 2 - expected_result = ['protein1', 'protein3'] + expected_result = ["protein1", "protein3"] - result = ea.extract_number_missing(self.data, min_valid, drop_cols=['sample'], group='group') + result = ea.extract_number_missing( + self.data, min_valid, drop_cols=["sample"], group="group" + ) self.assertEqual(result, expected_result) def test_extract_percentage_missing(self): missing_max = 0.2 - expected_result = ['protein1', 'protein3'] + expected_result = ["protein1", "protein3"] - result = ea.extract_percentage_missing(self.data, missing_max, drop_cols=['sample'], group='group') + result = ea.extract_percentage_missing( + self.data, missing_max, drop_cols=["sample"], group="group" + ) self.assertEqual(result, expected_result) class TestDimensionalityReduction(unittest.TestCase): def setUp(self): - self.data = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'protein1': [1.4, 2.2, 5.3, 4.2], - 'protein2': [5.6, 0.3, 2.1, 8.1], - 'protein3': [9.1, 10.01, 11.2, 12.9] - }) + self.data = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "protein1": [1.4, 2.2, 5.3, 4.2], + "protein2": [5.6, 0.3, 2.1, 8.1], + "protein3": [9.1, 10.01, 11.2, 12.9], + } + ) def test_run_pca(self): components = 2 expected_result = ( - pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'x': [0.877154, -3.883458, -1.550791, 4.557095], - 'y': [-2.815737, -0.420979, 2.278209, 0.958507], - }), - pd.DataFrame({ - 'x': [0.958489, 0.092382, 0.269749], - 'y': [-0.235412, 0.790179, 0.565861], - 'value': [0.986975, 0.795561, 0.626868] - }, - index=['protein2', 'protein1', 'protein3']), - pd.Series([0.71760534, 0.26139782]) + pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "x": [0.877154, -3.883458, -1.550791, 4.557095], + "y": [-2.815737, -0.420979, 2.278209, 0.958507], + } + ), + pd.DataFrame( + { + "x": [0.958489, 0.092382, 0.269749], + "y": [-0.235412, 0.790179, 0.565861], + "value": [0.986975, 0.795561, 0.626868], + }, + index=["protein2", "protein1", "protein3"], + ), + pd.Series([0.71760534, 0.26139782]), + ) + exp_dict = {"x_title": "PC1 (0.72)", "y_title": "PC2 (0.26)", "group": "group"} + result, annotation = ea.run_pca( + self.data, + drop_cols=[], + annotation_cols=[], + group="group", + components=components, + dropna=True, ) - exp_dict = {'x_title': 'PC1 (0.72)', - 'y_title': 'PC2 (0.26)', 'group': 'group'} - result, annotation = ea.run_pca(self.data, drop_cols=[], annotation_cols=[], group='group', components=components, dropna=True) assert annotation == exp_dict # results: tuple of 3 DFs: components, loadings, explained_variance pd.testing.assert_frame_equal(result[0], expected_result[0], check_exact=False) pd.testing.assert_frame_equal(result[1], expected_result[1], check_exact=False) - pd.testing.assert_series_equal(pd.Series(result[2]), expected_result[2], check_exact=False, check_dtype=False) - + pd.testing.assert_series_equal( + pd.Series(result[2]), + expected_result[2], + check_exact=False, + check_dtype=False, + ) + def test_run_tsne(self): components = 2 # ! MAC ARM64 yields these results (using the similar installation) - expected_result = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'x': [-113.341728, 36.020717, 57.2992514, -134.729141], - 'y': [131.169082, -59.616630, 110.601203, -39.086254] - }) + expected_result = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "x": [-113.341728, 36.020717, 57.2992514, -134.729141], + "y": [131.169082, -59.616630, 110.601203, -39.086254], + } + ) # ! Ubuntu x64 yields these results: - if not platform.machine() == 'arm64': - expected_result = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'x': [-27.375560760498047, -7.526909828186035, -28.731882095336914, -8.872751235961914], - 'y': [1.547648310661316, 0.17019487917423248, -18.321491241455078, -19.713821411132812] - }) - result, _ = ea.run_tsne(self.data, drop_cols=['sample', 'subject'], group='group', components=components, perplexity=3, n_iter=1000, init='pca', dropna=True) - result = result['tsne'] - pd.testing.assert_frame_equal(result, expected_result, check_exact=False, check_dtype=False) + if not platform.machine() == "arm64": + expected_result = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "x": [ + -27.375560760498047, + -7.526909828186035, + -28.731882095336914, + -8.872751235961914, + ], + "y": [ + 1.547648310661316, + 0.17019487917423248, + -18.321491241455078, + -19.713821411132812, + ], + } + ) + result, _ = ea.run_tsne( + self.data, + drop_cols=["sample", "subject"], + group="group", + components=components, + perplexity=3, + n_iter=1000, + init="pca", + dropna=True, + ) + result = result["tsne"] + pd.testing.assert_frame_equal( + result, expected_result, check_exact=False, check_dtype=False + ) def test_run_umap(self): - _ = ea.run_umap(self.data, drop_cols=['sample', 'subject'], group='group', n_neighbors=10, min_dist=0.3, metric='cosine', dropna=True) + _ = ea.run_umap( + self.data, + drop_cols=["sample", "subject"], + group="group", + n_neighbors=10, + min_dist=0.3, + metric="cosine", + dropna=True, + ) self.assertEqual(1, 1) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_multiple_testing.py b/tests/test_multiple_testing.py new file mode 100644 index 0000000..0f339bc --- /dev/null +++ b/tests/test_multiple_testing.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + +from acore.multiple_testing import apply_pvalue_correction + +# benferoni correction +array = [ + 0.1, + 0.01, + 0.2, + 0.4, + np.nan, + 0.5, +] # 0.04, 0.02, 0.0001] + +test_res = [ + ( + "b", + np.array([0.0, 1.0, 0.0, 0.0, np.nan, 0.0]), + np.array([0.5, 0.05, 1.0, 1.0, np.nan, 1.0]), + ), + ( + "fdr_bh", + np.array( + [ + 0.0, + 1.0, + 0.0, + 0.0, + np.nan, + 0.0, + ] + ), + np.array([0.25, 0.05, 0.33333333, 0.5, np.nan, 0.5]), + ), +] + + +@pytest.mark.parametrize("method,exp_rejected,exp_pvalues", test_res) +def test_apply_pvalue_correction_alpha_5_percent(method, exp_rejected, exp_pvalues): + act_rejected, act_pvalues = apply_pvalue_correction( + array, alpha=0.05, method=method + ) + np.testing.assert_array_equal(act_rejected, exp_rejected) + np.testing.assert_array_almost_equal(act_pvalues, exp_pvalues)