From 6861fe3bfe310db6965862cf5f40a3e2338630b2 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 20 Nov 2024 11:50:49 +0100 Subject: [PATCH 01/40] :art: update docstrings and module string - line lenght restriction to 100 characters --- acore/enrichment_analysis.py | 192 +++++++++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 43 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index a3f9082..bc9ba87 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -1,3 +1,7 @@ +"""Enrichment Analysis Module. Contains different functions to perform enrichment +analysis. +""" + import os import re import uuid @@ -29,15 +33,19 @@ def run_fisher(group1, group2, alternative="two-sided"): def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"): """ - Compute the Kolmogorov-Smirnov statistic on 2 samples. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html + Compute the Kolmogorov-Smirnov statistic on 2 samples. + See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html - :param list dist1: sequence of 1-D ndarray (first distribution to compare) drawn from a continuous distribution - :param list dist2: sequence of 1-D ndarray (second distribution to compare) drawn from a continuous distribution + :param list dist1: sequence of 1-D ndarray (first distribution to compare) + drawn from a continuous distribution + :param list dist2: sequence of 1-D ndarray (second distribution to compare) + drawn from a continuous distribution :param str alternative: defines the alternative hypothesis (default is ‘two-sided’): * **'two-sided'** * **'less'** * **'greater'** :return: statistic float and KS statistic pvalue float Two-tailed p-value. + Example:: result = run_kolmogorov_smirnov(dist1, dist2, alternative='two-sided') @@ -58,26 +66,45 @@ def run_site_regulation_enrichment( reject_col="rejected", group_col="group", method="fisher", - regex=r"(\w+~.+)_\w\d+\-\w+", + regex=r"(\w+~.+)_\w\d+\-\w+", # ! add example to docstring of what this matches correction="fdr_bh", ): """ - This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset. - - :param regulation_data: pandas dataframe resulting from differential regulation analysis. - :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). - :param str identifier: name of the column from annotation containing feature identifiers. - :param list groups: column names from regulation_data containing group identifiers. - :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis. - :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. - :param str method: method used to compute enrichment (only 'fisher' is supported currently). + This function runs a simple enrichment analysis for significantly + regulated protein sites in a dataset. + + :param regulation_data: pandas dataframe resulting from differential + regulation analysis. + :param annotation: pandas dataframe with annotations for features + (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). + :param str identifier: name of the column from annotation containing + feature identifiers. + :param list groups: column names from regulation_data containing + group identifiers. + :param str annotation_col: name of the column from annotation containing + annotation terms. + :param str reject_col: name of the column from regulatio_data containing + boolean for rejected null hypothesis. + :param str group_col: column name for new column in annotation dataframe + determining if feature belongs to foreground or background. + :param str method: method used to compute enrichment + (only 'fisher' is supported currently). :param str regex: how to extract the annotated identifier from the site identifier - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. + :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', + 'background', 'pvalue', 'padj' and 'rejected'. Example:: - result = run_site_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', match="(\w+~.+)_\w\d+\-\w+") + result = run_site_regulation_enrichment(regulation_data, + annotation, + identifier='identifier', + groups=['group1', 'group2'], + annotation_col='annotation', + reject_col='rejected', + group_col='group', + method='fisher', + match="(\w+~.+)_\w\d+\-\w+" + ) """ result = pd.DataFrame() if regulation_data is not None: @@ -120,24 +147,44 @@ def run_up_down_regulation_enrichment( lfc_cutoff=1, ): """ - This function runs a simple enrichment analysis for significantly regulated proteins distinguishing between up- and down-regulated. + This function runs a simple enrichment analysis for significantly regulated proteins + distinguishing between up- and down-regulated. - :param regulation_data: pandas dataframe resulting from differential regulation analysis (CKG's regulation table). - :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). + :param regulation_data: pandas dataframe resulting from differential regulation + analysis (CKG's regulation table). + :param annotation: pandas dataframe with annotations for features + (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. :param list groups: column names from regulation_data containing group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulation_data containing boolean for rejected null hypothesis. - :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. - :param str method: method used to compute enrichment (only 'fisher' is supported currently). + :param str reject_col: name of the column from regulation_data containing boolean for + rejected null hypothesis. + :param str group_col: column name for new column in annotation dataframe determining + if feature belongs to foreground or background. + :param str method: method used to compute enrichment + (only 'fisher' is supported currently). :param str correction: method to be used for multiple-testing correction :param float alpha: adjusted p-value cutoff to define significance :param float lfc_cutoff: log fold-change cutoff to define practical significance - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. + :return: Pandas dataframe with columns:'terms', 'identifiers', 'foreground', + 'background', 'pvalue', 'padj' and 'rejected'. Example:: - result = run_up_down_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', correction='fdr_bh', alpha=0.05, lfc_cutoff=1) + result = run_up_down_regulation_enrichment( + regulation_data, + annotation, + identifier='identifier', + groups=['group1', + 'group2'], + annotation_col='annotation', + reject_col='rejected', + group_col='group', + method='fisher', + correction='fdr_bh', + alpha=0.05, + lfc_cutoff=1, + ) """ enrichment_results = {} for g1, g2 in regulation_data.groupby(groups).groups: @@ -201,22 +248,37 @@ def run_regulation_enrichment( correction="fdr_bh", ): """ - This function runs a simple enrichment analysis for significantly regulated features in a dataset. + This function runs a simple enrichment analysis for significantly regulated features + in a dataset. :param regulation_data: pandas dataframe resulting from differential regulation analysis. - :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). + :param annotation: pandas dataframe with annotations for features + (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. :param list groups: column names from regulation_data containing group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis. - :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. + :param str reject_col: name of the column from regulatio_data containing boolean for + rejected null hypothesis. + :param str group_col: column name for new column in annotation dataframe determining + if feature belongs to foreground or background. :param str method: method used to compute enrichment (only 'fisher' is supported currently). :param str correction: method to be used for multiple-testing correction - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. + :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', + 'background', 'pvalue', 'padj' and 'rejected'. Example:: - result = run_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher') + result = run_regulation_enrichment( + regulation_data, + annotation, + identifier='identifier', + groups=['group1', + 'group2'], + annotation_col='annotation', + reject_col='rejected', + group_col='group', + method='fisher', + ) """ result = {} foreground_list = ( @@ -267,20 +329,36 @@ def run_enrichment( correction="fdr_bh", ): """ - Computes enrichment of the foreground relative to a given backgroung, using Fisher's exact test, and corrects for multiple hypothesis testing. + Computes enrichment of the foreground relative to a given backgroung, + using Fisher's exact test, and corrects for multiple hypothesis testing. - :param data: pandas dataframe with annotations for dataset features (columns: 'annotation', 'identifier', 'source', 'group'). + :param data: pandas dataframe with annotations for dataset features + (columns: 'annotation', 'identifier', 'source', 'group'). :param str foreground_id: group identifier of features that belong to the foreground. :param str background_id: group identifier of features that belong to the background. :param str annotation_col: name of the column containing annotation terms. :param str group_col: name of column containing the group identifiers. :param str identifier_col: name of column containing dependent variables identifiers. :param str method: method used to compute enrichment (only 'fisher' is supported currently). - :return: Pandas dataframe with annotation terms, features, number of foregroung/background features in each term, p-values and corrected p-values (columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'). + :return: Pandas dataframe with annotation terms, features, + number of foregroung/background features in each term, + p-values and corrected p-values + (columns: 'terms', 'identifiers', 'foreground', + 'background', 'pvalue', 'padj' and 'rejected'). Example:: - result = run_enrichment(data, foreground='foreground', background='background', foreground_pop=len(foreground_list), background_pop=len(background_list), annotation_col='annotation', group_col='group', identifier_col='identifier', method='fisher') + result = run_enrichment( + data, + foreground='foreground', + background='background', + foreground_pop=len(foreground_list), + background_pop=len(background_list), + annotation_col='annotation', + group_col='group', + identifier_col='identifier', + method='fisher', + ) """ result = pd.DataFrame() df = data.copy() @@ -359,23 +437,39 @@ def run_ssgsea( permutations=0, ): """ - Project each sample within a data set onto a space of gene set enrichment scores using the ssGSEA projection methodology described in Barbie et al., 2009. + Project each sample within a data set onto a space of gene set enrichment scores using + the single sample gene set enrichment analysis (ssGSEA) projection methodology + described in Barbie et al., 2009: + https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA). :param data: pandas dataframe with the quantified features (i.e. subject x proteins) - :param annotation: pandas dataframe with the annotation to be used in the enrichment (i.e. CKG pathway annotation file) + :param annotation: pandas dataframe with the annotation to be used in the enrichment + (i.e. CKG pathway annotation file) :param str annotation_col: name of the column containing annotation terms. :param str identifier_col: name of column containing dependent variables identifiers. - :param list set_index: column/s to be used as index. Enrichment will be calculated for these values (i.e ["subject"] will return subjects x pathways matrix of enrichment scores) - :param str out_dir: directory path where results will be stored (default None, tmp folder is used) - :param int min_size: minimum number of features (i.e. proteins) in enriched terms (i.e. pathways) - :param int max_size: maximum number of features (i.e. proteins) in enriched terms (i.e. pathways) + :param list set_index: column/s to be used as index. Enrichment will be calculated + for these values (i.e ["subject"] will return subjects x pathways matrix of + enrichment scores) + :param str out_dir: directory path where results will be stored + (default None, tmp folder is used) + :param int min_size: minimum number of features (i.e. proteins) in enriched terms + (i.e. pathways) + :param int max_size: maximum number of features (i.e. proteins) in enriched terms + (i.e. pathways) :param bool scale: whether or not to scale the data :param int permutations: number of permutations used in the ssgsea analysis - :return: dictionary with two dataframes: es - enrichment scores, and nes - normalized enrichment scores. + :return: dictionary with two dataframes: es - enrichment scores, + and nes - normalized enrichment scores. Example:: stproject = "P0000008" - p = project.Project(stproject, datasets={}, knowledge=None, report={}, configuration_files=None) + p = project.Project( + stproject, + datasets={}, + knowledge=None, + report={}, + configuration_files=None, + ) p.build_project(False) p.generate_report() @@ -383,7 +477,19 @@ def run_ssgsea( annotations = proteomics_dataset.get_dataframe("pathway annotation") processed = proteomics_dataset.get_dataframe('processed') - result = run_ssgsea(processed, annotations, annotation_col='annotation', identifier_col='identifier', set_index=['group', 'sample','subject'], outdir=None, min_size=10, scale=False, permutations=0) + result = run_ssgsea( + processed, + annotations, + annotation_col='annotation', + identifier_col='identifier', + set_index=['group', + 'sample', + 'subject'], + outdir=None, + min_size=10, + scale=False, + permutations=0 + ) """ result = {} From 64d34145343f8ae188334f63b10eb7bad976f155 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Fri, 22 Nov 2024 11:29:20 +0100 Subject: [PATCH 02/40] :construction: test enrichment analysis :art: type annotations maybe the dataset is not he best one to test enrichment analysis (few diff. reg. protein groups) --- acore/enrichment_analysis.py | 44 +++---- docs/api_examples/enrichment_analysis.py | 139 +++++++++++++++++++++++ 2 files changed, 161 insertions(+), 22 deletions(-) create mode 100644 docs/api_examples/enrichment_analysis.py diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index bc9ba87..6b35cfd 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -237,16 +237,16 @@ def run_up_down_regulation_enrichment( def run_regulation_enrichment( - regulation_data, - annotation, - identifier="identifier", - groups=["group1", "group2"], - annotation_col="annotation", - reject_col="rejected", - group_col="group", - method="fisher", - correction="fdr_bh", -): + regulation_data: pd.DataFrame, + annotation: pd.DataFrame, + identifier: str = "identifier", + groups: list[str] = ["group1", "group2"], + annotation_col: str = "annotation", + reject_col: str = "rejected", + group_col: str = "group", + method: str = "fisher", + correction: str = "fdr_bh", +) -> pd.DataFrame: """ This function runs a simple enrichment analysis for significantly regulated features in a dataset. @@ -255,7 +255,7 @@ def run_regulation_enrichment( :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. - :param list groups: column names from regulation_data containing group identifiers. + :param list[str] groups: column names from regulation_data containing group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis. @@ -280,7 +280,6 @@ def run_regulation_enrichment( method='fisher', ) """ - result = {} foreground_list = ( regulation_data[regulation_data[reject_col]][identifier].unique().tolist() ) @@ -318,16 +317,16 @@ def run_regulation_enrichment( def run_enrichment( data, - foreground_id, - background_id, - foreground_pop, - background_pop, - annotation_col="annotation", - group_col="group", - identifier_col="identifier", - method="fisher", - correction="fdr_bh", -): + foreground_id: str, + background_id: str, + foreground_pop: list[str], + background_pop: list[str], + annotation_col: str = "annotation", + group_col: str = "group", + identifier_col: str = "identifier", + method: str = "fisher", + correction: str = "fdr_bh", +) -> pd.DataFrame: """ Computes enrichment of the foreground relative to a given backgroung, using Fisher's exact test, and corrects for multiple hypothesis testing. @@ -386,6 +385,7 @@ def run_enrichment( num_background = num_background[0] else: num_background = 0 + # ! what happens if this is not the case? if method == "fisher" and num_foreground > 1: _, pvalue = run_fisher( [num_foreground, foreground_pop - num_foreground], diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py new file mode 100644 index 0000000..8ade04a --- /dev/null +++ b/docs/api_examples/enrichment_analysis.py @@ -0,0 +1,139 @@ +# %% [markdown] +# # Enrichment analysis +# +# - we need some groups of genes to compute clusters +# - we need functional annotations, i.e. a category summarizing a set of genes. +# - +# You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis +# on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o). +# +# Use example data for ovarian cancer +# ([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372)) + + +# %% tags=["hide-output"] +# %pip install acore + + +# %% +import pandas as pd + +import acore +import acore.differential_regulation +import acore.enrichment_analysis + +# %% [markdown] +# Parameters of this notebook + +# %% tags=["parameters"] +omics: str = ( + "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/" + "example_data/PXD010372/processed/omics.csv" +) +meta: str = ( + "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/" + "example_data/PXD010372/processed/meta_patients.csv" +) + +# %% [markdown] +# # Load processed data + +# %% +df_omics = pd.read_csv(omics, index_col=0) +df_meta = pd.read_csv(meta, index_col=0) +df_omics + +# %% +df_omics.notna().sum().sort_values(ascending=True).plot() + +# %% [markdown] +# Keep only features with a certain amount of non-NaN values and select 100 of these +# for illustration + +# %% +df_omics = df_omics.dropna(axis=1) +# .sample( +# 500, +# axis=1, +# random_state=42, +# ) + +# %% +df_meta + +# %% [markdown] +# ## Compute up and downregulated genes +# These will be used to find enrichments in the set of both up and downregulated genes. +# +# - only CT45 shows to be regulated + +# %% +group = "Status" +covariates = ["PlatinumValue"] +diff_reg = acore.differential_regulation.run_anova( + df_omics.join(df_meta[[group]]), + # df_omics.join(df_meta[[group] + covariates]), + # covariates=covariates, + drop_cols=[], + subject=None, + group=group, +) +diff_reg.describe(exclude=["float"]) + +# %% +diff_reg.describe(exclude=[float]) + +# %% [markdown] +# ## Find functional annotations, here pathways +# + +# %% +from dsp_phospho.ptms.uniprot_id_mapping import ( + check_id_mapping_results_ready, + get_id_mapping_results_link, + get_id_mapping_results_search, + submit_id_mapping, +) + +job_id = submit_id_mapping( + from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=df_omics.columns +) + +if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + # add fields to the link to get more information + # From and Entry (accession) are the same. + results = get_id_mapping_results_search( + link + "?fields=accession,go_p,go_c,go_f&format=tsv" + ) +header = results.pop(0).split("\t") +results = [line.split("\t") for line in results] +results = pd.DataFrame(results, columns=header) +results + +# %% +annotation = ( + results.set_index("Entry") + .rename_axis("identifier") + .drop("From", axis=1) + .rename_axis("source", axis=1) + .stack() + .to_frame("annotation") + .replace("", pd.NA) + .dropna() + .sort_values(["source", "annotation"]) + .reset_index() +) +annotation + +# %% [markdown] +# ## Enrichment analysis +# + +# %% +ret = acore.enrichment_analysis.run_regulation_enrichment( + regulation_data=diff_reg, annotation=annotation +) +ret + +# %% From d37dc5e84301d97a4ea205ac5f6d7d281906ec8a Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 10:48:54 +0100 Subject: [PATCH 03/40] :art::bug: Minimum number of genes rejected to be eligable how many proteins/genes do need to be rejected to be considered valid. Before it was at least 2 genes. --- acore/enrichment_analysis.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 6b35cfd..51e1ad6 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -319,8 +319,9 @@ def run_enrichment( data, foreground_id: str, background_id: str, - foreground_pop: list[str], - background_pop: list[str], + foreground_pop: int, + background_pop: int, + min_detected_in_set: int = 1, annotation_col: str = "annotation", group_col: str = "group", identifier_col: str = "identifier", @@ -335,6 +336,8 @@ def run_enrichment( (columns: 'annotation', 'identifier', 'source', 'group'). :param str foreground_id: group identifier of features that belong to the foreground. :param str background_id: group identifier of features that belong to the background. + :param int foreground_pop: number of features in the foreground. + :param int background_pop: number of features in the background. :param str annotation_col: name of the column containing annotation terms. :param str group_col: name of column containing the group identifiers. :param str identifier_col: name of column containing dependent variables identifiers. @@ -386,7 +389,7 @@ def run_enrichment( else: num_background = 0 # ! what happens if this is not the case? - if method == "fisher" and num_foreground > 1: + if method == "fisher" and num_foreground >= min_detected_in_set: _, pvalue = run_fisher( [num_foreground, foreground_pop - num_foreground], [num_background, background_pop - foreground_pop - num_background], From faf7158627ea1189e694577eb4ec27aba01e67d3 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 10:50:04 +0100 Subject: [PATCH 04/40] :white_check_mark: add minimal test for enrichment analysis --- tests/test_enrichment.py | 51 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py index cfd2e03..bc463f1 100644 --- a/tests/test_enrichment.py +++ b/tests/test_enrichment.py @@ -1,5 +1,8 @@ import unittest + +import pandas as pd from scipy import stats + import acore.enrichment_analysis as ea @@ -7,9 +10,11 @@ class TestRunFisher(unittest.TestCase): def test_run_fisher(self): group1 = [10, 5] group2 = [8, 12] - alternative = 'two-sided' + alternative = "two-sided" - expected_odds, expected_pvalue = stats.fisher_exact([[10, 5], [8, 12]], alternative) + expected_odds, expected_pvalue = stats.fisher_exact( + [[10, 5], [8, 12]], alternative + ) result = ea.run_fisher(group1, group2, alternative=alternative) @@ -21,9 +26,11 @@ class TestRunKolmogorovSmirnov(unittest.TestCase): def test_run_kolmogorov_smirnov(self): dist1 = [1, 2, 3, 4, 5] dist2 = [1, 2, 3, 4, 6] - alternative = 'two-sided' + alternative = "two-sided" - expected_result = stats.ks_2samp(dist1, dist2, alternative=alternative, mode='auto') + expected_result = stats.ks_2samp( + dist1, dist2, alternative=alternative, mode="auto" + ) result = ea.run_kolmogorov_smirnov(dist1, dist2, alternative=alternative) @@ -31,5 +38,39 @@ def test_run_kolmogorov_smirnov(self): self.assertEqual(result[1], expected_result.pvalue) -if __name__ == '__main__': +def test_run_enrichtment(): + annotation = { + "annotation": ["path1", "path1", "path1", "path2", "path2", "path3"], + "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6"], + "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P"], + } + annotation = pd.DataFrame(annotation) + regulation_res = { + "identifier": ["gene1", "gene2", "gene3", "gene4", "gene5", "gene6"], + "rejected": [True, True, False, False, True, True], + } + regulation_res = pd.DataFrame(regulation_res) + + actual = ea.run_regulation_enrichment( + regulation_data=regulation_res, + annotation=annotation, + ) + + expected = pd.DataFrame( + { + "terms": ["path1", "path2", "path3"], + "identifiers": ["gene1,gene2", "gene1,gene5", "gene6"], + "foreground": [2, 2, 1], + "background": [1, 0, 0], + "foreground_pop": [4, 4, 4], + "background_pop": [6, 6, 6], + "pvalue": [1.0, 0.4666666666666667, 1.0], + "padj": [1.0, 1.0, 1.0], + "rejected": [False, False, False], + } + ) + assert expected.equals(actual) + + +if __name__ == "__main__": unittest.main() From c1c66f947642797f3ce8778ae802fcf94c1210b5 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 10:53:40 +0100 Subject: [PATCH 05/40] :sparkles: add fetching information from Uniprot KB - will be tested by building docs (maybe add a unittest later) --- acore/io/uniprot.py | 322 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 acore/io/uniprot.py diff --git a/acore/io/uniprot.py b/acore/io/uniprot.py new file mode 100644 index 0000000..cedc98b --- /dev/null +++ b/acore/io/uniprot.py @@ -0,0 +1,322 @@ +"""Uniprot ID mapping using Python. + +Source: https://www.uniprot.org/help/id_mapping +""" + +import json +import re +import time +import zlib +from urllib.parse import parse_qs, urlencode, urlparse +from xml.etree import ElementTree + +import requests +from requests.adapters import HTTPAdapter, Retry + +POLLING_INTERVAL = 3 +API_URL = "https://rest.uniprot.org" + + +retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) +session = requests.Session() +session.mount("https://", HTTPAdapter(max_retries=retries)) + + +def check_response(response): + try: + response.raise_for_status() + except requests.HTTPError: + print(response.json()) + raise + + +def submit_id_mapping(from_db, to_db, ids): + request = requests.post( + f"{API_URL}/idmapping/run", + data={"from": from_db, "to": to_db, "ids": ",".join(ids)}, + ) + check_response(request) + return request.json()["jobId"] + + +def get_next_link(headers): + re_next_link = re.compile(r'<(.+)>; rel="next"') + if "Link" in headers: + match = re_next_link.match(headers["Link"]) + if match: + return match.group(1) + + +def check_id_mapping_results_ready(job_id): + while True: + request = session.get(f"{API_URL}/idmapping/status/{job_id}") + check_response(request) + j = request.json() + if "jobStatus" in j: + if j["jobStatus"] in ("NEW", "RUNNING"): + print(f"Retrying in {POLLING_INTERVAL}s") + time.sleep(POLLING_INTERVAL) + else: + raise Exception(j["jobStatus"]) + else: + return bool(j["results"] or j["failedIds"]) + + +def get_batch(batch_response, file_format, compressed): + batch_url = get_next_link(batch_response.headers) + while batch_url: + batch_response = session.get(batch_url) + batch_response.raise_for_status() + yield decode_results(batch_response, file_format, compressed) + batch_url = get_next_link(batch_response.headers) + + +def combine_batches(all_results, batch_results, file_format): + if file_format == "json": + for key in ("results", "failedIds"): + if key in batch_results and batch_results[key]: + all_results[key] += batch_results[key] + elif file_format == "tsv": + return all_results + batch_results[1:] + else: + return all_results + batch_results + return all_results + + +def get_id_mapping_results_link(job_id): + url = f"{API_URL}/idmapping/details/{job_id}" + request = session.get(url) + check_response(request) + return request.json()["redirectURL"] + + +def decode_results(response, file_format, compressed): + if compressed: + decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) + if file_format == "json": + j = json.loads(decompressed.decode("utf-8")) + return j + elif file_format == "tsv": + return [line for line in decompressed.decode("utf-8").split("\n") if line] + elif file_format == "xlsx": + return [decompressed] + elif file_format == "xml": + return [decompressed.decode("utf-8")] + else: + return decompressed.decode("utf-8") + elif file_format == "json": + return response.json() + elif file_format == "tsv": + return [line for line in response.text.split("\n") if line] + elif file_format == "xlsx": + return [response.content] + elif file_format == "xml": + return [response.text] + return response.text + + +def get_xml_namespace(element): + m = re.match(r"\{(.*)\}", element.tag) + return m.groups()[0] if m else "" + + +def merge_xml_results(xml_results): + merged_root = ElementTree.fromstring(xml_results[0]) + for result in xml_results[1:]: + root = ElementTree.fromstring(result) + for child in root.findall("{http://uniprot.org/uniprot}entry"): + merged_root.insert(-1, child) + ElementTree.register_namespace("", get_xml_namespace(merged_root[0])) + return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True) + + +def print_progress_batches(batch_index, size, total): + n_fetched = min((batch_index + 1) * size, total) + print(f"Fetched: {n_fetched} / {total}") + + +def get_id_mapping_results_search(url): + parsed = urlparse(url) + query = parse_qs(parsed.query) + file_format = query["format"][0] if "format" in query else "json" + if "size" in query: + size = int(query["size"][0]) + else: + size = 500 + query["size"] = size + compressed = ( + query["compressed"][0].lower() == "true" if "compressed" in query else False + ) + parsed = parsed._replace(query=urlencode(query, doseq=True)) + url = parsed.geturl() + request = session.get(url) + check_response(request) + results = decode_results(request, file_format, compressed) + total = int(request.headers["x-total-results"]) + print_progress_batches(0, size, total) + for i, batch in enumerate(get_batch(request, file_format, compressed), 1): + results = combine_batches(results, batch, file_format) + print_progress_batches(i, size, total) + if file_format == "xml": + return merge_xml_results(results) + return results + + +def get_id_mapping_results_stream(url): + if "/stream/" not in url: + url = url.replace("/results/", "/results/stream/") + request = session.get(url) + check_response(request) + parsed = urlparse(url) + query = parse_qs(parsed.query) + file_format = query["format"][0] if "format" in query else "json" + compressed = ( + query["compressed"][0].lower() == "true" if "compressed" in query else False + ) + return decode_results(request, file_format, compressed) + + +if __name__ == "__main__": + # id mapping is used to create a link to a query (you can see the json in the browser) + # UniProtKB is the knowleadgebase integrating all kind of other databases + import pandas as pd + + job_id = submit_id_mapping( + from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=["P05067", "P12345"] + ) + + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + # add fields to the link to get more information + # From and Entry (accession) are the same. + results = get_id_mapping_results_search( + link + "?fields=accession,go_p,go_c,go_f&format=tsv" + ) + # see the available fields you can add + # https://www.uniprot.org/help/return_fields + # and the available formats: json and tsv (for most endpoints) + # https://www.uniprot.org/help/api_queries#tips + print(results) + header = results.pop(0).split("\t") + results = [line.split("\t") for line in results] + df = pd.DataFrame(results, columns=header) + # result from: + # df.to_dict(orient="records") + records = [ + { + "From": "P05067", + "Gene Ontology (biological process)": ( + "adult locomotory behavior [GO:0008344]; amyloid fibril formation" + " [GO:1990000]; astrocyte activation [GO:0048143]; astrocyte activation" + " involved in immune response [GO:0002265]; axo-dendritic transport" + " [GO:0008088]; axon midline choice point recognition [GO:0016199];" + " axonogenesis [GO:0007409]; cell adhesion [GO:0007155]; cellular" + " response to amyloid-beta [GO:1904646]; central nervous system" + " development [GO:0007417]; cholesterol metabolic process [GO:0008203];" + " cognition [GO:0050890]; collateral sprouting in absence of injury" + " [GO:0048669]; cytosolic mRNA polyadenylation [GO:0180011]; dendrite" + " development [GO:0016358]; endocytosis [GO:0006897]; extracellular" + " matrix organization [GO:0030198]; forebrain development [GO:0030900];" + " G2/M transition of mitotic cell cycle [GO:0000086]; intracellular" + " copper ion homeostasis [GO:0006878]; ionotropic glutamate receptor" + " signaling pathway [GO:0035235]; learning [GO:0007612]; learning or" + " memory [GO:0007611]; locomotory behavior [GO:0007626]; mating" + " behavior [GO:0007617]; microglia development [GO:0014005]; microglial" + " cell activation [GO:0001774]; modulation of excitatory postsynaptic" + " potential [GO:0098815]; negative regulation of cell population" + " proliferation [GO:0008285]; negative regulation of gene expression" + " [GO:0010629]; negative regulation of long-term synaptic potentiation" + " [GO:1900272]; negative regulation of neuron differentiation" + " [GO:0045665]; neuromuscular process controlling balance [GO:0050885];" + " neuron apoptotic process [GO:0051402]; neuron cellular homeostasis" + " [GO:0070050]; neuron projection development [GO:0031175]; neuron" + " projection maintenance [GO:1990535]; neuron remodeling [GO:0016322];" + " NMDA selective glutamate receptor signaling pathway [GO:0098989];" + " Notch signaling pathway [GO:0007219]; positive regulation of amyloid" + " fibril formation [GO:1905908]; positive regulation of" + " calcium-mediated signaling [GO:0050850]; positive regulation of" + " chemokine production [GO:0032722]; positive regulation of ERK1 and" + " ERK2 cascade [GO:0070374]; positive regulation of G2/M transition of" + " mitotic cell cycle [GO:0010971]; positive regulation of gene" + " expression [GO:0010628]; positive regulation of glycolytic process" + " [GO:0045821]; positive regulation of inflammatory response" + " [GO:0050729]; positive regulation of interleukin-1 beta production" + " [GO:0032731]; positive regulation of interleukin-6 production" + " [GO:0032755]; positive regulation of JNK cascade [GO:0046330];" + " positive regulation of long-term synaptic potentiation [GO:1900273];" + " positive regulation of mitotic cell cycle [GO:0045931]; positive" + " regulation of non-canonical NF-kappaB signal transduction" + " [GO:1901224]; positive regulation of peptidyl-serine phosphorylation" + " [GO:0033138]; positive regulation of peptidyl-threonine" + " phosphorylation [GO:0010800]; positive regulation of protein" + " metabolic process [GO:0051247]; positive regulation of protein" + " phosphorylation [GO:0001934]; positive regulation of T cell migration" + " [GO:2000406]; positive regulation of transcription by RNA polymerase" + " II [GO:0045944]; positive regulation of tumor necrosis factor" + " production [GO:0032760]; regulation of gene expression [GO:0010468];" + " regulation of long-term neuronal synaptic plasticity [GO:0048169];" + " regulation of multicellular organism growth [GO:0040014]; regulation" + " of peptidyl-tyrosine phosphorylation [GO:0050730]; regulation of" + " presynapse assembly [GO:1905606]; regulation of spontaneous synaptic" + " transmission [GO:0150003]; regulation of synapse structure or" + " activity [GO:0050803]; regulation of translation [GO:0006417];" + " regulation of Wnt signaling pathway [GO:0030111]; response to" + " interleukin-1 [GO:0070555]; response to oxidative stress" + " [GO:0006979]; smooth endoplasmic reticulum calcium ion homeostasis" + " [GO:0051563]; suckling behavior [GO:0001967]; synapse organization" + " [GO:0050808]; synaptic assembly at neuromuscular junction" + " [GO:0051124]; visual learning [GO:0008542]" + ), + "Gene Ontology (cellular component)": ( + "apical part of cell [GO:0045177]; axon [GO:0030424]; cell surface" + " [GO:0009986]; cell-cell junction [GO:0005911]; ciliary rootlet" + " [GO:0035253]; clathrin-coated pit [GO:0005905]; COPII-coated ER to" + " Golgi transport vesicle [GO:0030134]; cytoplasm [GO:0005737]; cytosol" + " [GO:0005829]; dendrite [GO:0030425]; dendritic shaft [GO:0043198];" + " dendritic spine [GO:0043197]; early endosome [GO:0005769];" + " endoplasmic reticulum [GO:0005783]; endoplasmic reticulum lumen" + " [GO:0005788]; endosome [GO:0005768]; endosome lumen [GO:0031904];" + " extracellular exosome [GO:0070062]; extracellular region" + " [GO:0005576]; extracellular space [GO:0005615]; Golgi apparatus" + " [GO:0005794]; Golgi lumen [GO:0005796]; Golgi-associated vesicle" + " [GO:0005798]; growth cone [GO:0030426]; membrane [GO:0016020];" + " membrane raft [GO:0045121]; mitochondrial inner membrane" + " [GO:0005743]; neuromuscular junction [GO:0031594]; nuclear envelope" + " lumen [GO:0005641]; perikaryon [GO:0043204]; perinuclear region of" + " cytoplasm [GO:0048471]; plasma membrane [GO:0005886]; platelet alpha" + " granule lumen [GO:0031093]; presynaptic active zone [GO:0048786];" + " receptor complex [GO:0043235]; recycling endosome [GO:0055037];" + " smooth endoplasmic reticulum [GO:0005790]; spindle midzone" + " [GO:0051233]; synapse [GO:0045202]; synaptic vesicle [GO:0008021];" + " trans-Golgi network membrane [GO:0032588]" + ), + "Gene Ontology (molecular function)": ( + "DNA binding [GO:0003677]; enzyme binding [GO:0019899]; heparin binding" + " [GO:0008201]; identical protein binding [GO:0042802]; protein" + " serine/threonine kinase binding [GO:0120283]; PTB domain binding" + " [GO:0051425]; receptor ligand activity [GO:0048018]; RNA polymerase" + " II cis-regulatory region sequence-specific DNA binding [GO:0000978];" + " serine-type endopeptidase inhibitor activity [GO:0004867]; signaling" + " receptor activator activity [GO:0030546]; signaling receptor binding" + " [GO:0005102]; transition metal ion binding [GO:0046914]" + ), + }, + { + "From": "P12345", + "Gene Ontology (biological process)": ( + "2-oxoglutarate metabolic process [GO:0006103]; aspartate catabolic" + " process [GO:0006533]; aspartate metabolic process [GO:0006531];" + " glutamate metabolic process [GO:0006536]; lipid transport" + " [GO:0006869]; protein folding [GO:0006457]" + ), + "Gene Ontology (cellular component)": ( + "mitochondrial matrix [GO:0005759]; mitochondrion [GO:0005739]; plasma" + " membrane [GO:0005886]" + ), + "Gene Ontology (molecular function)": ( + "kynurenine-oxoglutarate transaminase activity [GO:0016212];" + " L-aspartate:2-oxoglutarate aminotransferase activity [GO:0004069];" + " pyridoxal phosphate binding [GO:0030170]" + ), + }, + ] From 0b3cd6eaac039cdaa30082496b352491180b31ec Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 11:05:57 +0100 Subject: [PATCH 06/40] :art: fetch annotation from uniprot in enrichment example not sure if the example is the best. --- docs/api_examples/enrichment_analysis.py | 138 +++++++++++++++-------- 1 file changed, 91 insertions(+), 47 deletions(-) diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 8ade04a..af89143 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -16,6 +16,8 @@ # %% +from pathlib import Path + import pandas as pd import acore @@ -26,20 +28,21 @@ # Parameters of this notebook # %% tags=["parameters"] -omics: str = ( - "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/" - "example_data/PXD010372/processed/omics.csv" -) -meta: str = ( +base_path: str = ( "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/" - "example_data/PXD010372/processed/meta_patients.csv" + "example_data/PXD010372/processed" ) +omics: str = f"{base_path}/omics.csv" +meta_pgs: str = f"{base_path}/meta_pgs.csv" +meta: str = f"{base_path}/meta_patients.csv" +N_to_sample: int = 1_000 # %% [markdown] # # Load processed data # %% df_omics = pd.read_csv(omics, index_col=0) +df_meta_pgs = pd.read_csv(meta_pgs, index_col=0) df_meta = pd.read_csv(meta, index_col=0) df_omics @@ -48,32 +51,42 @@ # %% [markdown] # Keep only features with a certain amount of non-NaN values and select 100 of these -# for illustration +# for illustration. Add the ones which were differently regulated in the ANOVA using all +# the protein groups. # %% -df_omics = df_omics.dropna(axis=1) -# .sample( -# 500, -# axis=1, -# random_state=42, -# ) +idx_always_included = ["Q5HYN5", "P39059", "O43432", "O43175"] +df_omics[idx_always_included] + +# %% +df_omics = ( + df_omics + # .dropna(axis=1) + .drop(idx_always_included, axis=1) + .dropna(thresh=18, axis=1) + .sample( + N_to_sample - len(idx_always_included), + axis=1, + random_state=42, + ) + .join(df_omics[idx_always_included]) +) +df_omics + # %% df_meta + # %% [markdown] # ## Compute up and downregulated genes # These will be used to find enrichments in the set of both up and downregulated genes. -# -# - only CT45 shows to be regulated # %% group = "Status" covariates = ["PlatinumValue"] diff_reg = acore.differential_regulation.run_anova( df_omics.join(df_meta[[group]]), - # df_omics.join(df_meta[[group] + covariates]), - # covariates=covariates, drop_cols=[], subject=None, group=group, @@ -81,50 +94,77 @@ diff_reg.describe(exclude=["float"]) # %% -diff_reg.describe(exclude=[float]) +diff_reg.query("rejected == True") # %% [markdown] # ## Find functional annotations, here pathways # # %% -from dsp_phospho.ptms.uniprot_id_mapping import ( +from acore.io.uniprot import ( check_id_mapping_results_ready, get_id_mapping_results_link, get_id_mapping_results_search, submit_id_mapping, ) -job_id = submit_id_mapping( - from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=df_omics.columns -) -if check_id_mapping_results_ready(job_id): - link = get_id_mapping_results_link(job_id) - # add fields to the link to get more information - # From and Entry (accession) are the same. - results = get_id_mapping_results_search( - link + "?fields=accession,go_p,go_c,go_f&format=tsv" +def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: + """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's + knowledgebase (KB). + + Parameters + ---------- + ids : pd.Index | list + Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields. + fields : str, optional + Fields to fetch, by default "accession,go_p,go_c. See for availble fields: + https://www.uniprot.org/help/return_fields + + Returns + ------- + pd.DataFrame + DataFrame with annotations of the UniProt IDs. + """ + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids) + + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + # add fields to the link to get more information + # From and Entry (accession) are the same for UniProt IDs. + results = get_id_mapping_results_search( + link + "?fields=accession,go_p,go_c,go_f&format=tsv" + ) + header = results.pop(0).split("\t") + results = [line.split("\t") for line in results] + df = pd.DataFrame(results, columns=header) + return df + + +fname_annotations = "downloaded/annotations.csv" +fname = Path(fname_annotations) +try: + annotations = pd.read_csv(fname, index_col=0) + print(f"Loaded annotations from {fname}") +except FileNotFoundError: + print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.") + annotations = fetch_annotations(df_omics.columns) + annotations = ( + annotations.set_index("Entry") + .rename_axis("identifier") + .drop("From", axis=1) + .rename_axis("source", axis=1) + .stack() + .to_frame("annotation") + .replace("", pd.NA) + .dropna() + .sort_values(["source", "annotation"]) + .reset_index() ) -header = results.pop(0).split("\t") -results = [line.split("\t") for line in results] -results = pd.DataFrame(results, columns=header) -results + fname.parent.mkdir(exist_ok=True, parents=True) + annotations.to_csv(fname, index=True) -# %% -annotation = ( - results.set_index("Entry") - .rename_axis("identifier") - .drop("From", axis=1) - .rename_axis("source", axis=1) - .stack() - .to_frame("annotation") - .replace("", pd.NA) - .dropna() - .sort_values(["source", "annotation"]) - .reset_index() -) -annotation +annotations # %% [markdown] # ## Enrichment analysis @@ -132,8 +172,12 @@ # %% ret = acore.enrichment_analysis.run_regulation_enrichment( - regulation_data=diff_reg, annotation=annotation + regulation_data=diff_reg, + annotation=annotations, + # group_col="annotation", + groups=["groups1", "groups2"], ) ret + # %% From 545299bb58b101292aee09a88b169dcd88aba491 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 12:12:02 +0100 Subject: [PATCH 07/40] :bug: make decomposition a module --- acore/decomposition/__init__.py | 3 +++ acore/decomposition/pca.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 acore/decomposition/__init__.py diff --git a/acore/decomposition/__init__.py b/acore/decomposition/__init__.py new file mode 100644 index 0000000..9f86d45 --- /dev/null +++ b/acore/decomposition/__init__.py @@ -0,0 +1,3 @@ +from . import pca + +__all__ = ["pca"] diff --git a/acore/decomposition/pca.py b/acore/decomposition/pca.py index 229661d..0ae8102 100644 --- a/acore/decomposition/pca.py +++ b/acore/decomposition/pca.py @@ -26,10 +26,10 @@ def run_pca( n_comp_max = min(df_wide.shape) n_comp_max = min(n_comp_max, n_components) pca = sklearn.decomposition.PCA(n_components=n_comp_max) - PCs = pca.fit_transform(df_wide) + pcs = pca.fit_transform(df_wide) cols = [ f"principal component {i+1} ({var_explained*100:.2f} %)" for i, var_explained in enumerate(pca.explained_variance_ratio_) ] - PCs = pd.DataFrame(PCs, index=df_wide.index, columns=cols) - return PCs, pca + pcs = pd.DataFrame(pcs, index=df_wide.index, columns=cols) + return pcs, pca From 18121c9841856bfdd5f2ff70bf16a1cab1250670 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 16:04:00 +0100 Subject: [PATCH 08/40] :art::white_check_mark: add fdr alpha parameters as option, optimize and test multiple testing --- acore/enrichment_analysis.py | 11 +++++- acore/multiple_testing.py | 23 ++++++++---- docs/api_examples/enrichment_analysis.py | 3 +- tests/test_multiple_testing.py | 45 ++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 9 deletions(-) create mode 100644 tests/test_multiple_testing.py diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 51e1ad6..7d113c5 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -246,6 +246,7 @@ def run_regulation_enrichment( group_col: str = "group", method: str = "fisher", correction: str = "fdr_bh", + correction_alpha: float = 0.05, ) -> pd.DataFrame: """ This function runs a simple enrichment analysis for significantly regulated features @@ -310,6 +311,7 @@ def run_regulation_enrichment( identifier_col=identifier, method=method, correction=correction, + correction_alpha=correction_alpha, ) return result @@ -327,6 +329,7 @@ def run_enrichment( identifier_col: str = "identifier", method: str = "fisher", correction: str = "fdr_bh", + correction_alpha: float = 0.05, ) -> pd.DataFrame: """ Computes enrichment of the foreground relative to a given backgroung, @@ -342,6 +345,8 @@ def run_enrichment( :param str group_col: name of column containing the group identifiers. :param str identifier_col: name of column containing dependent variables identifiers. :param str method: method used to compute enrichment (only 'fisher' is supported currently). + :param str correction: method to be used for multiple-testing correction. + :param float correction_alpha: adjusted p-value cutoff to define significance. :return: Pandas dataframe with annotation terms, features, number of foregroung/background features in each term, p-values and corrected p-values @@ -408,7 +413,11 @@ def run_enrichment( ) ) if len(pvalues) > 1: - rejected, padj = apply_pvalue_correction(pvalues, alpha=0.05, method=correction) + rejected, padj = apply_pvalue_correction( + pvalues, + alpha=correction_alpha, + method=correction, + ) result = pd.DataFrame( { "terms": terms, diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py index 35fd5d6..bf22e6b 100644 --- a/acore/multiple_testing.py +++ b/acore/multiple_testing.py @@ -5,10 +5,17 @@ from sklearn.utils import shuffle from statsmodels.stats import multitest +# ? dictionary with available methods in statsmodels.stats.multitest.multipletests: +# multitest.multitest_methods_names -def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"): + +def apply_pvalue_correction(pvalues, alpha: float = 0.05, method: str = "bonferroni"): """ - Performs p-value correction using the specified method. For more information visit https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html. + Performs p-value correction using the specified method as in + statsmodels.stats.multitest.multipletests. + + For more information visit + https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html. :param numpy.ndarray pvalues: et of p-values of the individual tests. :param float alpha: error rate. @@ -23,7 +30,9 @@ def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"): - fdr_by : Benjamini/Yekutieli (negative) - fdr_tsbh : two stage fdr correction (non-negative) - fdr_tsbky : two stage fdr correction (non-negative) - :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value. + :return: Tuple with two `numpy.array`s, boolen for rejecting H0 hypothesis + and float for adjusted p-value. Can contain missing values if `pvalues` + contain missing values. Exmaple:: @@ -32,10 +41,12 @@ def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"): p = np.array(pvalues) mask = np.isfinite(p) pval_corrected = np.full(p.shape, np.nan) - pval_corrected[mask] = multitest.multipletests(p[mask], alpha, method)[1] - rejected = [p <= alpha for p in pval_corrected] + _rejected, _pvals_corrected, _, _ = multitest.multipletests(p[mask], alpha, method) + pval_corrected[mask] = _pvals_corrected + rejected = np.full(p.shape, np.nan) + rejected[mask] = _rejected - return (rejected, pval_corrected.tolist()) + return (rejected, pval_corrected) def apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method="indep"): diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index af89143..d7dada2 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -174,8 +174,7 @@ def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: ret = acore.enrichment_analysis.run_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, - # group_col="annotation", - groups=["groups1", "groups2"], + correction_alpha=0.01, ) ret diff --git a/tests/test_multiple_testing.py b/tests/test_multiple_testing.py new file mode 100644 index 0000000..0f339bc --- /dev/null +++ b/tests/test_multiple_testing.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + +from acore.multiple_testing import apply_pvalue_correction + +# benferoni correction +array = [ + 0.1, + 0.01, + 0.2, + 0.4, + np.nan, + 0.5, +] # 0.04, 0.02, 0.0001] + +test_res = [ + ( + "b", + np.array([0.0, 1.0, 0.0, 0.0, np.nan, 0.0]), + np.array([0.5, 0.05, 1.0, 1.0, np.nan, 1.0]), + ), + ( + "fdr_bh", + np.array( + [ + 0.0, + 1.0, + 0.0, + 0.0, + np.nan, + 0.0, + ] + ), + np.array([0.25, 0.05, 0.33333333, 0.5, np.nan, 0.5]), + ), +] + + +@pytest.mark.parametrize("method,exp_rejected,exp_pvalues", test_res) +def test_apply_pvalue_correction_alpha_5_percent(method, exp_rejected, exp_pvalues): + act_rejected, act_pvalues = apply_pvalue_correction( + array, alpha=0.05, method=method + ) + np.testing.assert_array_equal(act_rejected, exp_rejected) + np.testing.assert_array_almost_equal(act_pvalues, exp_pvalues) From 301504560aa01cf9e5fc7c78c19a7a11c25c0420 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 16:05:18 +0100 Subject: [PATCH 09/40] :zap: just work with numpy array, not lists --- acore/enrichment_analysis.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 7d113c5..6e352d6 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -281,23 +281,22 @@ def run_regulation_enrichment( method='fisher', ) """ - foreground_list = ( - regulation_data[regulation_data[reject_col]][identifier].unique().tolist() - ) - background_list = ( - regulation_data[~regulation_data[reject_col]][identifier].unique().tolist() - ) + mask_rejected = regulation_data[reject_col] + foreground_list = regulation_data.loc[mask_rejected, identifier].unique() + background_list = regulation_data.loc[~mask_rejected, identifier].unique() foreground_pop = len(foreground_list) - background_pop = len(regulation_data[identifier].unique().tolist()) - grouping = [] - for i in annotation[identifier]: - if i in foreground_list: - grouping.append("foreground") - elif i in background_list: - grouping.append("background") - else: - grouping.append(np.nan) - annotation[group_col] = grouping + background_pop = len(regulation_data[identifier].unique()) + # needs to allow for missing annotations + annotation[group_col] = np.where( + annotation[identifier].isin(foreground_list), + "foreground", + # if in background list, then background, else np.nan + np.where( + annotation[identifier].isin(background_list), + "background", + np.nan, + ), + ) annotation = annotation.dropna(subset=[group_col]) result = run_enrichment( From 13f45e9ccad61649714b40600d595235a7a5e320 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 16:06:41 +0100 Subject: [PATCH 10/40] :bug: remove unused parameter - maybe it was there for compability, but then we could add kwargs to the function to catch any other arguments (without using these) --- acore/enrichment_analysis.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 6e352d6..405034f 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -240,7 +240,6 @@ def run_regulation_enrichment( regulation_data: pd.DataFrame, annotation: pd.DataFrame, identifier: str = "identifier", - groups: list[str] = ["group1", "group2"], annotation_col: str = "annotation", reject_col: str = "rejected", group_col: str = "group", @@ -256,7 +255,6 @@ def run_regulation_enrichment( :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. - :param list[str] groups: column names from regulation_data containing group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis. From 3961d84ccba6c720dab3297d48597f54949fa97d Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 16:18:46 +0100 Subject: [PATCH 11/40] :bug: correct name of testing fuction (to fct tested) --- tests/test_enrichment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py index bc463f1..e8d761c 100644 --- a/tests/test_enrichment.py +++ b/tests/test_enrichment.py @@ -38,7 +38,9 @@ def test_run_kolmogorov_smirnov(self): self.assertEqual(result[1], expected_result.pvalue) -def test_run_enrichtment(): +def test_run_regulation_enrichment(): + """Integration test for run_regulation_enrichment. Indirectly tests + run_enrichment from enrichment_analysis module.""" annotation = { "annotation": ["path1", "path1", "path1", "path2", "path2", "path3"], "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6"], From 7f51e02aec9a6adab9a737f64e0c0c3643542fd2 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 25 Nov 2024 16:20:52 +0100 Subject: [PATCH 12/40] :constructions: add enrichtment example to docs --- docs/api_examples/enrichment_analysis.ipynb | 331 ++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 332 insertions(+) create mode 100644 docs/api_examples/enrichment_analysis.ipynb diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb new file mode 100644 index 0000000..9a547ba --- /dev/null +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f79a8051", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Enrichment analysis\n", + "\n", + "- we need some groups of genes to compute clusters\n", + "- we need functional annotations, i.e. a category summarizing a set of genes.\n", + "-\n", + "You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis\n", + "on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o).\n", + "\n", + "Use example data for ovarian cancer\n", + "([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "956ed7b7", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "%pip install acore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3030d08", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "\n", + "import acore\n", + "import acore.differential_regulation\n", + "import acore.enrichment_analysis" + ] + }, + { + "cell_type": "markdown", + "id": "fddd607c", + "metadata": {}, + "source": [ + "Parameters of this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6af9349a", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "base_path: str = (\n", + " \"https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/\"\n", + " \"example_data/PXD010372/processed\"\n", + ")\n", + "omics: str = f\"{base_path}/omics.csv\"\n", + "meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n", + "meta: str = f\"{base_path}/meta_patients.csv\"\n", + "N_to_sample: int = 1_000" + ] + }, + { + "cell_type": "markdown", + "id": "10ed1830", + "metadata": {}, + "source": [ + "# Load processed data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d70ef4c", + "metadata": {}, + "outputs": [], + "source": [ + "df_omics = pd.read_csv(omics, index_col=0)\n", + "df_meta_pgs = pd.read_csv(meta_pgs, index_col=0)\n", + "df_meta = pd.read_csv(meta, index_col=0)\n", + "df_omics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3897e1f", + "metadata": {}, + "outputs": [], + "source": [ + "df_omics.notna().sum().sort_values(ascending=True).plot()" + ] + }, + { + "cell_type": "markdown", + "id": "8ce47108", + "metadata": {}, + "source": [ + "Keep only features with a certain amount of non-NaN values and select 100 of these\n", + "for illustration. Add the ones which were differently regulated in the ANOVA using all\n", + "the protein groups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3a8ab49", + "metadata": {}, + "outputs": [], + "source": [ + "idx_always_included = [\"Q5HYN5\", \"P39059\", \"O43432\", \"O43175\"]\n", + "df_omics[idx_always_included]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1145a2cd", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "df_omics = (\n", + " df_omics\n", + " # .dropna(axis=1)\n", + " .drop(idx_always_included, axis=1)\n", + " .dropna(thresh=18, axis=1)\n", + " .sample(\n", + " N_to_sample - len(idx_always_included),\n", + " axis=1,\n", + " random_state=42,\n", + " )\n", + " .join(df_omics[idx_always_included])\n", + ")\n", + "df_omics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea77e80", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "df_meta" + ] + }, + { + "cell_type": "markdown", + "id": "4bbf5dc4", + "metadata": {}, + "source": [ + "## Compute up and downregulated genes\n", + "These will be used to find enrichments in the set of both up and downregulated genes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "231bb6da", + "metadata": {}, + "outputs": [], + "source": [ + "group = \"Status\"\n", + "covariates = [\"PlatinumValue\"]\n", + "diff_reg = acore.differential_regulation.run_anova(\n", + " df_omics.join(df_meta[[group]]),\n", + " drop_cols=[],\n", + " subject=None,\n", + " group=group,\n", + ")\n", + "diff_reg.describe(exclude=[\"float\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e347b06", + "metadata": {}, + "outputs": [], + "source": [ + "diff_reg.query(\"rejected == True\")" + ] + }, + { + "cell_type": "markdown", + "id": "d6c0a225", + "metadata": {}, + "source": [ + "## Find functional annotations, here pathways\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2668415", + "metadata": {}, + "outputs": [], + "source": [ + "from acore.io.uniprot import (\n", + " check_id_mapping_results_ready,\n", + " get_id_mapping_results_link,\n", + " get_id_mapping_results_search,\n", + " submit_id_mapping,\n", + ")\n", + "\n", + "\n", + "def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:\n", + " \"\"\"Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's\n", + " knowledgebase (KB).\n", + "\n", + " Parameters\n", + " ----------\n", + " ids : pd.Index | list\n", + " Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.\n", + " fields : str, optional\n", + " Fields to fetch, by default \"accession,go_p,go_c. See for availble fields:\n", + " https://www.uniprot.org/help/return_fields\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " DataFrame with annotations of the UniProt IDs.\n", + " \"\"\"\n", + " job_id = submit_id_mapping(from_db=\"UniProtKB_AC-ID\", to_db=\"UniProtKB\", ids=ids)\n", + "\n", + " if check_id_mapping_results_ready(job_id):\n", + " link = get_id_mapping_results_link(job_id)\n", + " # add fields to the link to get more information\n", + " # From and Entry (accession) are the same for UniProt IDs.\n", + " results = get_id_mapping_results_search(\n", + " link + \"?fields=accession,go_p,go_c,go_f&format=tsv\"\n", + " )\n", + " header = results.pop(0).split(\"\\t\")\n", + " results = [line.split(\"\\t\") for line in results]\n", + " df = pd.DataFrame(results, columns=header)\n", + " return df\n", + "\n", + "\n", + "fname_annotations = \"downloaded/annotations.csv\"\n", + "fname = Path(fname_annotations)\n", + "try:\n", + " annotations = pd.read_csv(fname, index_col=0)\n", + " print(f\"Loaded annotations from {fname}\")\n", + "except FileNotFoundError:\n", + " print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n", + " annotations = fetch_annotations(df_omics.columns)\n", + " annotations = (\n", + " annotations.set_index(\"Entry\")\n", + " .rename_axis(\"identifier\")\n", + " .drop(\"From\", axis=1)\n", + " .rename_axis(\"source\", axis=1)\n", + " .stack()\n", + " .to_frame(\"annotation\")\n", + " .replace(\"\", pd.NA)\n", + " .dropna()\n", + " .sort_values([\"source\", \"annotation\"])\n", + " .reset_index()\n", + " )\n", + " fname.parent.mkdir(exist_ok=True, parents=True)\n", + " annotations.to_csv(fname, index=True)\n", + "\n", + "annotations" + ] + }, + { + "cell_type": "markdown", + "id": "4165bc94", + "metadata": {}, + "source": [ + "## Enrichment analysis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f300c5b5", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " correction_alpha=0.01,\n", + ")\n", + "ret" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dd57b99", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "tags,-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/index.rst b/docs/index.rst index 91a92e6..eba7982 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,6 +16,7 @@ api_examples/exploratory_analysis api_examples/normalization_analysis + api_examples/enrichment_analysis .. toctree:: :maxdepth: 1 From e63f9379d58b0630fd03f7cbc82b519b89153eb4 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 26 Nov 2024 10:42:22 +0100 Subject: [PATCH 13/40] :bug::sparkles: cast rejected to bool, annotate features using pandas functionality --- acore/enrichment_analysis.py | 44 ++++++++++++++++++++++++++++-------- tests/test_enrichment.py | 27 +++++++++++++++++++--- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 405034f..259fe73 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -236,6 +236,35 @@ def run_up_down_regulation_enrichment( return enrichment_results +# ! to move +def _annotate_features( + features: pd.Series, in_foreground: list[str], in_background: list[str] +) -> pd.Series: + """ + Annotate features as foreground or background based on their presence in the + foreground and background lists. + + :param features: pandas dataframe with features and their annotations. + :param in_foreground: list of features identifiers in the foreground. + :type in_foreground: set or list-like + :param in_background: list of features identifiers in the background. + :type in_background: set or list-like + :return: pandas dataframe with a new column 'group' containing 'foreground' or 'background'. + missing values are preserved. + + Example:: + + result = _annotate_features(features, in_foreground, in_background) + """ + in_either_or = features.isin(in_foreground) | features.isin(in_background) + res = ( + features.where(in_either_or, np.nan) + .mask(features.isin(in_foreground), "foreground") + .mask(features.isin(in_background), "background") + ) + return res + + def run_regulation_enrichment( regulation_data: pd.DataFrame, annotation: pd.DataFrame, @@ -285,15 +314,10 @@ def run_regulation_enrichment( foreground_pop = len(foreground_list) background_pop = len(regulation_data[identifier].unique()) # needs to allow for missing annotations - annotation[group_col] = np.where( - annotation[identifier].isin(foreground_list), - "foreground", - # if in background list, then background, else np.nan - np.where( - annotation[identifier].isin(background_list), - "background", - np.nan, - ), + annotation[group_col] = _annotate_features( + features=annotation[identifier], + in_foreground=foreground_list, + in_background=background_list, ) annotation = annotation.dropna(subset=[group_col]) @@ -425,7 +449,7 @@ def run_enrichment( "background_pop": background_pop, "pvalue": pvalues, "padj": padj, - "rejected": rejected, + "rejected": rejected.astype(bool), } ) result = result.sort_values(by="padj", ascending=True) diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py index e8d761c..75dbb86 100644 --- a/tests/test_enrichment.py +++ b/tests/test_enrichment.py @@ -1,5 +1,6 @@ import unittest +import numpy as np import pandas as pd from scipy import stats @@ -38,13 +39,33 @@ def test_run_kolmogorov_smirnov(self): self.assertEqual(result[1], expected_result.pvalue) +def test__annotate_features(): + expected = pd.Series( + [ + "foreground", + "foreground", + "background", + "foreground", + "background", + "background", + np.nan, + ] + ) + + features = pd.Series(["G1", "G2", "G3", "G4", "G5", "G6", "G9"]) + in_foreground = ["G1", "G2", "G4"] + in_background = ["G3", "G5", "G6"] + actual = ea._annotate_features(features, in_foreground, in_background) + pd.testing.assert_series_equal(expected, actual) + + def test_run_regulation_enrichment(): """Integration test for run_regulation_enrichment. Indirectly tests run_enrichment from enrichment_analysis module.""" annotation = { - "annotation": ["path1", "path1", "path1", "path2", "path2", "path3"], - "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6"], - "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P"], + "annotation": ["path1", "path1", "path1", "path2", "path2", "path3", "path3"], + "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6", "gene9"], + "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P", "GO_P"], } annotation = pd.DataFrame(annotation) regulation_res = { From 4adc62420d2bd171470f3a8aed8ca0c565181dc1 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 26 Nov 2024 10:18:31 +0000 Subject: [PATCH 14/40] :bug: newer versions of numpy need explicit type - NA can lead to float columns instead of 0. to investigate --- acore/enrichment_analysis.py | 3 ++- docs/api_examples/enrichment_analysis.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 259fe73..f8ac681 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -308,7 +308,8 @@ def run_regulation_enrichment( method='fisher', ) """ - mask_rejected = regulation_data[reject_col] + # ? can we remove NA features in that column? + mask_rejected = regulation_data[reject_col].astype(bool) foreground_list = regulation_data.loc[mask_rejected, identifier].unique() background_list = regulation_data.loc[~mask_rejected, identifier].unique() foreground_pop = len(foreground_list) diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index d7dada2..598a12a 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -94,6 +94,7 @@ diff_reg.describe(exclude=["float"]) # %% +diff_reg["rejected"] = diff_reg["rejected"].astype(bool) # ! needs to be fixed in anova diff_reg.query("rejected == True") # %% [markdown] From 6a7db73925f7966b781acb1a455cab1900a80b79 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 26 Nov 2024 16:30:47 +0100 Subject: [PATCH 15/40] :construction: annotate and use some more formatting option --- acore/enrichment_analysis.py | 64 ++++++++++++------------ docs/api_examples/enrichment_analysis.py | 3 ++ 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index f8ac681..ae76202 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -1,5 +1,8 @@ """Enrichment Analysis Module. Contains different functions to perform enrichment analysis. + +Most things in this module are covered in https://www.youtube.com/watch?v=2NC1QOXmc5o +by Lars Juhl Jensen. """ import os @@ -461,14 +464,14 @@ def run_enrichment( def run_ssgsea( data, annotation, - annotation_col="an notation", - identifier_col="identifier", - set_index=[], - outdir="tmp", - min_size=15, - max_size=500, - scale=False, - permutations=0, + set_index: list[str], + annotation_col: str = "an notation", + identifier_col: str = "identifier", + outdir: str = "tmp", + min_size: int = 15, + max_size: int = 500, + scale: bool = False, + permutations: int = 0, ): """ Project each sample within a data set onto a space of gene set enrichment scores using @@ -531,10 +534,13 @@ def run_ssgsea( if not os.path.exists(outdir): os.makedirs(outdir) + # Comine columns to create a unique name for each set (?) name = [] index = data[set_index] for i, row in data[set_index].iterrows(): - name.append("_".join(row[set_index].tolist())) + name.append( + "_".join(row[set_index].tolist()) + ) # this assumes strings as identifiers df["Name"] = name index.index = name @@ -554,29 +560,25 @@ def run_ssgsea( + "\t".join(list(filter(None, row[identifier_col]))) + "\n" ) - try: - enrichment = gp.ssgsea( - data=df, - gene_sets=str(file_path), - outdir=outdir, - min_size=min_size, - max_size=max_size, - scale=scale, - permutation_num=permutations, - no_plot=True, - processes=1, - seed=10, - format="png", - ) + enrichment = gp.ssgsea( + data=df, + gene_sets=str(file_path), + outdir=outdir, + min_size=min_size, + max_size=max_size, + scale=scale, + permutation_num=permutations, + no_plot=True, + processes=1, + seed=10, + format="png", + ) - enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose() - enrichment_es = enrichment_es.join(index) - enrichment_nes = enrichment.res2d.transpose() - enrichment_nes = enrichment_nes.join(index) + enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose() + enrichment_es = enrichment_es.join(index) + enrichment_nes = enrichment.res2d.transpose() + enrichment_nes = enrichment_nes.join(index) - result = {"es": enrichment_es, "nes": enrichment_nes} - except Exception as e: - print("Error in ssGSEA.", e) - df = None + result = {"es": enrichment_es, "nes": enrichment_nes} return result diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 598a12a..13d0f1f 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -18,12 +18,15 @@ # %% from pathlib import Path +import dsp_pandas import pandas as pd import acore import acore.differential_regulation import acore.enrichment_analysis +dsp_pandas.format.set_pandas_options(max_colwidth=15) + # %% [markdown] # Parameters of this notebook From 142b7787682793c819ec7c048d530bed32347ddf Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 2 Dec 2024 22:47:05 +0100 Subject: [PATCH 16/40] :art: pass on parameter for min set, docstrings resolve things discussed with Alberto. --- acore/enrichment_analysis.py | 8 +++++--- tests/test_enrichment.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index ae76202..ba31d62 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -247,12 +247,12 @@ def _annotate_features( Annotate features as foreground or background based on their presence in the foreground and background lists. - :param features: pandas dataframe with features and their annotations. + :param features: pandas.Series with features and their annotations. :param in_foreground: list of features identifiers in the foreground. :type in_foreground: set or list-like :param in_background: list of features identifiers in the background. :type in_background: set or list-like - :return: pandas dataframe with a new column 'group' containing 'foreground' or 'background'. + :return: pandas.Series containing 'foreground' or 'background'. missing values are preserved. Example:: @@ -276,6 +276,7 @@ def run_regulation_enrichment( reject_col: str = "rejected", group_col: str = "group", method: str = "fisher", + min_detected_in_set: int = 2, correction: str = "fdr_bh", correction_alpha: float = 0.05, ) -> pd.DataFrame: @@ -336,6 +337,7 @@ def run_regulation_enrichment( identifier_col=identifier, method=method, correction=correction, + min_detected_in_set=min_detected_in_set, correction_alpha=correction_alpha, ) @@ -348,7 +350,7 @@ def run_enrichment( background_id: str, foreground_pop: int, background_pop: int, - min_detected_in_set: int = 1, + min_detected_in_set: int = 2, annotation_col: str = "annotation", group_col: str = "group", identifier_col: str = "identifier", diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py index 75dbb86..6c0930a 100644 --- a/tests/test_enrichment.py +++ b/tests/test_enrichment.py @@ -77,6 +77,7 @@ def test_run_regulation_enrichment(): actual = ea.run_regulation_enrichment( regulation_data=regulation_res, annotation=annotation, + min_detected_in_set=1, ) expected = pd.DataFrame( From 27c6e3800a66ca85407f585231add2bc593b2005 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 3 Dec 2024 12:59:07 +0100 Subject: [PATCH 17/40] :art: improve visibility of data and inspect annotations --- acore/enrichment_analysis.py | 6 ++--- docs/api_examples/enrichment_analysis.ipynb | 30 ++++++++++++++++++++- docs/api_examples/enrichment_analysis.py | 12 +++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index ba31d62..6c896cb 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -211,7 +211,6 @@ def run_up_down_regulation_enrichment( df, annotation, identifier=identifier, - groups=groups, annotation_col=annotation_col, reject_col="up_pairwise_regulation", group_col=group_col, @@ -224,7 +223,6 @@ def run_up_down_regulation_enrichment( df, annotation, identifier=identifier, - groups=groups, annotation_col=annotation_col, reject_col="down_pairwise_regulation", group_col=group_col, @@ -241,7 +239,9 @@ def run_up_down_regulation_enrichment( # ! to move def _annotate_features( - features: pd.Series, in_foreground: list[str], in_background: list[str] + features: pd.Series, + in_foreground: set[str] | list[str], + in_background: set[str] | list[str], ) -> pd.Series: """ Annotate features as foreground or background based on their presence in the diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 9a547ba..93ff6ee 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -43,11 +43,14 @@ "source": [ "from pathlib import Path\n", "\n", + "import dsp_pandas\n", "import pandas as pd\n", "\n", "import acore\n", "import acore.differential_regulation\n", - "import acore.enrichment_analysis" + "import acore.enrichment_analysis\n", + "\n", + "dsp_pandas.format.set_pandas_options(max_colwidth=15)" ] }, { @@ -201,6 +204,7 @@ "metadata": {}, "outputs": [], "source": [ + "diff_reg[\"rejected\"] = diff_reg[\"rejected\"].astype(bool) # ! needs to be fixed in anova\n", "diff_reg.query(\"rejected == True\")" ] }, @@ -285,6 +289,30 @@ "annotations" ] }, + { + "cell_type": "markdown", + "id": "d4734452", + "metadata": {}, + "source": [ + "See how many protein groups are associated with each annotation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57fddefb", + "metadata": {}, + "outputs": [], + "source": [ + "_ = (\n", + " annotations.groupby(\"annotation\")\n", + " .size()\n", + " .value_counts()\n", + " .sort_index()\n", + " .plot(kind=\"bar\")\n", + ")" + ] + }, { "cell_type": "markdown", "id": "4165bc94", diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 13d0f1f..00115c9 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -170,6 +170,18 @@ def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: annotations +# %% [markdown] +# See how many protein groups are associated with each annotation. + +# %% +_ = ( + annotations.groupby("annotation") + .size() + .value_counts() + .sort_index() + .plot(kind="bar") +) + # %% [markdown] # ## Enrichment analysis # From 8ed860c98d4e708119ed8b5c2db58c17d2f57a4a Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 3 Dec 2024 14:31:42 +0100 Subject: [PATCH 18/40] :bug: make it Python 3.9 compatible --- acore/enrichment_analysis.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 6c896cb..de20275 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -5,6 +5,8 @@ by Lars Juhl Jensen. """ +from __future__ import annotations + import os import re import uuid From ec10c71f42992bfb83266698d31e4c430b7a12c6 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 3 Dec 2024 16:47:32 +0100 Subject: [PATCH 19/40] :art: type annotations and docstring updates - prepare to defien a common return type by specifying message TYPE_COLS_MSG --- acore/enrichment_analysis.py | 87 ++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index de20275..e8ee6ad 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -18,8 +18,17 @@ from acore.multiple_testing import apply_pvalue_correction +TYPE_COLS_MSG = """ +columns: 'terms', 'identifiers', 'foreground', + 'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'. +""" + -def run_fisher(group1, group2, alternative="two-sided"): +def run_fisher( + group1: list[int], + group2: list[int], + alternative="two-sided", +) -> tuple[float, float]: """annotated not-annotated group1 a b group2 c d @@ -66,7 +75,7 @@ def run_site_regulation_enrichment( regulation_data, annotation, identifier="identifier", - groups=["group1", "group2"], + groups=("group1", "group2"), annotation_col="annotation", reject_col="rejected", group_col="group", @@ -78,9 +87,9 @@ def run_site_regulation_enrichment( This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset. - :param regulation_data: pandas dataframe resulting from differential + :param regulation_data: pandas.DataFrame resulting from differential regulation analysis. - :param annotation: pandas dataframe with annotations for features + :param annotation: pandas.DataFrame with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. @@ -95,8 +104,8 @@ def run_site_regulation_enrichment( :param str method: method used to compute enrichment (only 'fisher' is supported currently). :param str regex: how to extract the annotated identifier from the site identifier - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', - 'background', 'pvalue', 'padj' and 'rejected'. + :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', + 'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'. Example:: @@ -108,7 +117,7 @@ def run_site_regulation_enrichment( reject_col='rejected', group_col='group', method='fisher', - match="(\w+~.+)_\w\d+\-\w+" + match="(\\w+~.+)_\\w\\d+\\-\\w+" ) """ result = pd.DataFrame() @@ -142,9 +151,9 @@ def run_up_down_regulation_enrichment( regulation_data, annotation, identifier="identifier", - groups=["group1", "group2"], + groups=("group1", "group2"), annotation_col="annotation", - reject_col="rejected", + # reject_col="rejected", group_col="group", method="fisher", correction="fdr_bh", @@ -155,9 +164,9 @@ def run_up_down_regulation_enrichment( This function runs a simple enrichment analysis for significantly regulated proteins distinguishing between up- and down-regulated. - :param regulation_data: pandas dataframe resulting from differential regulation + :param regulation_data: pandas.DataFrame resulting from differential regulation analysis (CKG's regulation table). - :param annotation: pandas dataframe with annotations for features + :param annotation: pandas.DataFrame with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. :param list groups: column names from regulation_data containing group identifiers. @@ -171,7 +180,7 @@ def run_up_down_regulation_enrichment( :param str correction: method to be used for multiple-testing correction :param float alpha: adjusted p-value cutoff to define significance :param float lfc_cutoff: log fold-change cutoff to define practical significance - :return: Pandas dataframe with columns:'terms', 'identifiers', 'foreground', + :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'. Example:: @@ -286,8 +295,8 @@ def run_regulation_enrichment( This function runs a simple enrichment analysis for significantly regulated features in a dataset. - :param regulation_data: pandas dataframe resulting from differential regulation analysis. - :param annotation: pandas dataframe with annotations for features + :param regulation_data: pandas.DataFrame resulting from differential regulation analysis. + :param annotation: pandas.DataFrame with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. @@ -297,8 +306,8 @@ def run_regulation_enrichment( if feature belongs to foreground or background. :param str method: method used to compute enrichment (only 'fisher' is supported currently). :param str correction: method to be used for multiple-testing correction - :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', - 'background', 'pvalue', 'padj' and 'rejected'. + :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', + 'background', 'foreground_pop', 'background_pop', 'pvalue', 'padj' and 'rejected'. Example:: @@ -306,12 +315,13 @@ def run_regulation_enrichment( regulation_data, annotation, identifier='identifier', - groups=['group1', - 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', + min_detected_in_set=2, + correction='fdr_bh', + correction_alpha=0.05, ) """ # ? can we remove NA features in that column? @@ -364,8 +374,8 @@ def run_enrichment( Computes enrichment of the foreground relative to a given backgroung, using Fisher's exact test, and corrects for multiple hypothesis testing. - :param data: pandas dataframe with annotations for dataset features - (columns: 'annotation', 'identifier', 'source', 'group'). + :param data: pandas.DataFrame with annotations for dataset features + (columns: 'annotation', 'identifier', 'group'). :param str foreground_id: group identifier of features that belong to the foreground. :param str background_id: group identifier of features that belong to the background. :param int foreground_pop: number of features in the foreground. @@ -376,7 +386,7 @@ def run_enrichment( :param str method: method used to compute enrichment (only 'fisher' is supported currently). :param str correction: method to be used for multiple-testing correction. :param float correction_alpha: adjusted p-value cutoff to define significance. - :return: Pandas dataframe with annotation terms, features, + :return: pandas.DataFrame with columns: annotation terms, features, number of foregroung/background features in each term, p-values and corrected p-values (columns: 'terms', 'identifiers', 'foreground', @@ -396,34 +406,35 @@ def run_enrichment( method='fisher', ) """ + if method != "fisher": + raise ValueError("Only Fisher's exact test is supported at the moment.") + result = pd.DataFrame() - df = data.copy() terms = [] ids = [] pvalues = [] fnum = [] bnum = [] countsdf = ( - df.groupby([annotation_col, group_col]) + data.groupby([annotation_col, group_col]) .agg(["count"])[(identifier_col, "count")] .reset_index() ) countsdf.columns = [annotation_col, group_col, "count"] - for annotation in ( - countsdf[countsdf[group_col] == foreground_id][annotation_col].unique().tolist() - ): + for annotation in countsdf.loc[ + countsdf[group_col] == foreground_id, annotation_col + ].unique(): counts = countsdf[countsdf[annotation_col] == annotation] num_foreground = counts.loc[counts[group_col] == foreground_id, "count"].values num_background = counts.loc[counts[group_col] == background_id, "count"].values - + # ! counts should always be of length one count? squeeze? if len(num_foreground) == 1: num_foreground = num_foreground[0] if len(num_background) == 1: num_background = num_background[0] else: num_background = 0 - # ! what happens if this is not the case? - if method == "fisher" and num_foreground >= min_detected_in_set: + if num_foreground >= min_detected_in_set: _, pvalue = run_fisher( [num_foreground, foreground_pop - num_foreground], [num_background, background_pop - foreground_pop - num_background], @@ -434,11 +445,11 @@ def run_enrichment( pvalues.append(pvalue) ids.append( ",".join( - df.loc[ - (df[annotation_col] == annotation) - & (df[group_col] == foreground_id), + data.loc[ + (data[annotation_col] == annotation) + & (data[group_col] == foreground_id), identifier_col, - ].tolist() + ] ) ) if len(pvalues) > 1: @@ -466,8 +477,8 @@ def run_enrichment( def run_ssgsea( - data, - annotation, + data: pd.DataFrame, + annotation: str, set_index: list[str], annotation_col: str = "an notation", identifier_col: str = "identifier", @@ -483,8 +494,8 @@ def run_ssgsea( described in Barbie et al., 2009: https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA). - :param data: pandas dataframe with the quantified features (i.e. subject x proteins) - :param annotation: pandas dataframe with the annotation to be used in the enrichment + :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins) + :param annotation: pandas.DataFrame with the annotation to be used in the enrichment (i.e. CKG pathway annotation file) :param str annotation_col: name of the column containing annotation terms. :param str identifier_col: name of column containing dependent variables identifiers. @@ -556,7 +567,7 @@ def run_ssgsea( ) fid = uuid.uuid4() file_path = os.path.join(outdir, str(fid) + ".gmt") - with open(file_path, "w") as out: + with open(file_path, "w", encoding="utf8") as out: for i, row in grouped_annotations.iterrows(): out.write( row[annotation_col] From 9c96af7c677bd13a7e58150fb16f3da76bca2887 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 13:22:07 +0100 Subject: [PATCH 20/40] :art: docstrings fmt --- acore/enrichment_analysis.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index e8ee6ad..0c2a2ed 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -27,7 +27,7 @@ def run_fisher( group1: list[int], group2: list[int], - alternative="two-sided", + alternative: str = "two-sided", ) -> tuple[float, float]: """annotated not-annotated group1 a b @@ -83,7 +83,7 @@ def run_site_regulation_enrichment( regex=r"(\w+~.+)_\w\d+\-\w+", # ! add example to docstring of what this matches correction="fdr_bh", ): - """ + r""" This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset. @@ -117,7 +117,7 @@ def run_site_regulation_enrichment( reject_col='rejected', group_col='group', method='fisher', - match="(\\w+~.+)_\\w\\d+\\-\\w+" + match="(\w+~.+)_\w\d+\-\w+" ) """ result = pd.DataFrame() @@ -357,7 +357,7 @@ def run_regulation_enrichment( def run_enrichment( - data, + data: pd.DataFrame, foreground_id: str, background_id: str, foreground_pop: int, @@ -390,7 +390,7 @@ def run_enrichment( number of foregroung/background features in each term, p-values and corrected p-values (columns: 'terms', 'identifiers', 'foreground', - 'background', 'pvalue', 'padj' and 'rejected'). + 'background', 'pvalue', 'padj' and 'rejected'). Example:: @@ -514,6 +514,7 @@ def run_ssgsea( and nes - normalized enrichment scores. Example:: + stproject = "P0000008" p = project.Project( stproject, @@ -542,7 +543,6 @@ def run_ssgsea( scale=False, permutations=0 ) - """ result = {} df = data.copy() From a6f21c90bb58a8c8345bed059b5a3552e1537e3e Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 14:29:10 +0100 Subject: [PATCH 21/40] :art: more docstring updates --- acore/enrichment_analysis.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 0c2a2ed..d925a51 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -29,15 +29,18 @@ def run_fisher( group2: list[int], alternative: str = "two-sided", ) -> tuple[float, float]: - """annotated not-annotated - group1 a b - group2 c d - ------------------------------------ + """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`. + + Example:: + + # annotated not-annotated + # group1 a b + # group2 c d - group1 = [a, b] - group2 = [c, d] - odds, pvalue = stats.fisher_exact([[a, b], [c, d]]) + odds, pvalue = stats.fisher_exact(group1=[a, b], + group2 =[c, d] + ) """ odds, pvalue = stats.fisher_exact([group1, group2], alternative) @@ -80,10 +83,10 @@ def run_site_regulation_enrichment( reject_col="rejected", group_col="group", method="fisher", - regex=r"(\w+~.+)_\w\d+\-\w+", # ! add example to docstring of what this matches + regex="(\\w+~.+)_\\w\\d+\\-\\w+", # ! add example to docstring of what kind of string this matches correction="fdr_bh", ): - r""" + """ This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset. @@ -97,7 +100,7 @@ def run_site_regulation_enrichment( group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulatio_data containing + :param str reject_col: name of the column from regulation_data containing boolean for rejected null hypothesis. :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. @@ -117,7 +120,7 @@ def run_site_regulation_enrichment( reject_col='rejected', group_col='group', method='fisher', - match="(\w+~.+)_\w\d+\-\w+" + match="(\\w+~.+)_\\w\\d+\\-\\w+" ) """ result = pd.DataFrame() @@ -300,7 +303,7 @@ def run_regulation_enrichment( (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulatio_data containing boolean for + :param str reject_col: name of the column from `regulation_data` containing boolean for rejected null hypothesis. :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. @@ -487,7 +490,7 @@ def run_ssgsea( max_size: int = 500, scale: bool = False, permutations: int = 0, -): +) -> dict[str, pd.DataFrame]: """ Project each sample within a data set onto a space of gene set enrichment scores using the single sample gene set enrichment analysis (ssGSEA) projection methodology From 4d98dd51900f57025ed3c2dd50c8ffeb34f0cde6 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 14:59:58 +0100 Subject: [PATCH 22/40] :bug: regex in parameter name vs interpreded in docstring sphinx deals differently with parsing the strings... --- acore/enrichment_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index d925a51..c0bc0c3 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -86,7 +86,7 @@ def run_site_regulation_enrichment( regex="(\\w+~.+)_\\w\\d+\\-\\w+", # ! add example to docstring of what kind of string this matches correction="fdr_bh", ): - """ + r""" This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset. From 9f4d50d9cc2a6277fc8e16269fa0f52a708b5f3a Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 15:40:26 +0100 Subject: [PATCH 23/40] :art: docstrings: test if listing works with blank lines, write Example correctly otherwise numpy notation should work fine --- acore/multiple_testing.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py index bf22e6b..07bc19c 100644 --- a/acore/multiple_testing.py +++ b/acore/multiple_testing.py @@ -20,21 +20,23 @@ def apply_pvalue_correction(pvalues, alpha: float = 0.05, method: str = "bonferr :param numpy.ndarray pvalues: et of p-values of the individual tests. :param float alpha: error rate. :param str method: method of p-value correction: - - bonferroni : one-step correction - - sidak : one-step correction - - holm-sidak : step down method using Sidak adjustments - - holm : step-down method using Bonferroni adjustments - - simes-hochberg : step-up method (independent) - - hommel : closed method based on Simes tests (non-negative) - - fdr_bh : Benjamini/Hochberg (non-negative) - - fdr_by : Benjamini/Yekutieli (negative) - - fdr_tsbh : two stage fdr correction (non-negative) - - fdr_tsbky : two stage fdr correction (non-negative) + + - 'bonferroni' : one-step correction + - 'sidak' : one-step correction + - 'holm-sidak' : step down method using Sidak adjustments + - 'holm' : step-down method using Bonferroni adjustments + - 'simes-hochberg' : step-up method (independent) + - 'hommel' : closed method based on Simes tests (non-negative) + - 'fdr_bh' : Benjamini/Hochberg (non-negative) + - 'fdr_by' : Benjamini/Yekutieli (negative) + - 'fdr_tsbh' : two stage fdr correction (non-negative) + - 'fdr_tsbky' : two stage fdr correction (non-negative) + :return: Tuple with two `numpy.array`s, boolen for rejecting H0 hypothesis and float for adjusted p-value. Can contain missing values if `pvalues` contain missing values. - Exmaple:: + Example:: result = apply_pvalue_correction(pvalues, alpha=0.05, method='bonferroni') """ @@ -58,7 +60,7 @@ def apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method="indep"): :param str method: method of p-value correction ('indep', 'negcorr'). :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value. - Exmaple:: + Example:: result = apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method='indep') """ @@ -76,7 +78,7 @@ def apply_pvalue_twostage_fdrcorrection(pvalues, alpha=0.05, method="bh"): :param str method: method of p-value correction ('bky', 'bh'). :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value. - Exmaple:: + Example:: result = apply_pvalue_twostage_fdrcorrection(pvalues, alpha=0.05, method='bh') """ From bc5ab2c72c5769f94dd58157c95d6776a274d3cf Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 15:59:44 +0100 Subject: [PATCH 24/40] :art: rename reject_col to rejected_col after typing it wrongly several times - default parameter is rejected, so it is easier to not mistype parameter name --- acore/enrichment_analysis.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index c0bc0c3..672b5cd 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -80,7 +80,7 @@ def run_site_regulation_enrichment( identifier="identifier", groups=("group1", "group2"), annotation_col="annotation", - reject_col="rejected", + rejected_col="rejected", group_col="group", method="fisher", regex="(\\w+~.+)_\\w\\d+\\-\\w+", # ! add example to docstring of what kind of string this matches @@ -100,7 +100,7 @@ def run_site_regulation_enrichment( group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulation_data containing + :param str rejected_col: name of the column from regulation_data containing boolean for rejected null hypothesis. :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. @@ -117,7 +117,7 @@ def run_site_regulation_enrichment( identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', - reject_col='rejected', + rejected_col='rejected', group_col='group', method='fisher', match="(\\w+~.+)_\\w\\d+\\-\\w+" @@ -141,7 +141,7 @@ def run_site_regulation_enrichment( identifier, groups, annotation_col, - reject_col, + rejected_col, group_col, method, correction, @@ -156,7 +156,7 @@ def run_up_down_regulation_enrichment( identifier="identifier", groups=("group1", "group2"), annotation_col="annotation", - # reject_col="rejected", + # rejected_col="rejected", group_col="group", method="fisher", correction="fdr_bh", @@ -174,7 +174,7 @@ def run_up_down_regulation_enrichment( :param str identifier: name of the column from annotation containing feature identifiers. :param list groups: column names from regulation_data containing group identifiers. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from regulation_data containing boolean for + :param str rejected_col: name of the column from regulation_data containing boolean for rejected null hypothesis. :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. @@ -195,7 +195,7 @@ def run_up_down_regulation_enrichment( groups=['group1', 'group2'], annotation_col='annotation', - reject_col='rejected', + rejected_col='rejected', group_col='group', method='fisher', correction='fdr_bh', @@ -226,7 +226,7 @@ def run_up_down_regulation_enrichment( annotation, identifier=identifier, annotation_col=annotation_col, - reject_col="up_pairwise_regulation", + rejected_col="up_pairwise_regulation", group_col=group_col, method=method, correction=correction, @@ -238,7 +238,7 @@ def run_up_down_regulation_enrichment( annotation, identifier=identifier, annotation_col=annotation_col, - reject_col="down_pairwise_regulation", + rejected_col="down_pairwise_regulation", group_col=group_col, method=method, correction=correction, @@ -287,7 +287,7 @@ def run_regulation_enrichment( annotation: pd.DataFrame, identifier: str = "identifier", annotation_col: str = "annotation", - reject_col: str = "rejected", + rejected_col: str = "rejected", group_col: str = "group", method: str = "fisher", min_detected_in_set: int = 2, @@ -302,8 +302,9 @@ def run_regulation_enrichment( :param annotation: pandas.DataFrame with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. + It should also be present in `regulation_data`. :param str annotation_col: name of the column from annotation containing annotation terms. - :param str reject_col: name of the column from `regulation_data` containing boolean for + :param str rejected_col: name of the column from `regulation_data` containing boolean for rejected null hypothesis. :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background. @@ -319,7 +320,7 @@ def run_regulation_enrichment( annotation, identifier='identifier', annotation_col='annotation', - reject_col='rejected', + rejected_col='rejected', group_col='group', method='fisher', min_detected_in_set=2, @@ -328,7 +329,7 @@ def run_regulation_enrichment( ) """ # ? can we remove NA features in that column? - mask_rejected = regulation_data[reject_col].astype(bool) + mask_rejected = regulation_data[rejected_col].astype(bool) foreground_list = regulation_data.loc[mask_rejected, identifier].unique() background_list = regulation_data.loc[~mask_rejected, identifier].unique() foreground_pop = len(foreground_list) From 834b54e25fcf9783ab9af73ce3a476160cb02332 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 16:00:08 +0100 Subject: [PATCH 25/40] :art: try to set hyperlink --- acore/exploratory_analysis.py | 2 +- acore/multiple_testing.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/acore/exploratory_analysis.py b/acore/exploratory_analysis.py index b0a09da..b1384dd 100644 --- a/acore/exploratory_analysis.py +++ b/acore/exploratory_analysis.py @@ -35,7 +35,7 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]): :param list columns: names to use for the variable column(s), and for the value column(s) :return: Pandas dataframe with columns 'name' (protein identifier), 'x' (coefficient of variation), 'y' (mean) and 'group'. - Exmaple:: + Example:: result = get_coefficient_variation(data, drop_columns=['sample', 'subject'], group='group') """ diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py index 07bc19c..ae36349 100644 --- a/acore/multiple_testing.py +++ b/acore/multiple_testing.py @@ -9,10 +9,14 @@ # multitest.multitest_methods_names -def apply_pvalue_correction(pvalues, alpha: float = 0.05, method: str = "bonferroni"): +def apply_pvalue_correction( + pvalues: np.ndarray, alpha: float = 0.05, method: str = "bonferroni" +) -> tuple[np.ndarray, np.ndarray]: """ Performs p-value correction using the specified method as in - statsmodels.stats.multitest.multipletests. + statsmodels.stats.multitest.multipletests_. + + .. _statsmodels.stats.multitest.multipletests: https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html For more information visit https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html. From 794c1affa3ee2ea7e12517638c1a9629f68fe77e Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 4 Dec 2024 16:05:17 +0100 Subject: [PATCH 26/40] :truck::sparkles: move uniprot code to module, creating user-facing function --- acore/io/uniprot/__init__.py | 44 ++++++++++++++++++++++++ acore/io/{ => uniprot}/uniprot.py | 3 ++ docs/api_examples/enrichment_analysis.py | 41 +--------------------- 3 files changed, 48 insertions(+), 40 deletions(-) create mode 100644 acore/io/uniprot/__init__.py rename acore/io/{ => uniprot}/uniprot.py (99%) diff --git a/acore/io/uniprot/__init__.py b/acore/io/uniprot/__init__.py new file mode 100644 index 0000000..f7d0468 --- /dev/null +++ b/acore/io/uniprot/__init__.py @@ -0,0 +1,44 @@ +"""Uniprot API user functions for fetching annotations for UniProt IDs and providing +the results as a pandas.DataFrame.""" + +import pandas as pd + +from .uniprot import ( + check_id_mapping_results_ready, + get_id_mapping_results_link, + get_id_mapping_results_search, + submit_id_mapping, +) + + +# function for outside usage +def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: + """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's + knowledgebase (KB). + + Parameters + ---------- + ids : pd.Index | list + Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields. + fields : str, optional + Fields to fetch, by default "accession,go_p,go_c. See for availble fields: + https://www.uniprot.org/help/return_fields + + Returns + ------- + pd.DataFrame + DataFrame with annotations of the UniProt IDs. + """ + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids) + + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + # add fields to the link to get more information + # From and Entry (accession) are the same for UniProt IDs. + results = get_id_mapping_results_search( + link + "?fields=accession,go_p,go_c,go_f&format=tsv" + ) + header = results.pop(0).split("\t") + results = [line.split("\t") for line in results] + df = pd.DataFrame(results, columns=header) + return df diff --git a/acore/io/uniprot.py b/acore/io/uniprot/uniprot.py similarity index 99% rename from acore/io/uniprot.py rename to acore/io/uniprot/uniprot.py index cedc98b..c9abf05 100644 --- a/acore/io/uniprot.py +++ b/acore/io/uniprot/uniprot.py @@ -10,6 +10,7 @@ from urllib.parse import parse_qs, urlencode, urlparse from xml.etree import ElementTree +import pandas as pd import requests from requests.adapters import HTTPAdapter, Retry @@ -176,6 +177,8 @@ def get_id_mapping_results_stream(url): return decode_results(request, file_format, compressed) + + if __name__ == "__main__": # id mapping is used to create a link to a query (you can see the json in the browser) # UniProtKB is the knowleadgebase integrating all kind of other databases diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 00115c9..25e7cf9 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -24,6 +24,7 @@ import acore import acore.differential_regulation import acore.enrichment_analysis +from acore.io.uniprot import fetch_annotations dsp_pandas.format.set_pandas_options(max_colwidth=15) @@ -105,46 +106,6 @@ # # %% -from acore.io.uniprot import ( - check_id_mapping_results_ready, - get_id_mapping_results_link, - get_id_mapping_results_search, - submit_id_mapping, -) - - -def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: - """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's - knowledgebase (KB). - - Parameters - ---------- - ids : pd.Index | list - Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields. - fields : str, optional - Fields to fetch, by default "accession,go_p,go_c. See for availble fields: - https://www.uniprot.org/help/return_fields - - Returns - ------- - pd.DataFrame - DataFrame with annotations of the UniProt IDs. - """ - job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids) - - if check_id_mapping_results_ready(job_id): - link = get_id_mapping_results_link(job_id) - # add fields to the link to get more information - # From and Entry (accession) are the same for UniProt IDs. - results = get_id_mapping_results_search( - link + "?fields=accession,go_p,go_c,go_f&format=tsv" - ) - header = results.pop(0).split("\t") - results = [line.split("\t") for line in results] - df = pd.DataFrame(results, columns=header) - return df - - fname_annotations = "downloaded/annotations.csv" fname = Path(fname_annotations) try: From 9d7c5f21fff8a5f28ab0c1ca43f3b4d3a38afb58 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Thu, 5 Dec 2024 09:17:12 +0100 Subject: [PATCH 27/40] :bug: test line break in docstring works locally. see: https://stackoverflow.com/a/14414965/9684872 --- acore/multiple_testing.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py index ae36349..5261b39 100644 --- a/acore/multiple_testing.py +++ b/acore/multiple_testing.py @@ -16,10 +16,8 @@ def apply_pvalue_correction( Performs p-value correction using the specified method as in statsmodels.stats.multitest.multipletests_. - .. _statsmodels.stats.multitest.multipletests: https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html - - For more information visit - https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html. + .. _statsmodels.stats.multitest.multipletests: \ +https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html :param numpy.ndarray pvalues: et of p-values of the individual tests. :param float alpha: error rate. From dba200081444a4c20625e01acf4e30304f7bb95a Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 9 Dec 2024 16:30:44 +0100 Subject: [PATCH 28/40] :bug: specify fields as parameter - output should be tsv due to how pandas return is constructed for now --- acore/io/uniprot/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/acore/io/uniprot/__init__.py b/acore/io/uniprot/__init__.py index f7d0468..96e6002 100644 --- a/acore/io/uniprot/__init__.py +++ b/acore/io/uniprot/__init__.py @@ -12,7 +12,10 @@ # function for outside usage -def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: +def fetch_annotations( + ids: pd.Index | list, + fields: str = "accession,go_p,go_c,go_f", +) -> pd.DataFrame: """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's knowledgebase (KB). @@ -30,13 +33,14 @@ def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame: DataFrame with annotations of the UniProt IDs. """ job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids) - + # tsv used here to return a DataFrame. Maybe other formats are availale at some points + _format = "tsv" if check_id_mapping_results_ready(job_id): link = get_id_mapping_results_link(job_id) # add fields to the link to get more information # From and Entry (accession) are the same for UniProt IDs. results = get_id_mapping_results_search( - link + "?fields=accession,go_p,go_c,go_f&format=tsv" + link + f"?fields={fields}&format={_format}" ) header = results.pop(0).split("\t") results = [line.split("\t") for line in results] From ddac13c8e0b9b79809e83fbc65cc3fd3fbdcf927 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 9 Dec 2024 16:33:58 +0100 Subject: [PATCH 29/40] :construction: use fetch_annotations from pkg, add ssgsea --- acore/enrichment_analysis.py | 90 ++++++++++++--------- docs/api_examples/enrichment_analysis.ipynb | 77 +++++++----------- docs/api_examples/enrichment_analysis.py | 17 +++- 3 files changed, 97 insertions(+), 87 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 672b5cd..d5cc803 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -483,8 +483,8 @@ def run_enrichment( def run_ssgsea( data: pd.DataFrame, annotation: str, - set_index: list[str], - annotation_col: str = "an notation", + set_index: list[str] = None, + annotation_col: str = "annotation", identifier_col: str = "identifier", outdir: str = "tmp", min_size: int = 15, @@ -501,11 +501,11 @@ def run_ssgsea( :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins) :param annotation: pandas.DataFrame with the annotation to be used in the enrichment (i.e. CKG pathway annotation file) - :param str annotation_col: name of the column containing annotation terms. - :param str identifier_col: name of column containing dependent variables identifiers. :param list set_index: column/s to be used as index. Enrichment will be calculated for these values (i.e ["subject"] will return subjects x pathways matrix of enrichment scores) + :param str annotation_col: name of the column containing annotation terms. + :param str identifier_col: name of column containing dependent variables identifiers. :param str out_dir: directory path where results will be stored (default None, tmp folder is used) :param int min_size: minimum number of features (i.e. proteins) in enriched terms @@ -548,56 +548,70 @@ def run_ssgsea( permutations=0 ) """ - result = {} df = data.copy() if not os.path.exists(outdir): os.makedirs(outdir) # Comine columns to create a unique name for each set (?) name = [] - index = data[set_index] - for i, row in data[set_index].iterrows(): + if set_index is None: + index = data.index.to_frame() + set_index = index.columns.tolist() + else: + index = data[set_index] + df = df.drop(set_index, axis=1) + + for i, row in index.iterrows(): name.append( "_".join(row[set_index].tolist()) ) # this assumes strings as identifiers df["Name"] = name index.index = name - df = df.drop(set_index, axis=1).set_index("Name").transpose() + df = df.set_index("Name").transpose() - if annotation_col in annotation and identifier_col in annotation: - grouped_annotations = ( - annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index() + if not annotation_col in annotation: + raise ValueError( + f"Missing Annotation Column: {annotation_col} as specified by `annotation_col`" ) - fid = uuid.uuid4() - file_path = os.path.join(outdir, str(fid) + ".gmt") - with open(file_path, "w", encoding="utf8") as out: - for i, row in grouped_annotations.iterrows(): - out.write( - row[annotation_col] - + "\t" - + "\t".join(list(filter(None, row[identifier_col]))) - + "\n" - ) - enrichment = gp.ssgsea( - data=df, - gene_sets=str(file_path), - outdir=outdir, - min_size=min_size, - max_size=max_size, - scale=scale, - permutation_num=permutations, - no_plot=True, - processes=1, - seed=10, - format="png", + + if not identifier_col in annotation: + raise ValueError( + f"Missing Identifier Column: {identifier_col} as specified by `identifier_col`" ) - enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose() - enrichment_es = enrichment_es.join(index) - enrichment_nes = enrichment.res2d.transpose() - enrichment_nes = enrichment_nes.join(index) + grouped_annotations = ( + annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index() + ) + fid = uuid.uuid4() + file_path = os.path.join(outdir, str(fid) + ".gmt") + with open(file_path, "w", encoding="utf8") as out: + for i, row in grouped_annotations.iterrows(): + out.write( + row[annotation_col] + + "\t" + + "\t".join(list(filter(None, row[identifier_col]))) + + "\n" + ) + enrichment = gp.ssgsea( + data=df, + gene_sets=str(file_path), + outdir=outdir, + min_size=min_size, + max_size=max_size, + scale=scale, + permutation_num=permutations, + no_plot=True, + processes=1, + seed=10, + format="png", + ) + + enrichment_es = pd.DataFrame(enrichment.results).transpose() # resultsOnSamples + enrichment_es = enrichment_es.join(index) + enrichment_nes = enrichment.res2d.transpose() + # enrichment_nes = enrichment_nes.join(index) # ! To Fix - result = {"es": enrichment_es, "nes": enrichment_nes} + result = {"es": enrichment_es, "nes": enrichment_nes} return result diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 93ff6ee..1f39dfa 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -49,6 +49,7 @@ "import acore\n", "import acore.differential_regulation\n", "import acore.enrichment_analysis\n", + "from acore.io.uniprot import fetch_annotations\n", "\n", "dsp_pandas.format.set_pandas_options(max_colwidth=15)" ] @@ -79,7 +80,7 @@ "omics: str = f\"{base_path}/omics.csv\"\n", "meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n", "meta: str = f\"{base_path}/meta_patients.csv\"\n", - "N_to_sample: int = 1_000" + "features_to_sample: int = 100" ] }, { @@ -149,7 +150,7 @@ " .drop(idx_always_included, axis=1)\n", " .dropna(thresh=18, axis=1)\n", " .sample(\n", - " N_to_sample - len(idx_always_included),\n", + " features_to_sample - len(idx_always_included),\n", " axis=1,\n", " random_state=42,\n", " )\n", @@ -223,47 +224,7 @@ "metadata": {}, "outputs": [], "source": [ - "from acore.io.uniprot import (\n", - " check_id_mapping_results_ready,\n", - " get_id_mapping_results_link,\n", - " get_id_mapping_results_search,\n", - " submit_id_mapping,\n", - ")\n", - "\n", - "\n", - "def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:\n", - " \"\"\"Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's\n", - " knowledgebase (KB).\n", - "\n", - " Parameters\n", - " ----------\n", - " ids : pd.Index | list\n", - " Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.\n", - " fields : str, optional\n", - " Fields to fetch, by default \"accession,go_p,go_c. See for availble fields:\n", - " https://www.uniprot.org/help/return_fields\n", - "\n", - " Returns\n", - " -------\n", - " pd.DataFrame\n", - " DataFrame with annotations of the UniProt IDs.\n", - " \"\"\"\n", - " job_id = submit_id_mapping(from_db=\"UniProtKB_AC-ID\", to_db=\"UniProtKB\", ids=ids)\n", - "\n", - " if check_id_mapping_results_ready(job_id):\n", - " link = get_id_mapping_results_link(job_id)\n", - " # add fields to the link to get more information\n", - " # From and Entry (accession) are the same for UniProt IDs.\n", - " results = get_id_mapping_results_search(\n", - " link + \"?fields=accession,go_p,go_c,go_f&format=tsv\"\n", - " )\n", - " header = results.pop(0).split(\"\\t\")\n", - " results = [line.split(\"\\t\") for line in results]\n", - " df = pd.DataFrame(results, columns=header)\n", - " return df\n", - "\n", - "\n", - "fname_annotations = \"downloaded/annotations.csv\"\n", + "fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n", "fname = Path(fname_annotations)\n", "try:\n", " annotations = pd.read_csv(fname, index_col=0)\n", @@ -325,9 +286,7 @@ "cell_type": "code", "execution_count": null, "id": "f300c5b5", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "ret = acore.enrichment_analysis.run_regulation_enrichment(\n", @@ -338,6 +297,32 @@ "ret" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b05e0c", + "metadata": {}, + "outputs": [], + "source": [ + "res_dict = acore.enrichment_analysis.run_ssgsea(\n", + " data=df_omics,\n", + " annotation=annotations,\n", + " min_size=1,\n", + ")\n", + "print(res_dict.keys())\n", + "res_dict[\"es\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9e2fddf", + "metadata": {}, + "outputs": [], + "source": [ + "res_dict[\"nes\"]" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 25e7cf9..64af662 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -39,7 +39,7 @@ omics: str = f"{base_path}/omics.csv" meta_pgs: str = f"{base_path}/meta_pgs.csv" meta: str = f"{base_path}/meta_patients.csv" -N_to_sample: int = 1_000 +features_to_sample: int = 100 # %% [markdown] # # Load processed data @@ -69,7 +69,7 @@ .drop(idx_always_included, axis=1) .dropna(thresh=18, axis=1) .sample( - N_to_sample - len(idx_always_included), + features_to_sample - len(idx_always_included), axis=1, random_state=42, ) @@ -106,7 +106,7 @@ # # %% -fname_annotations = "downloaded/annotations.csv" +fname_annotations = f"downloaded/annotations_{features_to_sample}.csv" fname = Path(fname_annotations) try: annotations = pd.read_csv(fname, index_col=0) @@ -155,5 +155,16 @@ ) ret +# %% +res_dict = acore.enrichment_analysis.run_ssgsea( + data=df_omics, + annotation=annotations, + min_size=1, +) +print(res_dict.keys()) +res_dict["es"] + +# %% +res_dict["nes"] # %% From 5373372091d24dab9e71c28d9764163d61c507b1 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Dec 2024 20:43:34 +0100 Subject: [PATCH 30/40] :art: type hints, formatting, add pca example to api example of enrichment analysis --- acore/enrichment_analysis.py | 88 ++++---- acore/exploratory_analysis.py | 217 ++++++++++++-------- acore/network_analysis.py | 3 +- docs/api_examples/enrichment_analysis.ipynb | 170 ++++++++++++++- docs/api_examples/enrichment_analysis.py | 82 +++++++- 5 files changed, 423 insertions(+), 137 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index d5cc803..ab277e6 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -74,16 +74,17 @@ def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"): return result +# ! undocumented for now (find usage example) def run_site_regulation_enrichment( - regulation_data, - annotation, + regulation_data: pd.DataFrame, + annotation: pd.DataFrame, identifier="identifier", groups=("group1", "group2"), annotation_col="annotation", rejected_col="rejected", group_col="group", method="fisher", - regex="(\\w+~.+)_\\w\\d+\\-\\w+", # ! add example to docstring of what kind of string this matches + regex="(\\w+~.+)_\\w\\d+\\-\\w+", correction="fdr_bh", ): r""" @@ -110,6 +111,8 @@ def run_site_regulation_enrichment( :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', 'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'. + :raises ValueError: if regulation_data is `None` or empty. + Example:: result = run_site_regulation_enrichment(regulation_data, @@ -124,28 +127,35 @@ def run_site_regulation_enrichment( ) """ result = pd.DataFrame() - if regulation_data is not None: - if not regulation_data.empty: - new_ids = [] - for ident in regulation_data[identifier].tolist(): - match = re.search(regex, ident) - if match is not None: - new_ids.append(match.group(1)) - else: - new_ids.append(ident) - regulation_data[identifier] = new_ids - regulation_data = regulation_data.drop_duplicates() - result = run_regulation_enrichment( - regulation_data, - annotation, - identifier, - groups, - annotation_col, - rejected_col, - group_col, - method, - correction, - ) + if regulation_data is None or regulation_data.empty: + raise ValueError("regulation_data is empty") + + new_ids = [] + # find any identifiers with a PTM and save only prot+gene identifer + for ident in regulation_data[identifier].tolist(): + match = re.search(regex, ident) + if match is not None: + new_ids.append( + match.group(1) + ) # removes the PTM extension of the identifierm of CKG + else: + new_ids.append(ident) + # so this is normalizing the identifiers to ignore the PTM extension + regulation_data[identifier] = new_ids # matches are used as identifiers + regulation_data = ( + regulation_data.drop_duplicates() + ) # depends on regulation_data if this has an effect (with floats probably not) + result = run_regulation_enrichment( + regulation_data, + annotation, + identifier, + groups, + annotation_col, + rejected_col, + group_col, + method, + correction, + ) return result @@ -491,17 +501,17 @@ def run_ssgsea( max_size: int = 500, scale: bool = False, permutations: int = 0, -) -> dict[str, pd.DataFrame]: +) -> pd.DataFrame: """ Project each sample within a data set onto a space of gene set enrichment scores using the single sample gene set enrichment analysis (ssGSEA) projection methodology described in Barbie et al., 2009: https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA). - :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins) - :param annotation: pandas.DataFrame with the annotation to be used in the enrichment + :param pd.DataFrame data: pandas.DataFrame with the quantified features (i.e. subject x proteins) + :param str annotation: pandas.DataFrame with the annotation to be used in the enrichment (i.e. CKG pathway annotation file) - :param list set_index: column/s to be used as index. Enrichment will be calculated + :param list[str] set_index: column/s to be used as index. Enrichment will be calculated for these values (i.e ["subject"] will return subjects x pathways matrix of enrichment scores) :param str annotation_col: name of the column containing annotation terms. @@ -514,8 +524,9 @@ def run_ssgsea( (i.e. pathways) :param bool scale: whether or not to scale the data :param int permutations: number of permutations used in the ssgsea analysis - :return: dictionary with two dataframes: es - enrichment scores, - and nes - normalized enrichment scores. + :return: pandas.DataFrame containing unnormalized enrichment scores (`ES`) for each sample, + and normalized enrichment scores (`NES`) with the enriched `Term` and sample `Name`. + :rtype: pandas.DataFrame Example:: @@ -561,7 +572,7 @@ def run_ssgsea( index = data[set_index] df = df.drop(set_index, axis=1) - for i, row in index.iterrows(): + for _, row in index.iterrows(): name.append( "_".join(row[set_index].tolist()) ) # this assumes strings as identifiers @@ -586,7 +597,7 @@ def run_ssgsea( fid = uuid.uuid4() file_path = os.path.join(outdir, str(fid) + ".gmt") with open(file_path, "w", encoding="utf8") as out: - for i, row in grouped_annotations.iterrows(): + for _, row in grouped_annotations.iterrows(): out.write( row[annotation_col] + "\t" @@ -606,12 +617,7 @@ def run_ssgsea( seed=10, format="png", ) - - enrichment_es = pd.DataFrame(enrichment.results).transpose() # resultsOnSamples - enrichment_es = enrichment_es.join(index) - enrichment_nes = enrichment.res2d.transpose() - # enrichment_nes = enrichment_nes.join(index) # ! To Fix - - result = {"es": enrichment_es, "nes": enrichment_nes} - + result = pd.DataFrame(enrichment.res2d).set_index("Name") + # potentially return wide format in separate format + # result = {"es": enrichment_es, "nes": enrichment_nes} return result diff --git a/acore/exploratory_analysis.py b/acore/exploratory_analysis.py index b1384dd..356e4a9 100644 --- a/acore/exploratory_analysis.py +++ b/acore/exploratory_analysis.py @@ -6,13 +6,13 @@ from sklearn.manifold import TSNE -def calculate_coefficient_variation(values): +def calculate_coefficient_variation(values: np.ndarray) -> np.ndarray: """ Compute the coefficient of variation, the ratio of the biased standard deviation to the mean, in percentage. For more information visit https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.variation.html. - :param numpy.ndarray values: numpy array + :param numpy.ndarray values: numpy array of log2 transformed values :return: The calculated variation along rows. :rtype: numpy.ndarray @@ -29,11 +29,13 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]): """ Extracts the coefficients of variation in each group. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_columns: column labels to be dropped from the dataframe :param str group: column label containing group identifiers. :param list columns: names to use for the variable column(s), and for the value column(s) - :return: Pandas dataframe with columns 'name' (protein identifier), 'x' (coefficient of variation), 'y' (mean) and 'group'. + :return: Pandas dataframe with columns 'name' (protein identifier), + 'x' (coefficient of variation), 'y' (mean) and 'group'. Example:: @@ -46,24 +48,27 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]): cvs_df = pd.DataFrame() for i in cvs.index: gcvs = cvs[i].tolist() - ints = formated_df.set_index("group").mean().values.tolist() + ints = formated_df.set_index(group).mean().values.tolist() tdf = pd.DataFrame(data={"name": cols, "x": gcvs, "y": ints}) - tdf["group"] = i + tdf[group] = i if cvs_df.empty: cvs_df = tdf.copy() else: - cvs_df = cvs_df.append(tdf) + cvs_df = pd.concat([cvs_df, tdf]) return cvs_df def extract_number_missing(data, min_valid, drop_cols=["sample"], group="group"): """ - Counts how many valid values exist in each column and filters column labels with more valid values than the minimum threshold defined. + Counts how many valid values exist in each column and filters column labels with more + valid values than the minimum threshold defined. :param data: pandas DataFrame with group as rows and protein identifier as column. - :param str group: column label containing group identifiers. If None, number of valid values is counted across all samples, otherwise is counted per unique group identifier. + :param str group: column label containing group identifiers. + If None, number of valid values is counted across all samples, + otherwise is counted per unique group identifier. :param int min_valid: minimum number of valid values to be filtered. :param list drop_columns: column labels to be dropped. :return: List of column labels above the threshold. @@ -89,16 +94,21 @@ def extract_percentage_missing( data, missing_max, drop_cols=["sample"], group="group", how="all" ): """ - Extracts ratio of missing/valid values in each column and filters column labels with lower ratio than the minimum threshold defined. + Extracts ratio of missing/valid values in each column and filters column labels with + lower ratio than the minimum threshold defined. :param data: pandas dataframe with group as rows and protein identifier as column. - :param str group: column label containing group identifiers. If None, ratio is calculated across all samples, otherwise is calculated per unique group identifier. + :param str group: column label containing group identifiers. + If None, ratio is calculated across all samples, + otherwise is calculated per unique group identifier. :param float missing_max: maximum ratio of missing/valid values to be filtered. - :param str how: define if labels with a higher percentage of missing values than the threshold in any group ('any') or in all groups ('all') should be filtered + :param str how: define if labels with a higher percentage of missing values than the threshold + in any group ('any') or in all groups ('all') should be filtered :return: List of column labels below the threshold. Example:: - result = extract_percentage_missing(data, missing_max=0.3, drop_cols=['sample'], group='group') + result = extract_percentage_missing(data, missing_max=0.3, + drop_cols=['sample'], group='group') """ if group is None: groups = data.loc[:, data.isnull().mean() <= missing_max].columns @@ -128,81 +138,86 @@ def run_pca( dropna=True, ): """ - Performs principal component analysis and returns the values of each component for each sample and each protein, and the loadings for each protein. \ - For information visit https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html. + Performs principal component analysis and returns the values of each component for each sample + and each protein, and the loadings for each protein. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + For information visit + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html. + + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_cols: column labels to be dropped from the dataframe. :param str group: column label containing group identifiers. :param list annotation_cols: list of columns to be added in the scatter plot annotation :param int components: number of components to keep. :param bool dropna: if True removes all columns with any missing values. - :return: tuple: 1) three pandas dataframes: components, loadings and variance; 2) xaxis and yaxis titles with components loadings for plotly. + :return: tuple: 1) three pandas dataframes: components, loadings and variance; 2) + xaxis and yaxis titles with components loadings for plotly. Example:: - result = run_pca(data, drop_cols=['sample', 'subject'], group='group', components=2, dropna=True) + result = run_pca(data, drop_cols=['sample', 'subject'], group='group', + components=2, dropna=True) """ np.random.seed(112736) - resultDf = pd.DataFrame() - loadings = pd.DataFrame() var_exp = [] args = {} - if not data.empty: - df = data.copy() - annotations = pd.DataFrame() - if annotation_cols is not None: - if len(list(set(annotation_cols).intersection(data.columns))) > 0: - annotations = data.set_index(group)[annotation_cols] - drop_cols_int = list(set(drop_cols).intersection(df.columns)) - if len(drop_cols_int) > 0: - df = df.drop(drop_cols_int, axis=1) - - y = df[group].tolist() - df = df.set_index(group) - df = df.select_dtypes(["number"]) - if dropna: - df = df.dropna(axis=1) - X = df.values - - if X.size > 0 and X.shape[1] > components: - pca = PCA(n_components=components) - X = pca.fit_transform(X) - var_exp = pca.explained_variance_ratio_ - loadings = pd.DataFrame(pca.components_.transpose()) - loadings.index = df.columns - values = { - index: np.sqrt(np.power(row, 2).sum()) - for index, row in loadings.iterrows() - } - loadings["value"] = loadings.index.map(values.get) - loadings = loadings.sort_values(by="value", ascending=False) - args = { - "x_title": "PC1" + " ({0:.2f})".format(var_exp[0]), - "y_title": "PC2" + " ({0:.2f})".format(var_exp[1]), - "group": "group", - } - if components == 2: - resultDf = pd.DataFrame(X, index=y, columns=["x", "y"]) - resultDf = resultDf.assign(**annotations) - resultDf = resultDf.reset_index() - resultDf.columns = ["group", "x", "y"] + annotation_cols - - loadings.columns = ["x", "y", "value"] - if components > 2: - args.update({"z_title": "PC3" + " ({0:.2f})".format(var_exp[2])}) - resultDf = pd.DataFrame(X, index=y) - resultDf = resultDf.assign(**annotations) - resultDf = resultDf.reset_index() - pca_cols = [] - loading_cols = [] - if components > 3: - pca_cols = [str(i) for i in resultDf.columns[4:]] - loading_cols = [str(i) for i in loadings.columns[3:]] - - resultDf.columns = ["group", "x", "y", "z"] + pca_cols - loadings.columns = ["x", "y", "z"] + loading_cols + if data.empty: + raise ValueError("Dataframe is empty.") + + df = data.copy() + annotations = pd.DataFrame() + if annotation_cols is not None: + if len(list(set(annotation_cols).intersection(data.columns))) > 0: + annotations = data.set_index(group)[annotation_cols] + drop_cols_int = list(set(drop_cols).intersection(df.columns)) + if len(drop_cols_int) > 0: + df = df.drop(drop_cols_int, axis=1) + + y = df[group].tolist() + df = df.set_index(group) + df = df.select_dtypes(["number"]) + if dropna: + df = df.dropna(axis=1) + X = df.values + + if X.size > 0 and X.shape[1] > components: + pca = PCA(n_components=components) + X = pca.fit_transform(X) + var_exp = pca.explained_variance_ratio_ + loadings = pd.DataFrame(pca.components_.transpose()) + loadings.index = df.columns + values = { + index: np.sqrt(np.power(row, 2).sum()) for index, row in loadings.iterrows() + } + loadings["value"] = loadings.index.map(values.get) + loadings = loadings.sort_values(by="value", ascending=False) + args = { + "x_title": "PC1" + " ({0:.2f})".format(var_exp[0]), + "y_title": "PC2" + " ({0:.2f})".format(var_exp[1]), + "group": "group", + } + if components == 2: + resultDf = pd.DataFrame(X, index=y, columns=["x", "y"]) + resultDf = resultDf.assign(**annotations) + resultDf = resultDf.reset_index() + resultDf.columns = ["group", "x", "y"] + annotation_cols + + loadings.columns = ["x", "y", "value"] + if components > 2: + args.update({"z_title": "PC3" + " ({0:.2f})".format(var_exp[2])}) + resultDf = pd.DataFrame(X, index=y) + resultDf = resultDf.assign(**annotations) + resultDf = resultDf.reset_index() + pca_cols = [] + loading_cols = [] + if components > 3: + pca_cols = [str(i) for i in resultDf.columns[4:]] + loading_cols = [str(i) for i in loadings.columns[3:]] + + resultDf.columns = ["group", "x", "y", "z"] + pca_cols + loadings.columns = ["x", "y", "z"] + loading_cols return (resultDf, loadings, var_exp), args @@ -219,22 +234,39 @@ def run_tsne( dropna=True, ): """ - Performs t-distributed Stochastic Neighbor Embedding analysis. For more information visit https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html. + Performs t-distributed Stochastic Neighbor Embedding analysis. + + For more information visit + https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_cols: column labels to be dropped from the dataframe. :param str group: column label containing group identifiers. :param int components: dimension of the embedded space. :param list annotation_cols: list of columns to be added in the scatter plot annotation - :param int perplexity: related to the number of nearest neighbors that is used in other manifold learning algorithms. Consider selecting a value between 5 and 50. + :param int perplexity: related to the number of nearest neighbors that is used + in other manifold learning algorithms. + Consider selecting a value between 5 and 50. :param int n_iter: maximum number of iterations for the optimization (at least 250). - :param str init: initialization of embedding ('random', 'pca' or numpy array of shape n_samples x n_components). + :param str init: initialization of embedding ('random', 'pca' or + numpy array of shape n_samples x n_components). :param bool dropna: if True removes all columns with any missing values. - :return: Two dictionaries: 1) pandas dataframe with embedding vectors, 2) xaxis and yaxis titles for plotly. + :return: Two dictionaries: + 1) pandas dataframe with embedding vectors, + 2) xaxis and yaxis titles for plotly. Example:: - result = run_tsne(data, drop_cols=['sample', 'subject'], group='group', components=2, perplexity=40, n_iter=1000, init='pca', dropna=True) + result = run_tsne(data, + drop_cols=['sample', 'subject'], + group='group', + components=2, + perplexity=40, + n_iter=1000, + init='pca', + dropna=True + ) """ result = {} args = {} @@ -256,7 +288,7 @@ def run_tsne( n_components=components, verbose=0, perplexity=perplexity, - max_iter=n_iter, + n_iter=n_iter, init=init, ) X = tsne.fit_transform(X) @@ -289,21 +321,34 @@ def run_umap( dropna=True, ): """ - Performs Uniform Manifold Approximation and Projection. For more information vist https://umap-learn.readthedocs.io. + Performs Uniform Manifold Approximation and Projection. + + For more information vist https://umap-learn.readthedocs.io. - :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject'). + :param data: pandas dataframe with samples as rows and protein identifiers as columns + (with additional columns 'group', 'sample' and 'subject'). :param list drop_cols: column labels to be dropped from the dataframe. :param str group: column label containing group identifiers. :param list annotation_cols: list of columns to be added in the scatter plot annotation - :param int n_neighbors: number of neighboring points used in local approximations of manifold structure. + :param int n_neighbors: number of neighboring points used + in local approximations of manifold structure. :param float min_dist: controls how tightly the embedding is allowed compress points together. :param str metric: metric used to measure distance in the input space. :param bool dropna: if True removes all columns with any missing values. - :return: Two dictionaries: 1) pandas dataframe with embedding of the training data in low-dimensional space, 2) xaxis and yaxis titles for plotly. + :return: Two dictionaries: + 1) pandas dataframe with embedding of the training data in low-dimensional space, + 2) xaxis and yaxis titles for plotly. Example:: - result = run_umap(data, drop_cols=['sample', 'subject'], group='group', n_neighbors=10, min_dist=0.3, metric='cosine', dropna=True) + result = run_umap(data, + drop_cols=['sample', 'subject'], + group='group', + n_neighbors=10, + min_dist=0.3, + metric='cosine', + dropna=True + ) """ np.random.seed(1145536) result = {} diff --git a/acore/network_analysis.py b/acore/network_analysis.py index 6af038e..50f9376 100644 --- a/acore/network_analysis.py +++ b/acore/network_analysis.py @@ -2,10 +2,11 @@ import networkx as nx import pandas as pd import snf -import utils from sklearn import cluster from sklearn.cluster import AffinityPropagation +from . import utils + def get_network_communities(graph, args): """ diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 1f39dfa..5ed9eca 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -297,6 +297,32 @@ "ret" ] }, + { + "cell_type": "markdown", + "id": "834d5a85", + "metadata": {}, + "source": [ + "### For up- and downregulated genes separately" + ] + }, + { + "cell_type": "markdown", + "id": "ecf75e7c", + "metadata": {}, + "source": [ + "### Site specific enrichment analysis" + ] + }, + { + "cell_type": "markdown", + "id": "0ee3a7c8", + "metadata": {}, + "source": [ + "The basic example uses a modified peptide sequence to\n", + "demonstrate the enrichment analysis.\n", + "- compare groups per amino acid modified (kinases targeting certain motifs?)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -304,23 +330,157 @@ "metadata": {}, "outputs": [], "source": [ - "res_dict = acore.enrichment_analysis.run_ssgsea(\n", + "import re\n", + "\n", + "regex = \"(\\\\w+~.+)_\\\\w\\\\d+\\\\-\\\\w+\"\n", + "identifier_ckg = \"gnd~P00350_T10-WW\"\n", + "match = re.search(regex, identifier_ckg)\n", + "match.group(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9e2fddf", + "metadata": {}, + "outputs": [], + "source": [ + "seq_mod = \"_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR\"\n", + "regex = \"\\(\\w\\w\\)\"\n", + "matches = re.findall(regex, seq_mod)\n", + "matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eab37e1", + "metadata": {}, + "outputs": [], + "source": [ + "# acore.enrichment_analysis.run_up_down_regulation_enrichment(" + ] + }, + { + "cell_type": "markdown", + "id": "dbc34b7b", + "metadata": {}, + "source": [ + "## Single sample GSEA (ssGSEA)\n", + "Run a gene set enrichment analysis (GSEA) for each sample,\n", + "see [article](https://www.nature.com/articles/nature08460#Sec3) and\n", + "the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea)\n", + "for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e211bf65", + "metadata": {}, + "outputs": [], + "source": [ + "enrichtments = acore.enrichment_analysis.run_ssgsea(\n", " data=df_omics,\n", " annotation=annotations,\n", " min_size=1,\n", ")\n", - "print(res_dict.keys())\n", - "res_dict[\"es\"]" + "enrichtments" ] }, { "cell_type": "code", "execution_count": null, - "id": "b9e2fddf", + "id": "0fd08e79", + "metadata": {}, + "outputs": [], + "source": [ + "enrichtments.iloc[0].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca9e4908", + "metadata": {}, + "outputs": [], + "source": [ + "enrichtments[\"NES\"].plot.hist()" + ] + }, + { + "cell_type": "markdown", + "id": "6b9a50cc", + "metadata": {}, + "source": [ + "The normalised enrichment score (NES) can be used in a PCA plot to see if the samples\n", + "cluster according to the enrichment of the gene sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d917326", + "metadata": {}, + "outputs": [], + "source": [ + "nes = enrichtments.set_index(\"Term\", append=True).unstack()[\"NES\"].convert_dtypes()\n", + "nes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02dbec64", + "metadata": {}, + "outputs": [], + "source": [ + "import acore.exploratory_analysis as ea\n", + "\n", + "pca_result, pca_annotation = ea.run_pca(\n", + " data=nes.join(df_meta[[group]]),\n", + " drop_cols=[],\n", + " annotation_cols=[],\n", + " group=group,\n", + " components=2,\n", + " dropna=False,\n", + ")\n", + "pca_result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28464b86", + "metadata": {}, + "outputs": [], + "source": [ + "# from plotly.offline import iplot\n", + "# from vuecore import viz\n", + "\n", + "# args = {\"factor\": 1, \"loadings\": 10}\n", + "# #! pca_results has three items, but docstring requests only two -> double check\n", + "# figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n", + "# iplot(figure)" + ] + }, + { + "cell_type": "markdown", + "id": "5d0c1eba", + "metadata": {}, + "source": [ + "## Compare two distributions - KS test\n", + "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c66fcef", "metadata": {}, "outputs": [], "source": [ - "res_dict[\"nes\"]" + "# plot two histograms of intensity values here" ] }, { diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 64af662..e4ccb77 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -155,16 +155,90 @@ ) ret +# %% [markdown] +# ### For up- and downregulated genes separately + +# %% [markdown] +# ### Site specific enrichment analysis + +# %% [markdown] +# The basic example uses a modified peptide sequence to +# demonstrate the enrichment analysis. +# - compare groups per amino acid modified (kinases targeting certain motifs?) + +# %% +import re + +regex = "(\\w+~.+)_\\w\\d+\\-\\w+" +identifier_ckg = "gnd~P00350_T10-WW" +match = re.search(regex, identifier_ckg) +match.group(1) + # %% -res_dict = acore.enrichment_analysis.run_ssgsea( +seq_mod = "_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR" +regex = "\(\w\w\)" +matches = re.findall(regex, seq_mod) +matches + +# %% +# acore.enrichment_analysis.run_up_down_regulation_enrichment( + +# %% [markdown] +# ## Single sample GSEA (ssGSEA) +# Run a gene set enrichment analysis (GSEA) for each sample, +# see [article](https://www.nature.com/articles/nature08460#Sec3) and +# the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea) +# for more details. + +# %% +enrichtments = acore.enrichment_analysis.run_ssgsea( data=df_omics, annotation=annotations, min_size=1, ) -print(res_dict.keys()) -res_dict["es"] +enrichtments + +# %% +enrichtments.iloc[0].to_dict() + +# %% +enrichtments["NES"].plot.hist() + +# %% [markdown] +# The normalised enrichment score (NES) can be used in a PCA plot to see if the samples +# cluster according to the enrichment of the gene sets. + +# %% +nes = enrichtments.set_index("Term", append=True).unstack()["NES"].convert_dtypes() +nes + +# %% +import acore.exploratory_analysis as ea + +pca_result, pca_annotation = ea.run_pca( + data=nes.join(df_meta[[group]]), + drop_cols=[], + annotation_cols=[], + group=group, + components=2, + dropna=False, +) +pca_result + +# %% +# from plotly.offline import iplot +# from vuecore import viz + +# args = {"factor": 1, "loadings": 10} +# # #! pca_results has three items, but docstring requests only two -> double check +# figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args) +# iplot(figure) + +# %% [markdown] +# ## Compare two distributions - KS test +# The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions. # %% -res_dict["nes"] +# plot two histograms of intensity values here # %% From 52ce9c5d113f5fc54514b0b1a3af05a12f3cf021 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Dec 2024 20:44:06 +0100 Subject: [PATCH 31/40] :art: run CI only on PR --- .github/workflows/tox-gha.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tox-gha.yml b/.github/workflows/tox-gha.yml index 696456d..6cf0ec4 100644 --- a/.github/workflows/tox-gha.yml +++ b/.github/workflows/tox-gha.yml @@ -2,10 +2,12 @@ name: Python package on: push: + branches: [main] pull_request: - # release: - # types: [published] - + branches: [main] + release: + types: [published] + jobs: build: runs-on: ubuntu-latest From 47bcdbd3329a5a02c0502004be8b9d421ec86aba Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 17 Dec 2024 14:58:02 +0100 Subject: [PATCH 32/40] :art: format tests --- tests/test_differential_regulation.py | 147 +++++++++++++++-------- tests/test_exploratory.py | 160 ++++++++++++++++++-------- 2 files changed, 211 insertions(+), 96 deletions(-) diff --git a/tests/test_differential_regulation.py b/tests/test_differential_regulation.py index 2e13f92..169bafa 100644 --- a/tests/test_differential_regulation.py +++ b/tests/test_differential_regulation.py @@ -7,56 +7,85 @@ class TestCalculateTtest(unittest.TestCase): def setUp(self): self.data = { - 'subject': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], - 'sample': ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12'], - 'group': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'], - 'protein': [1.4, 6.2, 9.03, 2.3, 7.4, 14.01, 4.5, 9.6, 10.4, 7.5, 11.6, 16.4], - 'group2': ['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b'], + "subject": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + "sample": [ + "S1", + "S2", + "S3", + "S4", + "S5", + "S6", + "S7", + "S8", + "S9", + "S10", + "S11", + "S12", + ], + "group": ["A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"], + "protein": [ + 1.4, + 6.2, + 9.03, + 2.3, + 7.4, + 14.01, + 4.5, + 9.6, + 10.4, + 7.5, + 11.6, + 16.4, + ], + "group2": ["a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"], } self.df = pd.DataFrame(self.data) def test_ttest_ind(self): data = { - 'group1': [1.3, 1.2, 0.2, 2.2, 3.3, 8.43], - 'group2': [1.4, 6.2, 7.3, 0.4, 1.5, 0.6] + "group1": [1.3, 1.2, 0.2, 2.2, 3.3, 8.43], + "group2": [1.4, 6.2, 7.3, 0.4, 1.5, 0.6], } df = pd.DataFrame(data) - condition1 = 'group1' - condition2 = 'group2' + condition1 = "group1" + condition2 = "group2" is_logged = True non_par = False expected_result = pg.ttest(df[condition1], df[condition2], paired=False) - result = dr.calculate_ttest(df, condition1, condition2, is_logged=is_logged, non_par=non_par) + result = dr.calculate_ttest( + df, condition1, condition2, is_logged=is_logged, non_par=non_par + ) self.assertAlmostEqual(result[0], expected_result["T"].values[0]) self.assertAlmostEqual(result[1], expected_result["p-val"].values[0]) def test_mann_whitney(self): - data = { - 'group1': [1, 2, 3, 4, 5], - 'group2': [2, 4, 6, 8, 10] - } + data = {"group1": [1, 2, 3, 4, 5], "group2": [2, 4, 6, 8, 10]} df = pd.DataFrame(data) - condition1 = 'group1' - condition2 = 'group2' + condition1 = "group1" + condition2 = "group2" is_logged = True non_par = True expected_result = pg.mwu(df[condition1], df[condition2]) - result = dr.calculate_ttest(df, condition1, condition2, is_logged=is_logged, non_par=non_par) + result = dr.calculate_ttest( + df, condition1, condition2, is_logged=is_logged, non_par=non_par + ) self.assertAlmostEqual(result[0], expected_result["U-val"].values[0]) self.assertAlmostEqual(result[1], expected_result["p-val"].values[0]) def test_calculate_anova(self): - column = 'protein' - group = 'group' + column = "protein" + group = "group" expected_result = pg.anova(data=self.df, dv=column, between=group) - expected_t, expected_df1, expected_df2, expected_pvalue = expected_result[['F', 'ddof1', 'ddof2', 'p-unc']].values[0] + expected_t, expected_df1, expected_df2, expected_pvalue = expected_result[ + ["F", "ddof1", "ddof2", "p-unc"] + ].values[0] result = dr.calculate_anova(self.df, column, group=group) @@ -66,14 +95,20 @@ def test_calculate_anova(self): self.assertEqual(result[4], expected_pvalue) def test_calculate_ancova(self): - column = 'protein' - group = 'group' + column = "protein" + group = "group" covariates = [] - expected_result = pg.ancova(data=self.df, dv=column, between=group, covar=covariates) - expected_t, expected_df, expected_pvalue = expected_result.loc[expected_result['Source'] == group, ['F', 'DF', 'p-unc']].values[0] + expected_result = pg.ancova( + data=self.df, dv=column, between=group, covar=covariates + ) + expected_t, expected_df, expected_pvalue = expected_result.loc[ + expected_result["Source"] == group, ["F", "DF", "p-unc"] + ].values[0] - result = dr.calculate_ancova(self.df, column, group=group, covariates=covariates) + result = dr.calculate_ancova( + self.df, column, group=group, covariates=covariates + ) self.assertEqual(result[1], expected_df) self.assertEqual(result[2], expected_df) @@ -82,16 +117,27 @@ def test_calculate_ancova(self): def test_calculate_repeated_measures_anova(self): """Source SS DF MS F p-unc ng2 eps -0 group 1.50 1 1.500 0.087642 0.795106 0.032609 1.0 -1 Error 34.23 2 17.115 NaN NaN NaN NaN""" - column = 'protein' - subject = 'subject' - within = 'group' - - expected_result = pg.rm_anova(data=self.df, dv=column, within=within, subject=subject, detailed=True, correction=True) - expected_t, expected_pvalue = expected_result.loc[0, ["F", "p-unc"]].values.tolist() + 0 group 1.50 1 1.500 0.087642 0.795106 0.032609 1.0 + 1 Error 34.23 2 17.115 NaN NaN NaN NaN""" + column = "protein" + subject = "subject" + within = "group" + + expected_result = pg.rm_anova( + data=self.df, + dv=column, + within=within, + subject=subject, + detailed=True, + correction=True, + ) + expected_t, expected_pvalue = expected_result.loc[ + 0, ["F", "p-unc"] + ].values.tolist() expected_df1, expected_df2 = expected_result["DF"] - result = dr.calculate_repeated_measures_anova(self.df, column, subject=subject, within=within) + result = dr.calculate_repeated_measures_anova( + self.df, column, subject=subject, within=within + ) self.assertEqual(result[1], expected_df1) self.assertEqual(result[2], expected_df2) @@ -99,19 +145,30 @@ def test_calculate_repeated_measures_anova(self): self.assertEqual(result[4], expected_pvalue) def test_calculate_mixed_anova(self): - column = 'protein' - subject = 'subject' - within = 'group' - between = 'group2' - - expected_result = pg.mixed_anova(data=self.df, dv=column, within=within, between=between, subject=subject, correction=True) - expected_result['identifier'] = column - expected_result = expected_result[['identifier', 'DF1', 'DF2', 'F', 'p-unc', 'Source']] - - result = dr.calculate_mixed_anova(self.df, column, subject=subject, within=within, between=between) + column = "protein" + subject = "subject" + within = "group" + between = "group2" + + expected_result = pg.mixed_anova( + data=self.df, + dv=column, + within=within, + between=between, + subject=subject, + correction=True, + ) + expected_result["identifier"] = column + expected_result = expected_result[ + ["identifier", "DF1", "DF2", "F", "p-unc", "Source"] + ] + + result = dr.calculate_mixed_anova( + self.df, column, subject=subject, within=within, between=between + ) self.assertEqual(result.values.tolist(), expected_result.values.tolist()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_exploratory.py b/tests/test_exploratory.py index ef84e3a..97dd673 100644 --- a/tests/test_exploratory.py +++ b/tests/test_exploratory.py @@ -1,94 +1,152 @@ -import unittest import platform +import unittest + import pandas as pd + import acore.exploratory_analysis as ea class TestExtractMissing(unittest.TestCase): def setUp(self): - self.data = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'sample': ['S1', 'S2', 'S3', 'S4'], - 'protein1': [1.4, 2.2, None, 4.2], - 'protein2': [5.6, None, None, 8.1], - 'protein3': [9.1, 10.01, 11.2, 12.9] - }) + self.data = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "sample": ["S1", "S2", "S3", "S4"], + "protein1": [1.4, 2.2, None, 4.2], + "protein2": [5.6, None, None, 8.1], + "protein3": [9.1, 10.01, 11.2, 12.9], + } + ) def test_extract_number_missing(self): min_valid = 2 - expected_result = ['protein1', 'protein3'] + expected_result = ["protein1", "protein3"] - result = ea.extract_number_missing(self.data, min_valid, drop_cols=['sample'], group='group') + result = ea.extract_number_missing( + self.data, min_valid, drop_cols=["sample"], group="group" + ) self.assertEqual(result, expected_result) def test_extract_percentage_missing(self): missing_max = 0.2 - expected_result = ['protein1', 'protein3'] + expected_result = ["protein1", "protein3"] - result = ea.extract_percentage_missing(self.data, missing_max, drop_cols=['sample'], group='group') + result = ea.extract_percentage_missing( + self.data, missing_max, drop_cols=["sample"], group="group" + ) self.assertEqual(result, expected_result) class TestDimensionalityReduction(unittest.TestCase): def setUp(self): - self.data = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'protein1': [1.4, 2.2, 5.3, 4.2], - 'protein2': [5.6, 0.3, 2.1, 8.1], - 'protein3': [9.1, 10.01, 11.2, 12.9] - }) + self.data = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "protein1": [1.4, 2.2, 5.3, 4.2], + "protein2": [5.6, 0.3, 2.1, 8.1], + "protein3": [9.1, 10.01, 11.2, 12.9], + } + ) def test_run_pca(self): components = 2 expected_result = ( - pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'x': [0.877154, -3.883458, -1.550791, 4.557095], - 'y': [-2.815737, -0.420979, 2.278209, 0.958507], - }), - pd.DataFrame({ - 'x': [0.958489, 0.092382, 0.269749], - 'y': [-0.235412, 0.790179, 0.565861], - 'value': [0.986975, 0.795561, 0.626868] - }, - index=['protein2', 'protein1', 'protein3']), - pd.Series([0.71760534, 0.26139782]) + pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "x": [0.877154, -3.883458, -1.550791, 4.557095], + "y": [-2.815737, -0.420979, 2.278209, 0.958507], + } + ), + pd.DataFrame( + { + "x": [0.958489, 0.092382, 0.269749], + "y": [-0.235412, 0.790179, 0.565861], + "value": [0.986975, 0.795561, 0.626868], + }, + index=["protein2", "protein1", "protein3"], + ), + pd.Series([0.71760534, 0.26139782]), + ) + exp_dict = {"x_title": "PC1 (0.72)", "y_title": "PC2 (0.26)", "group": "group"} + result, annotation = ea.run_pca( + self.data, + drop_cols=[], + annotation_cols=[], + group="group", + components=components, + dropna=True, ) - exp_dict = {'x_title': 'PC1 (0.72)', - 'y_title': 'PC2 (0.26)', 'group': 'group'} - result, annotation = ea.run_pca(self.data, drop_cols=[], annotation_cols=[], group='group', components=components, dropna=True) assert annotation == exp_dict # results: tuple of 3 DFs: components, loadings, explained_variance pd.testing.assert_frame_equal(result[0], expected_result[0], check_exact=False) pd.testing.assert_frame_equal(result[1], expected_result[1], check_exact=False) - pd.testing.assert_series_equal(pd.Series(result[2]), expected_result[2], check_exact=False, check_dtype=False) - + pd.testing.assert_series_equal( + pd.Series(result[2]), + expected_result[2], + check_exact=False, + check_dtype=False, + ) + def test_run_tsne(self): components = 2 # ! MAC ARM64 yields these results (using the similar installation) - expected_result = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'x': [-113.341728, 36.020717, 57.2992514, -134.729141], - 'y': [131.169082, -59.616630, 110.601203, -39.086254] - }) + expected_result = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "x": [-113.341728, 36.020717, 57.2992514, -134.729141], + "y": [131.169082, -59.616630, 110.601203, -39.086254], + } + ) # ! Ubuntu x64 yields these results: - if not platform.machine() == 'arm64': - expected_result = pd.DataFrame({ - 'group': ['A', 'A', 'B', 'B'], - 'x': [-27.375560760498047, -7.526909828186035, -28.731882095336914, -8.872751235961914], - 'y': [1.547648310661316, 0.17019487917423248, -18.321491241455078, -19.713821411132812] - }) - result, _ = ea.run_tsne(self.data, drop_cols=['sample', 'subject'], group='group', components=components, perplexity=3, n_iter=1000, init='pca', dropna=True) - result = result['tsne'] - pd.testing.assert_frame_equal(result, expected_result, check_exact=False, check_dtype=False) + if not platform.machine() == "arm64": + expected_result = pd.DataFrame( + { + "group": ["A", "A", "B", "B"], + "x": [ + -27.375560760498047, + -7.526909828186035, + -28.731882095336914, + -8.872751235961914, + ], + "y": [ + 1.547648310661316, + 0.17019487917423248, + -18.321491241455078, + -19.713821411132812, + ], + } + ) + result, _ = ea.run_tsne( + self.data, + drop_cols=["sample", "subject"], + group="group", + components=components, + perplexity=3, + n_iter=1000, + init="pca", + dropna=True, + ) + result = result["tsne"] + pd.testing.assert_frame_equal( + result, expected_result, check_exact=False, check_dtype=False + ) def test_run_umap(self): - _ = ea.run_umap(self.data, drop_cols=['sample', 'subject'], group='group', n_neighbors=10, min_dist=0.3, metric='cosine', dropna=True) + _ = ea.run_umap( + self.data, + drop_cols=["sample", "subject"], + group="group", + n_neighbors=10, + min_dist=0.3, + metric="cosine", + dropna=True, + ) self.assertEqual(1, 1) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 7acf4b3136ba611030673aebb5aa4aa03fee97e9 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 17 Dec 2024 15:59:40 +0100 Subject: [PATCH 33/40] :art: add up and down regulated example, clean-up function and annotate --- acore/enrichment_analysis.py | 115 +++++++++++--------- docs/api_examples/enrichment_analysis.ipynb | 16 +++ docs/api_examples/enrichment_analysis.py | 9 ++ 3 files changed, 86 insertions(+), 54 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index ab277e6..c67ac09 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -161,28 +161,33 @@ def run_site_regulation_enrichment( def run_up_down_regulation_enrichment( - regulation_data, - annotation, - identifier="identifier", - groups=("group1", "group2"), - annotation_col="annotation", - # rejected_col="rejected", - group_col="group", - method="fisher", - correction="fdr_bh", - alpha=0.05, - lfc_cutoff=1, -): + regulation_data: pd.DataFrame, + annotation: pd.DataFrame, + identifier: str = "identifier", + groups: list[str] = ("group1", "group2"), + annotation_col: str = "annotation", + # rejected_col:str="rejected", + group_col: str = "group", + method: str = "fisher", + min_detected_in_set: int = 2, + correction: str = "fdr_bh", + correction_alpha: float = 0.05, + lfc_cutoff: float = 1, +) -> pd.DataFrame: """ This function runs a simple enrichment analysis for significantly regulated proteins distinguishing between up- and down-regulated. - :param regulation_data: pandas.DataFrame resulting from differential regulation + :param pandas.DataFrame regulation_data: pandas.DataFrame resulting from differential regulation analysis (CKG's regulation table). - :param annotation: pandas.DataFrame with annotations for features + :param pandas.DataFrame annotation: pandas.DataFrame with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source'). :param str identifier: name of the column from annotation containing feature identifiers. - :param list groups: column names from regulation_data containing group identifiers. + :param list[str] groups: column names from regulation_data containing group identifiers. + See `pandas.DataFrame.groupby`_ for more information. + + .. _pandas.DataFrame.groupby: https://pandas.pydata.org/pandas-docs/stable/\ +reference/api/pandas.DataFrame.groupby.html :param str annotation_col: name of the column from annotation containing annotation terms. :param str rejected_col: name of the column from regulation_data containing boolean for rejected null hypothesis. @@ -213,52 +218,54 @@ def run_up_down_regulation_enrichment( lfc_cutoff=1, ) """ - enrichment_results = {} + if isinstance(groups, str): + groups = [groups] + if isinstance(groups, tuple): + groups = list(groups) + if len(groups) != 2: + raise ValueError("groups should be exactly two.") + + ret = list() + # In case of multiple comparisons this is used to get all possible combinations for g1, g2 in regulation_data.groupby(groups).groups: + df = regulation_data.groupby(groups).get_group((g1, g2)) + + padj_name = "padj" if "posthoc padj" in df: - df["up_pairwise_regulation"] = (df["posthoc padj"] <= alpha) & ( - df["log2FC"] >= lfc_cutoff - ) - df["down_pairwise_regulation"] = (df["posthoc padj"] <= alpha) & ( - df["log2FC"] <= -lfc_cutoff - ) - else: - df["up_pairwise_regulation"] = (df["padj"] <= alpha) & ( - df["log2FC"] >= lfc_cutoff - ) - df["down_pairwise_regulation"] = (df["padj"] <= alpha) & ( - df["log2FC"] <= -lfc_cutoff - ) + padj_name = "posthoc padj" - enrichment = run_regulation_enrichment( - df, - annotation, - identifier=identifier, - annotation_col=annotation_col, - rejected_col="up_pairwise_regulation", - group_col=group_col, - method=method, - correction=correction, + df["up_pairwise_regulation"] = (df[padj_name] <= correction_alpha) & ( + df["log2FC"] >= lfc_cutoff ) - enrichment["direction"] = "upregulated" - enrichment_results[g1 + "~" + g2] = enrichment - enrichment = run_regulation_enrichment( - df, - annotation, - identifier=identifier, - annotation_col=annotation_col, - rejected_col="down_pairwise_regulation", - group_col=group_col, - method=method, - correction=correction, - ) - enrichment["direction"] = "downregulated" - enrichment_results[g1 + "~" + g2] = enrichment_results[g1 + "~" + g2].append( - enrichment + df["down_pairwise_regulation"] = (df[padj_name] <= correction_alpha) & ( + df["log2FC"] <= -lfc_cutoff ) + comparison_tag = g1 + "~" + g2 + + for rej_col, direction in zip( + ("up_pairwise_regulation", "down_pairwise_regulation"), + ("upregulated", "downregulated"), + ): + _enrichment = run_regulation_enrichment( + df, + annotation, + identifier=identifier, + annotation_col=annotation_col, + rejected_col=rej_col, + group_col=group_col, + method=method, + min_detected_in_set=min_detected_in_set, + correction=correction, + correction_alpha=correction_alpha, + ) + _enrichment["direction"] = direction + _enrichment["comparison"] = comparison_tag + ret.append(_enrichment) + + ret = pd.concat(ret) - return enrichment_results + return ret # ! to move diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 5ed9eca..98bb5e5 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -292,6 +292,7 @@ "ret = acore.enrichment_analysis.run_regulation_enrichment(\n", " regulation_data=diff_reg,\n", " annotation=annotations,\n", + " min_detected_in_set=1, # ! default is 2, so more conservative\n", " correction_alpha=0.01,\n", ")\n", "ret" @@ -305,6 +306,21 @@ "### For up- and downregulated genes separately" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a631495", + "metadata": {}, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " min_detected_in_set=1, # ! default is 2, so more conservative\n", + ")\n", + "ret" + ] + }, { "cell_type": "markdown", "id": "ecf75e7c", diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index e4ccb77..7b4d78a 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -151,6 +151,7 @@ ret = acore.enrichment_analysis.run_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, + min_detected_in_set=1, # ! default is 2, so more conservative correction_alpha=0.01, ) ret @@ -158,6 +159,14 @@ # %% [markdown] # ### For up- and downregulated genes separately +# %% +ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( + regulation_data=diff_reg, + annotation=annotations, + min_detected_in_set=1, # ! default is 2, so more conservative +) +ret + # %% [markdown] # ### Site specific enrichment analysis From 436128f6fbfe1a189fd77f3435051c6d3e1f08e7 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 17 Dec 2024 16:38:33 +0100 Subject: [PATCH 34/40] :art: document log fold change, data used and improve docstrings --- acore/enrichment_analysis.py | 4 +- docs/api_examples/enrichment_analysis.ipynb | 73 ++++++++++++++++++++- docs/api_examples/enrichment_analysis.py | 38 ++++++++++- 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index c67ac09..e06d201 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -199,7 +199,7 @@ def run_up_down_regulation_enrichment( :param float alpha: adjusted p-value cutoff to define significance :param float lfc_cutoff: log fold-change cutoff to define practical significance :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground', - 'background', 'pvalue', 'padj' and 'rejected'. + 'background', 'pvalue', 'padj', 'rejected', 'direction' and 'comparison'. Example:: @@ -223,7 +223,7 @@ def run_up_down_regulation_enrichment( if isinstance(groups, tuple): groups = list(groups) if len(groups) != 2: - raise ValueError("groups should be exactly two.") + raise ValueError("groups should contains exactly two columns.") ret = list() # In case of multiple comparisons this is used to get all possible combinations diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 98bb5e5..1b0375f 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -140,7 +140,10 @@ "execution_count": null, "id": "1145a2cd", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -206,7 +209,7 @@ "outputs": [], "source": [ "diff_reg[\"rejected\"] = diff_reg[\"rejected\"].astype(bool) # ! needs to be fixed in anova\n", - "diff_reg.query(\"rejected == True\")" + "diff_reg.query(\"rejected\")" ] }, { @@ -262,7 +265,11 @@ "cell_type": "code", "execution_count": null, "id": "57fddefb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "_ = (\n", @@ -310,6 +317,65 @@ "cell_type": "code", "execution_count": null, "id": "1a631495", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "diff_reg.query(\"rejected\")[\n", + " [\n", + " \"identifier\",\n", + " \"group1\",\n", + " \"group2\",\n", + " \"pvalue\",\n", + " \"padj\",\n", + " \"rejected\",\n", + " \"log2FC\",\n", + " \"FC\",\n", + " ]\n", + "].sort_values(\"log2FC\")" + ] + }, + { + "cell_type": "markdown", + "id": "7380a528", + "metadata": {}, + "source": [ + "- this additionally sets a fold change cutoff\n", + "- and the fore and backgroud populations are changed due to the separation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a73ed6e", + "metadata": {}, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " min_detected_in_set=1, # ! default is 2, so more conservative\n", + " lfc_cutoff=1,\n", + ")\n", + "ret" + ] + }, + { + "cell_type": "markdown", + "id": "5cb036be", + "metadata": {}, + "source": [ + "here we see differences for the same set of differently regulated protein groups,\n", + "which can be reset using lfc_cutoff=0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e5e2b61", "metadata": {}, "outputs": [], "source": [ @@ -317,6 +383,7 @@ " regulation_data=diff_reg,\n", " annotation=annotations,\n", " min_detected_in_set=1, # ! default is 2, so more conservative\n", + " lfc_cutoff=0,\n", ")\n", "ret" ] diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 7b4d78a..c48fce2 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -62,7 +62,7 @@ idx_always_included = ["Q5HYN5", "P39059", "O43432", "O43175"] df_omics[idx_always_included] -# %% +# %% tags=["hide-input"] df_omics = ( df_omics # .dropna(axis=1) @@ -99,7 +99,7 @@ # %% diff_reg["rejected"] = diff_reg["rejected"].astype(bool) # ! needs to be fixed in anova -diff_reg.query("rejected == True") +diff_reg.query("rejected") # %% [markdown] # ## Find functional annotations, here pathways @@ -134,7 +134,7 @@ # %% [markdown] # See how many protein groups are associated with each annotation. -# %% +# %% tags=["hide-input"] _ = ( annotations.groupby("annotation") .size() @@ -159,11 +159,43 @@ # %% [markdown] # ### For up- and downregulated genes separately +# %% tags=["hide-input"] +diff_reg.query("rejected")[ + [ + "identifier", + "group1", + "group2", + "pvalue", + "padj", + "rejected", + "log2FC", + "FC", + ] +].sort_values("log2FC") + +# %% [markdown] +# - this additionally sets a fold change cutoff +# - and the fore and backgroud populations are changed due to the separation + +# %% +ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( + regulation_data=diff_reg, + annotation=annotations, + min_detected_in_set=1, # ! default is 2, so more conservative + lfc_cutoff=1, +) +ret + +# %% [markdown] +# here we see differences for the same set of differently regulated protein groups, +# which can be reset using lfc_cutoff=0. + # %% ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, min_detected_in_set=1, # ! default is 2, so more conservative + lfc_cutoff=0, ) ret From 5bdbfb8f65e8f27027953339a21109d128a2d0fb Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 18 Dec 2024 10:50:45 +0100 Subject: [PATCH 35/40] :memo: extend docs with pca plot from vuecore and ks example --- acore/enrichment_analysis.py | 11 ++- docs/api_examples/enrichment_analysis.ipynb | 80 +++++++++++++++++---- docs/api_examples/enrichment_analysis.py | 42 ++++++++--- 3 files changed, 110 insertions(+), 23 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index e06d201..0ace7b5 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -29,7 +29,10 @@ def run_fisher( group2: list[int], alternative: str = "two-sided", ) -> tuple[float, float]: - """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`. + """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`_. + + .. _scipy.stats.fisher_exact: https://docs.scipy.org/doc/scipy/reference/generated/\ +scipy.stats.fisher_exact.html Example:: @@ -51,7 +54,11 @@ def run_fisher( def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"): """ Compute the Kolmogorov-Smirnov statistic on 2 samples. - See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html + See `scipy.stats.ks_2samp`_ + + .. _scipy.stats.ks_2samp: https://docs.scipy.org/doc/scipy/reference/generated/\ +scipy.stats.ks_2samp.html + :param list dist1: sequence of 1-D ndarray (first distribution to compare) drawn from a continuous distribution diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 1b0375f..92da200 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -31,7 +31,7 @@ }, "outputs": [], "source": [ - "%pip install acore" + "%pip install acore vuecore" ] }, { @@ -528,7 +528,16 @@ " components=2,\n", " dropna=False,\n", ")\n", - "pca_result" + "resultDf, loadings, var_exp = pca_result\n", + "resultDf" + ] + }, + { + "cell_type": "markdown", + "id": "acd530d6", + "metadata": {}, + "source": [ + "The loadings show how the variables are correlated with the principal components." ] }, { @@ -538,13 +547,7 @@ "metadata": {}, "outputs": [], "source": [ - "# from plotly.offline import iplot\n", - "# from vuecore import viz\n", - "\n", - "# args = {\"factor\": 1, \"loadings\": 10}\n", - "# #! pca_results has three items, but docstring requests only two -> double check\n", - "# figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n", - "# iplot(figure)" + "loadings" ] }, { @@ -552,8 +555,10 @@ "id": "5d0c1eba", "metadata": {}, "source": [ - "## Compare two distributions - KS test\n", - "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions." + "We will plot both on the sample plot (samples on the first two principal components and\n", + "loadings of variables). We use the\n", + "[`vuecore` package](https://github.com/Multiomics-Analytics-Group/vuecore)\n", + "for this, which is also developed by the Multiomics Analytics Group." ] }, { @@ -563,7 +568,25 @@ "metadata": {}, "outputs": [], "source": [ - "# plot two histograms of intensity values here" + "from plotly.offline import iplot\n", + "from vuecore import viz\n", + "\n", + "args = {\"factor\": 1, \"loadings\": 10}\n", + "#! pca_results has three items, but docstring requests only two -> double check\n", + "figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n", + "iplot(figure)" + ] + }, + { + "cell_type": "markdown", + "id": "4be5a8c8", + "metadata": {}, + "source": [ + "## Compare two distributions - KS test\n", + "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions.\n", + "- we compare the distributions of the two differently upregulated protein groups\n", + "This is not the best example for comparing distributions, but it shows how to use the\n", + "KS test." ] }, { @@ -572,7 +595,38 @@ "id": "6dd57b99", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# plot two histograms of intensity values here\n", + "sel_pgs = [\"O43175\", \"P39059\"]\n", + "view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())\n", + "view.plot.hist(bins=20, alpha=0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "a480b65b", + "metadata": {}, + "source": [ + "Let us compare the two centered distributions using the KS test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0f8e6a8", + "metadata": {}, + "outputs": [], + "source": [ + "acore.enrichment_analysis.run_kolmogorov_smirnov(view[sel_pgs[0]], view[sel_pgs[1]])" + ] + }, + { + "cell_type": "markdown", + "id": "385b0ceb", + "metadata": {}, + "source": [ + "The result suggests that the two distributions are from the same distribution." + ] } ], "metadata": { diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index c48fce2..1caea9f 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -12,7 +12,7 @@ # %% tags=["hide-output"] -# %pip install acore +# %pip install acore vuecore # %% @@ -264,22 +264,48 @@ components=2, dropna=False, ) -pca_result +resultDf, loadings, var_exp = pca_result +resultDf + +# %% [markdown] +# The loadings show how the variables are correlated with the principal components. + +# %% +loadings + +# %% [markdown] +# We will plot both on the sample plot (samples on the first two principal components and +# loadings of variables). We use the +# [`vuecore` package](https://github.com/Multiomics-Analytics-Group/vuecore) +# for this, which is also developed by the Multiomics Analytics Group. # %% -# from plotly.offline import iplot -# from vuecore import viz +from plotly.offline import iplot +from vuecore import viz -# args = {"factor": 1, "loadings": 10} -# # #! pca_results has three items, but docstring requests only two -> double check -# figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args) -# iplot(figure) +args = {"factor": 1, "loadings": 10} +# #! pca_results has three items, but docstring requests only two -> double check +figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args) +iplot(figure) # %% [markdown] # ## Compare two distributions - KS test # The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions. +# - we compare the distributions of the two differently upregulated protein groups +# This is not the best example for comparing distributions, but it shows how to use the +# KS test. # %% # plot two histograms of intensity values here +sel_pgs = ["O43175", "P39059"] +view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean()) +view.plot.hist(bins=20, alpha=0.5) + +# %% [markdown] +# Let us compare the two centered distributions using the KS test. # %% +acore.enrichment_analysis.run_kolmogorov_smirnov(view[sel_pgs[0]], view[sel_pgs[1]]) + +# %% [markdown] +# The result suggests that the two distributions are from the same distribution. From ce990386fc9bc8df3ad0b7d4afdddf3e0458d359 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 18 Dec 2024 12:19:26 +0100 Subject: [PATCH 36/40] :art: add changes based on discussion with Alberto - enrichment only separate for up- and down regulated protein groups - keep hint on to do for PTM dataset --- acore/enrichment_analysis.py | 24 ++++----- docs/api_examples/enrichment_analysis.ipynb | 55 +++++---------------- docs/api_examples/enrichment_analysis.py | 35 +++++-------- 3 files changed, 37 insertions(+), 77 deletions(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 0ace7b5..44d4aee 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -85,14 +85,15 @@ def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"): def run_site_regulation_enrichment( regulation_data: pd.DataFrame, annotation: pd.DataFrame, - identifier="identifier", - groups=("group1", "group2"), - annotation_col="annotation", - rejected_col="rejected", - group_col="group", - method="fisher", - regex="(\\w+~.+)_\\w\\d+\\-\\w+", - correction="fdr_bh", + identifier: str = "identifier", + groups: list[str] = ("group1", "group2"), + annotation_col: str = "annotation", + rejected_col: str = "rejected", + group_col: str = "group", + method: str = "fisher", + regex: str = "(\\w+~.+)_\\w\\d+\\-\\w+", + correction: str = "fdr_bh", + remove_duplicates: bool = False, ): r""" This function runs a simple enrichment analysis for significantly @@ -144,14 +145,13 @@ def run_site_regulation_enrichment( if match is not None: new_ids.append( match.group(1) - ) # removes the PTM extension of the identifierm of CKG + ) # removes the PTM extension of the identifier of CKG else: new_ids.append(ident) # so this is normalizing the identifiers to ignore the PTM extension regulation_data[identifier] = new_ids # matches are used as identifiers - regulation_data = ( - regulation_data.drop_duplicates() - ) # depends on regulation_data if this has an effect (with floats probably not) + if remove_duplicates: + regulation_data = regulation_data.drop_duplicates(subset=[identifier]) result = run_regulation_enrichment( regulation_data, annotation, diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 92da200..07c3268 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -111,7 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_omics.notna().sum().sort_values(ascending=True).plot()" + "ax = df_omics.notna().sum().sort_values(ascending=True).plot()" ] }, { @@ -286,31 +286,9 @@ "id": "4165bc94", "metadata": {}, "source": [ - "## Enrichment analysis\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f300c5b5", - "metadata": {}, - "outputs": [], - "source": [ - "ret = acore.enrichment_analysis.run_regulation_enrichment(\n", - " regulation_data=diff_reg,\n", - " annotation=annotations,\n", - " min_detected_in_set=1, # ! default is 2, so more conservative\n", - " correction_alpha=0.01,\n", - ")\n", - "ret" - ] - }, - { - "cell_type": "markdown", - "id": "834d5a85", - "metadata": {}, - "source": [ - "### For up- and downregulated genes separately" + "## Enrichment analysis\n", + "Is done separately for up- and downregulated genes as it's assumed that biological\n", + "processes are regulated in one direction." ] }, { @@ -403,7 +381,12 @@ "source": [ "The basic example uses a modified peptide sequence to\n", "demonstrate the enrichment analysis.\n", - "- compare groups per amino acid modified (kinases targeting certain motifs?)" + "> TODO: The example on how to do that needs a PTM focused dataset.\n", + "The details of how site specific enrichment analysis is done will depend on the\n", + "dataset and the question at hand.\n", + "\n", + "If the identifiers contain PTMs this information is removed to match it to the annotation\n", + "using a regular expression (in the function). For example:" ] }, { @@ -428,20 +411,8 @@ "metadata": {}, "outputs": [], "source": [ - "seq_mod = \"_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR\"\n", - "regex = \"\\(\\w\\w\\)\"\n", - "matches = re.findall(regex, seq_mod)\n", - "matches" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7eab37e1", - "metadata": {}, - "outputs": [], - "source": [ - "# acore.enrichment_analysis.run_up_down_regulation_enrichment(" + "# ToDo: Add example for site specific enrichment analysis\n", + "# acore.enrichment_analysis.run_up_down_regulation_enrichment" ] }, { @@ -599,7 +570,7 @@ "# plot two histograms of intensity values here\n", "sel_pgs = [\"O43175\", \"P39059\"]\n", "view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())\n", - "view.plot.hist(bins=20, alpha=0.5)" + "ax = view.plot.hist(bins=20, alpha=0.5)" ] }, { diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 1caea9f..6acace4 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -51,7 +51,7 @@ df_omics # %% -df_omics.notna().sum().sort_values(ascending=True).plot() +ax = df_omics.notna().sum().sort_values(ascending=True).plot() # %% [markdown] # Keep only features with a certain amount of non-NaN values and select 100 of these @@ -145,19 +145,8 @@ # %% [markdown] # ## Enrichment analysis -# - -# %% -ret = acore.enrichment_analysis.run_regulation_enrichment( - regulation_data=diff_reg, - annotation=annotations, - min_detected_in_set=1, # ! default is 2, so more conservative - correction_alpha=0.01, -) -ret - -# %% [markdown] -# ### For up- and downregulated genes separately +# Is done separately for up- and downregulated genes as it's assumed that biological +# processes are regulated in one direction. # %% tags=["hide-input"] diff_reg.query("rejected")[ @@ -205,7 +194,12 @@ # %% [markdown] # The basic example uses a modified peptide sequence to # demonstrate the enrichment analysis. -# - compare groups per amino acid modified (kinases targeting certain motifs?) +# > TODO: The example on how to do that needs a PTM focused dataset. +# The details of how site specific enrichment analysis is done will depend on the +# dataset and the question at hand. +# +# If the identifiers contain PTMs this information is removed to match it to the annotation +# using a regular expression (in the function). For example: # %% import re @@ -216,13 +210,8 @@ match.group(1) # %% -seq_mod = "_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR" -regex = "\(\w\w\)" -matches = re.findall(regex, seq_mod) -matches - -# %% -# acore.enrichment_analysis.run_up_down_regulation_enrichment( +# ToDo: Add example for site specific enrichment analysis +# acore.enrichment_analysis.run_up_down_regulation_enrichment # %% [markdown] # ## Single sample GSEA (ssGSEA) @@ -299,7 +288,7 @@ # plot two histograms of intensity values here sel_pgs = ["O43175", "P39059"] view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean()) -view.plot.hist(bins=20, alpha=0.5) +ax = view.plot.hist(bins=20, alpha=0.5) # %% [markdown] # Let us compare the two centered distributions using the KS test. From 45c2f5e5191ce58ddaa799788e6e8ef037d9e7fd Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 18 Dec 2024 12:30:09 +0100 Subject: [PATCH 37/40] :art: lower log2fc cutoff to include downregulated protein groups --- docs/api_examples/enrichment_analysis.ipynb | 2 +- docs/api_examples/enrichment_analysis.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 07c3268..2f485d5 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -336,7 +336,7 @@ " regulation_data=diff_reg,\n", " annotation=annotations,\n", " min_detected_in_set=1, # ! default is 2, so more conservative\n", - " lfc_cutoff=1,\n", + " lfc_cutoff=0.5, # ! the default is 1\n", ")\n", "ret" ] diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 6acace4..ed230de 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -171,7 +171,7 @@ regulation_data=diff_reg, annotation=annotations, min_detected_in_set=1, # ! default is 2, so more conservative - lfc_cutoff=1, + lfc_cutoff=0.5, # ! the default is 1 ) ret From b71acc8e37953117f8cc48152f25454343091bb5 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Thu, 19 Dec 2024 15:09:28 +0100 Subject: [PATCH 38/40] :bug: split up GO term annotations Go term annotations had several Go terms concatenated using `;`, which are now split up. Performance can be improved. --- docs/api_examples/enrichment_analysis.ipynb | 56 +++++++++++++++++---- docs/api_examples/enrichment_analysis.py | 44 ++++++++++++---- 2 files changed, 82 insertions(+), 18 deletions(-) diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 2f485d5..73b1a37 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -51,7 +51,7 @@ "import acore.enrichment_analysis\n", "from acore.io.uniprot import fetch_annotations\n", "\n", - "dsp_pandas.format.set_pandas_options(max_colwidth=15)" + "dsp_pandas.format.set_pandas_options(max_colwidth=60)" ] }, { @@ -234,18 +234,31 @@ " print(f\"Loaded annotations from {fname}\")\n", "except FileNotFoundError:\n", " print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n", - " annotations = fetch_annotations(df_omics.columns)\n", + " fields = \"accession,go_p,go_c,go_f\"\n", + " annotations = fetch_annotations(df_omics.columns, fields=fields)\n", + " # First column (`From`) is additional to specified fields\n", + " d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n", + "\n", + " # expand go terms\n", + " to_expand = list()\n", + " for field in d_fields_to_col:\n", + " if \"go_\" in field:\n", + " col = d_fields_to_col[field]\n", + " annotations[col] = annotations[col].str.split(\";\")\n", + " to_expand.append(col)\n", + " for col in to_expand:\n", + " # this is a bit wastefull. Processing to stack format should be done here.\n", + " annotations = annotations.explode(col, ignore_index=True)\n", + " # process other than go term columns\n", " annotations = (\n", - " annotations.set_index(\"Entry\")\n", + " annotations.set_index(\"From\")\n", " .rename_axis(\"identifier\")\n", - " .drop(\"From\", axis=1)\n", + " # .drop(\"Entry\", axis=1)\n", " .rename_axis(\"source\", axis=1)\n", " .stack()\n", " .to_frame(\"annotation\")\n", - " .replace(\"\", pd.NA)\n", - " .dropna()\n", - " .sort_values([\"source\", \"annotation\"])\n", " .reset_index()\n", + " .drop_duplicates(ignore_index=True)\n", " )\n", " fname.parent.mkdir(exist_ok=True, parents=True)\n", " annotations.to_csv(fname, index=True)\n", @@ -335,7 +348,7 @@ "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", " regulation_data=diff_reg,\n", " annotation=annotations,\n", - " min_detected_in_set=1, # ! default is 2, so more conservative\n", + " min_detected_in_set=2, # ! default is 2, so more conservative\n", " lfc_cutoff=0.5, # ! the default is 1\n", ")\n", "ret" @@ -361,7 +374,32 @@ " regulation_data=diff_reg,\n", " annotation=annotations,\n", " min_detected_in_set=1, # ! default is 2, so more conservative\n", - " lfc_cutoff=0,\n", + " lfc_cutoff=0.1, # ! the default is 1\n", + ")\n", + "ret" + ] + }, + { + "cell_type": "markdown", + "id": "e3530547", + "metadata": {}, + "source": [ + "Or restricting the analysis to functional annotation for which we at least found 2\n", + "protein groups to be upregulated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ea367ca", + "metadata": {}, + "outputs": [], + "source": [ + "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", + " regulation_data=diff_reg,\n", + " annotation=annotations,\n", + " min_detected_in_set=2,\n", + " lfc_cutoff=0.5, # ! the default is 1\n", ")\n", "ret" ] diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index ed230de..3e7e97d 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -26,7 +26,7 @@ import acore.enrichment_analysis from acore.io.uniprot import fetch_annotations -dsp_pandas.format.set_pandas_options(max_colwidth=15) +dsp_pandas.format.set_pandas_options(max_colwidth=60) # %% [markdown] # Parameters of this notebook @@ -113,18 +113,31 @@ print(f"Loaded annotations from {fname}") except FileNotFoundError: print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.") - annotations = fetch_annotations(df_omics.columns) + fields = "accession,go_p,go_c,go_f" + annotations = fetch_annotations(df_omics.columns, fields=fields) + # First column (`From`) is additional to specified fields + d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])} + + # expand go terms + to_expand = list() + for field in d_fields_to_col: + if "go_" in field: + col = d_fields_to_col[field] + annotations[col] = annotations[col].str.split(";") + to_expand.append(col) + for col in to_expand: + # this is a bit wastefull. Processing to stack format should be done here. + annotations = annotations.explode(col, ignore_index=True) + # process other than go term columns annotations = ( - annotations.set_index("Entry") + annotations.set_index("From") .rename_axis("identifier") - .drop("From", axis=1) + # .drop("Entry", axis=1) .rename_axis("source", axis=1) .stack() .to_frame("annotation") - .replace("", pd.NA) - .dropna() - .sort_values(["source", "annotation"]) .reset_index() + .drop_duplicates(ignore_index=True) ) fname.parent.mkdir(exist_ok=True, parents=True) annotations.to_csv(fname, index=True) @@ -170,7 +183,7 @@ ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, - min_detected_in_set=1, # ! default is 2, so more conservative + min_detected_in_set=2, # ! default is 2, so more conservative lfc_cutoff=0.5, # ! the default is 1 ) ret @@ -184,7 +197,20 @@ regulation_data=diff_reg, annotation=annotations, min_detected_in_set=1, # ! default is 2, so more conservative - lfc_cutoff=0, + lfc_cutoff=0.1, # ! the default is 1 +) +ret + +# %% [markdown] +# Or restricting the analysis to functional annotation for which we at least found 2 +# protein groups to be upregulated. + +# %% +ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( + regulation_data=diff_reg, + annotation=annotations, + min_detected_in_set=2, + lfc_cutoff=0.5, # ! the default is 1 ) ret From e685440c3ad4c21766d499a39f6a85bba212eaea Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Thu, 19 Dec 2024 15:10:15 +0100 Subject: [PATCH 39/40] :bug: allow also single protein groups results --- acore/enrichment_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py index 44d4aee..6838d83 100644 --- a/acore/enrichment_analysis.py +++ b/acore/enrichment_analysis.py @@ -480,7 +480,7 @@ def run_enrichment( ] ) ) - if len(pvalues) > 1: + if len(pvalues) >= 1: rejected, padj = apply_pvalue_correction( pvalues, alpha=correction_alpha, From 8a2fac8cc6271f3f6345ebbc11e543b3a869dba4 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 14 Jan 2025 13:47:39 +0100 Subject: [PATCH 40/40] :art: do not have accession as a column ( "Entry") annotation field contains identifier again --- docs/api_examples/enrichment_analysis.ipynb | 2 +- docs/api_examples/enrichment_analysis.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 73b1a37..17e96d8 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -234,7 +234,7 @@ " print(f\"Loaded annotations from {fname}\")\n", "except FileNotFoundError:\n", " print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n", - " fields = \"accession,go_p,go_c,go_f\"\n", + " fields = \"go_p,go_c,go_f\"\n", " annotations = fetch_annotations(df_omics.columns, fields=fields)\n", " # First column (`From`) is additional to specified fields\n", " d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n", diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 3e7e97d..5f09cd0 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -113,7 +113,7 @@ print(f"Loaded annotations from {fname}") except FileNotFoundError: print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.") - fields = "accession,go_p,go_c,go_f" + fields = "go_p,go_c,go_f" annotations = fetch_annotations(df_omics.columns, fields=fields) # First column (`From`) is additional to specified fields d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])}