From 6861fe3bfe310db6965862cf5f40a3e2338630b2 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 20 Nov 2024 11:50:49 +0100
Subject: [PATCH 01/40] :art: update docstrings and module string

- line lenght restriction to 100 characters
---
 acore/enrichment_analysis.py | 192 +++++++++++++++++++++++++++--------
 1 file changed, 149 insertions(+), 43 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index a3f9082..bc9ba87 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -1,3 +1,7 @@
+"""Enrichment Analysis Module. Contains different functions to perform enrichment 
+analysis.
+"""
+
 import os
 import re
 import uuid
@@ -29,15 +33,19 @@ def run_fisher(group1, group2, alternative="two-sided"):
 
 def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"):
     """
-    Compute the Kolmogorov-Smirnov statistic on 2 samples. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html
+    Compute the Kolmogorov-Smirnov statistic on 2 samples.
+    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html
 
-    :param list dist1: sequence of 1-D ndarray (first distribution to compare) drawn from a continuous distribution
-    :param list dist2: sequence of 1-D ndarray (second distribution to compare) drawn from a continuous distribution
+    :param list dist1: sequence of 1-D ndarray (first distribution to compare)
+        drawn from a continuous distribution
+    :param list dist2: sequence of 1-D ndarray (second distribution to compare)
+        drawn from a continuous distribution
     :param str alternative: defines the alternative hypothesis (default is ‘two-sided’):
         * **'two-sided'**
         * **'less'**
         * **'greater'**
     :return: statistic float and KS statistic pvalue float Two-tailed p-value.
+
     Example::
 
         result = run_kolmogorov_smirnov(dist1, dist2, alternative='two-sided')
@@ -58,26 +66,45 @@ def run_site_regulation_enrichment(
     reject_col="rejected",
     group_col="group",
     method="fisher",
-    regex=r"(\w+~.+)_\w\d+\-\w+",
+    regex=r"(\w+~.+)_\w\d+\-\w+",  # ! add example to docstring of what this matches
     correction="fdr_bh",
 ):
     """
-    This function runs a simple enrichment analysis for significantly regulated protein sites in a dataset.
-
-    :param regulation_data: pandas dataframe resulting from differential regulation analysis.
-    :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
-    :param str identifier: name of the column from annotation containing feature identifiers.
-    :param list groups: column names from regulation_data containing group identifiers.
-    :param str annotation_col: name of the column from annotation containing annotation terms.
-    :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis.
-    :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background.
-    :param str method: method used to compute enrichment (only 'fisher' is supported currently).
+    This function runs a simple enrichment analysis for significantly
+    regulated protein sites in a dataset.
+
+    :param regulation_data: pandas dataframe resulting from differential
+        regulation analysis.
+    :param annotation: pandas dataframe with annotations for features
+        (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
+    :param str identifier: name of the column from annotation containing
+        feature identifiers.
+    :param list groups: column names from regulation_data containing
+        group identifiers.
+    :param str annotation_col: name of the column from annotation containing
+        annotation terms.
+    :param str reject_col: name of the column from regulatio_data containing
+        boolean for rejected null hypothesis.
+    :param str group_col: column name for new column in annotation dataframe
+        determining if feature belongs to foreground or background.
+    :param str method: method used to compute enrichment
+        (only 'fisher' is supported currently).
     :param str regex: how to extract the annotated identifier from the site identifier
-    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'.
+    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground',
+        'background', 'pvalue', 'padj' and 'rejected'.
 
     Example::
 
-        result = run_site_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', match="(\w+~.+)_\w\d+\-\w+")
+        result = run_site_regulation_enrichment(regulation_data,
+            annotation,
+            identifier='identifier',
+            groups=['group1', 'group2'],
+            annotation_col='annotation',
+            reject_col='rejected',
+            group_col='group',
+            method='fisher',
+            match="(\w+~.+)_\w\d+\-\w+"
+        )
     """
     result = pd.DataFrame()
     if regulation_data is not None:
@@ -120,24 +147,44 @@ def run_up_down_regulation_enrichment(
     lfc_cutoff=1,
 ):
     """
-    This function runs a simple enrichment analysis for significantly regulated proteins distinguishing between up- and down-regulated.
+    This function runs a simple enrichment analysis for significantly regulated proteins
+    distinguishing between up- and down-regulated.
 
-    :param regulation_data: pandas dataframe resulting from differential regulation analysis (CKG's regulation table).
-    :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
+    :param regulation_data: pandas dataframe resulting from differential regulation
+        analysis (CKG's regulation table).
+    :param annotation: pandas dataframe with annotations for features
+        (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
     :param list groups: column names from regulation_data containing group identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
-    :param str reject_col: name of the column from regulation_data containing boolean for rejected null hypothesis.
-    :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background.
-    :param str method: method used to compute enrichment (only 'fisher' is supported currently).
+    :param str reject_col: name of the column from regulation_data containing boolean for
+        rejected null hypothesis.
+    :param str group_col: column name for new column in annotation dataframe determining
+        if feature belongs to foreground or background.
+    :param str method: method used to compute enrichment
+        (only 'fisher' is supported currently).
     :param str correction: method to be used for multiple-testing correction
     :param float alpha: adjusted p-value cutoff to define significance
     :param float lfc_cutoff: log fold-change cutoff to define practical significance
-    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'.
+    :return: Pandas dataframe with columns:'terms', 'identifiers', 'foreground',
+        'background', 'pvalue', 'padj' and 'rejected'.
 
     Example::
 
-        result = run_up_down_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher', correction='fdr_bh', alpha=0.05, lfc_cutoff=1)
+        result = run_up_down_regulation_enrichment(
+            regulation_data,
+            annotation,
+            identifier='identifier',
+            groups=['group1',
+            'group2'],
+            annotation_col='annotation',
+            reject_col='rejected',
+            group_col='group',
+            method='fisher',
+            correction='fdr_bh',
+            alpha=0.05,
+            lfc_cutoff=1,
+        )
     """
     enrichment_results = {}
     for g1, g2 in regulation_data.groupby(groups).groups:
@@ -201,22 +248,37 @@ def run_regulation_enrichment(
     correction="fdr_bh",
 ):
     """
-    This function runs a simple enrichment analysis for significantly regulated features in a dataset.
+    This function runs a simple enrichment analysis for significantly regulated features
+    in a dataset.
 
     :param regulation_data: pandas dataframe resulting from differential regulation analysis.
-    :param annotation: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
+    :param annotation: pandas dataframe with annotations for features
+        (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
     :param list groups: column names from regulation_data containing group identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
-    :param str reject_col: name of the column from regulatio_data containing boolean for rejected null hypothesis.
-    :param str group_col: column name for new column in annotation dataframe determining if feature belongs to foreground or background.
+    :param str reject_col: name of the column from regulatio_data containing boolean for
+        rejected null hypothesis.
+    :param str group_col: column name for new column in annotation dataframe determining
+        if feature belongs to foreground or background.
     :param str method: method used to compute enrichment (only 'fisher' is supported currently).
     :param str correction: method to be used for multiple-testing correction
-    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected'.
+    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground',
+        'background', 'pvalue', 'padj' and 'rejected'.
 
     Example::
 
-        result = run_regulation_enrichment(regulation_data, annotation, identifier='identifier', groups=['group1', 'group2'], annotation_col='annotation', reject_col='rejected', group_col='group', method='fisher')
+        result = run_regulation_enrichment(
+            regulation_data,
+            annotation,
+            identifier='identifier',
+            groups=['group1',
+            'group2'],
+            annotation_col='annotation',
+            reject_col='rejected',
+            group_col='group',
+            method='fisher',
+         )
     """
     result = {}
     foreground_list = (
@@ -267,20 +329,36 @@ def run_enrichment(
     correction="fdr_bh",
 ):
     """
-    Computes enrichment of the foreground relative to a given backgroung, using Fisher's exact test, and corrects for multiple hypothesis testing.
+    Computes enrichment of the foreground relative to a given backgroung,
+    using Fisher's exact test, and corrects for multiple hypothesis testing.
 
-    :param data: pandas dataframe with annotations for dataset features (columns: 'annotation', 'identifier', 'source', 'group').
+    :param data: pandas dataframe with annotations for dataset features
+        (columns: 'annotation', 'identifier', 'source', 'group').
     :param str foreground_id: group identifier of features that belong to the foreground.
     :param str background_id: group identifier of features that belong to the background.
     :param str annotation_col: name of the column containing annotation terms.
     :param str group_col: name of column containing the group identifiers.
     :param str identifier_col: name of column containing dependent variables identifiers.
     :param str method: method used to compute enrichment (only 'fisher' is supported currently).
-    :return: Pandas dataframe with annotation terms, features, number of foregroung/background features in each term, p-values and corrected p-values (columns: 'terms', 'identifiers', 'foreground', 'background', 'pvalue', 'padj' and 'rejected').
+    :return: Pandas dataframe with annotation terms, features,
+        number of foregroung/background features in each term,
+        p-values and corrected p-values
+        (columns: 'terms', 'identifiers', 'foreground',
+         'background', 'pvalue', 'padj' and 'rejected').
 
     Example::
 
-        result = run_enrichment(data, foreground='foreground', background='background', foreground_pop=len(foreground_list), background_pop=len(background_list), annotation_col='annotation', group_col='group', identifier_col='identifier', method='fisher')
+        result = run_enrichment(
+            data,
+            foreground='foreground',
+            background='background',
+            foreground_pop=len(foreground_list),
+            background_pop=len(background_list),
+            annotation_col='annotation',
+            group_col='group',
+            identifier_col='identifier',
+            method='fisher',
+         )
     """
     result = pd.DataFrame()
     df = data.copy()
@@ -359,23 +437,39 @@ def run_ssgsea(
     permutations=0,
 ):
     """
-    Project each sample within a data set onto a space of gene set enrichment scores using the ssGSEA projection methodology described in Barbie et al., 2009.
+    Project each sample within a data set onto a space of gene set enrichment scores using
+    the single sample gene set enrichment analysis (ssGSEA) projection methodology
+    described in Barbie et al., 2009:
+    https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA).
 
     :param data: pandas dataframe with the quantified features (i.e. subject x proteins)
-    :param annotation: pandas dataframe with the annotation to be used in the enrichment (i.e. CKG pathway annotation file)
+    :param annotation: pandas dataframe with the annotation to be used in the enrichment
+        (i.e. CKG pathway annotation file)
     :param str annotation_col: name of the column containing annotation terms.
     :param str identifier_col: name of column containing dependent variables identifiers.
-    :param list set_index: column/s to be used as index. Enrichment will be calculated for these values (i.e ["subject"] will return subjects x pathways matrix of enrichment scores)
-    :param str out_dir: directory path where results will be stored (default None, tmp folder is used)
-    :param int min_size: minimum number of features (i.e. proteins) in enriched terms (i.e. pathways)
-    :param int max_size: maximum number of features (i.e. proteins) in enriched terms (i.e. pathways)
+    :param list set_index: column/s to be used as index. Enrichment will be calculated
+        for these values (i.e ["subject"] will return subjects x pathways matrix of
+        enrichment scores)
+    :param str out_dir: directory path where results will be stored
+        (default None, tmp folder is used)
+    :param int min_size: minimum number of features (i.e. proteins) in enriched terms
+        (i.e. pathways)
+    :param int max_size: maximum number of features (i.e. proteins) in enriched terms
+        (i.e. pathways)
     :param bool scale: whether or not to scale the data
     :param int permutations: number of permutations used in the ssgsea analysis
-    :return: dictionary with two dataframes: es - enrichment scores, and nes - normalized enrichment scores.
+    :return: dictionary with two dataframes: es - enrichment scores,
+        and nes - normalized enrichment scores.
 
     Example::
         stproject = "P0000008"
-        p = project.Project(stproject, datasets={}, knowledge=None, report={}, configuration_files=None)
+        p = project.Project(
+            stproject,
+            datasets={},
+            knowledge=None,
+            report={},
+            configuration_files=None,
+        )
         p.build_project(False)
         p.generate_report()
 
@@ -383,7 +477,19 @@ def run_ssgsea(
         annotations = proteomics_dataset.get_dataframe("pathway annotation")
         processed = proteomics_dataset.get_dataframe('processed')
 
-        result = run_ssgsea(processed, annotations, annotation_col='annotation', identifier_col='identifier', set_index=['group', 'sample','subject'], outdir=None, min_size=10, scale=False, permutations=0)
+        result = run_ssgsea(
+            processed,
+            annotations,
+            annotation_col='annotation',
+            identifier_col='identifier',
+            set_index=['group',
+            'sample',
+            'subject'],
+            outdir=None,
+            min_size=10,
+            scale=False,
+            permutations=0
+        )
 
     """
     result = {}

From 64d34145343f8ae188334f63b10eb7bad976f155 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Fri, 22 Nov 2024 11:29:20 +0100
Subject: [PATCH 02/40] :construction: test enrichment analysis :art: type
 annotations

maybe the dataset is not he best one to test enrichment analysis (few diff. reg. protein groups)
---
 acore/enrichment_analysis.py             |  44 +++----
 docs/api_examples/enrichment_analysis.py | 139 +++++++++++++++++++++++
 2 files changed, 161 insertions(+), 22 deletions(-)
 create mode 100644 docs/api_examples/enrichment_analysis.py

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index bc9ba87..6b35cfd 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -237,16 +237,16 @@ def run_up_down_regulation_enrichment(
 
 
 def run_regulation_enrichment(
-    regulation_data,
-    annotation,
-    identifier="identifier",
-    groups=["group1", "group2"],
-    annotation_col="annotation",
-    reject_col="rejected",
-    group_col="group",
-    method="fisher",
-    correction="fdr_bh",
-):
+    regulation_data: pd.DataFrame,
+    annotation: pd.DataFrame,
+    identifier: str = "identifier",
+    groups: list[str] = ["group1", "group2"],
+    annotation_col: str = "annotation",
+    reject_col: str = "rejected",
+    group_col: str = "group",
+    method: str = "fisher",
+    correction: str = "fdr_bh",
+) -> pd.DataFrame:
     """
     This function runs a simple enrichment analysis for significantly regulated features
     in a dataset.
@@ -255,7 +255,7 @@ def run_regulation_enrichment(
     :param annotation: pandas dataframe with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
-    :param list groups: column names from regulation_data containing group identifiers.
+    :param list[str] groups: column names from regulation_data containing group identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
     :param str reject_col: name of the column from regulatio_data containing boolean for
         rejected null hypothesis.
@@ -280,7 +280,6 @@ def run_regulation_enrichment(
             method='fisher',
          )
     """
-    result = {}
     foreground_list = (
         regulation_data[regulation_data[reject_col]][identifier].unique().tolist()
     )
@@ -318,16 +317,16 @@ def run_regulation_enrichment(
 
 def run_enrichment(
     data,
-    foreground_id,
-    background_id,
-    foreground_pop,
-    background_pop,
-    annotation_col="annotation",
-    group_col="group",
-    identifier_col="identifier",
-    method="fisher",
-    correction="fdr_bh",
-):
+    foreground_id: str,
+    background_id: str,
+    foreground_pop: list[str],
+    background_pop: list[str],
+    annotation_col: str = "annotation",
+    group_col: str = "group",
+    identifier_col: str = "identifier",
+    method: str = "fisher",
+    correction: str = "fdr_bh",
+) -> pd.DataFrame:
     """
     Computes enrichment of the foreground relative to a given backgroung,
     using Fisher's exact test, and corrects for multiple hypothesis testing.
@@ -386,6 +385,7 @@ def run_enrichment(
             num_background = num_background[0]
         else:
             num_background = 0
+        # ! what happens if this is not the case?
         if method == "fisher" and num_foreground > 1:
             _, pvalue = run_fisher(
                 [num_foreground, foreground_pop - num_foreground],
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
new file mode 100644
index 0000000..8ade04a
--- /dev/null
+++ b/docs/api_examples/enrichment_analysis.py
@@ -0,0 +1,139 @@
+# %% [markdown]
+# # Enrichment analysis
+#
+# - we need some groups of genes to compute clusters
+# - we need functional annotations, i.e. a category summarizing a set of genes.
+# -
+# You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis
+# on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o).
+#
+# Use example data for ovarian cancer
+# ([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372))
+
+
+# %% tags=["hide-output"]
+# %pip install acore
+
+
+# %%
+import pandas as pd
+
+import acore
+import acore.differential_regulation
+import acore.enrichment_analysis
+
+# %% [markdown]
+# Parameters of this notebook
+
+# %% tags=["parameters"]
+omics: str = (
+    "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/"
+    "example_data/PXD010372/processed/omics.csv"
+)
+meta: str = (
+    "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/"
+    "example_data/PXD010372/processed/meta_patients.csv"
+)
+
+# %% [markdown]
+# # Load processed data
+
+# %%
+df_omics = pd.read_csv(omics, index_col=0)
+df_meta = pd.read_csv(meta, index_col=0)
+df_omics
+
+# %%
+df_omics.notna().sum().sort_values(ascending=True).plot()
+
+# %% [markdown]
+# Keep only features with a certain amount of non-NaN values and select 100 of these
+# for illustration
+
+# %%
+df_omics = df_omics.dropna(axis=1)
+# .sample(
+#     500,
+#     axis=1,
+#     random_state=42,
+# )
+
+# %%
+df_meta
+
+# %% [markdown]
+# ## Compute up and downregulated genes
+# These will be used to find enrichments in the set of both up and downregulated genes.
+#
+# - only CT45 shows to be regulated
+
+# %%
+group = "Status"
+covariates = ["PlatinumValue"]
+diff_reg = acore.differential_regulation.run_anova(
+    df_omics.join(df_meta[[group]]),
+    # df_omics.join(df_meta[[group] + covariates]),
+    # covariates=covariates,
+    drop_cols=[],
+    subject=None,
+    group=group,
+)
+diff_reg.describe(exclude=["float"])
+
+# %%
+diff_reg.describe(exclude=[float])
+
+# %% [markdown]
+# ## Find functional annotations, here pathways
+#
+
+# %%
+from dsp_phospho.ptms.uniprot_id_mapping import (
+    check_id_mapping_results_ready,
+    get_id_mapping_results_link,
+    get_id_mapping_results_search,
+    submit_id_mapping,
+)
+
+job_id = submit_id_mapping(
+    from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=df_omics.columns
+)
+
+if check_id_mapping_results_ready(job_id):
+    link = get_id_mapping_results_link(job_id)
+    # add fields to the link to get more information
+    # From and Entry (accession) are the same.
+    results = get_id_mapping_results_search(
+        link + "?fields=accession,go_p,go_c,go_f&format=tsv"
+    )
+header = results.pop(0).split("\t")
+results = [line.split("\t") for line in results]
+results = pd.DataFrame(results, columns=header)
+results
+
+# %%
+annotation = (
+    results.set_index("Entry")
+    .rename_axis("identifier")
+    .drop("From", axis=1)
+    .rename_axis("source", axis=1)
+    .stack()
+    .to_frame("annotation")
+    .replace("", pd.NA)
+    .dropna()
+    .sort_values(["source", "annotation"])
+    .reset_index()
+)
+annotation
+
+# %% [markdown]
+# ## Enrichment analysis
+#
+
+# %%
+ret = acore.enrichment_analysis.run_regulation_enrichment(
+    regulation_data=diff_reg, annotation=annotation
+)
+ret
+
+# %%

From d37dc5e84301d97a4ea205ac5f6d7d281906ec8a Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 10:48:54 +0100
Subject: [PATCH 03/40] :art::bug: Minimum number of genes rejected to be
 eligable

how many proteins/genes do need to be rejected to
be considered valid. Before it was at least 2 genes.
---
 acore/enrichment_analysis.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 6b35cfd..51e1ad6 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -319,8 +319,9 @@ def run_enrichment(
     data,
     foreground_id: str,
     background_id: str,
-    foreground_pop: list[str],
-    background_pop: list[str],
+    foreground_pop: int,
+    background_pop: int,
+    min_detected_in_set: int = 1,
     annotation_col: str = "annotation",
     group_col: str = "group",
     identifier_col: str = "identifier",
@@ -335,6 +336,8 @@ def run_enrichment(
         (columns: 'annotation', 'identifier', 'source', 'group').
     :param str foreground_id: group identifier of features that belong to the foreground.
     :param str background_id: group identifier of features that belong to the background.
+    :param int foreground_pop: number of features in the foreground.
+    :param int background_pop: number of features in the background.
     :param str annotation_col: name of the column containing annotation terms.
     :param str group_col: name of column containing the group identifiers.
     :param str identifier_col: name of column containing dependent variables identifiers.
@@ -386,7 +389,7 @@ def run_enrichment(
         else:
             num_background = 0
         # ! what happens if this is not the case?
-        if method == "fisher" and num_foreground > 1:
+        if method == "fisher" and num_foreground >= min_detected_in_set:
             _, pvalue = run_fisher(
                 [num_foreground, foreground_pop - num_foreground],
                 [num_background, background_pop - foreground_pop - num_background],

From faf7158627ea1189e694577eb4ec27aba01e67d3 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 10:50:04 +0100
Subject: [PATCH 04/40] :white_check_mark: add minimal test for enrichment
 analysis

---
 tests/test_enrichment.py | 51 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py
index cfd2e03..bc463f1 100644
--- a/tests/test_enrichment.py
+++ b/tests/test_enrichment.py
@@ -1,5 +1,8 @@
 import unittest
+
+import pandas as pd
 from scipy import stats
+
 import acore.enrichment_analysis as ea
 
 
@@ -7,9 +10,11 @@ class TestRunFisher(unittest.TestCase):
     def test_run_fisher(self):
         group1 = [10, 5]
         group2 = [8, 12]
-        alternative = 'two-sided'
+        alternative = "two-sided"
 
-        expected_odds, expected_pvalue = stats.fisher_exact([[10, 5], [8, 12]], alternative)
+        expected_odds, expected_pvalue = stats.fisher_exact(
+            [[10, 5], [8, 12]], alternative
+        )
 
         result = ea.run_fisher(group1, group2, alternative=alternative)
 
@@ -21,9 +26,11 @@ class TestRunKolmogorovSmirnov(unittest.TestCase):
     def test_run_kolmogorov_smirnov(self):
         dist1 = [1, 2, 3, 4, 5]
         dist2 = [1, 2, 3, 4, 6]
-        alternative = 'two-sided'
+        alternative = "two-sided"
 
-        expected_result = stats.ks_2samp(dist1, dist2, alternative=alternative, mode='auto')
+        expected_result = stats.ks_2samp(
+            dist1, dist2, alternative=alternative, mode="auto"
+        )
 
         result = ea.run_kolmogorov_smirnov(dist1, dist2, alternative=alternative)
 
@@ -31,5 +38,39 @@ def test_run_kolmogorov_smirnov(self):
         self.assertEqual(result[1], expected_result.pvalue)
 
 
-if __name__ == '__main__':
+def test_run_enrichtment():
+    annotation = {
+        "annotation": ["path1", "path1", "path1", "path2", "path2", "path3"],
+        "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6"],
+        "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P"],
+    }
+    annotation = pd.DataFrame(annotation)
+    regulation_res = {
+        "identifier": ["gene1", "gene2", "gene3", "gene4", "gene5", "gene6"],
+        "rejected": [True, True, False, False, True, True],
+    }
+    regulation_res = pd.DataFrame(regulation_res)
+
+    actual = ea.run_regulation_enrichment(
+        regulation_data=regulation_res,
+        annotation=annotation,
+    )
+
+    expected = pd.DataFrame(
+        {
+            "terms": ["path1", "path2", "path3"],
+            "identifiers": ["gene1,gene2", "gene1,gene5", "gene6"],
+            "foreground": [2, 2, 1],
+            "background": [1, 0, 0],
+            "foreground_pop": [4, 4, 4],
+            "background_pop": [6, 6, 6],
+            "pvalue": [1.0, 0.4666666666666667, 1.0],
+            "padj": [1.0, 1.0, 1.0],
+            "rejected": [False, False, False],
+        }
+    )
+    assert expected.equals(actual)
+
+
+if __name__ == "__main__":
     unittest.main()

From c1c66f947642797f3ce8778ae802fcf94c1210b5 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 10:53:40 +0100
Subject: [PATCH 05/40] :sparkles: add fetching information from Uniprot KB

- will be tested by building docs (maybe add a unittest later)
---
 acore/io/uniprot.py | 322 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100644 acore/io/uniprot.py

diff --git a/acore/io/uniprot.py b/acore/io/uniprot.py
new file mode 100644
index 0000000..cedc98b
--- /dev/null
+++ b/acore/io/uniprot.py
@@ -0,0 +1,322 @@
+"""Uniprot ID mapping using Python.
+
+Source: https://www.uniprot.org/help/id_mapping
+"""
+
+import json
+import re
+import time
+import zlib
+from urllib.parse import parse_qs, urlencode, urlparse
+from xml.etree import ElementTree
+
+import requests
+from requests.adapters import HTTPAdapter, Retry
+
+POLLING_INTERVAL = 3
+API_URL = "https://rest.uniprot.org"
+
+
+retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
+session = requests.Session()
+session.mount("https://", HTTPAdapter(max_retries=retries))
+
+
+def check_response(response):
+    try:
+        response.raise_for_status()
+    except requests.HTTPError:
+        print(response.json())
+        raise
+
+
+def submit_id_mapping(from_db, to_db, ids):
+    request = requests.post(
+        f"{API_URL}/idmapping/run",
+        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
+    )
+    check_response(request)
+    return request.json()["jobId"]
+
+
+def get_next_link(headers):
+    re_next_link = re.compile(r'<(.+)>; rel="next"')
+    if "Link" in headers:
+        match = re_next_link.match(headers["Link"])
+        if match:
+            return match.group(1)
+
+
+def check_id_mapping_results_ready(job_id):
+    while True:
+        request = session.get(f"{API_URL}/idmapping/status/{job_id}")
+        check_response(request)
+        j = request.json()
+        if "jobStatus" in j:
+            if j["jobStatus"] in ("NEW", "RUNNING"):
+                print(f"Retrying in {POLLING_INTERVAL}s")
+                time.sleep(POLLING_INTERVAL)
+            else:
+                raise Exception(j["jobStatus"])
+        else:
+            return bool(j["results"] or j["failedIds"])
+
+
+def get_batch(batch_response, file_format, compressed):
+    batch_url = get_next_link(batch_response.headers)
+    while batch_url:
+        batch_response = session.get(batch_url)
+        batch_response.raise_for_status()
+        yield decode_results(batch_response, file_format, compressed)
+        batch_url = get_next_link(batch_response.headers)
+
+
+def combine_batches(all_results, batch_results, file_format):
+    if file_format == "json":
+        for key in ("results", "failedIds"):
+            if key in batch_results and batch_results[key]:
+                all_results[key] += batch_results[key]
+    elif file_format == "tsv":
+        return all_results + batch_results[1:]
+    else:
+        return all_results + batch_results
+    return all_results
+
+
+def get_id_mapping_results_link(job_id):
+    url = f"{API_URL}/idmapping/details/{job_id}"
+    request = session.get(url)
+    check_response(request)
+    return request.json()["redirectURL"]
+
+
+def decode_results(response, file_format, compressed):
+    if compressed:
+        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
+        if file_format == "json":
+            j = json.loads(decompressed.decode("utf-8"))
+            return j
+        elif file_format == "tsv":
+            return [line for line in decompressed.decode("utf-8").split("\n") if line]
+        elif file_format == "xlsx":
+            return [decompressed]
+        elif file_format == "xml":
+            return [decompressed.decode("utf-8")]
+        else:
+            return decompressed.decode("utf-8")
+    elif file_format == "json":
+        return response.json()
+    elif file_format == "tsv":
+        return [line for line in response.text.split("\n") if line]
+    elif file_format == "xlsx":
+        return [response.content]
+    elif file_format == "xml":
+        return [response.text]
+    return response.text
+
+
+def get_xml_namespace(element):
+    m = re.match(r"\{(.*)\}", element.tag)
+    return m.groups()[0] if m else ""
+
+
+def merge_xml_results(xml_results):
+    merged_root = ElementTree.fromstring(xml_results[0])
+    for result in xml_results[1:]:
+        root = ElementTree.fromstring(result)
+        for child in root.findall("{http://uniprot.org/uniprot}entry"):
+            merged_root.insert(-1, child)
+    ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
+    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
+
+
+def print_progress_batches(batch_index, size, total):
+    n_fetched = min((batch_index + 1) * size, total)
+    print(f"Fetched: {n_fetched} / {total}")
+
+
+def get_id_mapping_results_search(url):
+    parsed = urlparse(url)
+    query = parse_qs(parsed.query)
+    file_format = query["format"][0] if "format" in query else "json"
+    if "size" in query:
+        size = int(query["size"][0])
+    else:
+        size = 500
+        query["size"] = size
+    compressed = (
+        query["compressed"][0].lower() == "true" if "compressed" in query else False
+    )
+    parsed = parsed._replace(query=urlencode(query, doseq=True))
+    url = parsed.geturl()
+    request = session.get(url)
+    check_response(request)
+    results = decode_results(request, file_format, compressed)
+    total = int(request.headers["x-total-results"])
+    print_progress_batches(0, size, total)
+    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
+        results = combine_batches(results, batch, file_format)
+        print_progress_batches(i, size, total)
+    if file_format == "xml":
+        return merge_xml_results(results)
+    return results
+
+
+def get_id_mapping_results_stream(url):
+    if "/stream/" not in url:
+        url = url.replace("/results/", "/results/stream/")
+    request = session.get(url)
+    check_response(request)
+    parsed = urlparse(url)
+    query = parse_qs(parsed.query)
+    file_format = query["format"][0] if "format" in query else "json"
+    compressed = (
+        query["compressed"][0].lower() == "true" if "compressed" in query else False
+    )
+    return decode_results(request, file_format, compressed)
+
+
+if __name__ == "__main__":
+    # id mapping is used to create a link to a query (you can see the json in the browser)
+    # UniProtKB is the knowleadgebase integrating all kind of other databases
+    import pandas as pd
+
+    job_id = submit_id_mapping(
+        from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=["P05067", "P12345"]
+    )
+
+    if check_id_mapping_results_ready(job_id):
+        link = get_id_mapping_results_link(job_id)
+        # add fields to the link to get more information
+        # From and Entry (accession) are the same.
+        results = get_id_mapping_results_search(
+            link + "?fields=accession,go_p,go_c,go_f&format=tsv"
+        )
+        # see the available fields you can add
+        # https://www.uniprot.org/help/return_fields
+        # and the available formats: json and tsv (for most endpoints)
+        # https://www.uniprot.org/help/api_queries#tips
+    print(results)
+    header = results.pop(0).split("\t")
+    results = [line.split("\t") for line in results]
+    df = pd.DataFrame(results, columns=header)
+    # result from:
+    # df.to_dict(orient="records")
+    records = [
+        {
+            "From": "P05067",
+            "Gene Ontology (biological process)": (
+                "adult locomotory behavior [GO:0008344]; amyloid fibril formation"
+                " [GO:1990000]; astrocyte activation [GO:0048143]; astrocyte activation"
+                " involved in immune response [GO:0002265]; axo-dendritic transport"
+                " [GO:0008088]; axon midline choice point recognition [GO:0016199];"
+                " axonogenesis [GO:0007409]; cell adhesion [GO:0007155]; cellular"
+                " response to amyloid-beta [GO:1904646]; central nervous system"
+                " development [GO:0007417]; cholesterol metabolic process [GO:0008203];"
+                " cognition [GO:0050890]; collateral sprouting in absence of injury"
+                " [GO:0048669]; cytosolic mRNA polyadenylation [GO:0180011]; dendrite"
+                " development [GO:0016358]; endocytosis [GO:0006897]; extracellular"
+                " matrix organization [GO:0030198]; forebrain development [GO:0030900];"
+                " G2/M transition of mitotic cell cycle [GO:0000086]; intracellular"
+                " copper ion homeostasis [GO:0006878]; ionotropic glutamate receptor"
+                " signaling pathway [GO:0035235]; learning [GO:0007612]; learning or"
+                " memory [GO:0007611]; locomotory behavior [GO:0007626]; mating"
+                " behavior [GO:0007617]; microglia development [GO:0014005]; microglial"
+                " cell activation [GO:0001774]; modulation of excitatory postsynaptic"
+                " potential [GO:0098815]; negative regulation of cell population"
+                " proliferation [GO:0008285]; negative regulation of gene expression"
+                " [GO:0010629]; negative regulation of long-term synaptic potentiation"
+                " [GO:1900272]; negative regulation of neuron differentiation"
+                " [GO:0045665]; neuromuscular process controlling balance [GO:0050885];"
+                " neuron apoptotic process [GO:0051402]; neuron cellular homeostasis"
+                " [GO:0070050]; neuron projection development [GO:0031175]; neuron"
+                " projection maintenance [GO:1990535]; neuron remodeling [GO:0016322];"
+                " NMDA selective glutamate receptor signaling pathway [GO:0098989];"
+                " Notch signaling pathway [GO:0007219]; positive regulation of amyloid"
+                " fibril formation [GO:1905908]; positive regulation of"
+                " calcium-mediated signaling [GO:0050850]; positive regulation of"
+                " chemokine production [GO:0032722]; positive regulation of ERK1 and"
+                " ERK2 cascade [GO:0070374]; positive regulation of G2/M transition of"
+                " mitotic cell cycle [GO:0010971]; positive regulation of gene"
+                " expression [GO:0010628]; positive regulation of glycolytic process"
+                " [GO:0045821]; positive regulation of inflammatory response"
+                " [GO:0050729]; positive regulation of interleukin-1 beta production"
+                " [GO:0032731]; positive regulation of interleukin-6 production"
+                " [GO:0032755]; positive regulation of JNK cascade [GO:0046330];"
+                " positive regulation of long-term synaptic potentiation [GO:1900273];"
+                " positive regulation of mitotic cell cycle [GO:0045931]; positive"
+                " regulation of non-canonical NF-kappaB signal transduction"
+                " [GO:1901224]; positive regulation of peptidyl-serine phosphorylation"
+                " [GO:0033138]; positive regulation of peptidyl-threonine"
+                " phosphorylation [GO:0010800]; positive regulation of protein"
+                " metabolic process [GO:0051247]; positive regulation of protein"
+                " phosphorylation [GO:0001934]; positive regulation of T cell migration"
+                " [GO:2000406]; positive regulation of transcription by RNA polymerase"
+                " II [GO:0045944]; positive regulation of tumor necrosis factor"
+                " production [GO:0032760]; regulation of gene expression [GO:0010468];"
+                " regulation of long-term neuronal synaptic plasticity [GO:0048169];"
+                " regulation of multicellular organism growth [GO:0040014]; regulation"
+                " of peptidyl-tyrosine phosphorylation [GO:0050730]; regulation of"
+                " presynapse assembly [GO:1905606]; regulation of spontaneous synaptic"
+                " transmission [GO:0150003]; regulation of synapse structure or"
+                " activity [GO:0050803]; regulation of translation [GO:0006417];"
+                " regulation of Wnt signaling pathway [GO:0030111]; response to"
+                " interleukin-1 [GO:0070555]; response to oxidative stress"
+                " [GO:0006979]; smooth endoplasmic reticulum calcium ion homeostasis"
+                " [GO:0051563]; suckling behavior [GO:0001967]; synapse organization"
+                " [GO:0050808]; synaptic assembly at neuromuscular junction"
+                " [GO:0051124]; visual learning [GO:0008542]"
+            ),
+            "Gene Ontology (cellular component)": (
+                "apical part of cell [GO:0045177]; axon [GO:0030424]; cell surface"
+                " [GO:0009986]; cell-cell junction [GO:0005911]; ciliary rootlet"
+                " [GO:0035253]; clathrin-coated pit [GO:0005905]; COPII-coated ER to"
+                " Golgi transport vesicle [GO:0030134]; cytoplasm [GO:0005737]; cytosol"
+                " [GO:0005829]; dendrite [GO:0030425]; dendritic shaft [GO:0043198];"
+                " dendritic spine [GO:0043197]; early endosome [GO:0005769];"
+                " endoplasmic reticulum [GO:0005783]; endoplasmic reticulum lumen"
+                " [GO:0005788]; endosome [GO:0005768]; endosome lumen [GO:0031904];"
+                " extracellular exosome [GO:0070062]; extracellular region"
+                " [GO:0005576]; extracellular space [GO:0005615]; Golgi apparatus"
+                " [GO:0005794]; Golgi lumen [GO:0005796]; Golgi-associated vesicle"
+                " [GO:0005798]; growth cone [GO:0030426]; membrane [GO:0016020];"
+                " membrane raft [GO:0045121]; mitochondrial inner membrane"
+                " [GO:0005743]; neuromuscular junction [GO:0031594]; nuclear envelope"
+                " lumen [GO:0005641]; perikaryon [GO:0043204]; perinuclear region of"
+                " cytoplasm [GO:0048471]; plasma membrane [GO:0005886]; platelet alpha"
+                " granule lumen [GO:0031093]; presynaptic active zone [GO:0048786];"
+                " receptor complex [GO:0043235]; recycling endosome [GO:0055037];"
+                " smooth endoplasmic reticulum [GO:0005790]; spindle midzone"
+                " [GO:0051233]; synapse [GO:0045202]; synaptic vesicle [GO:0008021];"
+                " trans-Golgi network membrane [GO:0032588]"
+            ),
+            "Gene Ontology (molecular function)": (
+                "DNA binding [GO:0003677]; enzyme binding [GO:0019899]; heparin binding"
+                " [GO:0008201]; identical protein binding [GO:0042802]; protein"
+                " serine/threonine kinase binding [GO:0120283]; PTB domain binding"
+                " [GO:0051425]; receptor ligand activity [GO:0048018]; RNA polymerase"
+                " II cis-regulatory region sequence-specific DNA binding [GO:0000978];"
+                " serine-type endopeptidase inhibitor activity [GO:0004867]; signaling"
+                " receptor activator activity [GO:0030546]; signaling receptor binding"
+                " [GO:0005102]; transition metal ion binding [GO:0046914]"
+            ),
+        },
+        {
+            "From": "P12345",
+            "Gene Ontology (biological process)": (
+                "2-oxoglutarate metabolic process [GO:0006103]; aspartate catabolic"
+                " process [GO:0006533]; aspartate metabolic process [GO:0006531];"
+                " glutamate metabolic process [GO:0006536]; lipid transport"
+                " [GO:0006869]; protein folding [GO:0006457]"
+            ),
+            "Gene Ontology (cellular component)": (
+                "mitochondrial matrix [GO:0005759]; mitochondrion [GO:0005739]; plasma"
+                " membrane [GO:0005886]"
+            ),
+            "Gene Ontology (molecular function)": (
+                "kynurenine-oxoglutarate transaminase activity [GO:0016212];"
+                " L-aspartate:2-oxoglutarate aminotransferase activity [GO:0004069];"
+                " pyridoxal phosphate binding [GO:0030170]"
+            ),
+        },
+    ]

From 0b3cd6eaac039cdaa30082496b352491180b31ec Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 11:05:57 +0100
Subject: [PATCH 06/40] :art: fetch annotation from uniprot in enrichment
 example

not sure if the example is the best.
---
 docs/api_examples/enrichment_analysis.py | 138 +++++++++++++++--------
 1 file changed, 91 insertions(+), 47 deletions(-)

diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 8ade04a..af89143 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -16,6 +16,8 @@
 
 
 # %%
+from pathlib import Path
+
 import pandas as pd
 
 import acore
@@ -26,20 +28,21 @@
 # Parameters of this notebook
 
 # %% tags=["parameters"]
-omics: str = (
-    "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/"
-    "example_data/PXD010372/processed/omics.csv"
-)
-meta: str = (
+base_path: str = (
     "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/"
-    "example_data/PXD010372/processed/meta_patients.csv"
+    "example_data/PXD010372/processed"
 )
+omics: str = f"{base_path}/omics.csv"
+meta_pgs: str = f"{base_path}/meta_pgs.csv"
+meta: str = f"{base_path}/meta_patients.csv"
+N_to_sample: int = 1_000
 
 # %% [markdown]
 # # Load processed data
 
 # %%
 df_omics = pd.read_csv(omics, index_col=0)
+df_meta_pgs = pd.read_csv(meta_pgs, index_col=0)
 df_meta = pd.read_csv(meta, index_col=0)
 df_omics
 
@@ -48,32 +51,42 @@
 
 # %% [markdown]
 # Keep only features with a certain amount of non-NaN values and select 100 of these
-# for illustration
+# for illustration. Add the ones which were differently regulated in the ANOVA using all
+# the protein groups.
 
 # %%
-df_omics = df_omics.dropna(axis=1)
-# .sample(
-#     500,
-#     axis=1,
-#     random_state=42,
-# )
+idx_always_included = ["Q5HYN5", "P39059", "O43432", "O43175"]
+df_omics[idx_always_included]
+
+# %%
+df_omics = (
+    df_omics
+    # .dropna(axis=1)
+    .drop(idx_always_included, axis=1)
+    .dropna(thresh=18, axis=1)
+    .sample(
+        N_to_sample - len(idx_always_included),
+        axis=1,
+        random_state=42,
+    )
+    .join(df_omics[idx_always_included])
+)
+df_omics
+
 
 # %%
 df_meta
 
+
 # %% [markdown]
 # ## Compute up and downregulated genes
 # These will be used to find enrichments in the set of both up and downregulated genes.
-#
-# - only CT45 shows to be regulated
 
 # %%
 group = "Status"
 covariates = ["PlatinumValue"]
 diff_reg = acore.differential_regulation.run_anova(
     df_omics.join(df_meta[[group]]),
-    # df_omics.join(df_meta[[group] + covariates]),
-    # covariates=covariates,
     drop_cols=[],
     subject=None,
     group=group,
@@ -81,50 +94,77 @@
 diff_reg.describe(exclude=["float"])
 
 # %%
-diff_reg.describe(exclude=[float])
+diff_reg.query("rejected == True")
 
 # %% [markdown]
 # ## Find functional annotations, here pathways
 #
 
 # %%
-from dsp_phospho.ptms.uniprot_id_mapping import (
+from acore.io.uniprot import (
     check_id_mapping_results_ready,
     get_id_mapping_results_link,
     get_id_mapping_results_search,
     submit_id_mapping,
 )
 
-job_id = submit_id_mapping(
-    from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=df_omics.columns
-)
 
-if check_id_mapping_results_ready(job_id):
-    link = get_id_mapping_results_link(job_id)
-    # add fields to the link to get more information
-    # From and Entry (accession) are the same.
-    results = get_id_mapping_results_search(
-        link + "?fields=accession,go_p,go_c,go_f&format=tsv"
+def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
+    """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's
+    knowledgebase (KB).
+
+    Parameters
+    ----------
+    ids : pd.Index | list
+        Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.
+    fields : str, optional
+        Fields to fetch, by default "accession,go_p,go_c. See for availble fields:
+        https://www.uniprot.org/help/return_fields
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with annotations of the UniProt IDs.
+    """
+    job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids)
+
+    if check_id_mapping_results_ready(job_id):
+        link = get_id_mapping_results_link(job_id)
+        # add fields to the link to get more information
+        # From and Entry (accession) are the same for UniProt IDs.
+        results = get_id_mapping_results_search(
+            link + "?fields=accession,go_p,go_c,go_f&format=tsv"
+        )
+    header = results.pop(0).split("\t")
+    results = [line.split("\t") for line in results]
+    df = pd.DataFrame(results, columns=header)
+    return df
+
+
+fname_annotations = "downloaded/annotations.csv"
+fname = Path(fname_annotations)
+try:
+    annotations = pd.read_csv(fname, index_col=0)
+    print(f"Loaded annotations from {fname}")
+except FileNotFoundError:
+    print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.")
+    annotations = fetch_annotations(df_omics.columns)
+    annotations = (
+        annotations.set_index("Entry")
+        .rename_axis("identifier")
+        .drop("From", axis=1)
+        .rename_axis("source", axis=1)
+        .stack()
+        .to_frame("annotation")
+        .replace("", pd.NA)
+        .dropna()
+        .sort_values(["source", "annotation"])
+        .reset_index()
     )
-header = results.pop(0).split("\t")
-results = [line.split("\t") for line in results]
-results = pd.DataFrame(results, columns=header)
-results
+    fname.parent.mkdir(exist_ok=True, parents=True)
+    annotations.to_csv(fname, index=True)
 
-# %%
-annotation = (
-    results.set_index("Entry")
-    .rename_axis("identifier")
-    .drop("From", axis=1)
-    .rename_axis("source", axis=1)
-    .stack()
-    .to_frame("annotation")
-    .replace("", pd.NA)
-    .dropna()
-    .sort_values(["source", "annotation"])
-    .reset_index()
-)
-annotation
+annotations
 
 # %% [markdown]
 # ## Enrichment analysis
@@ -132,8 +172,12 @@
 
 # %%
 ret = acore.enrichment_analysis.run_regulation_enrichment(
-    regulation_data=diff_reg, annotation=annotation
+    regulation_data=diff_reg,
+    annotation=annotations,
+    # group_col="annotation",
+    groups=["groups1", "groups2"],
 )
 ret
 
+
 # %%

From 545299bb58b101292aee09a88b169dcd88aba491 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 12:12:02 +0100
Subject: [PATCH 07/40] :bug: make decomposition a module

---
 acore/decomposition/__init__.py | 3 +++
 acore/decomposition/pca.py      | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)
 create mode 100644 acore/decomposition/__init__.py

diff --git a/acore/decomposition/__init__.py b/acore/decomposition/__init__.py
new file mode 100644
index 0000000..9f86d45
--- /dev/null
+++ b/acore/decomposition/__init__.py
@@ -0,0 +1,3 @@
+from . import pca
+
+__all__ = ["pca"]
diff --git a/acore/decomposition/pca.py b/acore/decomposition/pca.py
index 229661d..0ae8102 100644
--- a/acore/decomposition/pca.py
+++ b/acore/decomposition/pca.py
@@ -26,10 +26,10 @@ def run_pca(
         n_comp_max = min(df_wide.shape)
         n_comp_max = min(n_comp_max, n_components)
     pca = sklearn.decomposition.PCA(n_components=n_comp_max)
-    PCs = pca.fit_transform(df_wide)
+    pcs = pca.fit_transform(df_wide)
     cols = [
         f"principal component {i+1} ({var_explained*100:.2f} %)"
         for i, var_explained in enumerate(pca.explained_variance_ratio_)
     ]
-    PCs = pd.DataFrame(PCs, index=df_wide.index, columns=cols)
-    return PCs, pca
+    pcs = pd.DataFrame(pcs, index=df_wide.index, columns=cols)
+    return pcs, pca

From 18121c9841856bfdd5f2ff70bf16a1cab1250670 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 16:04:00 +0100
Subject: [PATCH 08/40] :art::white_check_mark: add fdr alpha parameters as
 option, optimize and test multiple testing

---
 acore/enrichment_analysis.py             | 11 +++++-
 acore/multiple_testing.py                | 23 ++++++++----
 docs/api_examples/enrichment_analysis.py |  3 +-
 tests/test_multiple_testing.py           | 45 ++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_multiple_testing.py

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 51e1ad6..7d113c5 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -246,6 +246,7 @@ def run_regulation_enrichment(
     group_col: str = "group",
     method: str = "fisher",
     correction: str = "fdr_bh",
+    correction_alpha: float = 0.05,
 ) -> pd.DataFrame:
     """
     This function runs a simple enrichment analysis for significantly regulated features
@@ -310,6 +311,7 @@ def run_regulation_enrichment(
         identifier_col=identifier,
         method=method,
         correction=correction,
+        correction_alpha=correction_alpha,
     )
 
     return result
@@ -327,6 +329,7 @@ def run_enrichment(
     identifier_col: str = "identifier",
     method: str = "fisher",
     correction: str = "fdr_bh",
+    correction_alpha: float = 0.05,
 ) -> pd.DataFrame:
     """
     Computes enrichment of the foreground relative to a given backgroung,
@@ -342,6 +345,8 @@ def run_enrichment(
     :param str group_col: name of column containing the group identifiers.
     :param str identifier_col: name of column containing dependent variables identifiers.
     :param str method: method used to compute enrichment (only 'fisher' is supported currently).
+    :param str correction: method to be used for multiple-testing correction.
+    :param float correction_alpha: adjusted p-value cutoff to define significance.
     :return: Pandas dataframe with annotation terms, features,
         number of foregroung/background features in each term,
         p-values and corrected p-values
@@ -408,7 +413,11 @@ def run_enrichment(
                 )
             )
     if len(pvalues) > 1:
-        rejected, padj = apply_pvalue_correction(pvalues, alpha=0.05, method=correction)
+        rejected, padj = apply_pvalue_correction(
+            pvalues,
+            alpha=correction_alpha,
+            method=correction,
+        )
         result = pd.DataFrame(
             {
                 "terms": terms,
diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py
index 35fd5d6..bf22e6b 100644
--- a/acore/multiple_testing.py
+++ b/acore/multiple_testing.py
@@ -5,10 +5,17 @@
 from sklearn.utils import shuffle
 from statsmodels.stats import multitest
 
+# ? dictionary with available methods in statsmodels.stats.multitest.multipletests:
+# multitest.multitest_methods_names
 
-def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"):
+
+def apply_pvalue_correction(pvalues, alpha: float = 0.05, method: str = "bonferroni"):
     """
-    Performs p-value correction using the specified method. For more information visit https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html.
+    Performs p-value correction using the specified method as in
+    statsmodels.stats.multitest.multipletests.
+
+    For more information visit
+    https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html.
 
     :param numpy.ndarray pvalues: et of p-values of the individual tests.
     :param float alpha: error rate.
@@ -23,7 +30,9 @@ def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"):
         - fdr_by : Benjamini/Yekutieli (negative)
         - fdr_tsbh : two stage fdr correction (non-negative)
         - fdr_tsbky : two stage fdr correction (non-negative)
-    :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value.
+    :return: Tuple with two `numpy.array`s, boolen for rejecting H0 hypothesis
+             and float for adjusted p-value. Can contain missing values if `pvalues`
+             contain missing values.
 
     Exmaple::
 
@@ -32,10 +41,12 @@ def apply_pvalue_correction(pvalues, alpha=0.05, method="bonferroni"):
     p = np.array(pvalues)
     mask = np.isfinite(p)
     pval_corrected = np.full(p.shape, np.nan)
-    pval_corrected[mask] = multitest.multipletests(p[mask], alpha, method)[1]
-    rejected = [p <= alpha for p in pval_corrected]
+    _rejected, _pvals_corrected, _, _ = multitest.multipletests(p[mask], alpha, method)
+    pval_corrected[mask] = _pvals_corrected
+    rejected = np.full(p.shape, np.nan)
+    rejected[mask] = _rejected
 
-    return (rejected, pval_corrected.tolist())
+    return (rejected, pval_corrected)
 
 
 def apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method="indep"):
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index af89143..d7dada2 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -174,8 +174,7 @@ def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
 ret = acore.enrichment_analysis.run_regulation_enrichment(
     regulation_data=diff_reg,
     annotation=annotations,
-    # group_col="annotation",
-    groups=["groups1", "groups2"],
+    correction_alpha=0.01,
 )
 ret
 
diff --git a/tests/test_multiple_testing.py b/tests/test_multiple_testing.py
new file mode 100644
index 0000000..0f339bc
--- /dev/null
+++ b/tests/test_multiple_testing.py
@@ -0,0 +1,45 @@
+import numpy as np
+import pytest
+
+from acore.multiple_testing import apply_pvalue_correction
+
+# benferoni correction
+array = [
+    0.1,
+    0.01,
+    0.2,
+    0.4,
+    np.nan,
+    0.5,
+]  # 0.04, 0.02, 0.0001]
+
+test_res = [
+    (
+        "b",
+        np.array([0.0, 1.0, 0.0, 0.0, np.nan, 0.0]),
+        np.array([0.5, 0.05, 1.0, 1.0, np.nan, 1.0]),
+    ),
+    (
+        "fdr_bh",
+        np.array(
+            [
+                0.0,
+                1.0,
+                0.0,
+                0.0,
+                np.nan,
+                0.0,
+            ]
+        ),
+        np.array([0.25, 0.05, 0.33333333, 0.5, np.nan, 0.5]),
+    ),
+]
+
+
+@pytest.mark.parametrize("method,exp_rejected,exp_pvalues", test_res)
+def test_apply_pvalue_correction_alpha_5_percent(method, exp_rejected, exp_pvalues):
+    act_rejected, act_pvalues = apply_pvalue_correction(
+        array, alpha=0.05, method=method
+    )
+    np.testing.assert_array_equal(act_rejected, exp_rejected)
+    np.testing.assert_array_almost_equal(act_pvalues, exp_pvalues)

From 301504560aa01cf9e5fc7c78c19a7a11c25c0420 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 16:05:18 +0100
Subject: [PATCH 09/40] :zap: just work with numpy array, not lists

---
 acore/enrichment_analysis.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 7d113c5..6e352d6 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -281,23 +281,22 @@ def run_regulation_enrichment(
             method='fisher',
          )
     """
-    foreground_list = (
-        regulation_data[regulation_data[reject_col]][identifier].unique().tolist()
-    )
-    background_list = (
-        regulation_data[~regulation_data[reject_col]][identifier].unique().tolist()
-    )
+    mask_rejected = regulation_data[reject_col]
+    foreground_list = regulation_data.loc[mask_rejected, identifier].unique()
+    background_list = regulation_data.loc[~mask_rejected, identifier].unique()
     foreground_pop = len(foreground_list)
-    background_pop = len(regulation_data[identifier].unique().tolist())
-    grouping = []
-    for i in annotation[identifier]:
-        if i in foreground_list:
-            grouping.append("foreground")
-        elif i in background_list:
-            grouping.append("background")
-        else:
-            grouping.append(np.nan)
-    annotation[group_col] = grouping
+    background_pop = len(regulation_data[identifier].unique())
+    # needs to allow for missing annotations
+    annotation[group_col] = np.where(
+        annotation[identifier].isin(foreground_list),
+        "foreground",
+        # if in background list, then background, else np.nan
+        np.where(
+            annotation[identifier].isin(background_list),
+            "background",
+            np.nan,
+        ),
+    )
     annotation = annotation.dropna(subset=[group_col])
 
     result = run_enrichment(

From 13f45e9ccad61649714b40600d595235a7a5e320 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 16:06:41 +0100
Subject: [PATCH 10/40] :bug: remove unused parameter

- maybe it was there for compability, but then we could add kwargs to the function to catch any other arguments (without using these)
---
 acore/enrichment_analysis.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 6e352d6..405034f 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -240,7 +240,6 @@ def run_regulation_enrichment(
     regulation_data: pd.DataFrame,
     annotation: pd.DataFrame,
     identifier: str = "identifier",
-    groups: list[str] = ["group1", "group2"],
     annotation_col: str = "annotation",
     reject_col: str = "rejected",
     group_col: str = "group",
@@ -256,7 +255,6 @@ def run_regulation_enrichment(
     :param annotation: pandas dataframe with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
-    :param list[str] groups: column names from regulation_data containing group identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
     :param str reject_col: name of the column from regulatio_data containing boolean for
         rejected null hypothesis.

From 3961d84ccba6c720dab3297d48597f54949fa97d Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 16:18:46 +0100
Subject: [PATCH 11/40] :bug: correct name of testing fuction (to fct tested)

---
 tests/test_enrichment.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py
index bc463f1..e8d761c 100644
--- a/tests/test_enrichment.py
+++ b/tests/test_enrichment.py
@@ -38,7 +38,9 @@ def test_run_kolmogorov_smirnov(self):
         self.assertEqual(result[1], expected_result.pvalue)
 
 
-def test_run_enrichtment():
+def test_run_regulation_enrichment():
+    """Integration test for run_regulation_enrichment. Indirectly tests
+    run_enrichment from enrichment_analysis module."""
     annotation = {
         "annotation": ["path1", "path1", "path1", "path2", "path2", "path3"],
         "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6"],

From 7f51e02aec9a6adab9a737f64e0c0c3643542fd2 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 25 Nov 2024 16:20:52 +0100
Subject: [PATCH 12/40] :constructions: add enrichtment example to docs

---
 docs/api_examples/enrichment_analysis.ipynb | 331 ++++++++++++++++++++
 docs/index.rst                              |   1 +
 2 files changed, 332 insertions(+)
 create mode 100644 docs/api_examples/enrichment_analysis.ipynb

diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
new file mode 100644
index 0000000..9a547ba
--- /dev/null
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -0,0 +1,331 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f79a8051",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "# Enrichment analysis\n",
+    "\n",
+    "- we need some groups of genes to compute clusters\n",
+    "- we need functional annotations, i.e. a category summarizing a set of genes.\n",
+    "-\n",
+    "You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis\n",
+    "on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o).\n",
+    "\n",
+    "Use example data for ovarian cancer\n",
+    "([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "956ed7b7",
+   "metadata": {
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "%pip install acore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3030d08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "import acore\n",
+    "import acore.differential_regulation\n",
+    "import acore.enrichment_analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fddd607c",
+   "metadata": {},
+   "source": [
+    "Parameters of this notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6af9349a",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "base_path: str = (\n",
+    "    \"https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/main/\"\n",
+    "    \"example_data/PXD010372/processed\"\n",
+    ")\n",
+    "omics: str = f\"{base_path}/omics.csv\"\n",
+    "meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n",
+    "meta: str = f\"{base_path}/meta_patients.csv\"\n",
+    "N_to_sample: int = 1_000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10ed1830",
+   "metadata": {},
+   "source": [
+    "# Load processed data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d70ef4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_omics = pd.read_csv(omics, index_col=0)\n",
+    "df_meta_pgs = pd.read_csv(meta_pgs, index_col=0)\n",
+    "df_meta = pd.read_csv(meta, index_col=0)\n",
+    "df_omics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3897e1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_omics.notna().sum().sort_values(ascending=True).plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ce47108",
+   "metadata": {},
+   "source": [
+    "Keep only features with a certain amount of non-NaN values and select 100 of these\n",
+    "for illustration. Add the ones which were differently regulated in the ANOVA using all\n",
+    "the protein groups."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3a8ab49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx_always_included = [\"Q5HYN5\", \"P39059\", \"O43432\", \"O43175\"]\n",
+    "df_omics[idx_always_included]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1145a2cd",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "df_omics = (\n",
+    "    df_omics\n",
+    "    # .dropna(axis=1)\n",
+    "    .drop(idx_always_included, axis=1)\n",
+    "    .dropna(thresh=18, axis=1)\n",
+    "    .sample(\n",
+    "        N_to_sample - len(idx_always_included),\n",
+    "        axis=1,\n",
+    "        random_state=42,\n",
+    "    )\n",
+    "    .join(df_omics[idx_always_included])\n",
+    ")\n",
+    "df_omics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aea77e80",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "df_meta"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bbf5dc4",
+   "metadata": {},
+   "source": [
+    "## Compute up and downregulated genes\n",
+    "These will be used to find enrichments in the set of both up and downregulated genes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "231bb6da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group = \"Status\"\n",
+    "covariates = [\"PlatinumValue\"]\n",
+    "diff_reg = acore.differential_regulation.run_anova(\n",
+    "    df_omics.join(df_meta[[group]]),\n",
+    "    drop_cols=[],\n",
+    "    subject=None,\n",
+    "    group=group,\n",
+    ")\n",
+    "diff_reg.describe(exclude=[\"float\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e347b06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "diff_reg.query(\"rejected == True\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6c0a225",
+   "metadata": {},
+   "source": [
+    "## Find functional annotations, here pathways\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2668415",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from acore.io.uniprot import (\n",
+    "    check_id_mapping_results_ready,\n",
+    "    get_id_mapping_results_link,\n",
+    "    get_id_mapping_results_search,\n",
+    "    submit_id_mapping,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:\n",
+    "    \"\"\"Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's\n",
+    "    knowledgebase (KB).\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    ids : pd.Index | list\n",
+    "        Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.\n",
+    "    fields : str, optional\n",
+    "        Fields to fetch, by default \"accession,go_p,go_c. See for availble fields:\n",
+    "        https://www.uniprot.org/help/return_fields\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    pd.DataFrame\n",
+    "        DataFrame with annotations of the UniProt IDs.\n",
+    "    \"\"\"\n",
+    "    job_id = submit_id_mapping(from_db=\"UniProtKB_AC-ID\", to_db=\"UniProtKB\", ids=ids)\n",
+    "\n",
+    "    if check_id_mapping_results_ready(job_id):\n",
+    "        link = get_id_mapping_results_link(job_id)\n",
+    "        # add fields to the link to get more information\n",
+    "        # From and Entry (accession) are the same for UniProt IDs.\n",
+    "        results = get_id_mapping_results_search(\n",
+    "            link + \"?fields=accession,go_p,go_c,go_f&format=tsv\"\n",
+    "        )\n",
+    "    header = results.pop(0).split(\"\\t\")\n",
+    "    results = [line.split(\"\\t\") for line in results]\n",
+    "    df = pd.DataFrame(results, columns=header)\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "fname_annotations = \"downloaded/annotations.csv\"\n",
+    "fname = Path(fname_annotations)\n",
+    "try:\n",
+    "    annotations = pd.read_csv(fname, index_col=0)\n",
+    "    print(f\"Loaded annotations from {fname}\")\n",
+    "except FileNotFoundError:\n",
+    "    print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n",
+    "    annotations = fetch_annotations(df_omics.columns)\n",
+    "    annotations = (\n",
+    "        annotations.set_index(\"Entry\")\n",
+    "        .rename_axis(\"identifier\")\n",
+    "        .drop(\"From\", axis=1)\n",
+    "        .rename_axis(\"source\", axis=1)\n",
+    "        .stack()\n",
+    "        .to_frame(\"annotation\")\n",
+    "        .replace(\"\", pd.NA)\n",
+    "        .dropna()\n",
+    "        .sort_values([\"source\", \"annotation\"])\n",
+    "        .reset_index()\n",
+    "    )\n",
+    "    fname.parent.mkdir(exist_ok=True, parents=True)\n",
+    "    annotations.to_csv(fname, index=True)\n",
+    "\n",
+    "annotations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4165bc94",
+   "metadata": {},
+   "source": [
+    "## Enrichment analysis\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f300c5b5",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "ret = acore.enrichment_analysis.run_regulation_enrichment(\n",
+    "    regulation_data=diff_reg,\n",
+    "    annotation=annotations,\n",
+    "    correction_alpha=0.01,\n",
+    ")\n",
+    "ret"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dd57b99",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "tags,-all",
+   "main_language": "python",
+   "notebook_metadata_filter": "-all"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/index.rst b/docs/index.rst
index 91a92e6..eba7982 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -16,6 +16,7 @@
 
    api_examples/exploratory_analysis
    api_examples/normalization_analysis
+   api_examples/enrichment_analysis
 
 .. toctree::
    :maxdepth: 1

From e63f9379d58b0630fd03f7cbc82b519b89153eb4 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 26 Nov 2024 10:42:22 +0100
Subject: [PATCH 13/40] :bug::sparkles: cast rejected to bool, annotate
 features using pandas functionality

---
 acore/enrichment_analysis.py | 44 ++++++++++++++++++++++++++++--------
 tests/test_enrichment.py     | 27 +++++++++++++++++++---
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 405034f..259fe73 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -236,6 +236,35 @@ def run_up_down_regulation_enrichment(
     return enrichment_results
 
 
+# ! to move
+def _annotate_features(
+    features: pd.Series, in_foreground: list[str], in_background: list[str]
+) -> pd.Series:
+    """
+    Annotate features as foreground or background based on their presence in the
+    foreground and background lists.
+
+    :param features: pandas dataframe with features and their annotations.
+    :param in_foreground: list of features identifiers in the foreground.
+    :type in_foreground: set or list-like
+    :param in_background: list of features identifiers in the background.
+    :type in_background: set or list-like
+    :return: pandas dataframe with a new column 'group' containing 'foreground' or 'background'.
+             missing values are preserved.
+
+    Example::
+
+        result = _annotate_features(features, in_foreground, in_background)
+    """
+    in_either_or = features.isin(in_foreground) | features.isin(in_background)
+    res = (
+        features.where(in_either_or, np.nan)
+        .mask(features.isin(in_foreground), "foreground")
+        .mask(features.isin(in_background), "background")
+    )
+    return res
+
+
 def run_regulation_enrichment(
     regulation_data: pd.DataFrame,
     annotation: pd.DataFrame,
@@ -285,15 +314,10 @@ def run_regulation_enrichment(
     foreground_pop = len(foreground_list)
     background_pop = len(regulation_data[identifier].unique())
     # needs to allow for missing annotations
-    annotation[group_col] = np.where(
-        annotation[identifier].isin(foreground_list),
-        "foreground",
-        # if in background list, then background, else np.nan
-        np.where(
-            annotation[identifier].isin(background_list),
-            "background",
-            np.nan,
-        ),
+    annotation[group_col] = _annotate_features(
+        features=annotation[identifier],
+        in_foreground=foreground_list,
+        in_background=background_list,
     )
     annotation = annotation.dropna(subset=[group_col])
 
@@ -425,7 +449,7 @@ def run_enrichment(
                 "background_pop": background_pop,
                 "pvalue": pvalues,
                 "padj": padj,
-                "rejected": rejected,
+                "rejected": rejected.astype(bool),
             }
         )
         result = result.sort_values(by="padj", ascending=True)
diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py
index e8d761c..75dbb86 100644
--- a/tests/test_enrichment.py
+++ b/tests/test_enrichment.py
@@ -1,5 +1,6 @@
 import unittest
 
+import numpy as np
 import pandas as pd
 from scipy import stats
 
@@ -38,13 +39,33 @@ def test_run_kolmogorov_smirnov(self):
         self.assertEqual(result[1], expected_result.pvalue)
 
 
+def test__annotate_features():
+    expected = pd.Series(
+        [
+            "foreground",
+            "foreground",
+            "background",
+            "foreground",
+            "background",
+            "background",
+            np.nan,
+        ]
+    )
+
+    features = pd.Series(["G1", "G2", "G3", "G4", "G5", "G6", "G9"])
+    in_foreground = ["G1", "G2", "G4"]
+    in_background = ["G3", "G5", "G6"]
+    actual = ea._annotate_features(features, in_foreground, in_background)
+    pd.testing.assert_series_equal(expected, actual)
+
+
 def test_run_regulation_enrichment():
     """Integration test for run_regulation_enrichment. Indirectly tests
     run_enrichment from enrichment_analysis module."""
     annotation = {
-        "annotation": ["path1", "path1", "path1", "path2", "path2", "path3"],
-        "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6"],
-        "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P"],
+        "annotation": ["path1", "path1", "path1", "path2", "path2", "path3", "path3"],
+        "identifier": ["gene1", "gene2", "gene3", "gene1", "gene5", "gene6", "gene9"],
+        "source": ["GO", "GO", "GO", "GO_P", "GO_P", "GO_P", "GO_P"],
     }
     annotation = pd.DataFrame(annotation)
     regulation_res = {

From 4adc62420d2bd171470f3a8aed8ca0c565181dc1 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 26 Nov 2024 10:18:31 +0000
Subject: [PATCH 14/40] :bug: newer versions of numpy need explicit type

- NA can lead to float columns instead of 0. to investigate
---
 acore/enrichment_analysis.py             | 3 ++-
 docs/api_examples/enrichment_analysis.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 259fe73..f8ac681 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -308,7 +308,8 @@ def run_regulation_enrichment(
             method='fisher',
          )
     """
-    mask_rejected = regulation_data[reject_col]
+    # ? can we remove NA features in that column?
+    mask_rejected = regulation_data[reject_col].astype(bool)
     foreground_list = regulation_data.loc[mask_rejected, identifier].unique()
     background_list = regulation_data.loc[~mask_rejected, identifier].unique()
     foreground_pop = len(foreground_list)
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index d7dada2..598a12a 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -94,6 +94,7 @@
 diff_reg.describe(exclude=["float"])
 
 # %%
+diff_reg["rejected"] = diff_reg["rejected"].astype(bool)  # ! needs to be fixed in anova
 diff_reg.query("rejected == True")
 
 # %% [markdown]

From 6a7db73925f7966b781acb1a455cab1900a80b79 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 26 Nov 2024 16:30:47 +0100
Subject: [PATCH 15/40] :construction: annotate and use some more formatting
 option

---
 acore/enrichment_analysis.py             | 64 ++++++++++++------------
 docs/api_examples/enrichment_analysis.py |  3 ++
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index f8ac681..ae76202 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -1,5 +1,8 @@
 """Enrichment Analysis Module. Contains different functions to perform enrichment 
 analysis.
+
+Most things in this module are covered in https://www.youtube.com/watch?v=2NC1QOXmc5o
+by Lars Juhl Jensen.
 """
 
 import os
@@ -461,14 +464,14 @@ def run_enrichment(
 def run_ssgsea(
     data,
     annotation,
-    annotation_col="an notation",
-    identifier_col="identifier",
-    set_index=[],
-    outdir="tmp",
-    min_size=15,
-    max_size=500,
-    scale=False,
-    permutations=0,
+    set_index: list[str],
+    annotation_col: str = "an notation",
+    identifier_col: str = "identifier",
+    outdir: str = "tmp",
+    min_size: int = 15,
+    max_size: int = 500,
+    scale: bool = False,
+    permutations: int = 0,
 ):
     """
     Project each sample within a data set onto a space of gene set enrichment scores using
@@ -531,10 +534,13 @@ def run_ssgsea(
     if not os.path.exists(outdir):
         os.makedirs(outdir)
 
+    # Comine columns to create a unique name for each set (?)
     name = []
     index = data[set_index]
     for i, row in data[set_index].iterrows():
-        name.append("_".join(row[set_index].tolist()))
+        name.append(
+            "_".join(row[set_index].tolist())
+        )  # this assumes strings as identifiers
 
     df["Name"] = name
     index.index = name
@@ -554,29 +560,25 @@ def run_ssgsea(
                     + "\t".join(list(filter(None, row[identifier_col])))
                     + "\n"
                 )
-        try:
-            enrichment = gp.ssgsea(
-                data=df,
-                gene_sets=str(file_path),
-                outdir=outdir,
-                min_size=min_size,
-                max_size=max_size,
-                scale=scale,
-                permutation_num=permutations,
-                no_plot=True,
-                processes=1,
-                seed=10,
-                format="png",
-            )
+        enrichment = gp.ssgsea(
+            data=df,
+            gene_sets=str(file_path),
+            outdir=outdir,
+            min_size=min_size,
+            max_size=max_size,
+            scale=scale,
+            permutation_num=permutations,
+            no_plot=True,
+            processes=1,
+            seed=10,
+            format="png",
+        )
 
-            enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose()
-            enrichment_es = enrichment_es.join(index)
-            enrichment_nes = enrichment.res2d.transpose()
-            enrichment_nes = enrichment_nes.join(index)
+        enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose()
+        enrichment_es = enrichment_es.join(index)
+        enrichment_nes = enrichment.res2d.transpose()
+        enrichment_nes = enrichment_nes.join(index)
 
-            result = {"es": enrichment_es, "nes": enrichment_nes}
-        except Exception as e:
-            print("Error in ssGSEA.", e)
-    df = None
+        result = {"es": enrichment_es, "nes": enrichment_nes}
 
     return result
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 598a12a..13d0f1f 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -18,12 +18,15 @@
 # %%
 from pathlib import Path
 
+import dsp_pandas
 import pandas as pd
 
 import acore
 import acore.differential_regulation
 import acore.enrichment_analysis
 
+dsp_pandas.format.set_pandas_options(max_colwidth=15)
+
 # %% [markdown]
 # Parameters of this notebook
 

From 142b7787682793c819ec7c048d530bed32347ddf Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 2 Dec 2024 22:47:05 +0100
Subject: [PATCH 16/40] :art: pass on parameter for min set, docstrings

resolve things discussed with Alberto.
---
 acore/enrichment_analysis.py | 8 +++++---
 tests/test_enrichment.py     | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index ae76202..ba31d62 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -247,12 +247,12 @@ def _annotate_features(
     Annotate features as foreground or background based on their presence in the
     foreground and background lists.
 
-    :param features: pandas dataframe with features and their annotations.
+    :param features: pandas.Series with features and their annotations.
     :param in_foreground: list of features identifiers in the foreground.
     :type in_foreground: set or list-like
     :param in_background: list of features identifiers in the background.
     :type in_background: set or list-like
-    :return: pandas dataframe with a new column 'group' containing 'foreground' or 'background'.
+    :return: pandas.Series containing 'foreground' or 'background'.
              missing values are preserved.
 
     Example::
@@ -276,6 +276,7 @@ def run_regulation_enrichment(
     reject_col: str = "rejected",
     group_col: str = "group",
     method: str = "fisher",
+    min_detected_in_set: int = 2,
     correction: str = "fdr_bh",
     correction_alpha: float = 0.05,
 ) -> pd.DataFrame:
@@ -336,6 +337,7 @@ def run_regulation_enrichment(
         identifier_col=identifier,
         method=method,
         correction=correction,
+        min_detected_in_set=min_detected_in_set,
         correction_alpha=correction_alpha,
     )
 
@@ -348,7 +350,7 @@ def run_enrichment(
     background_id: str,
     foreground_pop: int,
     background_pop: int,
-    min_detected_in_set: int = 1,
+    min_detected_in_set: int = 2,
     annotation_col: str = "annotation",
     group_col: str = "group",
     identifier_col: str = "identifier",
diff --git a/tests/test_enrichment.py b/tests/test_enrichment.py
index 75dbb86..6c0930a 100644
--- a/tests/test_enrichment.py
+++ b/tests/test_enrichment.py
@@ -77,6 +77,7 @@ def test_run_regulation_enrichment():
     actual = ea.run_regulation_enrichment(
         regulation_data=regulation_res,
         annotation=annotation,
+        min_detected_in_set=1,
     )
 
     expected = pd.DataFrame(

From 27c6e3800a66ca85407f585231add2bc593b2005 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 3 Dec 2024 12:59:07 +0100
Subject: [PATCH 17/40] :art: improve visibility of data and inspect
 annotations

---
 acore/enrichment_analysis.py                |  6 ++---
 docs/api_examples/enrichment_analysis.ipynb | 30 ++++++++++++++++++++-
 docs/api_examples/enrichment_analysis.py    | 12 +++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index ba31d62..6c896cb 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -211,7 +211,6 @@ def run_up_down_regulation_enrichment(
             df,
             annotation,
             identifier=identifier,
-            groups=groups,
             annotation_col=annotation_col,
             reject_col="up_pairwise_regulation",
             group_col=group_col,
@@ -224,7 +223,6 @@ def run_up_down_regulation_enrichment(
             df,
             annotation,
             identifier=identifier,
-            groups=groups,
             annotation_col=annotation_col,
             reject_col="down_pairwise_regulation",
             group_col=group_col,
@@ -241,7 +239,9 @@ def run_up_down_regulation_enrichment(
 
 # ! to move
 def _annotate_features(
-    features: pd.Series, in_foreground: list[str], in_background: list[str]
+    features: pd.Series,
+    in_foreground: set[str] | list[str],
+    in_background: set[str] | list[str],
 ) -> pd.Series:
     """
     Annotate features as foreground or background based on their presence in the
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 9a547ba..93ff6ee 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -43,11 +43,14 @@
    "source": [
     "from pathlib import Path\n",
     "\n",
+    "import dsp_pandas\n",
     "import pandas as pd\n",
     "\n",
     "import acore\n",
     "import acore.differential_regulation\n",
-    "import acore.enrichment_analysis"
+    "import acore.enrichment_analysis\n",
+    "\n",
+    "dsp_pandas.format.set_pandas_options(max_colwidth=15)"
    ]
   },
   {
@@ -201,6 +204,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "diff_reg[\"rejected\"] = diff_reg[\"rejected\"].astype(bool)  # ! needs to be fixed in anova\n",
     "diff_reg.query(\"rejected == True\")"
    ]
   },
@@ -285,6 +289,30 @@
     "annotations"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "d4734452",
+   "metadata": {},
+   "source": [
+    "See how many protein groups are associated with each annotation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57fddefb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = (\n",
+    "    annotations.groupby(\"annotation\")\n",
+    "    .size()\n",
+    "    .value_counts()\n",
+    "    .sort_index()\n",
+    "    .plot(kind=\"bar\")\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4165bc94",
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 13d0f1f..00115c9 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -170,6 +170,18 @@ def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
 
 annotations
 
+# %% [markdown]
+# See how many protein groups are associated with each annotation.
+
+# %%
+_ = (
+    annotations.groupby("annotation")
+    .size()
+    .value_counts()
+    .sort_index()
+    .plot(kind="bar")
+)
+
 # %% [markdown]
 # ## Enrichment analysis
 #

From 8ed860c98d4e708119ed8b5c2db58c17d2f57a4a Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 3 Dec 2024 14:31:42 +0100
Subject: [PATCH 18/40] :bug: make it Python 3.9 compatible

---
 acore/enrichment_analysis.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 6c896cb..de20275 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -5,6 +5,8 @@
 by Lars Juhl Jensen.
 """
 
+from __future__ import annotations
+
 import os
 import re
 import uuid

From ec10c71f42992bfb83266698d31e4c430b7a12c6 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 3 Dec 2024 16:47:32 +0100
Subject: [PATCH 19/40] :art: type annotations and docstring updates

- prepare to defien a common return type by specifying message TYPE_COLS_MSG
---
 acore/enrichment_analysis.py | 87 ++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index de20275..e8ee6ad 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -18,8 +18,17 @@
 
 from acore.multiple_testing import apply_pvalue_correction
 
+TYPE_COLS_MSG = """
+columns: 'terms', 'identifiers', 'foreground',
+    'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'.
+"""
+
 
-def run_fisher(group1, group2, alternative="two-sided"):
+def run_fisher(
+    group1: list[int],
+    group2: list[int],
+    alternative="two-sided",
+) -> tuple[float, float]:
     """annotated   not-annotated
     group1      a               b
     group2      c               d
@@ -66,7 +75,7 @@ def run_site_regulation_enrichment(
     regulation_data,
     annotation,
     identifier="identifier",
-    groups=["group1", "group2"],
+    groups=("group1", "group2"),
     annotation_col="annotation",
     reject_col="rejected",
     group_col="group",
@@ -78,9 +87,9 @@ def run_site_regulation_enrichment(
     This function runs a simple enrichment analysis for significantly
     regulated protein sites in a dataset.
 
-    :param regulation_data: pandas dataframe resulting from differential
+    :param regulation_data: pandas.DataFrame resulting from differential
         regulation analysis.
-    :param annotation: pandas dataframe with annotations for features
+    :param annotation: pandas.DataFrame with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing
         feature identifiers.
@@ -95,8 +104,8 @@ def run_site_regulation_enrichment(
     :param str method: method used to compute enrichment
         (only 'fisher' is supported currently).
     :param str regex: how to extract the annotated identifier from the site identifier
-    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground',
-        'background', 'pvalue', 'padj' and 'rejected'.
+    :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground',
+        'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'.
 
     Example::
 
@@ -108,7 +117,7 @@ def run_site_regulation_enrichment(
             reject_col='rejected',
             group_col='group',
             method='fisher',
-            match="(\w+~.+)_\w\d+\-\w+"
+            match="(\\w+~.+)_\\w\\d+\\-\\w+"
         )
     """
     result = pd.DataFrame()
@@ -142,9 +151,9 @@ def run_up_down_regulation_enrichment(
     regulation_data,
     annotation,
     identifier="identifier",
-    groups=["group1", "group2"],
+    groups=("group1", "group2"),
     annotation_col="annotation",
-    reject_col="rejected",
+    # reject_col="rejected",
     group_col="group",
     method="fisher",
     correction="fdr_bh",
@@ -155,9 +164,9 @@ def run_up_down_regulation_enrichment(
     This function runs a simple enrichment analysis for significantly regulated proteins
     distinguishing between up- and down-regulated.
 
-    :param regulation_data: pandas dataframe resulting from differential regulation
+    :param regulation_data: pandas.DataFrame resulting from differential regulation
         analysis (CKG's regulation table).
-    :param annotation: pandas dataframe with annotations for features
+    :param annotation: pandas.DataFrame with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
     :param list groups: column names from regulation_data containing group identifiers.
@@ -171,7 +180,7 @@ def run_up_down_regulation_enrichment(
     :param str correction: method to be used for multiple-testing correction
     :param float alpha: adjusted p-value cutoff to define significance
     :param float lfc_cutoff: log fold-change cutoff to define practical significance
-    :return: Pandas dataframe with columns:'terms', 'identifiers', 'foreground',
+    :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground',
         'background', 'pvalue', 'padj' and 'rejected'.
 
     Example::
@@ -286,8 +295,8 @@ def run_regulation_enrichment(
     This function runs a simple enrichment analysis for significantly regulated features
     in a dataset.
 
-    :param regulation_data: pandas dataframe resulting from differential regulation analysis.
-    :param annotation: pandas dataframe with annotations for features
+    :param regulation_data: pandas.DataFrame resulting from differential regulation analysis.
+    :param annotation: pandas.DataFrame with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
@@ -297,8 +306,8 @@ def run_regulation_enrichment(
         if feature belongs to foreground or background.
     :param str method: method used to compute enrichment (only 'fisher' is supported currently).
     :param str correction: method to be used for multiple-testing correction
-    :return: Pandas dataframe with columns: 'terms', 'identifiers', 'foreground',
-        'background', 'pvalue', 'padj' and 'rejected'.
+    :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground',
+        'background', 'foreground_pop', 'background_pop', 'pvalue', 'padj' and 'rejected'.
 
     Example::
 
@@ -306,12 +315,13 @@ def run_regulation_enrichment(
             regulation_data,
             annotation,
             identifier='identifier',
-            groups=['group1',
-            'group2'],
             annotation_col='annotation',
             reject_col='rejected',
             group_col='group',
             method='fisher',
+            min_detected_in_set=2,
+            correction='fdr_bh',
+            correction_alpha=0.05,
          )
     """
     # ? can we remove NA features in that column?
@@ -364,8 +374,8 @@ def run_enrichment(
     Computes enrichment of the foreground relative to a given backgroung,
     using Fisher's exact test, and corrects for multiple hypothesis testing.
 
-    :param data: pandas dataframe with annotations for dataset features
-        (columns: 'annotation', 'identifier', 'source', 'group').
+    :param data: pandas.DataFrame with annotations for dataset features
+        (columns: 'annotation', 'identifier', 'group').
     :param str foreground_id: group identifier of features that belong to the foreground.
     :param str background_id: group identifier of features that belong to the background.
     :param int foreground_pop: number of features in the foreground.
@@ -376,7 +386,7 @@ def run_enrichment(
     :param str method: method used to compute enrichment (only 'fisher' is supported currently).
     :param str correction: method to be used for multiple-testing correction.
     :param float correction_alpha: adjusted p-value cutoff to define significance.
-    :return: Pandas dataframe with annotation terms, features,
+    :return: pandas.DataFrame with columns: annotation terms, features,
         number of foregroung/background features in each term,
         p-values and corrected p-values
         (columns: 'terms', 'identifiers', 'foreground',
@@ -396,34 +406,35 @@ def run_enrichment(
             method='fisher',
          )
     """
+    if method != "fisher":
+        raise ValueError("Only Fisher's exact test is supported at the moment.")
+
     result = pd.DataFrame()
-    df = data.copy()
     terms = []
     ids = []
     pvalues = []
     fnum = []
     bnum = []
     countsdf = (
-        df.groupby([annotation_col, group_col])
+        data.groupby([annotation_col, group_col])
         .agg(["count"])[(identifier_col, "count")]
         .reset_index()
     )
     countsdf.columns = [annotation_col, group_col, "count"]
-    for annotation in (
-        countsdf[countsdf[group_col] == foreground_id][annotation_col].unique().tolist()
-    ):
+    for annotation in countsdf.loc[
+        countsdf[group_col] == foreground_id, annotation_col
+    ].unique():
         counts = countsdf[countsdf[annotation_col] == annotation]
         num_foreground = counts.loc[counts[group_col] == foreground_id, "count"].values
         num_background = counts.loc[counts[group_col] == background_id, "count"].values
-
+        # ! counts should always be of length one count? squeeze?
         if len(num_foreground) == 1:
             num_foreground = num_foreground[0]
         if len(num_background) == 1:
             num_background = num_background[0]
         else:
             num_background = 0
-        # ! what happens if this is not the case?
-        if method == "fisher" and num_foreground >= min_detected_in_set:
+        if num_foreground >= min_detected_in_set:
             _, pvalue = run_fisher(
                 [num_foreground, foreground_pop - num_foreground],
                 [num_background, background_pop - foreground_pop - num_background],
@@ -434,11 +445,11 @@ def run_enrichment(
             pvalues.append(pvalue)
             ids.append(
                 ",".join(
-                    df.loc[
-                        (df[annotation_col] == annotation)
-                        & (df[group_col] == foreground_id),
+                    data.loc[
+                        (data[annotation_col] == annotation)
+                        & (data[group_col] == foreground_id),
                         identifier_col,
-                    ].tolist()
+                    ]
                 )
             )
     if len(pvalues) > 1:
@@ -466,8 +477,8 @@ def run_enrichment(
 
 
 def run_ssgsea(
-    data,
-    annotation,
+    data: pd.DataFrame,
+    annotation: str,
     set_index: list[str],
     annotation_col: str = "an notation",
     identifier_col: str = "identifier",
@@ -483,8 +494,8 @@ def run_ssgsea(
     described in Barbie et al., 2009:
     https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA).
 
-    :param data: pandas dataframe with the quantified features (i.e. subject x proteins)
-    :param annotation: pandas dataframe with the annotation to be used in the enrichment
+    :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins)
+    :param annotation: pandas.DataFrame with the annotation to be used in the enrichment
         (i.e. CKG pathway annotation file)
     :param str annotation_col: name of the column containing annotation terms.
     :param str identifier_col: name of column containing dependent variables identifiers.
@@ -556,7 +567,7 @@ def run_ssgsea(
         )
         fid = uuid.uuid4()
         file_path = os.path.join(outdir, str(fid) + ".gmt")
-        with open(file_path, "w") as out:
+        with open(file_path, "w", encoding="utf8") as out:
             for i, row in grouped_annotations.iterrows():
                 out.write(
                     row[annotation_col]

From 9c96af7c677bd13a7e58150fb16f3da76bca2887 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 13:22:07 +0100
Subject: [PATCH 20/40] :art: docstrings fmt

---
 acore/enrichment_analysis.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index e8ee6ad..0c2a2ed 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -27,7 +27,7 @@
 def run_fisher(
     group1: list[int],
     group2: list[int],
-    alternative="two-sided",
+    alternative: str = "two-sided",
 ) -> tuple[float, float]:
     """annotated   not-annotated
     group1      a               b
@@ -83,7 +83,7 @@ def run_site_regulation_enrichment(
     regex=r"(\w+~.+)_\w\d+\-\w+",  # ! add example to docstring of what this matches
     correction="fdr_bh",
 ):
-    """
+    r"""
     This function runs a simple enrichment analysis for significantly
     regulated protein sites in a dataset.
 
@@ -117,7 +117,7 @@ def run_site_regulation_enrichment(
             reject_col='rejected',
             group_col='group',
             method='fisher',
-            match="(\\w+~.+)_\\w\\d+\\-\\w+"
+            match="(\w+~.+)_\w\d+\-\w+"
         )
     """
     result = pd.DataFrame()
@@ -357,7 +357,7 @@ def run_regulation_enrichment(
 
 
 def run_enrichment(
-    data,
+    data: pd.DataFrame,
     foreground_id: str,
     background_id: str,
     foreground_pop: int,
@@ -390,7 +390,7 @@ def run_enrichment(
         number of foregroung/background features in each term,
         p-values and corrected p-values
         (columns: 'terms', 'identifiers', 'foreground',
-         'background', 'pvalue', 'padj' and 'rejected').
+        'background', 'pvalue', 'padj' and 'rejected').
 
     Example::
 
@@ -514,6 +514,7 @@ def run_ssgsea(
         and nes - normalized enrichment scores.
 
     Example::
+
         stproject = "P0000008"
         p = project.Project(
             stproject,
@@ -542,7 +543,6 @@ def run_ssgsea(
             scale=False,
             permutations=0
         )
-
     """
     result = {}
     df = data.copy()

From a6f21c90bb58a8c8345bed059b5a3552e1537e3e Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 14:29:10 +0100
Subject: [PATCH 21/40] :art: more docstring updates

---
 acore/enrichment_analysis.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 0c2a2ed..d925a51 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -29,15 +29,18 @@ def run_fisher(
     group2: list[int],
     alternative: str = "two-sided",
 ) -> tuple[float, float]:
-    """annotated   not-annotated
-    group1      a               b
-    group2      c               d
-    ------------------------------------
+    """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`.
+
+    Example::
+
+        # annotated   not-annotated
+        # group1      a               b
+        # group2      c               d
 
-    group1 = [a, b]
-    group2 = [c, d]
 
-    odds, pvalue = stats.fisher_exact([[a, b], [c, d]])
+        odds, pvalue = stats.fisher_exact(group1=[a, b],
+                                          group2 =[c, d]
+                        )
     """
 
     odds, pvalue = stats.fisher_exact([group1, group2], alternative)
@@ -80,10 +83,10 @@ def run_site_regulation_enrichment(
     reject_col="rejected",
     group_col="group",
     method="fisher",
-    regex=r"(\w+~.+)_\w\d+\-\w+",  # ! add example to docstring of what this matches
+    regex="(\\w+~.+)_\\w\\d+\\-\\w+",  # ! add example to docstring of what kind of string this matches
     correction="fdr_bh",
 ):
-    r"""
+    """
     This function runs a simple enrichment analysis for significantly
     regulated protein sites in a dataset.
 
@@ -97,7 +100,7 @@ def run_site_regulation_enrichment(
         group identifiers.
     :param str annotation_col: name of the column from annotation containing
         annotation terms.
-    :param str reject_col: name of the column from regulatio_data containing
+    :param str reject_col: name of the column from regulation_data containing
         boolean for rejected null hypothesis.
     :param str group_col: column name for new column in annotation dataframe
         determining if feature belongs to foreground or background.
@@ -117,7 +120,7 @@ def run_site_regulation_enrichment(
             reject_col='rejected',
             group_col='group',
             method='fisher',
-            match="(\w+~.+)_\w\d+\-\w+"
+            match="(\\w+~.+)_\\w\\d+\\-\\w+"
         )
     """
     result = pd.DataFrame()
@@ -300,7 +303,7 @@ def run_regulation_enrichment(
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
-    :param str reject_col: name of the column from regulatio_data containing boolean for
+    :param str reject_col: name of the column from `regulation_data` containing boolean for
         rejected null hypothesis.
     :param str group_col: column name for new column in annotation dataframe determining
         if feature belongs to foreground or background.
@@ -487,7 +490,7 @@ def run_ssgsea(
     max_size: int = 500,
     scale: bool = False,
     permutations: int = 0,
-):
+) -> dict[str, pd.DataFrame]:
     """
     Project each sample within a data set onto a space of gene set enrichment scores using
     the single sample gene set enrichment analysis (ssGSEA) projection methodology

From 4d98dd51900f57025ed3c2dd50c8ffeb34f0cde6 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 14:59:58 +0100
Subject: [PATCH 22/40] :bug: regex in parameter name vs interpreded in
 docstring

sphinx deals differently with parsing the strings...
---
 acore/enrichment_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index d925a51..c0bc0c3 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -86,7 +86,7 @@ def run_site_regulation_enrichment(
     regex="(\\w+~.+)_\\w\\d+\\-\\w+",  # ! add example to docstring of what kind of string this matches
     correction="fdr_bh",
 ):
-    """
+    r"""
     This function runs a simple enrichment analysis for significantly
     regulated protein sites in a dataset.
 

From 9f4d50d9cc2a6277fc8e16269fa0f52a708b5f3a Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 15:40:26 +0100
Subject: [PATCH 23/40] :art: docstrings: test if listing works with blank
 lines, write Example correctly

otherwise numpy notation should work fine
---
 acore/multiple_testing.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py
index bf22e6b..07bc19c 100644
--- a/acore/multiple_testing.py
+++ b/acore/multiple_testing.py
@@ -20,21 +20,23 @@ def apply_pvalue_correction(pvalues, alpha: float = 0.05, method: str = "bonferr
     :param numpy.ndarray pvalues: et of p-values of the individual tests.
     :param float alpha: error rate.
     :param str method: method of p-value correction:
-        - bonferroni : one-step correction
-        - sidak : one-step correction
-        - holm-sidak : step down method using Sidak adjustments
-        - holm : step-down method using Bonferroni adjustments
-        - simes-hochberg : step-up method (independent)
-        - hommel : closed method based on Simes tests (non-negative)
-        - fdr_bh : Benjamini/Hochberg (non-negative)
-        - fdr_by : Benjamini/Yekutieli (negative)
-        - fdr_tsbh : two stage fdr correction (non-negative)
-        - fdr_tsbky : two stage fdr correction (non-negative)
+
+        - 'bonferroni' : one-step correction
+        - 'sidak' : one-step correction
+        - 'holm-sidak' : step down method using Sidak adjustments
+        - 'holm' : step-down method using Bonferroni adjustments
+        - 'simes-hochberg' : step-up method (independent)
+        - 'hommel' : closed method based on Simes tests (non-negative)
+        - 'fdr_bh' : Benjamini/Hochberg (non-negative)
+        - 'fdr_by' : Benjamini/Yekutieli (negative)
+        - 'fdr_tsbh' : two stage fdr correction (non-negative)
+        - 'fdr_tsbky' : two stage fdr correction (non-negative)
+
     :return: Tuple with two `numpy.array`s, boolen for rejecting H0 hypothesis
              and float for adjusted p-value. Can contain missing values if `pvalues`
              contain missing values.
 
-    Exmaple::
+    Example::
 
         result = apply_pvalue_correction(pvalues, alpha=0.05, method='bonferroni')
     """
@@ -58,7 +60,7 @@ def apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method="indep"):
     :param str method: method of p-value correction ('indep', 'negcorr').
     :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value.
 
-    Exmaple::
+    Example::
 
         result = apply_pvalue_fdrcorrection(pvalues, alpha=0.05, method='indep')
     """
@@ -76,7 +78,7 @@ def apply_pvalue_twostage_fdrcorrection(pvalues, alpha=0.05, method="bh"):
     :param str method: method of p-value correction ('bky', 'bh').
     :return: Tuple with two arrays, boolen for rejecting H0 hypothesis and float for adjusted p-value.
 
-    Exmaple::
+    Example::
 
         result = apply_pvalue_twostage_fdrcorrection(pvalues, alpha=0.05, method='bh')
     """

From bc5ab2c72c5769f94dd58157c95d6776a274d3cf Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 15:59:44 +0100
Subject: [PATCH 24/40] :art: rename reject_col to rejected_col after typing it
 wrongly several times

- default parameter is rejected, so it is easier to not mistype parameter name
---
 acore/enrichment_analysis.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index c0bc0c3..672b5cd 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -80,7 +80,7 @@ def run_site_regulation_enrichment(
     identifier="identifier",
     groups=("group1", "group2"),
     annotation_col="annotation",
-    reject_col="rejected",
+    rejected_col="rejected",
     group_col="group",
     method="fisher",
     regex="(\\w+~.+)_\\w\\d+\\-\\w+",  # ! add example to docstring of what kind of string this matches
@@ -100,7 +100,7 @@ def run_site_regulation_enrichment(
         group identifiers.
     :param str annotation_col: name of the column from annotation containing
         annotation terms.
-    :param str reject_col: name of the column from regulation_data containing
+    :param str rejected_col: name of the column from regulation_data containing
         boolean for rejected null hypothesis.
     :param str group_col: column name for new column in annotation dataframe
         determining if feature belongs to foreground or background.
@@ -117,7 +117,7 @@ def run_site_regulation_enrichment(
             identifier='identifier',
             groups=['group1', 'group2'],
             annotation_col='annotation',
-            reject_col='rejected',
+            rejected_col='rejected',
             group_col='group',
             method='fisher',
             match="(\\w+~.+)_\\w\\d+\\-\\w+"
@@ -141,7 +141,7 @@ def run_site_regulation_enrichment(
                 identifier,
                 groups,
                 annotation_col,
-                reject_col,
+                rejected_col,
                 group_col,
                 method,
                 correction,
@@ -156,7 +156,7 @@ def run_up_down_regulation_enrichment(
     identifier="identifier",
     groups=("group1", "group2"),
     annotation_col="annotation",
-    # reject_col="rejected",
+    # rejected_col="rejected",
     group_col="group",
     method="fisher",
     correction="fdr_bh",
@@ -174,7 +174,7 @@ def run_up_down_regulation_enrichment(
     :param str identifier: name of the column from annotation containing feature identifiers.
     :param list groups: column names from regulation_data containing group identifiers.
     :param str annotation_col: name of the column from annotation containing annotation terms.
-    :param str reject_col: name of the column from regulation_data containing boolean for
+    :param str rejected_col: name of the column from regulation_data containing boolean for
         rejected null hypothesis.
     :param str group_col: column name for new column in annotation dataframe determining
         if feature belongs to foreground or background.
@@ -195,7 +195,7 @@ def run_up_down_regulation_enrichment(
             groups=['group1',
             'group2'],
             annotation_col='annotation',
-            reject_col='rejected',
+            rejected_col='rejected',
             group_col='group',
             method='fisher',
             correction='fdr_bh',
@@ -226,7 +226,7 @@ def run_up_down_regulation_enrichment(
             annotation,
             identifier=identifier,
             annotation_col=annotation_col,
-            reject_col="up_pairwise_regulation",
+            rejected_col="up_pairwise_regulation",
             group_col=group_col,
             method=method,
             correction=correction,
@@ -238,7 +238,7 @@ def run_up_down_regulation_enrichment(
             annotation,
             identifier=identifier,
             annotation_col=annotation_col,
-            reject_col="down_pairwise_regulation",
+            rejected_col="down_pairwise_regulation",
             group_col=group_col,
             method=method,
             correction=correction,
@@ -287,7 +287,7 @@ def run_regulation_enrichment(
     annotation: pd.DataFrame,
     identifier: str = "identifier",
     annotation_col: str = "annotation",
-    reject_col: str = "rejected",
+    rejected_col: str = "rejected",
     group_col: str = "group",
     method: str = "fisher",
     min_detected_in_set: int = 2,
@@ -302,8 +302,9 @@ def run_regulation_enrichment(
     :param annotation: pandas.DataFrame with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
+        It should also be present in `regulation_data`.
     :param str annotation_col: name of the column from annotation containing annotation terms.
-    :param str reject_col: name of the column from `regulation_data` containing boolean for
+    :param str rejected_col: name of the column from `regulation_data` containing boolean for
         rejected null hypothesis.
     :param str group_col: column name for new column in annotation dataframe determining
         if feature belongs to foreground or background.
@@ -319,7 +320,7 @@ def run_regulation_enrichment(
             annotation,
             identifier='identifier',
             annotation_col='annotation',
-            reject_col='rejected',
+            rejected_col='rejected',
             group_col='group',
             method='fisher',
             min_detected_in_set=2,
@@ -328,7 +329,7 @@ def run_regulation_enrichment(
          )
     """
     # ? can we remove NA features in that column?
-    mask_rejected = regulation_data[reject_col].astype(bool)
+    mask_rejected = regulation_data[rejected_col].astype(bool)
     foreground_list = regulation_data.loc[mask_rejected, identifier].unique()
     background_list = regulation_data.loc[~mask_rejected, identifier].unique()
     foreground_pop = len(foreground_list)

From 834b54e25fcf9783ab9af73ce3a476160cb02332 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 16:00:08 +0100
Subject: [PATCH 25/40] :art: try to set hyperlink

---
 acore/exploratory_analysis.py | 2 +-
 acore/multiple_testing.py     | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/acore/exploratory_analysis.py b/acore/exploratory_analysis.py
index b0a09da..b1384dd 100644
--- a/acore/exploratory_analysis.py
+++ b/acore/exploratory_analysis.py
@@ -35,7 +35,7 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]):
     :param list columns: names to use for the variable column(s), and for the value column(s)
     :return: Pandas dataframe with columns 'name' (protein identifier), 'x' (coefficient of variation), 'y' (mean) and 'group'.
 
-    Exmaple::
+    Example::
 
         result = get_coefficient_variation(data, drop_columns=['sample', 'subject'], group='group')
     """
diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py
index 07bc19c..ae36349 100644
--- a/acore/multiple_testing.py
+++ b/acore/multiple_testing.py
@@ -9,10 +9,14 @@
 # multitest.multitest_methods_names
 
 
-def apply_pvalue_correction(pvalues, alpha: float = 0.05, method: str = "bonferroni"):
+def apply_pvalue_correction(
+    pvalues: np.ndarray, alpha: float = 0.05, method: str = "bonferroni"
+) -> tuple[np.ndarray, np.ndarray]:
     """
     Performs p-value correction using the specified method as in
-    statsmodels.stats.multitest.multipletests.
+    statsmodels.stats.multitest.multipletests_.
+
+    .. _statsmodels.stats.multitest.multipletests: https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
 
     For more information visit
     https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html.

From 794c1affa3ee2ea7e12517638c1a9629f68fe77e Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 4 Dec 2024 16:05:17 +0100
Subject: [PATCH 26/40] :truck::sparkles: move uniprot code to module, creating
 user-facing function

---
 acore/io/uniprot/__init__.py             | 44 ++++++++++++++++++++++++
 acore/io/{ => uniprot}/uniprot.py        |  3 ++
 docs/api_examples/enrichment_analysis.py | 41 +---------------------
 3 files changed, 48 insertions(+), 40 deletions(-)
 create mode 100644 acore/io/uniprot/__init__.py
 rename acore/io/{ => uniprot}/uniprot.py (99%)

diff --git a/acore/io/uniprot/__init__.py b/acore/io/uniprot/__init__.py
new file mode 100644
index 0000000..f7d0468
--- /dev/null
+++ b/acore/io/uniprot/__init__.py
@@ -0,0 +1,44 @@
+"""Uniprot API user functions for fetching annotations for UniProt IDs and providing 
+the results as a pandas.DataFrame."""
+
+import pandas as pd
+
+from .uniprot import (
+    check_id_mapping_results_ready,
+    get_id_mapping_results_link,
+    get_id_mapping_results_search,
+    submit_id_mapping,
+)
+
+
+# function for outside usage
+def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
+    """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's
+    knowledgebase (KB).
+
+    Parameters
+    ----------
+    ids : pd.Index | list
+        Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.
+    fields : str, optional
+        Fields to fetch, by default "accession,go_p,go_c. See for availble fields:
+        https://www.uniprot.org/help/return_fields
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with annotations of the UniProt IDs.
+    """
+    job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids)
+
+    if check_id_mapping_results_ready(job_id):
+        link = get_id_mapping_results_link(job_id)
+        # add fields to the link to get more information
+        # From and Entry (accession) are the same for UniProt IDs.
+        results = get_id_mapping_results_search(
+            link + "?fields=accession,go_p,go_c,go_f&format=tsv"
+        )
+    header = results.pop(0).split("\t")
+    results = [line.split("\t") for line in results]
+    df = pd.DataFrame(results, columns=header)
+    return df
diff --git a/acore/io/uniprot.py b/acore/io/uniprot/uniprot.py
similarity index 99%
rename from acore/io/uniprot.py
rename to acore/io/uniprot/uniprot.py
index cedc98b..c9abf05 100644
--- a/acore/io/uniprot.py
+++ b/acore/io/uniprot/uniprot.py
@@ -10,6 +10,7 @@
 from urllib.parse import parse_qs, urlencode, urlparse
 from xml.etree import ElementTree
 
+import pandas as pd
 import requests
 from requests.adapters import HTTPAdapter, Retry
 
@@ -176,6 +177,8 @@ def get_id_mapping_results_stream(url):
     return decode_results(request, file_format, compressed)
 
 
+
+
 if __name__ == "__main__":
     # id mapping is used to create a link to a query (you can see the json in the browser)
     # UniProtKB is the knowleadgebase integrating all kind of other databases
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 00115c9..25e7cf9 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -24,6 +24,7 @@
 import acore
 import acore.differential_regulation
 import acore.enrichment_analysis
+from acore.io.uniprot import fetch_annotations
 
 dsp_pandas.format.set_pandas_options(max_colwidth=15)
 
@@ -105,46 +106,6 @@
 #
 
 # %%
-from acore.io.uniprot import (
-    check_id_mapping_results_ready,
-    get_id_mapping_results_link,
-    get_id_mapping_results_search,
-    submit_id_mapping,
-)
-
-
-def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
-    """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's
-    knowledgebase (KB).
-
-    Parameters
-    ----------
-    ids : pd.Index | list
-        Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.
-    fields : str, optional
-        Fields to fetch, by default "accession,go_p,go_c. See for availble fields:
-        https://www.uniprot.org/help/return_fields
-
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame with annotations of the UniProt IDs.
-    """
-    job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids)
-
-    if check_id_mapping_results_ready(job_id):
-        link = get_id_mapping_results_link(job_id)
-        # add fields to the link to get more information
-        # From and Entry (accession) are the same for UniProt IDs.
-        results = get_id_mapping_results_search(
-            link + "?fields=accession,go_p,go_c,go_f&format=tsv"
-        )
-    header = results.pop(0).split("\t")
-    results = [line.split("\t") for line in results]
-    df = pd.DataFrame(results, columns=header)
-    return df
-
-
 fname_annotations = "downloaded/annotations.csv"
 fname = Path(fname_annotations)
 try:

From 9d7c5f21fff8a5f28ab0c1ca43f3b4d3a38afb58 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Thu, 5 Dec 2024 09:17:12 +0100
Subject: [PATCH 27/40] :bug: test line break in docstring

works locally. see:
https://stackoverflow.com/a/14414965/9684872
---
 acore/multiple_testing.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/acore/multiple_testing.py b/acore/multiple_testing.py
index ae36349..5261b39 100644
--- a/acore/multiple_testing.py
+++ b/acore/multiple_testing.py
@@ -16,10 +16,8 @@ def apply_pvalue_correction(
     Performs p-value correction using the specified method as in
     statsmodels.stats.multitest.multipletests_.
 
-    .. _statsmodels.stats.multitest.multipletests: https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
-
-    For more information visit
-    https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html.
+    .. _statsmodels.stats.multitest.multipletests: \
+https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
 
     :param numpy.ndarray pvalues: et of p-values of the individual tests.
     :param float alpha: error rate.

From dba200081444a4c20625e01acf4e30304f7bb95a Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 9 Dec 2024 16:30:44 +0100
Subject: [PATCH 28/40] :bug: specify fields as parameter

- output should be tsv due to how pandas return is constructed for now
---
 acore/io/uniprot/__init__.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/acore/io/uniprot/__init__.py b/acore/io/uniprot/__init__.py
index f7d0468..96e6002 100644
--- a/acore/io/uniprot/__init__.py
+++ b/acore/io/uniprot/__init__.py
@@ -12,7 +12,10 @@
 
 
 # function for outside usage
-def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
+def fetch_annotations(
+    ids: pd.Index | list,
+    fields: str = "accession,go_p,go_c,go_f",
+) -> pd.DataFrame:
     """Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's
     knowledgebase (KB).
 
@@ -30,13 +33,14 @@ def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:
         DataFrame with annotations of the UniProt IDs.
     """
     job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=ids)
-
+    # tsv used here to return a DataFrame. Maybe other formats are availale at some points
+    _format = "tsv"
     if check_id_mapping_results_ready(job_id):
         link = get_id_mapping_results_link(job_id)
         # add fields to the link to get more information
         # From and Entry (accession) are the same for UniProt IDs.
         results = get_id_mapping_results_search(
-            link + "?fields=accession,go_p,go_c,go_f&format=tsv"
+            link + f"?fields={fields}&format={_format}"
         )
     header = results.pop(0).split("\t")
     results = [line.split("\t") for line in results]

From ddac13c8e0b9b79809e83fbc65cc3fd3fbdcf927 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 9 Dec 2024 16:33:58 +0100
Subject: [PATCH 29/40] :construction: use fetch_annotations from pkg, add
 ssgsea

---
 acore/enrichment_analysis.py                | 90 ++++++++++++---------
 docs/api_examples/enrichment_analysis.ipynb | 77 +++++++-----------
 docs/api_examples/enrichment_analysis.py    | 17 +++-
 3 files changed, 97 insertions(+), 87 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 672b5cd..d5cc803 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -483,8 +483,8 @@ def run_enrichment(
 def run_ssgsea(
     data: pd.DataFrame,
     annotation: str,
-    set_index: list[str],
-    annotation_col: str = "an notation",
+    set_index: list[str] = None,
+    annotation_col: str = "annotation",
     identifier_col: str = "identifier",
     outdir: str = "tmp",
     min_size: int = 15,
@@ -501,11 +501,11 @@ def run_ssgsea(
     :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins)
     :param annotation: pandas.DataFrame with the annotation to be used in the enrichment
         (i.e. CKG pathway annotation file)
-    :param str annotation_col: name of the column containing annotation terms.
-    :param str identifier_col: name of column containing dependent variables identifiers.
     :param list set_index: column/s to be used as index. Enrichment will be calculated
         for these values (i.e ["subject"] will return subjects x pathways matrix of
         enrichment scores)
+    :param str annotation_col: name of the column containing annotation terms.
+    :param str identifier_col: name of column containing dependent variables identifiers.
     :param str out_dir: directory path where results will be stored
         (default None, tmp folder is used)
     :param int min_size: minimum number of features (i.e. proteins) in enriched terms
@@ -548,56 +548,70 @@ def run_ssgsea(
             permutations=0
         )
     """
-    result = {}
     df = data.copy()
     if not os.path.exists(outdir):
         os.makedirs(outdir)
 
     # Comine columns to create a unique name for each set (?)
     name = []
-    index = data[set_index]
-    for i, row in data[set_index].iterrows():
+    if set_index is None:
+        index = data.index.to_frame()
+        set_index = index.columns.tolist()
+    else:
+        index = data[set_index]
+        df = df.drop(set_index, axis=1)
+
+    for i, row in index.iterrows():
         name.append(
             "_".join(row[set_index].tolist())
         )  # this assumes strings as identifiers
 
     df["Name"] = name
     index.index = name
-    df = df.drop(set_index, axis=1).set_index("Name").transpose()
+    df = df.set_index("Name").transpose()
 
-    if annotation_col in annotation and identifier_col in annotation:
-        grouped_annotations = (
-            annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index()
+    if not annotation_col in annotation:
+        raise ValueError(
+            f"Missing Annotation Column: {annotation_col} as specified by `annotation_col`"
         )
-        fid = uuid.uuid4()
-        file_path = os.path.join(outdir, str(fid) + ".gmt")
-        with open(file_path, "w", encoding="utf8") as out:
-            for i, row in grouped_annotations.iterrows():
-                out.write(
-                    row[annotation_col]
-                    + "\t"
-                    + "\t".join(list(filter(None, row[identifier_col])))
-                    + "\n"
-                )
-        enrichment = gp.ssgsea(
-            data=df,
-            gene_sets=str(file_path),
-            outdir=outdir,
-            min_size=min_size,
-            max_size=max_size,
-            scale=scale,
-            permutation_num=permutations,
-            no_plot=True,
-            processes=1,
-            seed=10,
-            format="png",
+
+    if not identifier_col in annotation:
+        raise ValueError(
+            f"Missing Identifier Column: {identifier_col} as specified by `identifier_col`"
         )
 
-        enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose()
-        enrichment_es = enrichment_es.join(index)
-        enrichment_nes = enrichment.res2d.transpose()
-        enrichment_nes = enrichment_nes.join(index)
+    grouped_annotations = (
+        annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index()
+    )
+    fid = uuid.uuid4()
+    file_path = os.path.join(outdir, str(fid) + ".gmt")
+    with open(file_path, "w", encoding="utf8") as out:
+        for i, row in grouped_annotations.iterrows():
+            out.write(
+                row[annotation_col]
+                + "\t"
+                + "\t".join(list(filter(None, row[identifier_col])))
+                + "\n"
+            )
+    enrichment = gp.ssgsea(
+        data=df,
+        gene_sets=str(file_path),
+        outdir=outdir,
+        min_size=min_size,
+        max_size=max_size,
+        scale=scale,
+        permutation_num=permutations,
+        no_plot=True,
+        processes=1,
+        seed=10,
+        format="png",
+    )
+
+    enrichment_es = pd.DataFrame(enrichment.results).transpose()  # resultsOnSamples
+    enrichment_es = enrichment_es.join(index)
+    enrichment_nes = enrichment.res2d.transpose()
+    # enrichment_nes = enrichment_nes.join(index) # ! To Fix
 
-        result = {"es": enrichment_es, "nes": enrichment_nes}
+    result = {"es": enrichment_es, "nes": enrichment_nes}
 
     return result
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 93ff6ee..1f39dfa 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -49,6 +49,7 @@
     "import acore\n",
     "import acore.differential_regulation\n",
     "import acore.enrichment_analysis\n",
+    "from acore.io.uniprot import fetch_annotations\n",
     "\n",
     "dsp_pandas.format.set_pandas_options(max_colwidth=15)"
    ]
@@ -79,7 +80,7 @@
     "omics: str = f\"{base_path}/omics.csv\"\n",
     "meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n",
     "meta: str = f\"{base_path}/meta_patients.csv\"\n",
-    "N_to_sample: int = 1_000"
+    "features_to_sample: int = 100"
    ]
   },
   {
@@ -149,7 +150,7 @@
     "    .drop(idx_always_included, axis=1)\n",
     "    .dropna(thresh=18, axis=1)\n",
     "    .sample(\n",
-    "        N_to_sample - len(idx_always_included),\n",
+    "        features_to_sample - len(idx_always_included),\n",
     "        axis=1,\n",
     "        random_state=42,\n",
     "    )\n",
@@ -223,47 +224,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from acore.io.uniprot import (\n",
-    "    check_id_mapping_results_ready,\n",
-    "    get_id_mapping_results_link,\n",
-    "    get_id_mapping_results_search,\n",
-    "    submit_id_mapping,\n",
-    ")\n",
-    "\n",
-    "\n",
-    "def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:\n",
-    "    \"\"\"Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's\n",
-    "    knowledgebase (KB).\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    ids : pd.Index | list\n",
-    "        Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.\n",
-    "    fields : str, optional\n",
-    "        Fields to fetch, by default \"accession,go_p,go_c. See for availble fields:\n",
-    "        https://www.uniprot.org/help/return_fields\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    pd.DataFrame\n",
-    "        DataFrame with annotations of the UniProt IDs.\n",
-    "    \"\"\"\n",
-    "    job_id = submit_id_mapping(from_db=\"UniProtKB_AC-ID\", to_db=\"UniProtKB\", ids=ids)\n",
-    "\n",
-    "    if check_id_mapping_results_ready(job_id):\n",
-    "        link = get_id_mapping_results_link(job_id)\n",
-    "        # add fields to the link to get more information\n",
-    "        # From and Entry (accession) are the same for UniProt IDs.\n",
-    "        results = get_id_mapping_results_search(\n",
-    "            link + \"?fields=accession,go_p,go_c,go_f&format=tsv\"\n",
-    "        )\n",
-    "    header = results.pop(0).split(\"\\t\")\n",
-    "    results = [line.split(\"\\t\") for line in results]\n",
-    "    df = pd.DataFrame(results, columns=header)\n",
-    "    return df\n",
-    "\n",
-    "\n",
-    "fname_annotations = \"downloaded/annotations.csv\"\n",
+    "fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n",
     "fname = Path(fname_annotations)\n",
     "try:\n",
     "    annotations = pd.read_csv(fname, index_col=0)\n",
@@ -325,9 +286,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "f300c5b5",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "ret = acore.enrichment_analysis.run_regulation_enrichment(\n",
@@ -338,6 +297,32 @@
     "ret"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88b05e0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict = acore.enrichment_analysis.run_ssgsea(\n",
+    "    data=df_omics,\n",
+    "    annotation=annotations,\n",
+    "    min_size=1,\n",
+    ")\n",
+    "print(res_dict.keys())\n",
+    "res_dict[\"es\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9e2fddf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict[\"nes\"]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 25e7cf9..64af662 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -39,7 +39,7 @@
 omics: str = f"{base_path}/omics.csv"
 meta_pgs: str = f"{base_path}/meta_pgs.csv"
 meta: str = f"{base_path}/meta_patients.csv"
-N_to_sample: int = 1_000
+features_to_sample: int = 100
 
 # %% [markdown]
 # # Load processed data
@@ -69,7 +69,7 @@
     .drop(idx_always_included, axis=1)
     .dropna(thresh=18, axis=1)
     .sample(
-        N_to_sample - len(idx_always_included),
+        features_to_sample - len(idx_always_included),
         axis=1,
         random_state=42,
     )
@@ -106,7 +106,7 @@
 #
 
 # %%
-fname_annotations = "downloaded/annotations.csv"
+fname_annotations = f"downloaded/annotations_{features_to_sample}.csv"
 fname = Path(fname_annotations)
 try:
     annotations = pd.read_csv(fname, index_col=0)
@@ -155,5 +155,16 @@
 )
 ret
 
+# %%
+res_dict = acore.enrichment_analysis.run_ssgsea(
+    data=df_omics,
+    annotation=annotations,
+    min_size=1,
+)
+print(res_dict.keys())
+res_dict["es"]
+
+# %%
+res_dict["nes"]
 
 # %%

From 5373372091d24dab9e71c28d9764163d61c507b1 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 16 Dec 2024 20:43:34 +0100
Subject: [PATCH 30/40] :art: type hints, formatting, add pca example to api
 example of enrichment analysis

---
 acore/enrichment_analysis.py                |  88 ++++----
 acore/exploratory_analysis.py               | 217 ++++++++++++--------
 acore/network_analysis.py                   |   3 +-
 docs/api_examples/enrichment_analysis.ipynb | 170 ++++++++++++++-
 docs/api_examples/enrichment_analysis.py    |  82 +++++++-
 5 files changed, 423 insertions(+), 137 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index d5cc803..ab277e6 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -74,16 +74,17 @@ def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"):
     return result
 
 
+# ! undocumented for now (find usage example)
 def run_site_regulation_enrichment(
-    regulation_data,
-    annotation,
+    regulation_data: pd.DataFrame,
+    annotation: pd.DataFrame,
     identifier="identifier",
     groups=("group1", "group2"),
     annotation_col="annotation",
     rejected_col="rejected",
     group_col="group",
     method="fisher",
-    regex="(\\w+~.+)_\\w\\d+\\-\\w+",  # ! add example to docstring of what kind of string this matches
+    regex="(\\w+~.+)_\\w\\d+\\-\\w+",
     correction="fdr_bh",
 ):
     r"""
@@ -110,6 +111,8 @@ def run_site_regulation_enrichment(
     :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground',
         'background', foreground_pop, background_pop, 'pvalue', 'padj' and 'rejected'.
 
+    :raises ValueError: if regulation_data is `None` or empty.
+
     Example::
 
         result = run_site_regulation_enrichment(regulation_data,
@@ -124,28 +127,35 @@ def run_site_regulation_enrichment(
         )
     """
     result = pd.DataFrame()
-    if regulation_data is not None:
-        if not regulation_data.empty:
-            new_ids = []
-            for ident in regulation_data[identifier].tolist():
-                match = re.search(regex, ident)
-                if match is not None:
-                    new_ids.append(match.group(1))
-                else:
-                    new_ids.append(ident)
-            regulation_data[identifier] = new_ids
-            regulation_data = regulation_data.drop_duplicates()
-            result = run_regulation_enrichment(
-                regulation_data,
-                annotation,
-                identifier,
-                groups,
-                annotation_col,
-                rejected_col,
-                group_col,
-                method,
-                correction,
-            )
+    if regulation_data is None or regulation_data.empty:
+        raise ValueError("regulation_data is empty")
+
+    new_ids = []
+    # find any identifiers with a PTM and save only prot+gene identifer
+    for ident in regulation_data[identifier].tolist():
+        match = re.search(regex, ident)
+        if match is not None:
+            new_ids.append(
+                match.group(1)
+            )  # removes the PTM extension of the identifierm of CKG
+        else:
+            new_ids.append(ident)
+    # so this is normalizing the identifiers to ignore the PTM extension
+    regulation_data[identifier] = new_ids  # matches are used as identifiers
+    regulation_data = (
+        regulation_data.drop_duplicates()
+    )  # depends on regulation_data if this has an effect (with floats probably not)
+    result = run_regulation_enrichment(
+        regulation_data,
+        annotation,
+        identifier,
+        groups,
+        annotation_col,
+        rejected_col,
+        group_col,
+        method,
+        correction,
+    )
 
     return result
 
@@ -491,17 +501,17 @@ def run_ssgsea(
     max_size: int = 500,
     scale: bool = False,
     permutations: int = 0,
-) -> dict[str, pd.DataFrame]:
+) -> pd.DataFrame:
     """
     Project each sample within a data set onto a space of gene set enrichment scores using
     the single sample gene set enrichment analysis (ssGSEA) projection methodology
     described in Barbie et al., 2009:
     https://www.nature.com/articles/nature08460#Sec3 (search "Single Sample" GSEA).
 
-    :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins)
-    :param annotation: pandas.DataFrame with the annotation to be used in the enrichment
+    :param pd.DataFrame data: pandas.DataFrame with the quantified features (i.e. subject x proteins)
+    :param str annotation: pandas.DataFrame with the annotation to be used in the enrichment
         (i.e. CKG pathway annotation file)
-    :param list set_index: column/s to be used as index. Enrichment will be calculated
+    :param list[str] set_index: column/s to be used as index. Enrichment will be calculated
         for these values (i.e ["subject"] will return subjects x pathways matrix of
         enrichment scores)
     :param str annotation_col: name of the column containing annotation terms.
@@ -514,8 +524,9 @@ def run_ssgsea(
         (i.e. pathways)
     :param bool scale: whether or not to scale the data
     :param int permutations: number of permutations used in the ssgsea analysis
-    :return: dictionary with two dataframes: es - enrichment scores,
-        and nes - normalized enrichment scores.
+    :return: pandas.DataFrame containing unnormalized enrichment scores (`ES`) for each sample,
+        and normalized enrichment scores (`NES`) with the enriched `Term` and sample `Name`.
+    :rtype: pandas.DataFrame
 
     Example::
 
@@ -561,7 +572,7 @@ def run_ssgsea(
         index = data[set_index]
         df = df.drop(set_index, axis=1)
 
-    for i, row in index.iterrows():
+    for _, row in index.iterrows():
         name.append(
             "_".join(row[set_index].tolist())
         )  # this assumes strings as identifiers
@@ -586,7 +597,7 @@ def run_ssgsea(
     fid = uuid.uuid4()
     file_path = os.path.join(outdir, str(fid) + ".gmt")
     with open(file_path, "w", encoding="utf8") as out:
-        for i, row in grouped_annotations.iterrows():
+        for _, row in grouped_annotations.iterrows():
             out.write(
                 row[annotation_col]
                 + "\t"
@@ -606,12 +617,7 @@ def run_ssgsea(
         seed=10,
         format="png",
     )
-
-    enrichment_es = pd.DataFrame(enrichment.results).transpose()  # resultsOnSamples
-    enrichment_es = enrichment_es.join(index)
-    enrichment_nes = enrichment.res2d.transpose()
-    # enrichment_nes = enrichment_nes.join(index) # ! To Fix
-
-    result = {"es": enrichment_es, "nes": enrichment_nes}
-
+    result = pd.DataFrame(enrichment.res2d).set_index("Name")
+    # potentially return wide format in separate format
+    # result = {"es": enrichment_es, "nes": enrichment_nes}
     return result
diff --git a/acore/exploratory_analysis.py b/acore/exploratory_analysis.py
index b1384dd..356e4a9 100644
--- a/acore/exploratory_analysis.py
+++ b/acore/exploratory_analysis.py
@@ -6,13 +6,13 @@
 from sklearn.manifold import TSNE
 
 
-def calculate_coefficient_variation(values):
+def calculate_coefficient_variation(values: np.ndarray) -> np.ndarray:
     """
     Compute the coefficient of variation, the ratio of the biased standard
     deviation to the mean, in percentage. For more information
     visit https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.variation.html.
 
-    :param numpy.ndarray values: numpy array
+    :param numpy.ndarray values: numpy array of log2 transformed values
     :return: The calculated variation along rows.
     :rtype: numpy.ndarray
 
@@ -29,11 +29,13 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]):
     """
     Extracts the coefficients of variation in each group.
 
-    :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
+    :param data: pandas dataframe with samples as rows and protein identifiers as columns
+                 (with additional columns 'group', 'sample' and 'subject').
     :param list drop_columns: column labels to be dropped from the dataframe
     :param str group: column label containing group identifiers.
     :param list columns: names to use for the variable column(s), and for the value column(s)
-    :return: Pandas dataframe with columns 'name' (protein identifier), 'x' (coefficient of variation), 'y' (mean) and 'group'.
+    :return: Pandas dataframe with columns 'name' (protein identifier),
+             'x' (coefficient of variation), 'y' (mean) and 'group'.
 
     Example::
 
@@ -46,24 +48,27 @@ def get_coefficient_variation(data, drop_columns, group, columns=["name", "y"]):
     cvs_df = pd.DataFrame()
     for i in cvs.index:
         gcvs = cvs[i].tolist()
-        ints = formated_df.set_index("group").mean().values.tolist()
+        ints = formated_df.set_index(group).mean().values.tolist()
         tdf = pd.DataFrame(data={"name": cols, "x": gcvs, "y": ints})
-        tdf["group"] = i
+        tdf[group] = i
 
         if cvs_df.empty:
             cvs_df = tdf.copy()
         else:
-            cvs_df = cvs_df.append(tdf)
+            cvs_df = pd.concat([cvs_df, tdf])
 
     return cvs_df
 
 
 def extract_number_missing(data, min_valid, drop_cols=["sample"], group="group"):
     """
-    Counts how many valid values exist in each column and filters column labels with more valid values than the minimum threshold defined.
+    Counts how many valid values exist in each column and filters column labels with more
+    valid values than the minimum threshold defined.
 
     :param data: pandas DataFrame with group as rows and protein identifier as column.
-    :param str group: column label containing group identifiers. If None, number of valid values is counted across all samples, otherwise is counted per unique group identifier.
+    :param str group: column label containing group identifiers.
+                      If None, number of valid values is counted across all samples,
+                      otherwise is counted per unique group identifier.
     :param int min_valid: minimum number of valid values to be filtered.
     :param list drop_columns: column labels to be dropped.
     :return: List of column labels above the threshold.
@@ -89,16 +94,21 @@ def extract_percentage_missing(
     data, missing_max, drop_cols=["sample"], group="group", how="all"
 ):
     """
-    Extracts ratio of missing/valid values in each column and filters column labels with lower ratio than the minimum threshold defined.
+    Extracts ratio of missing/valid values in each column and filters column labels with
+    lower ratio than the minimum threshold defined.
 
     :param data: pandas dataframe with group as rows and protein identifier as column.
-    :param str group: column label containing group identifiers. If None, ratio is calculated across all samples, otherwise is calculated per unique group identifier.
+    :param str group: column label containing group identifiers.
+                      If None, ratio is calculated across all samples,
+                      otherwise is calculated per unique group identifier.
     :param float missing_max: maximum ratio of missing/valid values to be filtered.
-    :param str how: define if labels with a higher percentage of missing values than the threshold in any group ('any') or in all groups ('all') should be filtered
+    :param str how: define if labels with a higher percentage of missing values than the threshold
+                    in any group ('any') or in all groups ('all') should be filtered
     :return: List of column labels below the threshold.
 
     Example::
-        result = extract_percentage_missing(data, missing_max=0.3, drop_cols=['sample'], group='group')
+        result = extract_percentage_missing(data, missing_max=0.3,
+                                            drop_cols=['sample'], group='group')
     """
     if group is None:
         groups = data.loc[:, data.isnull().mean() <= missing_max].columns
@@ -128,81 +138,86 @@ def run_pca(
     dropna=True,
 ):
     """
-    Performs principal component analysis and returns the values of each component for each sample and each protein, and the loadings for each protein. \
-    For information visit https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html.
+    Performs principal component analysis and returns the values of each component for each sample
+    and each protein, and the loadings for each protein.
 
-    :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
+    For information visit
+    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html.
+
+    :param data: pandas dataframe with samples as rows and protein identifiers as columns
+                 (with additional columns 'group', 'sample' and 'subject').
     :param list drop_cols: column labels to be dropped from the dataframe.
     :param str group: column label containing group identifiers.
     :param list annotation_cols: list of columns to be added in the scatter plot annotation
     :param int components: number of components to keep.
     :param bool dropna: if True removes all columns with any missing values.
-    :return: tuple: 1) three pandas dataframes: components, loadings and variance; 2) xaxis and yaxis titles with components loadings for plotly.
+    :return: tuple: 1) three pandas dataframes: components, loadings and variance; 2)
+             xaxis and yaxis titles with components loadings for plotly.
 
     Example::
 
-        result = run_pca(data, drop_cols=['sample', 'subject'], group='group', components=2, dropna=True)
+        result = run_pca(data, drop_cols=['sample', 'subject'], group='group',
+                         components=2, dropna=True)
     """
 
     np.random.seed(112736)
-    resultDf = pd.DataFrame()
-    loadings = pd.DataFrame()
     var_exp = []
     args = {}
-    if not data.empty:
-        df = data.copy()
-        annotations = pd.DataFrame()
-        if annotation_cols is not None:
-            if len(list(set(annotation_cols).intersection(data.columns))) > 0:
-                annotations = data.set_index(group)[annotation_cols]
-        drop_cols_int = list(set(drop_cols).intersection(df.columns))
-        if len(drop_cols_int) > 0:
-            df = df.drop(drop_cols_int, axis=1)
-
-        y = df[group].tolist()
-        df = df.set_index(group)
-        df = df.select_dtypes(["number"])
-        if dropna:
-            df = df.dropna(axis=1)
-        X = df.values
-
-        if X.size > 0 and X.shape[1] > components:
-            pca = PCA(n_components=components)
-            X = pca.fit_transform(X)
-            var_exp = pca.explained_variance_ratio_
-            loadings = pd.DataFrame(pca.components_.transpose())
-            loadings.index = df.columns
-            values = {
-                index: np.sqrt(np.power(row, 2).sum())
-                for index, row in loadings.iterrows()
-            }
-            loadings["value"] = loadings.index.map(values.get)
-            loadings = loadings.sort_values(by="value", ascending=False)
-            args = {
-                "x_title": "PC1" + " ({0:.2f})".format(var_exp[0]),
-                "y_title": "PC2" + " ({0:.2f})".format(var_exp[1]),
-                "group": "group",
-            }
-            if components == 2:
-                resultDf = pd.DataFrame(X, index=y, columns=["x", "y"])
-                resultDf = resultDf.assign(**annotations)
-                resultDf = resultDf.reset_index()
-                resultDf.columns = ["group", "x", "y"] + annotation_cols
-
-                loadings.columns = ["x", "y", "value"]
-            if components > 2:
-                args.update({"z_title": "PC3" + " ({0:.2f})".format(var_exp[2])})
-                resultDf = pd.DataFrame(X, index=y)
-                resultDf = resultDf.assign(**annotations)
-                resultDf = resultDf.reset_index()
-                pca_cols = []
-                loading_cols = []
-                if components > 3:
-                    pca_cols = [str(i) for i in resultDf.columns[4:]]
-                    loading_cols = [str(i) for i in loadings.columns[3:]]
-
-                resultDf.columns = ["group", "x", "y", "z"] + pca_cols
-                loadings.columns = ["x", "y", "z"] + loading_cols
+    if data.empty:
+        raise ValueError("Dataframe is empty.")
+
+    df = data.copy()
+    annotations = pd.DataFrame()
+    if annotation_cols is not None:
+        if len(list(set(annotation_cols).intersection(data.columns))) > 0:
+            annotations = data.set_index(group)[annotation_cols]
+    drop_cols_int = list(set(drop_cols).intersection(df.columns))
+    if len(drop_cols_int) > 0:
+        df = df.drop(drop_cols_int, axis=1)
+
+    y = df[group].tolist()
+    df = df.set_index(group)
+    df = df.select_dtypes(["number"])
+    if dropna:
+        df = df.dropna(axis=1)
+    X = df.values
+
+    if X.size > 0 and X.shape[1] > components:
+        pca = PCA(n_components=components)
+        X = pca.fit_transform(X)
+        var_exp = pca.explained_variance_ratio_
+        loadings = pd.DataFrame(pca.components_.transpose())
+        loadings.index = df.columns
+        values = {
+            index: np.sqrt(np.power(row, 2).sum()) for index, row in loadings.iterrows()
+        }
+        loadings["value"] = loadings.index.map(values.get)
+        loadings = loadings.sort_values(by="value", ascending=False)
+        args = {
+            "x_title": "PC1" + " ({0:.2f})".format(var_exp[0]),
+            "y_title": "PC2" + " ({0:.2f})".format(var_exp[1]),
+            "group": "group",
+        }
+        if components == 2:
+            resultDf = pd.DataFrame(X, index=y, columns=["x", "y"])
+            resultDf = resultDf.assign(**annotations)
+            resultDf = resultDf.reset_index()
+            resultDf.columns = ["group", "x", "y"] + annotation_cols
+
+            loadings.columns = ["x", "y", "value"]
+        if components > 2:
+            args.update({"z_title": "PC3" + " ({0:.2f})".format(var_exp[2])})
+            resultDf = pd.DataFrame(X, index=y)
+            resultDf = resultDf.assign(**annotations)
+            resultDf = resultDf.reset_index()
+            pca_cols = []
+            loading_cols = []
+            if components > 3:
+                pca_cols = [str(i) for i in resultDf.columns[4:]]
+                loading_cols = [str(i) for i in loadings.columns[3:]]
+
+            resultDf.columns = ["group", "x", "y", "z"] + pca_cols
+            loadings.columns = ["x", "y", "z"] + loading_cols
 
     return (resultDf, loadings, var_exp), args
 
@@ -219,22 +234,39 @@ def run_tsne(
     dropna=True,
 ):
     """
-    Performs t-distributed Stochastic Neighbor Embedding analysis. For more information visit https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html.
+    Performs t-distributed Stochastic Neighbor Embedding analysis.
+
+    For more information visit
+    https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html.
 
-    :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
+    :param data: pandas dataframe with samples as rows and protein identifiers as columns
+                 (with additional columns 'group', 'sample' and 'subject').
     :param list drop_cols: column labels to be dropped from the dataframe.
     :param str group: column label containing group identifiers.
     :param int components: dimension of the embedded space.
     :param list annotation_cols: list of columns to be added in the scatter plot annotation
-    :param int perplexity: related to the number of nearest neighbors that is used in other manifold learning algorithms. Consider selecting a value between 5 and 50.
+    :param int perplexity: related to the number of nearest neighbors that is used
+                           in other manifold learning algorithms.
+                           Consider selecting a value between 5 and 50.
     :param int n_iter: maximum number of iterations for the optimization (at least 250).
-    :param str init: initialization of embedding ('random', 'pca' or numpy array of shape n_samples x n_components).
+    :param str init: initialization of embedding ('random', 'pca' or
+                     numpy array of shape n_samples x n_components).
     :param bool dropna: if True removes all columns with any missing values.
-    :return: Two dictionaries: 1) pandas dataframe with embedding vectors, 2) xaxis and yaxis titles for plotly.
+    :return: Two dictionaries:
+                1) pandas dataframe with embedding vectors,
+                2) xaxis and yaxis titles for plotly.
 
     Example::
 
-        result = run_tsne(data, drop_cols=['sample', 'subject'], group='group', components=2, perplexity=40, n_iter=1000, init='pca', dropna=True)
+        result = run_tsne(data,
+                          drop_cols=['sample', 'subject'],
+                          group='group',
+                          components=2,
+                          perplexity=40,
+                          n_iter=1000,
+                          init='pca',
+                          dropna=True
+                        )
     """
     result = {}
     args = {}
@@ -256,7 +288,7 @@ def run_tsne(
             n_components=components,
             verbose=0,
             perplexity=perplexity,
-            max_iter=n_iter,
+            n_iter=n_iter,
             init=init,
         )
         X = tsne.fit_transform(X)
@@ -289,21 +321,34 @@ def run_umap(
     dropna=True,
 ):
     """
-    Performs Uniform Manifold Approximation and Projection. For more information vist https://umap-learn.readthedocs.io.
+    Performs Uniform Manifold Approximation and Projection.
+
+    For more information vist https://umap-learn.readthedocs.io.
 
-    :param data: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
+    :param data: pandas dataframe with samples as rows and protein identifiers as columns
+                 (with additional columns 'group', 'sample' and 'subject').
     :param list drop_cols: column labels to be dropped from the dataframe.
     :param str group: column label containing group identifiers.
     :param list annotation_cols: list of columns to be added in the scatter plot annotation
-    :param int n_neighbors: number of neighboring points used in local approximations of manifold structure.
+    :param int n_neighbors: number of neighboring points used
+                            in local approximations of manifold structure.
     :param float min_dist: controls how tightly the embedding is allowed compress points together.
     :param str metric: metric used to measure distance in the input space.
     :param bool dropna: if True removes all columns with any missing values.
-    :return: Two dictionaries: 1) pandas dataframe with embedding of the training data in low-dimensional space, 2) xaxis and yaxis titles for plotly.
+    :return: Two dictionaries:
+                1) pandas dataframe with embedding of the training data in low-dimensional space,
+                2) xaxis and yaxis titles for plotly.
 
     Example::
 
-        result = run_umap(data, drop_cols=['sample', 'subject'], group='group', n_neighbors=10, min_dist=0.3, metric='cosine', dropna=True)
+        result = run_umap(data,
+                          drop_cols=['sample', 'subject'],
+                          group='group',
+                          n_neighbors=10,
+                          min_dist=0.3,
+                          metric='cosine',
+                          dropna=True
+                        )
     """
     np.random.seed(1145536)
     result = {}
diff --git a/acore/network_analysis.py b/acore/network_analysis.py
index 6af038e..50f9376 100644
--- a/acore/network_analysis.py
+++ b/acore/network_analysis.py
@@ -2,10 +2,11 @@
 import networkx as nx
 import pandas as pd
 import snf
-import utils
 from sklearn import cluster
 from sklearn.cluster import AffinityPropagation
 
+from . import utils
+
 
 def get_network_communities(graph, args):
     """
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 1f39dfa..5ed9eca 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -297,6 +297,32 @@
     "ret"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "834d5a85",
+   "metadata": {},
+   "source": [
+    "### For up- and downregulated genes separately"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecf75e7c",
+   "metadata": {},
+   "source": [
+    "### Site specific enrichment analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ee3a7c8",
+   "metadata": {},
+   "source": [
+    "The basic example uses a modified peptide sequence to\n",
+    "demonstrate the enrichment analysis.\n",
+    "- compare groups per amino acid modified (kinases targeting certain motifs?)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -304,23 +330,157 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res_dict = acore.enrichment_analysis.run_ssgsea(\n",
+    "import re\n",
+    "\n",
+    "regex = \"(\\\\w+~.+)_\\\\w\\\\d+\\\\-\\\\w+\"\n",
+    "identifier_ckg = \"gnd~P00350_T10-WW\"\n",
+    "match = re.search(regex, identifier_ckg)\n",
+    "match.group(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9e2fddf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seq_mod = \"_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR\"\n",
+    "regex = \"\\(\\w\\w\\)\"\n",
+    "matches = re.findall(regex, seq_mod)\n",
+    "matches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eab37e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# acore.enrichment_analysis.run_up_down_regulation_enrichment("
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dbc34b7b",
+   "metadata": {},
+   "source": [
+    "## Single sample GSEA (ssGSEA)\n",
+    "Run a gene set enrichment analysis (GSEA) for each sample,\n",
+    "see [article](https://www.nature.com/articles/nature08460#Sec3) and\n",
+    "the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea)\n",
+    "for more details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e211bf65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enrichtments = acore.enrichment_analysis.run_ssgsea(\n",
     "    data=df_omics,\n",
     "    annotation=annotations,\n",
     "    min_size=1,\n",
     ")\n",
-    "print(res_dict.keys())\n",
-    "res_dict[\"es\"]"
+    "enrichtments"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b9e2fddf",
+   "id": "0fd08e79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enrichtments.iloc[0].to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca9e4908",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enrichtments[\"NES\"].plot.hist()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b9a50cc",
+   "metadata": {},
+   "source": [
+    "The normalised enrichment score (NES) can be used in a PCA plot to see if the samples\n",
+    "cluster according to the enrichment of the gene sets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d917326",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nes = enrichtments.set_index(\"Term\", append=True).unstack()[\"NES\"].convert_dtypes()\n",
+    "nes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02dbec64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import acore.exploratory_analysis as ea\n",
+    "\n",
+    "pca_result, pca_annotation = ea.run_pca(\n",
+    "    data=nes.join(df_meta[[group]]),\n",
+    "    drop_cols=[],\n",
+    "    annotation_cols=[],\n",
+    "    group=group,\n",
+    "    components=2,\n",
+    "    dropna=False,\n",
+    ")\n",
+    "pca_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28464b86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from plotly.offline import iplot\n",
+    "# from vuecore import viz\n",
+    "\n",
+    "# args = {\"factor\": 1, \"loadings\": 10}\n",
+    "# #! pca_results has three items, but docstring requests only two -> double check\n",
+    "# figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n",
+    "# iplot(figure)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d0c1eba",
+   "metadata": {},
+   "source": [
+    "## Compare two distributions - KS test\n",
+    "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c66fcef",
    "metadata": {},
    "outputs": [],
    "source": [
-    "res_dict[\"nes\"]"
+    "# plot two histograms of intensity values here"
    ]
   },
   {
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 64af662..e4ccb77 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -155,16 +155,90 @@
 )
 ret
 
+# %% [markdown]
+# ### For up- and downregulated genes separately
+
+# %% [markdown]
+# ### Site specific enrichment analysis
+
+# %% [markdown]
+# The basic example uses a modified peptide sequence to
+# demonstrate the enrichment analysis.
+# - compare groups per amino acid modified (kinases targeting certain motifs?)
+
+# %%
+import re
+
+regex = "(\\w+~.+)_\\w\\d+\\-\\w+"
+identifier_ckg = "gnd~P00350_T10-WW"
+match = re.search(regex, identifier_ckg)
+match.group(1)
+
 # %%
-res_dict = acore.enrichment_analysis.run_ssgsea(
+seq_mod = "_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR"
+regex = "\(\w\w\)"
+matches = re.findall(regex, seq_mod)
+matches
+
+# %%
+# acore.enrichment_analysis.run_up_down_regulation_enrichment(
+
+# %% [markdown]
+# ## Single sample GSEA (ssGSEA)
+# Run a gene set enrichment analysis (GSEA) for each sample,
+# see [article](https://www.nature.com/articles/nature08460#Sec3) and
+# the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea)
+# for more details.
+
+# %%
+enrichtments = acore.enrichment_analysis.run_ssgsea(
     data=df_omics,
     annotation=annotations,
     min_size=1,
 )
-print(res_dict.keys())
-res_dict["es"]
+enrichtments
+
+# %%
+enrichtments.iloc[0].to_dict()
+
+# %%
+enrichtments["NES"].plot.hist()
+
+# %% [markdown]
+# The normalised enrichment score (NES) can be used in a PCA plot to see if the samples
+# cluster according to the enrichment of the gene sets.
+
+# %%
+nes = enrichtments.set_index("Term", append=True).unstack()["NES"].convert_dtypes()
+nes
+
+# %%
+import acore.exploratory_analysis as ea
+
+pca_result, pca_annotation = ea.run_pca(
+    data=nes.join(df_meta[[group]]),
+    drop_cols=[],
+    annotation_cols=[],
+    group=group,
+    components=2,
+    dropna=False,
+)
+pca_result
+
+# %%
+# from plotly.offline import iplot
+# from vuecore import viz
+
+# args = {"factor": 1, "loadings": 10}
+# # #! pca_results has three items, but docstring requests only two -> double check
+# figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args)
+# iplot(figure)
+
+# %% [markdown]
+# ## Compare two distributions - KS test
+# The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions.
 
 # %%
-res_dict["nes"]
+# plot two histograms of intensity values here
 
 # %%

From 52ce9c5d113f5fc54514b0b1a3af05a12f3cf021 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 16 Dec 2024 20:44:06 +0100
Subject: [PATCH 31/40] :art: run CI only on PR

---
 .github/workflows/tox-gha.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tox-gha.yml b/.github/workflows/tox-gha.yml
index 696456d..6cf0ec4 100644
--- a/.github/workflows/tox-gha.yml
+++ b/.github/workflows/tox-gha.yml
@@ -2,10 +2,12 @@ name: Python package
 
 on:
   push:
+    branches: [main]
   pull_request:
-  # release:
-  #   types: [published]
-
+    branches: [main]
+  release:
+    types: [published]
+    
 jobs:
   build:
     runs-on: ubuntu-latest

From 47bcdbd3329a5a02c0502004be8b9d421ec86aba Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 17 Dec 2024 14:58:02 +0100
Subject: [PATCH 32/40] :art: format tests

---
 tests/test_differential_regulation.py | 147 +++++++++++++++--------
 tests/test_exploratory.py             | 160 ++++++++++++++++++--------
 2 files changed, 211 insertions(+), 96 deletions(-)

diff --git a/tests/test_differential_regulation.py b/tests/test_differential_regulation.py
index 2e13f92..169bafa 100644
--- a/tests/test_differential_regulation.py
+++ b/tests/test_differential_regulation.py
@@ -7,56 +7,85 @@
 class TestCalculateTtest(unittest.TestCase):
     def setUp(self):
         self.data = {
-            'subject': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
-            'sample': ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12'],
-            'group': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
-            'protein': [1.4, 6.2, 9.03, 2.3, 7.4, 14.01, 4.5, 9.6, 10.4, 7.5, 11.6, 16.4],
-            'group2': ['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b'],
+            "subject": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
+            "sample": [
+                "S1",
+                "S2",
+                "S3",
+                "S4",
+                "S5",
+                "S6",
+                "S7",
+                "S8",
+                "S9",
+                "S10",
+                "S11",
+                "S12",
+            ],
+            "group": ["A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"],
+            "protein": [
+                1.4,
+                6.2,
+                9.03,
+                2.3,
+                7.4,
+                14.01,
+                4.5,
+                9.6,
+                10.4,
+                7.5,
+                11.6,
+                16.4,
+            ],
+            "group2": ["a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"],
         }
         self.df = pd.DataFrame(self.data)
 
     def test_ttest_ind(self):
         data = {
-            'group1': [1.3, 1.2, 0.2, 2.2, 3.3, 8.43],
-            'group2': [1.4, 6.2, 7.3, 0.4, 1.5, 0.6]
+            "group1": [1.3, 1.2, 0.2, 2.2, 3.3, 8.43],
+            "group2": [1.4, 6.2, 7.3, 0.4, 1.5, 0.6],
         }
         df = pd.DataFrame(data)
 
-        condition1 = 'group1'
-        condition2 = 'group2'
+        condition1 = "group1"
+        condition2 = "group2"
         is_logged = True
         non_par = False
 
         expected_result = pg.ttest(df[condition1], df[condition2], paired=False)
-        result = dr.calculate_ttest(df, condition1, condition2, is_logged=is_logged, non_par=non_par)
+        result = dr.calculate_ttest(
+            df, condition1, condition2, is_logged=is_logged, non_par=non_par
+        )
 
         self.assertAlmostEqual(result[0], expected_result["T"].values[0])
         self.assertAlmostEqual(result[1], expected_result["p-val"].values[0])
 
     def test_mann_whitney(self):
-        data = {
-            'group1': [1, 2, 3, 4, 5],
-            'group2': [2, 4, 6, 8, 10]
-        }
+        data = {"group1": [1, 2, 3, 4, 5], "group2": [2, 4, 6, 8, 10]}
         df = pd.DataFrame(data)
 
-        condition1 = 'group1'
-        condition2 = 'group2'
+        condition1 = "group1"
+        condition2 = "group2"
         is_logged = True
         non_par = True
 
         expected_result = pg.mwu(df[condition1], df[condition2])
-        result = dr.calculate_ttest(df, condition1, condition2, is_logged=is_logged, non_par=non_par)
+        result = dr.calculate_ttest(
+            df, condition1, condition2, is_logged=is_logged, non_par=non_par
+        )
 
         self.assertAlmostEqual(result[0], expected_result["U-val"].values[0])
         self.assertAlmostEqual(result[1], expected_result["p-val"].values[0])
 
     def test_calculate_anova(self):
-        column = 'protein'
-        group = 'group'
+        column = "protein"
+        group = "group"
 
         expected_result = pg.anova(data=self.df, dv=column, between=group)
-        expected_t, expected_df1, expected_df2, expected_pvalue = expected_result[['F', 'ddof1', 'ddof2', 'p-unc']].values[0]
+        expected_t, expected_df1, expected_df2, expected_pvalue = expected_result[
+            ["F", "ddof1", "ddof2", "p-unc"]
+        ].values[0]
 
         result = dr.calculate_anova(self.df, column, group=group)
 
@@ -66,14 +95,20 @@ def test_calculate_anova(self):
         self.assertEqual(result[4], expected_pvalue)
 
     def test_calculate_ancova(self):
-        column = 'protein'
-        group = 'group'
+        column = "protein"
+        group = "group"
         covariates = []
 
-        expected_result = pg.ancova(data=self.df, dv=column, between=group, covar=covariates)
-        expected_t, expected_df, expected_pvalue = expected_result.loc[expected_result['Source'] == group, ['F', 'DF', 'p-unc']].values[0]
+        expected_result = pg.ancova(
+            data=self.df, dv=column, between=group, covar=covariates
+        )
+        expected_t, expected_df, expected_pvalue = expected_result.loc[
+            expected_result["Source"] == group, ["F", "DF", "p-unc"]
+        ].values[0]
 
-        result = dr.calculate_ancova(self.df, column, group=group, covariates=covariates)
+        result = dr.calculate_ancova(
+            self.df, column, group=group, covariates=covariates
+        )
 
         self.assertEqual(result[1], expected_df)
         self.assertEqual(result[2], expected_df)
@@ -82,16 +117,27 @@ def test_calculate_ancova(self):
 
     def test_calculate_repeated_measures_anova(self):
         """Source     SS  DF      MS         F     p-unc       ng2  eps
-0  group   1.50   1   1.500  0.087642  0.795106  0.032609  1.0
-1  Error  34.23   2  17.115       NaN       NaN       NaN  NaN"""
-        column = 'protein'
-        subject = 'subject'
-        within = 'group'
-
-        expected_result = pg.rm_anova(data=self.df, dv=column, within=within, subject=subject, detailed=True, correction=True)
-        expected_t, expected_pvalue = expected_result.loc[0, ["F", "p-unc"]].values.tolist()
+        0  group   1.50   1   1.500  0.087642  0.795106  0.032609  1.0
+        1  Error  34.23   2  17.115       NaN       NaN       NaN  NaN"""
+        column = "protein"
+        subject = "subject"
+        within = "group"
+
+        expected_result = pg.rm_anova(
+            data=self.df,
+            dv=column,
+            within=within,
+            subject=subject,
+            detailed=True,
+            correction=True,
+        )
+        expected_t, expected_pvalue = expected_result.loc[
+            0, ["F", "p-unc"]
+        ].values.tolist()
         expected_df1, expected_df2 = expected_result["DF"]
-        result = dr.calculate_repeated_measures_anova(self.df, column, subject=subject, within=within)
+        result = dr.calculate_repeated_measures_anova(
+            self.df, column, subject=subject, within=within
+        )
 
         self.assertEqual(result[1], expected_df1)
         self.assertEqual(result[2], expected_df2)
@@ -99,19 +145,30 @@ def test_calculate_repeated_measures_anova(self):
         self.assertEqual(result[4], expected_pvalue)
 
     def test_calculate_mixed_anova(self):
-        column = 'protein'
-        subject = 'subject'
-        within = 'group'
-        between = 'group2'
-
-        expected_result = pg.mixed_anova(data=self.df, dv=column, within=within, between=between, subject=subject, correction=True)
-        expected_result['identifier'] = column
-        expected_result = expected_result[['identifier', 'DF1', 'DF2', 'F', 'p-unc', 'Source']]
-
-        result = dr.calculate_mixed_anova(self.df, column, subject=subject, within=within, between=between)
+        column = "protein"
+        subject = "subject"
+        within = "group"
+        between = "group2"
+
+        expected_result = pg.mixed_anova(
+            data=self.df,
+            dv=column,
+            within=within,
+            between=between,
+            subject=subject,
+            correction=True,
+        )
+        expected_result["identifier"] = column
+        expected_result = expected_result[
+            ["identifier", "DF1", "DF2", "F", "p-unc", "Source"]
+        ]
+
+        result = dr.calculate_mixed_anova(
+            self.df, column, subject=subject, within=within, between=between
+        )
 
         self.assertEqual(result.values.tolist(), expected_result.values.tolist())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_exploratory.py b/tests/test_exploratory.py
index ef84e3a..97dd673 100644
--- a/tests/test_exploratory.py
+++ b/tests/test_exploratory.py
@@ -1,94 +1,152 @@
-import unittest
 import platform
+import unittest
+
 import pandas as pd
+
 import acore.exploratory_analysis as ea
 
 
 class TestExtractMissing(unittest.TestCase):
     def setUp(self):
-        self.data = pd.DataFrame({
-            'group': ['A', 'A', 'B', 'B'],
-            'sample': ['S1', 'S2', 'S3', 'S4'],
-            'protein1': [1.4, 2.2, None, 4.2],
-            'protein2': [5.6, None, None, 8.1],
-            'protein3': [9.1, 10.01, 11.2, 12.9]
-        })
+        self.data = pd.DataFrame(
+            {
+                "group": ["A", "A", "B", "B"],
+                "sample": ["S1", "S2", "S3", "S4"],
+                "protein1": [1.4, 2.2, None, 4.2],
+                "protein2": [5.6, None, None, 8.1],
+                "protein3": [9.1, 10.01, 11.2, 12.9],
+            }
+        )
 
     def test_extract_number_missing(self):
         min_valid = 2
-        expected_result = ['protein1', 'protein3']
+        expected_result = ["protein1", "protein3"]
 
-        result = ea.extract_number_missing(self.data, min_valid, drop_cols=['sample'], group='group')
+        result = ea.extract_number_missing(
+            self.data, min_valid, drop_cols=["sample"], group="group"
+        )
 
         self.assertEqual(result, expected_result)
 
     def test_extract_percentage_missing(self):
         missing_max = 0.2
-        expected_result = ['protein1', 'protein3']
+        expected_result = ["protein1", "protein3"]
 
-        result = ea.extract_percentage_missing(self.data, missing_max, drop_cols=['sample'], group='group')
+        result = ea.extract_percentage_missing(
+            self.data, missing_max, drop_cols=["sample"], group="group"
+        )
 
         self.assertEqual(result, expected_result)
 
 
 class TestDimensionalityReduction(unittest.TestCase):
     def setUp(self):
-        self.data = pd.DataFrame({
-            'group': ['A', 'A', 'B', 'B'],
-            'protein1': [1.4, 2.2, 5.3, 4.2],
-            'protein2': [5.6, 0.3, 2.1, 8.1],
-            'protein3': [9.1, 10.01, 11.2, 12.9]
-        })
+        self.data = pd.DataFrame(
+            {
+                "group": ["A", "A", "B", "B"],
+                "protein1": [1.4, 2.2, 5.3, 4.2],
+                "protein2": [5.6, 0.3, 2.1, 8.1],
+                "protein3": [9.1, 10.01, 11.2, 12.9],
+            }
+        )
 
     def test_run_pca(self):
         components = 2
         expected_result = (
-            pd.DataFrame({
-                'group': ['A', 'A', 'B', 'B'],
-                'x': [0.877154, -3.883458, -1.550791, 4.557095],
-                'y': [-2.815737, -0.420979, 2.278209, 0.958507],
-            }),
-            pd.DataFrame({
-                'x': [0.958489, 0.092382, 0.269749],
-                'y': [-0.235412, 0.790179, 0.565861],
-                'value': [0.986975, 0.795561, 0.626868]
-            },
-                index=['protein2', 'protein1', 'protein3']),
-            pd.Series([0.71760534, 0.26139782])
+            pd.DataFrame(
+                {
+                    "group": ["A", "A", "B", "B"],
+                    "x": [0.877154, -3.883458, -1.550791, 4.557095],
+                    "y": [-2.815737, -0.420979, 2.278209, 0.958507],
+                }
+            ),
+            pd.DataFrame(
+                {
+                    "x": [0.958489, 0.092382, 0.269749],
+                    "y": [-0.235412, 0.790179, 0.565861],
+                    "value": [0.986975, 0.795561, 0.626868],
+                },
+                index=["protein2", "protein1", "protein3"],
+            ),
+            pd.Series([0.71760534, 0.26139782]),
+        )
+        exp_dict = {"x_title": "PC1 (0.72)", "y_title": "PC2 (0.26)", "group": "group"}
+        result, annotation = ea.run_pca(
+            self.data,
+            drop_cols=[],
+            annotation_cols=[],
+            group="group",
+            components=components,
+            dropna=True,
         )
-        exp_dict = {'x_title': 'PC1 (0.72)',
-                    'y_title': 'PC2 (0.26)', 'group': 'group'}
-        result, annotation = ea.run_pca(self.data, drop_cols=[], annotation_cols=[], group='group', components=components, dropna=True)
         assert annotation == exp_dict
         # results: tuple of 3 DFs: components, loadings, explained_variance
         pd.testing.assert_frame_equal(result[0], expected_result[0], check_exact=False)
         pd.testing.assert_frame_equal(result[1], expected_result[1], check_exact=False)
-        pd.testing.assert_series_equal(pd.Series(result[2]), expected_result[2], check_exact=False, check_dtype=False)
-        
+        pd.testing.assert_series_equal(
+            pd.Series(result[2]),
+            expected_result[2],
+            check_exact=False,
+            check_dtype=False,
+        )
+
     def test_run_tsne(self):
         components = 2
         # ! MAC ARM64 yields these results (using the similar installation)
-        expected_result = pd.DataFrame({
-            'group': ['A', 'A', 'B', 'B'],
-            'x': [-113.341728, 36.020717, 57.2992514, -134.729141],
-            'y': [131.169082, -59.616630, 110.601203, -39.086254]
-            })
+        expected_result = pd.DataFrame(
+            {
+                "group": ["A", "A", "B", "B"],
+                "x": [-113.341728, 36.020717, 57.2992514, -134.729141],
+                "y": [131.169082, -59.616630, 110.601203, -39.086254],
+            }
+        )
         # ! Ubuntu x64 yields these results:
-        if not platform.machine() == 'arm64':
-            expected_result = pd.DataFrame({
-                'group': ['A', 'A', 'B', 'B'],
-                'x': [-27.375560760498047, -7.526909828186035, -28.731882095336914, -8.872751235961914],
-                'y': [1.547648310661316, 0.17019487917423248, -18.321491241455078, -19.713821411132812]
-            })
-        result, _ = ea.run_tsne(self.data, drop_cols=['sample', 'subject'], group='group', components=components, perplexity=3, n_iter=1000, init='pca', dropna=True)
-        result = result['tsne']
-        pd.testing.assert_frame_equal(result, expected_result, check_exact=False, check_dtype=False)
+        if not platform.machine() == "arm64":
+            expected_result = pd.DataFrame(
+                {
+                    "group": ["A", "A", "B", "B"],
+                    "x": [
+                        -27.375560760498047,
+                        -7.526909828186035,
+                        -28.731882095336914,
+                        -8.872751235961914,
+                    ],
+                    "y": [
+                        1.547648310661316,
+                        0.17019487917423248,
+                        -18.321491241455078,
+                        -19.713821411132812,
+                    ],
+                }
+            )
+        result, _ = ea.run_tsne(
+            self.data,
+            drop_cols=["sample", "subject"],
+            group="group",
+            components=components,
+            perplexity=3,
+            n_iter=1000,
+            init="pca",
+            dropna=True,
+        )
+        result = result["tsne"]
+        pd.testing.assert_frame_equal(
+            result, expected_result, check_exact=False, check_dtype=False
+        )
 
     def test_run_umap(self):
-        _ = ea.run_umap(self.data, drop_cols=['sample', 'subject'], group='group', n_neighbors=10, min_dist=0.3, metric='cosine', dropna=True)
+        _ = ea.run_umap(
+            self.data,
+            drop_cols=["sample", "subject"],
+            group="group",
+            n_neighbors=10,
+            min_dist=0.3,
+            metric="cosine",
+            dropna=True,
+        )
 
         self.assertEqual(1, 1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From 7acf4b3136ba611030673aebb5aa4aa03fee97e9 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 17 Dec 2024 15:59:40 +0100
Subject: [PATCH 33/40] :art: add up and down regulated example, clean-up
 function and annotate

---
 acore/enrichment_analysis.py                | 115 +++++++++++---------
 docs/api_examples/enrichment_analysis.ipynb |  16 +++
 docs/api_examples/enrichment_analysis.py    |   9 ++
 3 files changed, 86 insertions(+), 54 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index ab277e6..c67ac09 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -161,28 +161,33 @@ def run_site_regulation_enrichment(
 
 
 def run_up_down_regulation_enrichment(
-    regulation_data,
-    annotation,
-    identifier="identifier",
-    groups=("group1", "group2"),
-    annotation_col="annotation",
-    # rejected_col="rejected",
-    group_col="group",
-    method="fisher",
-    correction="fdr_bh",
-    alpha=0.05,
-    lfc_cutoff=1,
-):
+    regulation_data: pd.DataFrame,
+    annotation: pd.DataFrame,
+    identifier: str = "identifier",
+    groups: list[str] = ("group1", "group2"),
+    annotation_col: str = "annotation",
+    # rejected_col:str="rejected",
+    group_col: str = "group",
+    method: str = "fisher",
+    min_detected_in_set: int = 2,
+    correction: str = "fdr_bh",
+    correction_alpha: float = 0.05,
+    lfc_cutoff: float = 1,
+) -> pd.DataFrame:
     """
     This function runs a simple enrichment analysis for significantly regulated proteins
     distinguishing between up- and down-regulated.
 
-    :param regulation_data: pandas.DataFrame resulting from differential regulation
+    :param pandas.DataFrame regulation_data: pandas.DataFrame resulting from differential regulation
         analysis (CKG's regulation table).
-    :param annotation: pandas.DataFrame with annotations for features
+    :param pandas.DataFrame annotation: pandas.DataFrame with annotations for features
         (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
     :param str identifier: name of the column from annotation containing feature identifiers.
-    :param list groups: column names from regulation_data containing group identifiers.
+    :param list[str] groups: column names from regulation_data containing group identifiers.
+            See `pandas.DataFrame.groupby`_ for more information.
+            
+            .. _pandas.DataFrame.groupby: https://pandas.pydata.org/pandas-docs/stable/\
+reference/api/pandas.DataFrame.groupby.html
     :param str annotation_col: name of the column from annotation containing annotation terms.
     :param str rejected_col: name of the column from regulation_data containing boolean for
         rejected null hypothesis.
@@ -213,52 +218,54 @@ def run_up_down_regulation_enrichment(
             lfc_cutoff=1,
         )
     """
-    enrichment_results = {}
+    if isinstance(groups, str):
+        groups = [groups]
+    if isinstance(groups, tuple):
+        groups = list(groups)
+    if len(groups) != 2:
+        raise ValueError("groups should be exactly two.")
+
+    ret = list()
+    # In case of multiple comparisons this is used to get all possible combinations
     for g1, g2 in regulation_data.groupby(groups).groups:
+
         df = regulation_data.groupby(groups).get_group((g1, g2))
+
+        padj_name = "padj"
         if "posthoc padj" in df:
-            df["up_pairwise_regulation"] = (df["posthoc padj"] <= alpha) & (
-                df["log2FC"] >= lfc_cutoff
-            )
-            df["down_pairwise_regulation"] = (df["posthoc padj"] <= alpha) & (
-                df["log2FC"] <= -lfc_cutoff
-            )
-        else:
-            df["up_pairwise_regulation"] = (df["padj"] <= alpha) & (
-                df["log2FC"] >= lfc_cutoff
-            )
-            df["down_pairwise_regulation"] = (df["padj"] <= alpha) & (
-                df["log2FC"] <= -lfc_cutoff
-            )
+            padj_name = "posthoc padj"
 
-        enrichment = run_regulation_enrichment(
-            df,
-            annotation,
-            identifier=identifier,
-            annotation_col=annotation_col,
-            rejected_col="up_pairwise_regulation",
-            group_col=group_col,
-            method=method,
-            correction=correction,
+        df["up_pairwise_regulation"] = (df[padj_name] <= correction_alpha) & (
+            df["log2FC"] >= lfc_cutoff
         )
-        enrichment["direction"] = "upregulated"
-        enrichment_results[g1 + "~" + g2] = enrichment
-        enrichment = run_regulation_enrichment(
-            df,
-            annotation,
-            identifier=identifier,
-            annotation_col=annotation_col,
-            rejected_col="down_pairwise_regulation",
-            group_col=group_col,
-            method=method,
-            correction=correction,
-        )
-        enrichment["direction"] = "downregulated"
-        enrichment_results[g1 + "~" + g2] = enrichment_results[g1 + "~" + g2].append(
-            enrichment
+        df["down_pairwise_regulation"] = (df[padj_name] <= correction_alpha) & (
+            df["log2FC"] <= -lfc_cutoff
         )
+        comparison_tag = g1 + "~" + g2
+
+        for rej_col, direction in zip(
+            ("up_pairwise_regulation", "down_pairwise_regulation"),
+            ("upregulated", "downregulated"),
+        ):
+            _enrichment = run_regulation_enrichment(
+                df,
+                annotation,
+                identifier=identifier,
+                annotation_col=annotation_col,
+                rejected_col=rej_col,
+                group_col=group_col,
+                method=method,
+                min_detected_in_set=min_detected_in_set,
+                correction=correction,
+                correction_alpha=correction_alpha,
+            )
+            _enrichment["direction"] = direction
+            _enrichment["comparison"] = comparison_tag
+            ret.append(_enrichment)
+
+    ret = pd.concat(ret)
 
-    return enrichment_results
+    return ret
 
 
 # ! to move
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 5ed9eca..98bb5e5 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -292,6 +292,7 @@
     "ret = acore.enrichment_analysis.run_regulation_enrichment(\n",
     "    regulation_data=diff_reg,\n",
     "    annotation=annotations,\n",
+    "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
     "    correction_alpha=0.01,\n",
     ")\n",
     "ret"
@@ -305,6 +306,21 @@
     "### For up- and downregulated genes separately"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a631495",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n",
+    "    regulation_data=diff_reg,\n",
+    "    annotation=annotations,\n",
+    "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
+    ")\n",
+    "ret"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ecf75e7c",
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index e4ccb77..7b4d78a 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -151,6 +151,7 @@
 ret = acore.enrichment_analysis.run_regulation_enrichment(
     regulation_data=diff_reg,
     annotation=annotations,
+    min_detected_in_set=1,  # ! default is 2, so more conservative
     correction_alpha=0.01,
 )
 ret
@@ -158,6 +159,14 @@
 # %% [markdown]
 # ### For up- and downregulated genes separately
 
+# %%
+ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(
+    regulation_data=diff_reg,
+    annotation=annotations,
+    min_detected_in_set=1,  # ! default is 2, so more conservative
+)
+ret
+
 # %% [markdown]
 # ### Site specific enrichment analysis
 

From 436128f6fbfe1a189fd77f3435051c6d3e1f08e7 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 17 Dec 2024 16:38:33 +0100
Subject: [PATCH 34/40] :art: document log fold change, data used and improve
 docstrings

---
 acore/enrichment_analysis.py                |  4 +-
 docs/api_examples/enrichment_analysis.ipynb | 73 ++++++++++++++++++++-
 docs/api_examples/enrichment_analysis.py    | 38 ++++++++++-
 3 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index c67ac09..e06d201 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -199,7 +199,7 @@ def run_up_down_regulation_enrichment(
     :param float alpha: adjusted p-value cutoff to define significance
     :param float lfc_cutoff: log fold-change cutoff to define practical significance
     :return: pandas.DataFrame with columns: 'terms', 'identifiers', 'foreground',
-        'background', 'pvalue', 'padj' and 'rejected'.
+        'background', 'pvalue', 'padj', 'rejected', 'direction' and 'comparison'.
 
     Example::
 
@@ -223,7 +223,7 @@ def run_up_down_regulation_enrichment(
     if isinstance(groups, tuple):
         groups = list(groups)
     if len(groups) != 2:
-        raise ValueError("groups should be exactly two.")
+        raise ValueError("groups should contains exactly two columns.")
 
     ret = list()
     # In case of multiple comparisons this is used to get all possible combinations
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 98bb5e5..1b0375f 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -140,7 +140,10 @@
    "execution_count": null,
    "id": "1145a2cd",
    "metadata": {
-    "lines_to_next_cell": 2
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
@@ -206,7 +209,7 @@
    "outputs": [],
    "source": [
     "diff_reg[\"rejected\"] = diff_reg[\"rejected\"].astype(bool)  # ! needs to be fixed in anova\n",
-    "diff_reg.query(\"rejected == True\")"
+    "diff_reg.query(\"rejected\")"
    ]
   },
   {
@@ -262,7 +265,11 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "57fddefb",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
     "_ = (\n",
@@ -310,6 +317,65 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "1a631495",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "diff_reg.query(\"rejected\")[\n",
+    "    [\n",
+    "        \"identifier\",\n",
+    "        \"group1\",\n",
+    "        \"group2\",\n",
+    "        \"pvalue\",\n",
+    "        \"padj\",\n",
+    "        \"rejected\",\n",
+    "        \"log2FC\",\n",
+    "        \"FC\",\n",
+    "    ]\n",
+    "].sort_values(\"log2FC\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7380a528",
+   "metadata": {},
+   "source": [
+    "- this additionally sets a fold change cutoff\n",
+    "- and the fore and backgroud populations are changed due to the separation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a73ed6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n",
+    "    regulation_data=diff_reg,\n",
+    "    annotation=annotations,\n",
+    "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
+    "    lfc_cutoff=1,\n",
+    ")\n",
+    "ret"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5cb036be",
+   "metadata": {},
+   "source": [
+    "here we see differences for the same set of differently regulated protein groups,\n",
+    "which can be reset using lfc_cutoff=0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e5e2b61",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -317,6 +383,7 @@
     "    regulation_data=diff_reg,\n",
     "    annotation=annotations,\n",
     "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
+    "    lfc_cutoff=0,\n",
     ")\n",
     "ret"
    ]
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 7b4d78a..c48fce2 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -62,7 +62,7 @@
 idx_always_included = ["Q5HYN5", "P39059", "O43432", "O43175"]
 df_omics[idx_always_included]
 
-# %%
+# %% tags=["hide-input"]
 df_omics = (
     df_omics
     # .dropna(axis=1)
@@ -99,7 +99,7 @@
 
 # %%
 diff_reg["rejected"] = diff_reg["rejected"].astype(bool)  # ! needs to be fixed in anova
-diff_reg.query("rejected == True")
+diff_reg.query("rejected")
 
 # %% [markdown]
 # ## Find functional annotations, here pathways
@@ -134,7 +134,7 @@
 # %% [markdown]
 # See how many protein groups are associated with each annotation.
 
-# %%
+# %% tags=["hide-input"]
 _ = (
     annotations.groupby("annotation")
     .size()
@@ -159,11 +159,43 @@
 # %% [markdown]
 # ### For up- and downregulated genes separately
 
+# %% tags=["hide-input"]
+diff_reg.query("rejected")[
+    [
+        "identifier",
+        "group1",
+        "group2",
+        "pvalue",
+        "padj",
+        "rejected",
+        "log2FC",
+        "FC",
+    ]
+].sort_values("log2FC")
+
+# %% [markdown]
+# - this additionally sets a fold change cutoff
+# - and the fore and backgroud populations are changed due to the separation
+
+# %%
+ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(
+    regulation_data=diff_reg,
+    annotation=annotations,
+    min_detected_in_set=1,  # ! default is 2, so more conservative
+    lfc_cutoff=1,
+)
+ret
+
+# %% [markdown]
+# here we see differences for the same set of differently regulated protein groups,
+# which can be reset using lfc_cutoff=0.
+
 # %%
 ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(
     regulation_data=diff_reg,
     annotation=annotations,
     min_detected_in_set=1,  # ! default is 2, so more conservative
+    lfc_cutoff=0,
 )
 ret
 

From 5bdbfb8f65e8f27027953339a21109d128a2d0fb Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 18 Dec 2024 10:50:45 +0100
Subject: [PATCH 35/40] :memo: extend docs with pca plot from vuecore and ks
 example

---
 acore/enrichment_analysis.py                | 11 ++-
 docs/api_examples/enrichment_analysis.ipynb | 80 +++++++++++++++++----
 docs/api_examples/enrichment_analysis.py    | 42 ++++++++---
 3 files changed, 110 insertions(+), 23 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index e06d201..0ace7b5 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -29,7 +29,10 @@ def run_fisher(
     group2: list[int],
     alternative: str = "two-sided",
 ) -> tuple[float, float]:
-    """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`.
+    """Run fisher's exact test on two groups using `scipy.stats.fisher_exact`_.
+
+    .. _scipy.stats.fisher_exact: https://docs.scipy.org/doc/scipy/reference/generated/\
+scipy.stats.fisher_exact.html
 
     Example::
 
@@ -51,7 +54,11 @@ def run_fisher(
 def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"):
     """
     Compute the Kolmogorov-Smirnov statistic on 2 samples.
-    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html
+    See `scipy.stats.ks_2samp`_
+
+    .. _scipy.stats.ks_2samp: https://docs.scipy.org/doc/scipy/reference/generated/\
+scipy.stats.ks_2samp.html
+
 
     :param list dist1: sequence of 1-D ndarray (first distribution to compare)
         drawn from a continuous distribution
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 1b0375f..92da200 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -31,7 +31,7 @@
    },
    "outputs": [],
    "source": [
-    "%pip install acore"
+    "%pip install acore vuecore"
    ]
   },
   {
@@ -528,7 +528,16 @@
     "    components=2,\n",
     "    dropna=False,\n",
     ")\n",
-    "pca_result"
+    "resultDf, loadings, var_exp = pca_result\n",
+    "resultDf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acd530d6",
+   "metadata": {},
+   "source": [
+    "The loadings show how the variables are correlated with the principal components."
    ]
   },
   {
@@ -538,13 +547,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# from plotly.offline import iplot\n",
-    "# from vuecore import viz\n",
-    "\n",
-    "# args = {\"factor\": 1, \"loadings\": 10}\n",
-    "# #! pca_results has three items, but docstring requests only two -> double check\n",
-    "# figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n",
-    "# iplot(figure)"
+    "loadings"
    ]
   },
   {
@@ -552,8 +555,10 @@
    "id": "5d0c1eba",
    "metadata": {},
    "source": [
-    "## Compare two distributions - KS test\n",
-    "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions."
+    "We will plot both on the sample plot (samples on the first two principal components and\n",
+    "loadings of variables). We use the\n",
+    "[`vuecore` package](https://github.com/Multiomics-Analytics-Group/vuecore)\n",
+    "for this, which is also developed by the Multiomics Analytics Group."
    ]
   },
   {
@@ -563,7 +568,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# plot two histograms of intensity values here"
+    "from plotly.offline import iplot\n",
+    "from vuecore import viz\n",
+    "\n",
+    "args = {\"factor\": 1, \"loadings\": 10}\n",
+    "#! pca_results has three items, but docstring requests only two -> double check\n",
+    "figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n",
+    "iplot(figure)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4be5a8c8",
+   "metadata": {},
+   "source": [
+    "## Compare two distributions - KS test\n",
+    "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions.\n",
+    "- we compare the distributions of the two differently upregulated protein groups\n",
+    "This is not the best example for comparing distributions, but it shows how to use the\n",
+    "KS test."
    ]
   },
   {
@@ -572,7 +595,38 @@
    "id": "6dd57b99",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# plot two histograms of intensity values here\n",
+    "sel_pgs = [\"O43175\", \"P39059\"]\n",
+    "view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())\n",
+    "view.plot.hist(bins=20, alpha=0.5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a480b65b",
+   "metadata": {},
+   "source": [
+    "Let us compare the two centered distributions using the KS test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0f8e6a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acore.enrichment_analysis.run_kolmogorov_smirnov(view[sel_pgs[0]], view[sel_pgs[1]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "385b0ceb",
+   "metadata": {},
+   "source": [
+    "The result suggests that the two distributions are from the same distribution."
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index c48fce2..1caea9f 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -12,7 +12,7 @@
 
 
 # %% tags=["hide-output"]
-# %pip install acore
+# %pip install acore vuecore
 
 
 # %%
@@ -264,22 +264,48 @@
     components=2,
     dropna=False,
 )
-pca_result
+resultDf, loadings, var_exp = pca_result
+resultDf
+
+# %% [markdown]
+# The loadings show how the variables are correlated with the principal components.
+
+# %%
+loadings
+
+# %% [markdown]
+# We will plot both on the sample plot (samples on the first two principal components and
+# loadings of variables). We use the
+# [`vuecore` package](https://github.com/Multiomics-Analytics-Group/vuecore)
+# for this, which is also developed by the Multiomics Analytics Group.
 
 # %%
-# from plotly.offline import iplot
-# from vuecore import viz
+from plotly.offline import iplot
+from vuecore import viz
 
-# args = {"factor": 1, "loadings": 10}
-# # #! pca_results has three items, but docstring requests only two -> double check
-# figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args)
-# iplot(figure)
+args = {"factor": 1, "loadings": 10}
+# #! pca_results has three items, but docstring requests only two -> double check
+figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args)
+iplot(figure)
 
 # %% [markdown]
 # ## Compare two distributions - KS test
 # The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions.
+# - we compare the distributions of the two differently upregulated protein groups
+# This is not the best example for comparing distributions, but it shows how to use the
+# KS test.
 
 # %%
 # plot two histograms of intensity values here
+sel_pgs = ["O43175", "P39059"]
+view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())
+view.plot.hist(bins=20, alpha=0.5)
+
+# %% [markdown]
+# Let us compare the two centered distributions using the KS test.
 
 # %%
+acore.enrichment_analysis.run_kolmogorov_smirnov(view[sel_pgs[0]], view[sel_pgs[1]])
+
+# %% [markdown]
+# The result suggests that the two distributions are from the same distribution.

From ce990386fc9bc8df3ad0b7d4afdddf3e0458d359 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 18 Dec 2024 12:19:26 +0100
Subject: [PATCH 36/40] :art: add changes based on discussion with Alberto

- enrichment only separate for up- and down regulated protein groups
- keep hint on to do for PTM dataset
---
 acore/enrichment_analysis.py                | 24 ++++-----
 docs/api_examples/enrichment_analysis.ipynb | 55 +++++----------------
 docs/api_examples/enrichment_analysis.py    | 35 +++++--------
 3 files changed, 37 insertions(+), 77 deletions(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 0ace7b5..44d4aee 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -85,14 +85,15 @@ def run_kolmogorov_smirnov(dist1, dist2, alternative="two-sided"):
 def run_site_regulation_enrichment(
     regulation_data: pd.DataFrame,
     annotation: pd.DataFrame,
-    identifier="identifier",
-    groups=("group1", "group2"),
-    annotation_col="annotation",
-    rejected_col="rejected",
-    group_col="group",
-    method="fisher",
-    regex="(\\w+~.+)_\\w\\d+\\-\\w+",
-    correction="fdr_bh",
+    identifier: str = "identifier",
+    groups: list[str] = ("group1", "group2"),
+    annotation_col: str = "annotation",
+    rejected_col: str = "rejected",
+    group_col: str = "group",
+    method: str = "fisher",
+    regex: str = "(\\w+~.+)_\\w\\d+\\-\\w+",
+    correction: str = "fdr_bh",
+    remove_duplicates: bool = False,
 ):
     r"""
     This function runs a simple enrichment analysis for significantly
@@ -144,14 +145,13 @@ def run_site_regulation_enrichment(
         if match is not None:
             new_ids.append(
                 match.group(1)
-            )  # removes the PTM extension of the identifierm of CKG
+            )  # removes the PTM extension of the identifier of CKG
         else:
             new_ids.append(ident)
     # so this is normalizing the identifiers to ignore the PTM extension
     regulation_data[identifier] = new_ids  # matches are used as identifiers
-    regulation_data = (
-        regulation_data.drop_duplicates()
-    )  # depends on regulation_data if this has an effect (with floats probably not)
+    if remove_duplicates:
+        regulation_data = regulation_data.drop_duplicates(subset=[identifier])
     result = run_regulation_enrichment(
         regulation_data,
         annotation,
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 92da200..07c3268 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -111,7 +111,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_omics.notna().sum().sort_values(ascending=True).plot()"
+    "ax = df_omics.notna().sum().sort_values(ascending=True).plot()"
    ]
   },
   {
@@ -286,31 +286,9 @@
    "id": "4165bc94",
    "metadata": {},
    "source": [
-    "## Enrichment analysis\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f300c5b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ret = acore.enrichment_analysis.run_regulation_enrichment(\n",
-    "    regulation_data=diff_reg,\n",
-    "    annotation=annotations,\n",
-    "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
-    "    correction_alpha=0.01,\n",
-    ")\n",
-    "ret"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "834d5a85",
-   "metadata": {},
-   "source": [
-    "### For up- and downregulated genes separately"
+    "## Enrichment analysis\n",
+    "Is done separately for up- and downregulated genes as it's assumed that biological\n",
+    "processes are regulated in one direction."
    ]
   },
   {
@@ -403,7 +381,12 @@
    "source": [
     "The basic example uses a modified peptide sequence to\n",
     "demonstrate the enrichment analysis.\n",
-    "- compare groups per amino acid modified (kinases targeting certain motifs?)"
+    "> TODO: The example on how to do that needs a PTM focused dataset.\n",
+    "The details of how site specific enrichment analysis is done will depend on the\n",
+    "dataset and the question at hand.\n",
+    "\n",
+    "If the identifiers contain PTMs this information is removed to match it to the annotation\n",
+    "using a regular expression (in the function). For example:"
    ]
   },
   {
@@ -428,20 +411,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "seq_mod = \"_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR\"\n",
-    "regex = \"\\(\\w\\w\\)\"\n",
-    "matches = re.findall(regex, seq_mod)\n",
-    "matches"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7eab37e1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# acore.enrichment_analysis.run_up_down_regulation_enrichment("
+    "# ToDo: Add example for site specific enrichment analysis\n",
+    "# acore.enrichment_analysis.run_up_down_regulation_enrichment"
    ]
   },
   {
@@ -599,7 +570,7 @@
     "# plot two histograms of intensity values here\n",
     "sel_pgs = [\"O43175\", \"P39059\"]\n",
     "view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())\n",
-    "view.plot.hist(bins=20, alpha=0.5)"
+    "ax = view.plot.hist(bins=20, alpha=0.5)"
    ]
   },
   {
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 1caea9f..6acace4 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -51,7 +51,7 @@
 df_omics
 
 # %%
-df_omics.notna().sum().sort_values(ascending=True).plot()
+ax = df_omics.notna().sum().sort_values(ascending=True).plot()
 
 # %% [markdown]
 # Keep only features with a certain amount of non-NaN values and select 100 of these
@@ -145,19 +145,8 @@
 
 # %% [markdown]
 # ## Enrichment analysis
-#
-
-# %%
-ret = acore.enrichment_analysis.run_regulation_enrichment(
-    regulation_data=diff_reg,
-    annotation=annotations,
-    min_detected_in_set=1,  # ! default is 2, so more conservative
-    correction_alpha=0.01,
-)
-ret
-
-# %% [markdown]
-# ### For up- and downregulated genes separately
+# Is done separately for up- and downregulated genes as it's assumed that biological
+# processes are regulated in one direction.
 
 # %% tags=["hide-input"]
 diff_reg.query("rejected")[
@@ -205,7 +194,12 @@
 # %% [markdown]
 # The basic example uses a modified peptide sequence to
 # demonstrate the enrichment analysis.
-# - compare groups per amino acid modified (kinases targeting certain motifs?)
+# > TODO: The example on how to do that needs a PTM focused dataset.
+# The details of how site specific enrichment analysis is done will depend on the
+# dataset and the question at hand.
+#
+# If the identifiers contain PTMs this information is removed to match it to the annotation
+# using a regular expression (in the function). For example:
 
 # %%
 import re
@@ -216,13 +210,8 @@
 match.group(1)
 
 # %%
-seq_mod = "_AAADQET(ph)DTDPEPQPVVGPDAADHRPTVM(ox)LLGGGALSR"
-regex = "\(\w\w\)"
-matches = re.findall(regex, seq_mod)
-matches
-
-# %%
-# acore.enrichment_analysis.run_up_down_regulation_enrichment(
+# ToDo: Add example for site specific enrichment analysis
+# acore.enrichment_analysis.run_up_down_regulation_enrichment
 
 # %% [markdown]
 # ## Single sample GSEA (ssGSEA)
@@ -299,7 +288,7 @@
 # plot two histograms of intensity values here
 sel_pgs = ["O43175", "P39059"]
 view = df_omics[sel_pgs].sub(df_omics[sel_pgs].mean())
-view.plot.hist(bins=20, alpha=0.5)
+ax = view.plot.hist(bins=20, alpha=0.5)
 
 # %% [markdown]
 # Let us compare the two centered distributions using the KS test.

From 45c2f5e5191ce58ddaa799788e6e8ef037d9e7fd Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 18 Dec 2024 12:30:09 +0100
Subject: [PATCH 37/40] :art: lower log2fc cutoff to include downregulated
 protein groups

---
 docs/api_examples/enrichment_analysis.ipynb | 2 +-
 docs/api_examples/enrichment_analysis.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 07c3268..2f485d5 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -336,7 +336,7 @@
     "    regulation_data=diff_reg,\n",
     "    annotation=annotations,\n",
     "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
-    "    lfc_cutoff=1,\n",
+    "    lfc_cutoff=0.5,  # ! the default is 1\n",
     ")\n",
     "ret"
    ]
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 6acace4..ed230de 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -171,7 +171,7 @@
     regulation_data=diff_reg,
     annotation=annotations,
     min_detected_in_set=1,  # ! default is 2, so more conservative
-    lfc_cutoff=1,
+    lfc_cutoff=0.5,  # ! the default is 1
 )
 ret
 

From b71acc8e37953117f8cc48152f25454343091bb5 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Thu, 19 Dec 2024 15:09:28 +0100
Subject: [PATCH 38/40] :bug: split up GO term annotations

Go term annotations had several Go terms concatenated using `;`, which are now split up. Performance can be improved.
---
 docs/api_examples/enrichment_analysis.ipynb | 56 +++++++++++++++++----
 docs/api_examples/enrichment_analysis.py    | 44 ++++++++++++----
 2 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 2f485d5..73b1a37 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -51,7 +51,7 @@
     "import acore.enrichment_analysis\n",
     "from acore.io.uniprot import fetch_annotations\n",
     "\n",
-    "dsp_pandas.format.set_pandas_options(max_colwidth=15)"
+    "dsp_pandas.format.set_pandas_options(max_colwidth=60)"
    ]
   },
   {
@@ -234,18 +234,31 @@
     "    print(f\"Loaded annotations from {fname}\")\n",
     "except FileNotFoundError:\n",
     "    print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n",
-    "    annotations = fetch_annotations(df_omics.columns)\n",
+    "    fields = \"accession,go_p,go_c,go_f\"\n",
+    "    annotations = fetch_annotations(df_omics.columns, fields=fields)\n",
+    "    # First column (`From`) is additional to specified fields\n",
+    "    d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n",
+    "\n",
+    "    # expand go terms\n",
+    "    to_expand = list()\n",
+    "    for field in d_fields_to_col:\n",
+    "        if \"go_\" in field:\n",
+    "            col = d_fields_to_col[field]\n",
+    "            annotations[col] = annotations[col].str.split(\";\")\n",
+    "            to_expand.append(col)\n",
+    "    for col in to_expand:\n",
+    "        # this is a bit wastefull. Processing to stack format should be done here.\n",
+    "        annotations = annotations.explode(col, ignore_index=True)\n",
+    "    # process other than go term columns\n",
     "    annotations = (\n",
-    "        annotations.set_index(\"Entry\")\n",
+    "        annotations.set_index(\"From\")\n",
     "        .rename_axis(\"identifier\")\n",
-    "        .drop(\"From\", axis=1)\n",
+    "        # .drop(\"Entry\", axis=1)\n",
     "        .rename_axis(\"source\", axis=1)\n",
     "        .stack()\n",
     "        .to_frame(\"annotation\")\n",
-    "        .replace(\"\", pd.NA)\n",
-    "        .dropna()\n",
-    "        .sort_values([\"source\", \"annotation\"])\n",
     "        .reset_index()\n",
+    "        .drop_duplicates(ignore_index=True)\n",
     "    )\n",
     "    fname.parent.mkdir(exist_ok=True, parents=True)\n",
     "    annotations.to_csv(fname, index=True)\n",
@@ -335,7 +348,7 @@
     "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n",
     "    regulation_data=diff_reg,\n",
     "    annotation=annotations,\n",
-    "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
+    "    min_detected_in_set=2,  # ! default is 2, so more conservative\n",
     "    lfc_cutoff=0.5,  # ! the default is 1\n",
     ")\n",
     "ret"
@@ -361,7 +374,32 @@
     "    regulation_data=diff_reg,\n",
     "    annotation=annotations,\n",
     "    min_detected_in_set=1,  # ! default is 2, so more conservative\n",
-    "    lfc_cutoff=0,\n",
+    "    lfc_cutoff=0.1,  # ! the default is 1\n",
+    ")\n",
+    "ret"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e3530547",
+   "metadata": {},
+   "source": [
+    "Or restricting the analysis to functional annotation for which we at least found 2\n",
+    "protein groups to be upregulated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ea367ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n",
+    "    regulation_data=diff_reg,\n",
+    "    annotation=annotations,\n",
+    "    min_detected_in_set=2,\n",
+    "    lfc_cutoff=0.5,  # ! the default is 1\n",
     ")\n",
     "ret"
    ]
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index ed230de..3e7e97d 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -26,7 +26,7 @@
 import acore.enrichment_analysis
 from acore.io.uniprot import fetch_annotations
 
-dsp_pandas.format.set_pandas_options(max_colwidth=15)
+dsp_pandas.format.set_pandas_options(max_colwidth=60)
 
 # %% [markdown]
 # Parameters of this notebook
@@ -113,18 +113,31 @@
     print(f"Loaded annotations from {fname}")
 except FileNotFoundError:
     print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.")
-    annotations = fetch_annotations(df_omics.columns)
+    fields = "accession,go_p,go_c,go_f"
+    annotations = fetch_annotations(df_omics.columns, fields=fields)
+    # First column (`From`) is additional to specified fields
+    d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])}
+
+    # expand go terms
+    to_expand = list()
+    for field in d_fields_to_col:
+        if "go_" in field:
+            col = d_fields_to_col[field]
+            annotations[col] = annotations[col].str.split(";")
+            to_expand.append(col)
+    for col in to_expand:
+        # this is a bit wastefull. Processing to stack format should be done here.
+        annotations = annotations.explode(col, ignore_index=True)
+    # process other than go term columns
     annotations = (
-        annotations.set_index("Entry")
+        annotations.set_index("From")
         .rename_axis("identifier")
-        .drop("From", axis=1)
+        # .drop("Entry", axis=1)
         .rename_axis("source", axis=1)
         .stack()
         .to_frame("annotation")
-        .replace("", pd.NA)
-        .dropna()
-        .sort_values(["source", "annotation"])
         .reset_index()
+        .drop_duplicates(ignore_index=True)
     )
     fname.parent.mkdir(exist_ok=True, parents=True)
     annotations.to_csv(fname, index=True)
@@ -170,7 +183,7 @@
 ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(
     regulation_data=diff_reg,
     annotation=annotations,
-    min_detected_in_set=1,  # ! default is 2, so more conservative
+    min_detected_in_set=2,  # ! default is 2, so more conservative
     lfc_cutoff=0.5,  # ! the default is 1
 )
 ret
@@ -184,7 +197,20 @@
     regulation_data=diff_reg,
     annotation=annotations,
     min_detected_in_set=1,  # ! default is 2, so more conservative
-    lfc_cutoff=0,
+    lfc_cutoff=0.1,  # ! the default is 1
+)
+ret
+
+# %% [markdown]
+# Or restricting the analysis to functional annotation for which we at least found 2
+# protein groups to be upregulated.
+
+# %%
+ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(
+    regulation_data=diff_reg,
+    annotation=annotations,
+    min_detected_in_set=2,
+    lfc_cutoff=0.5,  # ! the default is 1
 )
 ret
 

From e685440c3ad4c21766d499a39f6a85bba212eaea Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Thu, 19 Dec 2024 15:10:15 +0100
Subject: [PATCH 39/40] :bug: allow also single protein groups results

---
 acore/enrichment_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
index 44d4aee..6838d83 100644
--- a/acore/enrichment_analysis.py
+++ b/acore/enrichment_analysis.py
@@ -480,7 +480,7 @@ def run_enrichment(
                     ]
                 )
             )
-    if len(pvalues) > 1:
+    if len(pvalues) >= 1:
         rejected, padj = apply_pvalue_correction(
             pvalues,
             alpha=correction_alpha,

From 8a2fac8cc6271f3f6345ebbc11e543b3a869dba4 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Tue, 14 Jan 2025 13:47:39 +0100
Subject: [PATCH 40/40] :art: do not have accession as a column ( "Entry")

annotation field contains identifier again
---
 docs/api_examples/enrichment_analysis.ipynb | 2 +-
 docs/api_examples/enrichment_analysis.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
index 73b1a37..17e96d8 100644
--- a/docs/api_examples/enrichment_analysis.ipynb
+++ b/docs/api_examples/enrichment_analysis.ipynb
@@ -234,7 +234,7 @@
     "    print(f\"Loaded annotations from {fname}\")\n",
     "except FileNotFoundError:\n",
     "    print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n",
-    "    fields = \"accession,go_p,go_c,go_f\"\n",
+    "    fields = \"go_p,go_c,go_f\"\n",
     "    annotations = fetch_annotations(df_omics.columns, fields=fields)\n",
     "    # First column (`From`) is additional to specified fields\n",
     "    d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n",
diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
index 3e7e97d..5f09cd0 100644
--- a/docs/api_examples/enrichment_analysis.py
+++ b/docs/api_examples/enrichment_analysis.py
@@ -113,7 +113,7 @@
     print(f"Loaded annotations from {fname}")
 except FileNotFoundError:
     print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.")
-    fields = "accession,go_p,go_c,go_f"
+    fields = "go_p,go_c,go_f"
     annotations = fetch_annotations(df_omics.columns, fields=fields)
     # First column (`From`) is additional to specified fields
     d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])}