From 8d3a437f4a53c93602e28897dffd6ba8ac2c8ee5 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:25:34 +0100
Subject: [PATCH 01/12] Add optional dataframe conversion to speed up
 fingerprinting

---
 schemist/converting.py |  47 ++++++++++--------
 schemist/features.py   | 110 ++++++++++++++++++++++++++++++++++++++---
 schemist/tables.py     |   3 +-
 3 files changed, 132 insertions(+), 28 deletions(-)

diff --git a/schemist/converting.py b/schemist/converting.py
index 591e19c..f0a4fc1 100644
--- a/schemist/converting.py
+++ b/schemist/converting.py
@@ -303,33 +303,38 @@ def _mol2selfies(m: Mol,
                    "minihelm": _mini_helm2mol}
 
 
-def _x2mol(strings: Union[Iterable[str], str],
-           input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
+def _x2mol(
+    strings: Union[Iterable[str], str],
+    input_representation: str = 'smiles'
+) -> Union[Mol, None, Iterable[Union[Mol, None]]]:
 
     from_function = _FROM_FUNCTIONS[input_representation.casefold()]
-
     return from_function(strings)
 
 
-def _mol2x(mols: Union[Iterable[Mol], Mol],
-           output_representation: str = 'smiles',
-           **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+def _mol2x(
+    mols: Union[Iterable[Mol], Mol],
+    output_representation: str = 'smiles',
+    **kwargs
+) -> Union[str, None, Iterable[Union[str, None]]]:
 
     to_function = _TO_FUNCTIONS[output_representation.casefold()]
 
     return to_function(mols, **kwargs)
 
 
-def convert_string_representation(strings: Union[Iterable[str], str],
-                                  input_representation: str = 'smiles', 
-                                  output_representation: Union[Iterable[str], str] = 'smiles', 
-                                  **kwargs) -> Union[str, None, Iterable[Union[str, None]], Dict[str, Union[str, None, Iterable[Union[str, None]]]]]:
+def convert_string_representation(
+    strings: Union[Iterable[str], str],
+    input_representation: str = 'smiles', 
+    output_representation: Union[Iterable[str], str] = 'smiles', 
+    **kwargs
+) -> Union[str, None, Iterable[Union[str, None]], Dict[str, Union[str, None, Iterable[Union[str, None]]]]]:
     
     """Convert between string representations of chemical structures.
     
     """
 
-    mols = _x2mol(strings, input_representation)
+    mols = _x2mol(cast(strings, to=list), input_representation)
     # print_err(mols)
 
     if not isinstance(output_representation, str) and isinstance(output_representation, Iterable):
@@ -348,15 +353,17 @@ def convert_string_representation(strings: Union[Iterable[str], str],
 def _convert_input_to_smiles(f: Callable) -> Callable:
 
     @wraps(f)
-    def _f(strings: Union[Iterable[str], str], 
-           input_representation: str = 'smiles',
-           *args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+    def _f(
+        strings: Union[Iterable[str], str], 
+        input_representation: str = 'smiles',
+        *args, **kwargs
+    ) -> Union[str, None, Iterable[Union[str, None]]]:
         
-        smiles = convert_string_representation(strings, 
-                                               output_representation='smiles', 
-                                               input_representation=input_representation)
-
-        return f(strings=smiles, 
-                 *args, **kwargs)
+        smiles = convert_string_representation(
+            cast(strings, to=list), 
+            output_representation='smiles', 
+            input_representation=input_representation
+        )
+        return f(strings=smiles, *args, **kwargs)
 
     return _f
diff --git a/schemist/features.py b/schemist/features.py
index fc1166c..a1e1bdd 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -1,11 +1,14 @@
 """Tools for generating chemical features."""
 
-from typing import Any, Callable, Iterable, List, Optional, Union
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
 from functools import wraps
 
+from carabiner.cast import cast
 from descriptastorus.descriptors import MakeGenerator
 from pandas import DataFrame, Series
 import numpy as np
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
 from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
 
 from .converting import _smiles2mol, _convert_input_to_smiles
@@ -35,7 +38,7 @@ def _get_descriptastorus_features(
 ) -> Union[DataFrame, Tuple[np.ndarray, List[str]]]:
 
     generator = MakeGenerator((generator, ))
-    features = map(generator.process, smiles)    
+    features = list(map(generator.process, smiles))    
     return np.stack(features, axis=0), [col for col, _ in generator.GetColumns()]
 
 
@@ -49,7 +52,46 @@ def calculate_2d_features(
 ) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
 
     """Calculate 2d features from string representation.
-    
+
+    Parameters
+    ----------
+    strings : str
+        Input string representation(s).
+    input_representation : str
+        Representation type
+    normalized : bool, optional
+        Whether to return normalized features. Default: `True`.
+    histogram_normalized : bool, optional
+        Whether to return histogram normalized features (faster). Default: `True`.
+    return_dataframe : bool, optional
+        Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.
+
+    Returns
+    -------
+    DataFrame, Tuple of numpy Arrays
+        If `return_dataframe = True`, a DataFrame with named feature columns, and 
+        the final column called `"meta_feature_valid"` being the validity indicator.
+        Otherwise returns a tuple of Arrays with the first being the matrix of 
+        features and the second being the vector of validity indicators.
+
+    Examples
+    --------
+    >>> features, validity = calculate_2d_features(strings='CCC')
+    >>> features[:,:3]
+    array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05]])
+    >>> validity
+    array([1.])
+    >>> features, validity = calculate_2d_features(strings=['CCC', 'CCCO'])
+    >>> features[:,:3]
+    array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05],
+        [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]])
+    >>> validity
+    array([1., 1.])
+    >>> calculate_2d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
+    CCC     True
+    CCCO    True
+    Name: meta_feature_valid, dtype: bool
+
     """  
 
     if normalized:
@@ -59,7 +101,8 @@ def calculate_2d_features(
             generator_name = "RDKit2DNormalized"
     else:
         generator_name = "RDKit2D"
-
+    
+    strings = cast(strings, to=list)
     feature_matrix, columns = _get_descriptastorus_features(
         strings,
         generator=generator_name,
@@ -111,7 +154,58 @@ def calculate_fingerprints(
 ) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
     
     """Calculate the binary fingerprint of string representation(s).
+
+    Only Morgan fingerprints are allowed.
+
+    Parameters
+    ----------
+    strings : str
+        Input string representation(s).
+    input_representation : str
+        Representation type
+    fp_type : str, opional
+        Which fingerprint type to calculate. Default: `'morgan'`.
+    radius : int, optional
+        Atom radius for fingerprints. Default: `2`.
+    chiral : bool, optional
+        Whether to take chirality into account. Default: `True`.
+    on_bits : bool, optional
+        Whether to return the non-zero indices instead of the full binary vector. Default: `True`.
+    return_dataframe : bool, optional
+        Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.
+
+    Returns
+    -------
+    DataFrame, Tuple of numpy Arrays
+        If `return_dataframe = True`, a DataFrame with named feature columns, and 
+        the final column called `"meta_feature_valid"` being the validity indicator.
+        Otherwise returns a tuple of Arrays with the first being the matrix of 
+        features and the second being the vector of validity indicators.
+
+    Raises
+    ------
+    NotImplementedError
+        If `fp_type` is not `'morgan'`.
     
+    Examples
+    --------
+    >>> bits, validity = calculate_fingerprints(strings='CCC')
+    >>> bits
+    ['80;294;1057;1344']
+    >>> validity
+    [True] 
+    >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
+    >>> bits
+    ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277']
+    >>> validity
+    [True, True]
+    >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)
+    array([4, 8])
+    >>> calculate_fingerprints(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
+    CCC     True
+    CCCO    True
+    Name: meta_feature_valid, dtype: bool
+
     """
     
     if fp_type.casefold() == 'morgan':
@@ -121,7 +215,7 @@ def calculate_fingerprints(
     
     fp_generator = generator_class(radius=radius, 
                                    includeChirality=chiral)
-    mols = (_smiles2mol(s) for s in strings)
+    mols = (_smiles2mol(s) for s in cast(strings, to=list))
     fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits) 
                   for mol in mols)
 
@@ -161,8 +255,10 @@ def calculate_fingerprints(
     "fp": calculate_fingerprints,
 }
 
-def calculate_feature(feature_type: str,
-                      *args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
+def calculate_feature(
+    feature_type: str,
+    return_dataframe: bool = False,
+    *args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
     
     """Calculate the binary fingerprint or descriptor vector of string representation(s).
     
diff --git a/schemist/tables.py b/schemist/tables.py
index 7d2decf..cbe3c5c 100644
--- a/schemist/tables.py
+++ b/schemist/tables.py
@@ -114,7 +114,8 @@ def featurizer(df: DataFrame,
     feature_df = calculate_feature(feature_type=feature_type,
                                    strings=_get_column_values(df, column), 
                                    prefix=prefix,
-                                   input_representation=input_representation)
+                                   input_representation=input_representation,
+                                   return_dataframe=True)
     
     if len(ids) > 0:
         df = concat([df[ids], feature_df], axis=1)

From e14b1ee69f7ce290d04d8b18f994af964db0cd43 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:29:05 +0100
Subject: [PATCH 02/12] Fix doctest

---
 schemist/features.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/schemist/features.py b/schemist/features.py
index a1e1bdd..8cfc035 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -192,13 +192,13 @@ def calculate_fingerprints(
     >>> bits, validity = calculate_fingerprints(strings='CCC')
     >>> bits
     ['80;294;1057;1344']
-    >>> validity
-    [True] 
+    >>> sum(validity)
+    1 
     >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
     >>> bits
     ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277']
-    >>> validity
-    [True, True]
+    >>> sum(validity)
+    2
     >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)
     array([4, 8])
     >>> calculate_fingerprints(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid

From 3b08a0daf018e4b58e5957b401e2f5c68a02494d Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:41:38 +0100
Subject: [PATCH 03/12] Add norm whitespace to doctest

---
 schemist/features.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/schemist/features.py b/schemist/features.py
index 8cfc035..8e7e763 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -192,12 +192,12 @@ def calculate_fingerprints(
     >>> bits, validity = calculate_fingerprints(strings='CCC')
     >>> bits
     ['80;294;1057;1344']
-    >>> sum(validity)
+    >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     1 
     >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
     >>> bits
     ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277']
-    >>> sum(validity)
+    >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     2
     >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)
     array([4, 8])

From e86e39c74a00ecddc1d44bb3c9069be403e5e44f Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:42:01 +0100
Subject: [PATCH 04/12] Update test versions and dependencies

---
 .github/workflows/python-package.yml | 2 +-
 pyproject.toml                       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 7c86656..3f6312b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v3
diff --git a/pyproject.toml b/pyproject.toml
index c5bfa74..c1e73e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
 ]
 
 dependencies = [ 
-  "carabiner-tools[pd]",
+  "carabiner-tools[pd]>=0.0.3.post1",
   "datamol",
   "descriptastorus",
   "nemony",
@@ -36,7 +36,7 @@ dependencies = [
   "pandas",
   "rdkit",
   "requests",
-  "selfies"
+  "selfies",
 ]
 
 [project.urls]

From 9b950712cabafb92a6a52b2daf8c2d1ab1761dee Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:46:45 +0100
Subject: [PATCH 05/12] Make sure fingerprint matrix is ndarray

---
 .github/workflows/python-package.yml | 4 ++--
 schemist/features.py                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3f6312b..2de9753 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -15,9 +15,9 @@ jobs:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/schemist/features.py b/schemist/features.py
index 8e7e763..4559d51 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -225,7 +225,7 @@ def calculate_fingerprints(
                         for fp_string in fp_strings)
         fingerprints = [';'.join(fp) for fp in fingerprints]
         validity = [len(fp) > 0 for fp in fingerprints]
-        feature_matrix = fingerprints
+        feature_matrix = np.asarray(fingerprints)
     
     else:
         

From 582802a382d53946ed887d5890153ecd39d11eb6 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:56:17 +0100
Subject: [PATCH 06/12] Pin working descriptastorus version

---
 pyproject.toml       | 2 +-
 schemist/features.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c1e73e7..afe0696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
 dependencies = [ 
   "carabiner-tools[pd]>=0.0.3.post1",
   "datamol",
-  "descriptastorus",
+  "descriptastorus==2.6.1",
   "nemony",
   "openpyxl==3.1.0", 
   "pandas",
diff --git a/schemist/features.py b/schemist/features.py
index 4559d51..1e2a807 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -84,7 +84,7 @@ def calculate_2d_features(
     >>> features, validity = calculate_2d_features(strings=['CCC', 'CCCO'])
     >>> features[:,:3]
     array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05],
-        [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]])
+           [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]])
     >>> validity
     array([1., 1.])
     >>> calculate_2d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
@@ -191,12 +191,12 @@ def calculate_fingerprints(
     --------
     >>> bits, validity = calculate_fingerprints(strings='CCC')
     >>> bits
-    ['80;294;1057;1344']
+    array(['80;294;1057;1344'], dtype='<U16')
     >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     1 
     >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
     >>> bits
-    ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277']
+    array(['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'], dtype='<U16')
     >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     2
     >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)

From da452f40d999cc7cdf96a9c082b6004da68844fa Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 15:58:16 +0100
Subject: [PATCH 07/12] Fix ndarray doctest

---
 schemist/features.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/schemist/features.py b/schemist/features.py
index 1e2a807..6e4a81b 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -190,13 +190,13 @@ def calculate_fingerprints(
     Examples
     --------
     >>> bits, validity = calculate_fingerprints(strings='CCC')
-    >>> bits
-    array(['80;294;1057;1344'], dtype='<U16')
+    >>> list(bits)
+    ['80;294;1057;1344']
     >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     1 
     >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
-    >>> bits
-    array(['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'], dtype='<U16')
+    >>> list(bits)
+    ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277']
     >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     2
     >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)

From 04b802ebbc48859f530f92049cf4661b00e71d40 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 16:08:29 +0100
Subject: [PATCH 08/12] Correct ndarray list conversion doctest

---
 =2.0                 | 1 +
 schemist/features.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 =2.0

diff --git a/=2.0 b/=2.0
new file mode 100644
index 0000000..8c3d0ec
--- /dev/null
+++ b/=2.0
@@ -0,0 +1 @@
+Requirement already satisfied: numpy in /camp/home/johnsoe/.conda/envs/schemist/lib/python3.12/site-packages (1.26.4)
diff --git a/schemist/features.py b/schemist/features.py
index 6e4a81b..6172a08 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -190,12 +190,12 @@ def calculate_fingerprints(
     Examples
     --------
     >>> bits, validity = calculate_fingerprints(strings='CCC')
-    >>> list(bits)
+    >>> bits.tolist()
     ['80;294;1057;1344']
     >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     1 
     >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
-    >>> list(bits)
+    >>> bits.tolist()
     ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277']
     >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
     2

From e8336a6021f9a677d072ea552677069035bbe383 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 16:23:04 +0100
Subject: [PATCH 09/12] Fix on_bits=True, return_daatframe=True

---
 schemist/features.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/schemist/features.py b/schemist/features.py
index 6172a08..703db83 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -13,7 +13,7 @@
 
 from .converting import _smiles2mol, _convert_input_to_smiles
 
-def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
+def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], Union[DataFrame, Tuple[np.ndarray, np.ndarray]]]:
 
     @wraps(f)
     def _f(prefix: Optional[str] = None,
@@ -225,7 +225,6 @@ def calculate_fingerprints(
                         for fp_string in fp_strings)
         fingerprints = [';'.join(fp) for fp in fingerprints]
         validity = [len(fp) > 0 for fp in fingerprints]
-        feature_matrix = np.asarray(fingerprints)
     
     else:
         
@@ -234,16 +233,16 @@ def calculate_fingerprints(
                         else (-np.ones((fp_generator.GetOptions().fpSize, )))
                         for fp_string in fp_strings]
         validity = [np.all(fp >= 0) for fp in fingerprints]
-        feature_matrix = np.stack(fingerprints, axis=0)
+        
+    feature_matrix = np.stack(fingerprints, axis=0)
 
     if return_dataframe:
-        ncol = feature_matrix.shape[-1]
-        if ncol == 1:
+        if feature_matrix.ndim == 1:  # on_bits only
             feature_matrix = DataFrame(feature_matrix, 
                                        columns=['fp_bits'])
         else:
             feature_matrix = DataFrame(feature_matrix,
-                                        columns=[f"fp_{i}" for i in range(ncol)])
+                                       columns=[f"fp_{i}" for i, _ in enumerate(feature_matrix.T)])
         return feature_matrix.assign(meta_feature_type=fp_type.casefold(), 
                                      meta_feature_valid=validity)
     else:

From 812fa19ea25a36ea3bc8e0a57654cb1f058e599e Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 16:28:07 +0100
Subject: [PATCH 10/12] Add SMILES index to dataframe

---
 schemist/features.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/schemist/features.py b/schemist/features.py
index 703db83..548fffd 100644
--- a/schemist/features.py
+++ b/schemist/features.py
@@ -215,7 +215,8 @@ def calculate_fingerprints(
     
     fp_generator = generator_class(radius=radius, 
                                    includeChirality=chiral)
-    mols = (_smiles2mol(s) for s in cast(strings, to=list))
+    strings = cast(strings, to=list)
+    mols = (_smiles2mol(s) for s in strings)
     fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits) 
                   for mol in mols)
 
@@ -238,8 +239,11 @@ def calculate_fingerprints(
 
     if return_dataframe:
         if feature_matrix.ndim == 1:  # on_bits only
-            feature_matrix = DataFrame(feature_matrix, 
-                                       columns=['fp_bits'])
+            feature_matrix = DataFrame(
+                feature_matrix, 
+                columns=['fp_bits'],
+                index=strings,
+            )
         else:
             feature_matrix = DataFrame(feature_matrix,
                                        columns=[f"fp_{i}" for i, _ in enumerate(feature_matrix.T)])

From 54ab3939e2736b1e68ac5eb5c986e589643e5d20 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 16:49:41 +0100
Subject: [PATCH 11/12] Update docs

---
 =2.0                        |   1 -
 README.md                   |  37 +++++++++---
 docs/source/index.md        |   8 ++-
 docs/source/installation.md |  17 ++++++
 docs/source/modules.rst     |   7 +++
 docs/source/schemist.rst    | 109 ++++++++++++++++++++++++++++++++++++
 docs/source/usage.md        |  55 ++++++++++++++++++
 7 files changed, 224 insertions(+), 10 deletions(-)
 delete mode 100644 =2.0
 create mode 100644 docs/source/installation.md
 create mode 100644 docs/source/modules.rst
 create mode 100644 docs/source/schemist.rst
 create mode 100644 docs/source/usage.md

diff --git a/=2.0 b/=2.0
deleted file mode 100644
index 8c3d0ec..0000000
--- a/=2.0
+++ /dev/null
@@ -1 +0,0 @@
-Requirement already satisfied: numpy in /camp/home/johnsoe/.conda/envs/schemist/lib/python3.12/site-packages (1.26.4)
diff --git a/README.md b/README.md
index 33032ef..435cf63 100644
--- a/README.md
+++ b/README.md
@@ -33,17 +33,40 @@ pip install -e .
 
 ## Command-line usage
 
-**schemist**  provides command-line utlities to ... The tools complete specific tasks which 
-can be easily composed into analysis pipelines, because the TSV table output goes to
-`stdout` by default so they can be piped from one tool to another.
-
-To get a list of commands (tools), do
+**schemist**  provides command-line utlities. The list of commands can be checked like so:
 
 ```bash
-schemist --help
+$ schemist --help
+usage: schemist [-h] [--version] {clean,convert,featurize,collate,dedup,enumerate,react,split} ...
+
+Tools for cleaning, collating, and augmenting chemical datasets.
+
+options:
+  -h, --help            show this help message and exit
+  --version, -v         show program's version number and exit
+
+Sub-commands:
+  {clean,convert,featurize,collate,dedup,enumerate,react,split}
+                        Use these commands to specify the tool you want to use.
+    clean               Clean and normalize SMILES column of a table.
+    convert             Convert between string representations of chemical structures.
+    featurize           Convert between string representations of chemical structures.
+    collate             Collect disparate tables or SDF files of libraries into a single table.
+    dedup               Deduplicate chemical structures and retain references.
+    enumerate           Enumerate bio-chemical structures within length and sequence constraints.
+    react               React compounds in silico in indicated columns using a named reaction.
+    split               Split table based on chosen algorithm, optionally taking account of chemical structure during splits.
 ```
 
-And to get help for a specific command, do
+Each command is designed to work on large data files in a streaming fashion, so that the entire file is not held in memory at once. One caveat is that the scaffold-based splits are very slow with tables of millions of rows.
+
+All commands (except `collate`) take from the input table a named column with a SMILES, SELFIES, amino-acid sequence, HELM, or InChI representation of compounds.
+
+The tools complete specific tasks which 
+can be easily composed into analysis pipelines, because the TSV table output goes to
+`stdout` by default so they can be piped from one tool to another.
+
+To get help for a specific command, do
 
 ```bash
 schemist <command> --help
diff --git a/docs/source/index.md b/docs/source/index.md
index fdc8e0c..3f449d6 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -4,7 +4,7 @@
 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/schemist)
 ![PyPI](https://img.shields.io/pypi/v/schemist)
 
-Cleaning, collating, and augmenting chemical datasets.
+Organizing and processing tables of chemical structures.
 
 ```{toctree}
 :maxdepth: 2
@@ -16,6 +16,10 @@ python
 modules
 ```
 
+## Issues, problems, suggestions
+
+Add to the [issue tracker](https://www.github.com/schemist/issues).
+
 ## Source
 
-`GitHub <https://github.com/scbirlab/schemist>`_
\ No newline at end of file
+View source at [GitHub](https://github.com/scbirlab/schemist).
\ No newline at end of file
diff --git a/docs/source/installation.md b/docs/source/installation.md
new file mode 100644
index 0000000..b6e8f6a
--- /dev/null
+++ b/docs/source/installation.md
@@ -0,0 +1,17 @@
+# Installation
+
+## The easy way
+
+Install the pre-compiled version from GitHub:
+
+```bash
+$ pip install schemist
+```
+
+## From source
+
+Clone the [repository](https://www.github.com/schemist), then `cd` into it. Then run:
+
+```bash
+pip install -e .
+```
\ No newline at end of file
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
new file mode 100644
index 0000000..7ea050d
--- /dev/null
+++ b/docs/source/modules.rst
@@ -0,0 +1,7 @@
+schemist
+========
+
+.. toctree::
+   :maxdepth: 4
+
+   schemist
diff --git a/docs/source/schemist.rst b/docs/source/schemist.rst
new file mode 100644
index 0000000..0a238c5
--- /dev/null
+++ b/docs/source/schemist.rst
@@ -0,0 +1,109 @@
+schemist package
+================
+
+Submodules
+----------
+
+schemist.cleaning module
+------------------------
+
+.. automodule:: schemist.cleaning
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.cli module
+-------------------
+
+.. automodule:: schemist.cli
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.collating module
+-------------------------
+
+.. automodule:: schemist.collating
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.converting module
+--------------------------
+
+.. automodule:: schemist.converting
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.features module
+------------------------
+
+.. automodule:: schemist.features
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.generating module
+--------------------------
+
+.. automodule:: schemist.generating
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.io module
+------------------
+
+.. automodule:: schemist.io
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.rest\_lookup module
+----------------------------
+
+.. automodule:: schemist.rest_lookup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.splitting module
+-------------------------
+
+.. automodule:: schemist.splitting
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.tables module
+----------------------
+
+.. automodule:: schemist.tables
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.typing module
+----------------------
+
+.. automodule:: schemist.typing
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+schemist.utils module
+---------------------
+
+.. automodule:: schemist.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: schemist
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/usage.md b/docs/source/usage.md
new file mode 100644
index 0000000..a7916ce
--- /dev/null
+++ b/docs/source/usage.md
@@ -0,0 +1,55 @@
+# Usage
+
+**schemist** has a variety of utilities which can be used through the command-line or the [Python API](#python-api).
+
+## Command-line usage
+
+**schemist**  provides command-line utlities. The list of commands can be checked like so:
+
+```bash
+$ schemist --help
+usage: schemist [-h] [--version] {clean,convert,featurize,collate,dedup,enumerate,react,split} ...
+
+Tools for cleaning, collating, and augmenting chemical datasets.
+
+options:
+  -h, --help            show this help message and exit
+  --version, -v         show program's version number and exit
+
+Sub-commands:
+  {clean,convert,featurize,collate,dedup,enumerate,react,split}
+                        Use these commands to specify the tool you want to use.
+    clean               Clean and normalize SMILES column of a table.
+    convert             Convert between string representations of chemical structures.
+    featurize           Convert between string representations of chemical structures.
+    collate             Collect disparate tables or SDF files of libraries into a single table.
+    dedup               Deduplicate chemical structures and retain references.
+    enumerate           Enumerate bio-chemical structures within length and sequence constraints.
+    react               React compounds in silico in indicated columns using a named reaction.
+    split               Split table based on chosen algorithm, optionally taking account of chemical structure during splits.
+```
+
+Each command is designed to work on large data files in a streaming fashion, so that the entire file is not held in memory at once. One caveat is that the scaffold-based splits are very slow with tables of millions of rows.
+
+All commands (except `collate`) take from the input table a named column with a SMILES, SELFIES, amino-acid sequence, HELM, or InChI representation of compounds.
+
+The tools complete specific tasks which 
+can be easily composed into analysis pipelines, because the TSV table output goes to
+`stdout` by default so they can be piped from one tool to another.
+
+To get help for a specific command, do
+
+```bash
+schemist <command> --help
+```
+
+For the Python API, [see below](#python-api).
+
+
+## Python API
+
+You can access the underlying functions of `schemist` to help custom analyses or develop other tools.
+
+```python
+>>> import schemist as sch
+```
\ No newline at end of file

From dcfdcecdae1693eb5025d66bff25e7039ebdac2e Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 11 Oct 2024 16:50:10 +0100
Subject: [PATCH 12/12] Update docs

---
 README.md | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/README.md b/README.md
index 435cf63..f6d5c23 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,6 @@ Cleaning, collating, and augmenting chemical datasets.
 
 - [Installation](#installation)
 - [Command-line usage](#command-line-usage)
-    - [Example](#example)
-    - [Other commands](#other-commands)
 - [Python API](#python-api)
 - [Documentation](#documentation)
 
@@ -74,12 +72,6 @@ schemist <command> --help
 
 For the Python API, [see below](#python-api).
 
-## Example
-
-
-## Other commands
-
-
 
 ## Python API
 
@@ -89,8 +81,6 @@ For the Python API, [see below](#python-api).
 >>> import schemist as sch
 ```
 
-
-
 ## Documentation
 
 Full API documentation is at [ReadTheDocs](https://schemist.readthedocs.org).
\ No newline at end of file