From 8d3a437f4a53c93602e28897dffd6ba8ac2c8ee5 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:25:34 +0100 Subject: [PATCH 01/12] Add optional dataframe conversion to speed up fingerprinting --- schemist/converting.py | 47 ++++++++++-------- schemist/features.py | 110 ++++++++++++++++++++++++++++++++++++++--- schemist/tables.py | 3 +- 3 files changed, 132 insertions(+), 28 deletions(-) diff --git a/schemist/converting.py b/schemist/converting.py index 591e19c..f0a4fc1 100644 --- a/schemist/converting.py +++ b/schemist/converting.py @@ -303,33 +303,38 @@ def _mol2selfies(m: Mol, "minihelm": _mini_helm2mol} -def _x2mol(strings: Union[Iterable[str], str], - input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]: +def _x2mol( + strings: Union[Iterable[str], str], + input_representation: str = 'smiles' +) -> Union[Mol, None, Iterable[Union[Mol, None]]]: from_function = _FROM_FUNCTIONS[input_representation.casefold()] - return from_function(strings) -def _mol2x(mols: Union[Iterable[Mol], Mol], - output_representation: str = 'smiles', - **kwargs) -> Union[str, None, Iterable[Union[str, None]]]: +def _mol2x( + mols: Union[Iterable[Mol], Mol], + output_representation: str = 'smiles', + **kwargs +) -> Union[str, None, Iterable[Union[str, None]]]: to_function = _TO_FUNCTIONS[output_representation.casefold()] return to_function(mols, **kwargs) -def convert_string_representation(strings: Union[Iterable[str], str], - input_representation: str = 'smiles', - output_representation: Union[Iterable[str], str] = 'smiles', - **kwargs) -> Union[str, None, Iterable[Union[str, None]], Dict[str, Union[str, None, Iterable[Union[str, None]]]]]: +def convert_string_representation( + strings: Union[Iterable[str], str], + input_representation: str = 'smiles', + output_representation: Union[Iterable[str], str] = 'smiles', + **kwargs +) -> Union[str, None, Iterable[Union[str, None]], Dict[str, Union[str, None, Iterable[Union[str, None]]]]]: """Convert between string representations of chemical structures. """ - mols = _x2mol(strings, input_representation) + mols = _x2mol(cast(strings, to=list), input_representation) # print_err(mols) if not isinstance(output_representation, str) and isinstance(output_representation, Iterable): @@ -348,15 +353,17 @@ def convert_string_representation(strings: Union[Iterable[str], str], def _convert_input_to_smiles(f: Callable) -> Callable: @wraps(f) - def _f(strings: Union[Iterable[str], str], - input_representation: str = 'smiles', - *args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]: + def _f( + strings: Union[Iterable[str], str], + input_representation: str = 'smiles', + *args, **kwargs + ) -> Union[str, None, Iterable[Union[str, None]]]: - smiles = convert_string_representation(strings, - output_representation='smiles', - input_representation=input_representation) - - return f(strings=smiles, - *args, **kwargs) + smiles = convert_string_representation( + cast(strings, to=list), + output_representation='smiles', + input_representation=input_representation + ) + return f(strings=smiles, *args, **kwargs) return _f diff --git a/schemist/features.py b/schemist/features.py index fc1166c..a1e1bdd 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -1,11 +1,14 @@ """Tools for generating chemical features.""" -from typing import Any, Callable, Iterable, List, Optional, Union +from typing import Any, Callable, Iterable, List, Optional, Tuple, Union from functools import wraps +from carabiner.cast import cast from descriptastorus.descriptors import MakeGenerator from pandas import DataFrame, Series import numpy as np +from rdkit import RDLogger +RDLogger.DisableLog('rdApp.*') from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol from .converting import _smiles2mol, _convert_input_to_smiles @@ -35,7 +38,7 @@ def _get_descriptastorus_features( ) -> Union[DataFrame, Tuple[np.ndarray, List[str]]]: generator = MakeGenerator((generator, )) - features = map(generator.process, smiles) + features = list(map(generator.process, smiles)) return np.stack(features, axis=0), [col for col, _ in generator.GetColumns()] @@ -49,7 +52,46 @@ def calculate_2d_features( ) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]: """Calculate 2d features from string representation. - + + Parameters + ---------- + strings : str + Input string representation(s). + input_representation : str + Representation type + normalized : bool, optional + Whether to return normalized features. Default: `True`. + histogram_normalized : bool, optional + Whether to return histogram normalized features (faster). Default: `True`. + return_dataframe : bool, optional + Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`. + + Returns + ------- + DataFrame, Tuple of numpy Arrays + If `return_dataframe = True`, a DataFrame with named feature columns, and + the final column called `"meta_feature_valid"` being the validity indicator. + Otherwise returns a tuple of Arrays with the first being the matrix of + features and the second being the vector of validity indicators. + + Examples + -------- + >>> features, validity = calculate_2d_features(strings='CCC') + >>> features[:,:3] + array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05]]) + >>> validity + array([1.]) + >>> features, validity = calculate_2d_features(strings=['CCC', 'CCCO']) + >>> features[:,:3] + array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05], + [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]]) + >>> validity + array([1., 1.]) + >>> calculate_2d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid + CCC True + CCCO True + Name: meta_feature_valid, dtype: bool + """ if normalized: @@ -59,7 +101,8 @@ def calculate_2d_features( generator_name = "RDKit2DNormalized" else: generator_name = "RDKit2D" - + + strings = cast(strings, to=list) feature_matrix, columns = _get_descriptastorus_features( strings, generator=generator_name, @@ -111,7 +154,58 @@ def calculate_fingerprints( ) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]: """Calculate the binary fingerprint of string representation(s). + + Only Morgan fingerprints are allowed. + + Parameters + ---------- + strings : str + Input string representation(s). + input_representation : str + Representation type + fp_type : str, opional + Which fingerprint type to calculate. Default: `'morgan'`. + radius : int, optional + Atom radius for fingerprints. Default: `2`. + chiral : bool, optional + Whether to take chirality into account. Default: `True`. + on_bits : bool, optional + Whether to return the non-zero indices instead of the full binary vector. Default: `True`. + return_dataframe : bool, optional + Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`. + + Returns + ------- + DataFrame, Tuple of numpy Arrays + If `return_dataframe = True`, a DataFrame with named feature columns, and + the final column called `"meta_feature_valid"` being the validity indicator. + Otherwise returns a tuple of Arrays with the first being the matrix of + features and the second being the vector of validity indicators. + + Raises + ------ + NotImplementedError + If `fp_type` is not `'morgan'`. + Examples + -------- + >>> bits, validity = calculate_fingerprints(strings='CCC') + >>> bits + ['80;294;1057;1344'] + >>> validity + [True] + >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO']) + >>> bits + ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'] + >>> validity + [True, True] + >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1) + array([4, 8]) + >>> calculate_fingerprints(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid + CCC True + CCCO True + Name: meta_feature_valid, dtype: bool + """ if fp_type.casefold() == 'morgan': @@ -121,7 +215,7 @@ def calculate_fingerprints( fp_generator = generator_class(radius=radius, includeChirality=chiral) - mols = (_smiles2mol(s) for s in strings) + mols = (_smiles2mol(s) for s in cast(strings, to=list)) fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits) for mol in mols) @@ -161,8 +255,10 @@ def calculate_fingerprints( "fp": calculate_fingerprints, } -def calculate_feature(feature_type: str, - *args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]: +def calculate_feature( + feature_type: str, + return_dataframe: bool = False, + *args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]: """Calculate the binary fingerprint or descriptor vector of string representation(s). diff --git a/schemist/tables.py b/schemist/tables.py index 7d2decf..cbe3c5c 100644 --- a/schemist/tables.py +++ b/schemist/tables.py @@ -114,7 +114,8 @@ def featurizer(df: DataFrame, feature_df = calculate_feature(feature_type=feature_type, strings=_get_column_values(df, column), prefix=prefix, - input_representation=input_representation) + input_representation=input_representation, + return_dataframe=True) if len(ids) > 0: df = concat([df[ids], feature_df], axis=1) From e14b1ee69f7ce290d04d8b18f994af964db0cd43 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:29:05 +0100 Subject: [PATCH 02/12] Fix doctest --- schemist/features.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/schemist/features.py b/schemist/features.py index a1e1bdd..8cfc035 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -192,13 +192,13 @@ def calculate_fingerprints( >>> bits, validity = calculate_fingerprints(strings='CCC') >>> bits ['80;294;1057;1344'] - >>> validity - [True] + >>> sum(validity) + 1 >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO']) >>> bits ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'] - >>> validity - [True, True] + >>> sum(validity) + 2 >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1) array([4, 8]) >>> calculate_fingerprints(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid From 3b08a0daf018e4b58e5957b401e2f5c68a02494d Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:41:38 +0100 Subject: [PATCH 03/12] Add norm whitespace to doctest --- schemist/features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schemist/features.py b/schemist/features.py index 8cfc035..8e7e763 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -192,12 +192,12 @@ def calculate_fingerprints( >>> bits, validity = calculate_fingerprints(strings='CCC') >>> bits ['80;294;1057;1344'] - >>> sum(validity) + >>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 1 >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO']) >>> bits ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'] - >>> sum(validity) + >>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 2 >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1) array([4, 8]) From e86e39c74a00ecddc1d44bb3c9069be403e5e44f Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:42:01 +0100 Subject: [PATCH 04/12] Update test versions and dependencies --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7c86656..3f6312b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/pyproject.toml b/pyproject.toml index c5bfa74..c1e73e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ ] dependencies = [ - "carabiner-tools[pd]", + "carabiner-tools[pd]>=0.0.3.post1", "datamol", "descriptastorus", "nemony", @@ -36,7 +36,7 @@ dependencies = [ "pandas", "rdkit", "requests", - "selfies" + "selfies", ] [project.urls] From 9b950712cabafb92a6a52b2daf8c2d1ab1761dee Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:46:45 +0100 Subject: [PATCH 05/12] Make sure fingerprint matrix is ndarray --- .github/workflows/python-package.yml | 4 ++-- schemist/features.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3f6312b..2de9753 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,9 +15,9 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/schemist/features.py b/schemist/features.py index 8e7e763..4559d51 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -225,7 +225,7 @@ def calculate_fingerprints( for fp_string in fp_strings) fingerprints = [';'.join(fp) for fp in fingerprints] validity = [len(fp) > 0 for fp in fingerprints] - feature_matrix = fingerprints + feature_matrix = np.asarray(fingerprints) else: From 582802a382d53946ed887d5890153ecd39d11eb6 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:56:17 +0100 Subject: [PATCH 06/12] Pin working descriptastorus version --- pyproject.toml | 2 +- schemist/features.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c1e73e7..afe0696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ dependencies = [ "carabiner-tools[pd]>=0.0.3.post1", "datamol", - "descriptastorus", + "descriptastorus==2.6.1", "nemony", "openpyxl==3.1.0", "pandas", diff --git a/schemist/features.py b/schemist/features.py index 4559d51..1e2a807 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -84,7 +84,7 @@ def calculate_2d_features( >>> features, validity = calculate_2d_features(strings=['CCC', 'CCCO']) >>> features[:,:3] array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05], - [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]]) + [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]]) >>> validity array([1., 1.]) >>> calculate_2d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid @@ -191,12 +191,12 @@ def calculate_fingerprints( -------- >>> bits, validity = calculate_fingerprints(strings='CCC') >>> bits - ['80;294;1057;1344'] + array(['80;294;1057;1344'], dtype='>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 1 >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO']) >>> bits - ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'] + array(['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'], dtype='>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 2 >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1) From da452f40d999cc7cdf96a9c082b6004da68844fa Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 15:58:16 +0100 Subject: [PATCH 07/12] Fix ndarray doctest --- schemist/features.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/schemist/features.py b/schemist/features.py index 1e2a807..6e4a81b 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -190,13 +190,13 @@ def calculate_fingerprints( Examples -------- >>> bits, validity = calculate_fingerprints(strings='CCC') - >>> bits - array(['80;294;1057;1344'], dtype='>> list(bits) + ['80;294;1057;1344'] >>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 1 >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO']) - >>> bits - array(['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'], dtype='>> list(bits) + ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'] >>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 2 >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1) From 04b802ebbc48859f530f92049cf4661b00e71d40 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 16:08:29 +0100 Subject: [PATCH 08/12] Correct ndarray list conversion doctest --- =2.0 | 1 + schemist/features.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 =2.0 diff --git a/=2.0 b/=2.0 new file mode 100644 index 0000000..8c3d0ec --- /dev/null +++ b/=2.0 @@ -0,0 +1 @@ +Requirement already satisfied: numpy in /camp/home/johnsoe/.conda/envs/schemist/lib/python3.12/site-packages (1.26.4) diff --git a/schemist/features.py b/schemist/features.py index 6e4a81b..6172a08 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -190,12 +190,12 @@ def calculate_fingerprints( Examples -------- >>> bits, validity = calculate_fingerprints(strings='CCC') - >>> list(bits) + >>> bits.tolist() ['80;294;1057;1344'] >>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 1 >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO']) - >>> list(bits) + >>> bits.tolist() ['80;294;1057;1344', '80;222;294;473;794;807;1057;1277'] >>> sum(validity) # doctest: +NORMALIZE_WHITESPACE 2 From e8336a6021f9a677d072ea552677069035bbe383 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 16:23:04 +0100 Subject: [PATCH 09/12] Fix on_bits=True, return_daatframe=True --- schemist/features.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/schemist/features.py b/schemist/features.py index 6172a08..703db83 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -13,7 +13,7 @@ from .converting import _smiles2mol, _convert_input_to_smiles -def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]: +def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], Union[DataFrame, Tuple[np.ndarray, np.ndarray]]]: @wraps(f) def _f(prefix: Optional[str] = None, @@ -225,7 +225,6 @@ def calculate_fingerprints( for fp_string in fp_strings) fingerprints = [';'.join(fp) for fp in fingerprints] validity = [len(fp) > 0 for fp in fingerprints] - feature_matrix = np.asarray(fingerprints) else: @@ -234,16 +233,16 @@ def calculate_fingerprints( else (-np.ones((fp_generator.GetOptions().fpSize, ))) for fp_string in fp_strings] validity = [np.all(fp >= 0) for fp in fingerprints] - feature_matrix = np.stack(fingerprints, axis=0) + + feature_matrix = np.stack(fingerprints, axis=0) if return_dataframe: - ncol = feature_matrix.shape[-1] - if ncol == 1: + if feature_matrix.ndim == 1: # on_bits only feature_matrix = DataFrame(feature_matrix, columns=['fp_bits']) else: feature_matrix = DataFrame(feature_matrix, - columns=[f"fp_{i}" for i in range(ncol)]) + columns=[f"fp_{i}" for i, _ in enumerate(feature_matrix.T)]) return feature_matrix.assign(meta_feature_type=fp_type.casefold(), meta_feature_valid=validity) else: From 812fa19ea25a36ea3bc8e0a57654cb1f058e599e Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 16:28:07 +0100 Subject: [PATCH 10/12] Add SMILES index to dataframe --- schemist/features.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/schemist/features.py b/schemist/features.py index 703db83..548fffd 100644 --- a/schemist/features.py +++ b/schemist/features.py @@ -215,7 +215,8 @@ def calculate_fingerprints( fp_generator = generator_class(radius=radius, includeChirality=chiral) - mols = (_smiles2mol(s) for s in cast(strings, to=list)) + strings = cast(strings, to=list) + mols = (_smiles2mol(s) for s in strings) fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits) for mol in mols) @@ -238,8 +239,11 @@ def calculate_fingerprints( if return_dataframe: if feature_matrix.ndim == 1: # on_bits only - feature_matrix = DataFrame(feature_matrix, - columns=['fp_bits']) + feature_matrix = DataFrame( + feature_matrix, + columns=['fp_bits'], + index=strings, + ) else: feature_matrix = DataFrame(feature_matrix, columns=[f"fp_{i}" for i, _ in enumerate(feature_matrix.T)]) From 54ab3939e2736b1e68ac5eb5c986e589643e5d20 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 16:49:41 +0100 Subject: [PATCH 11/12] Update docs --- =2.0 | 1 - README.md | 37 +++++++++--- docs/source/index.md | 8 ++- docs/source/installation.md | 17 ++++++ docs/source/modules.rst | 7 +++ docs/source/schemist.rst | 109 ++++++++++++++++++++++++++++++++++++ docs/source/usage.md | 55 ++++++++++++++++++ 7 files changed, 224 insertions(+), 10 deletions(-) delete mode 100644 =2.0 create mode 100644 docs/source/installation.md create mode 100644 docs/source/modules.rst create mode 100644 docs/source/schemist.rst create mode 100644 docs/source/usage.md diff --git a/=2.0 b/=2.0 deleted file mode 100644 index 8c3d0ec..0000000 --- a/=2.0 +++ /dev/null @@ -1 +0,0 @@ -Requirement already satisfied: numpy in /camp/home/johnsoe/.conda/envs/schemist/lib/python3.12/site-packages (1.26.4) diff --git a/README.md b/README.md index 33032ef..435cf63 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,40 @@ pip install -e . ## Command-line usage -**schemist** provides command-line utlities to ... The tools complete specific tasks which -can be easily composed into analysis pipelines, because the TSV table output goes to -`stdout` by default so they can be piped from one tool to another. - -To get a list of commands (tools), do +**schemist** provides command-line utlities. The list of commands can be checked like so: ```bash -schemist --help +$ schemist --help +usage: schemist [-h] [--version] {clean,convert,featurize,collate,dedup,enumerate,react,split} ... + +Tools for cleaning, collating, and augmenting chemical datasets. + +options: + -h, --help show this help message and exit + --version, -v show program's version number and exit + +Sub-commands: + {clean,convert,featurize,collate,dedup,enumerate,react,split} + Use these commands to specify the tool you want to use. + clean Clean and normalize SMILES column of a table. + convert Convert between string representations of chemical structures. + featurize Convert between string representations of chemical structures. + collate Collect disparate tables or SDF files of libraries into a single table. + dedup Deduplicate chemical structures and retain references. + enumerate Enumerate bio-chemical structures within length and sequence constraints. + react React compounds in silico in indicated columns using a named reaction. + split Split table based on chosen algorithm, optionally taking account of chemical structure during splits. ``` -And to get help for a specific command, do +Each command is designed to work on large data files in a streaming fashion, so that the entire file is not held in memory at once. One caveat is that the scaffold-based splits are very slow with tables of millions of rows. + +All commands (except `collate`) take from the input table a named column with a SMILES, SELFIES, amino-acid sequence, HELM, or InChI representation of compounds. + +The tools complete specific tasks which +can be easily composed into analysis pipelines, because the TSV table output goes to +`stdout` by default so they can be piped from one tool to another. + +To get help for a specific command, do ```bash schemist --help diff --git a/docs/source/index.md b/docs/source/index.md index fdc8e0c..3f449d6 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -4,7 +4,7 @@ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/schemist) ![PyPI](https://img.shields.io/pypi/v/schemist) -Cleaning, collating, and augmenting chemical datasets. +Organizing and processing tables of chemical structures. ```{toctree} :maxdepth: 2 @@ -16,6 +16,10 @@ python modules ``` +## Issues, problems, suggestions + +Add to the [issue tracker](https://www.github.com/schemist/issues). + ## Source -`GitHub `_ \ No newline at end of file +View source at [GitHub](https://github.com/scbirlab/schemist). \ No newline at end of file diff --git a/docs/source/installation.md b/docs/source/installation.md new file mode 100644 index 0000000..b6e8f6a --- /dev/null +++ b/docs/source/installation.md @@ -0,0 +1,17 @@ +# Installation + +## The easy way + +Install the pre-compiled version from GitHub: + +```bash +$ pip install schemist +``` + +## From source + +Clone the [repository](https://www.github.com/schemist), then `cd` into it. Then run: + +```bash +pip install -e . +``` \ No newline at end of file diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..7ea050d --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +schemist +======== + +.. toctree:: + :maxdepth: 4 + + schemist diff --git a/docs/source/schemist.rst b/docs/source/schemist.rst new file mode 100644 index 0000000..0a238c5 --- /dev/null +++ b/docs/source/schemist.rst @@ -0,0 +1,109 @@ +schemist package +================ + +Submodules +---------- + +schemist.cleaning module +------------------------ + +.. automodule:: schemist.cleaning + :members: + :undoc-members: + :show-inheritance: + +schemist.cli module +------------------- + +.. automodule:: schemist.cli + :members: + :undoc-members: + :show-inheritance: + +schemist.collating module +------------------------- + +.. automodule:: schemist.collating + :members: + :undoc-members: + :show-inheritance: + +schemist.converting module +-------------------------- + +.. automodule:: schemist.converting + :members: + :undoc-members: + :show-inheritance: + +schemist.features module +------------------------ + +.. automodule:: schemist.features + :members: + :undoc-members: + :show-inheritance: + +schemist.generating module +-------------------------- + +.. automodule:: schemist.generating + :members: + :undoc-members: + :show-inheritance: + +schemist.io module +------------------ + +.. automodule:: schemist.io + :members: + :undoc-members: + :show-inheritance: + +schemist.rest\_lookup module +---------------------------- + +.. automodule:: schemist.rest_lookup + :members: + :undoc-members: + :show-inheritance: + +schemist.splitting module +------------------------- + +.. automodule:: schemist.splitting + :members: + :undoc-members: + :show-inheritance: + +schemist.tables module +---------------------- + +.. automodule:: schemist.tables + :members: + :undoc-members: + :show-inheritance: + +schemist.typing module +---------------------- + +.. automodule:: schemist.typing + :members: + :undoc-members: + :show-inheritance: + +schemist.utils module +--------------------- + +.. automodule:: schemist.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: schemist + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/usage.md b/docs/source/usage.md new file mode 100644 index 0000000..a7916ce --- /dev/null +++ b/docs/source/usage.md @@ -0,0 +1,55 @@ +# Usage + +**schemist** has a variety of utilities which can be used through the command-line or the [Python API](#python-api). + +## Command-line usage + +**schemist** provides command-line utlities. The list of commands can be checked like so: + +```bash +$ schemist --help +usage: schemist [-h] [--version] {clean,convert,featurize,collate,dedup,enumerate,react,split} ... + +Tools for cleaning, collating, and augmenting chemical datasets. + +options: + -h, --help show this help message and exit + --version, -v show program's version number and exit + +Sub-commands: + {clean,convert,featurize,collate,dedup,enumerate,react,split} + Use these commands to specify the tool you want to use. + clean Clean and normalize SMILES column of a table. + convert Convert between string representations of chemical structures. + featurize Convert between string representations of chemical structures. + collate Collect disparate tables or SDF files of libraries into a single table. + dedup Deduplicate chemical structures and retain references. + enumerate Enumerate bio-chemical structures within length and sequence constraints. + react React compounds in silico in indicated columns using a named reaction. + split Split table based on chosen algorithm, optionally taking account of chemical structure during splits. +``` + +Each command is designed to work on large data files in a streaming fashion, so that the entire file is not held in memory at once. One caveat is that the scaffold-based splits are very slow with tables of millions of rows. + +All commands (except `collate`) take from the input table a named column with a SMILES, SELFIES, amino-acid sequence, HELM, or InChI representation of compounds. + +The tools complete specific tasks which +can be easily composed into analysis pipelines, because the TSV table output goes to +`stdout` by default so they can be piped from one tool to another. + +To get help for a specific command, do + +```bash +schemist --help +``` + +For the Python API, [see below](#python-api). + + +## Python API + +You can access the underlying functions of `schemist` to help custom analyses or develop other tools. + +```python +>>> import schemist as sch +``` \ No newline at end of file From dcfdcecdae1693eb5025d66bff25e7039ebdac2e Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 11 Oct 2024 16:50:10 +0100 Subject: [PATCH 12/12] Update docs --- README.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/README.md b/README.md index 435cf63..f6d5c23 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ Cleaning, collating, and augmenting chemical datasets. - [Installation](#installation) - [Command-line usage](#command-line-usage) - - [Example](#example) - - [Other commands](#other-commands) - [Python API](#python-api) - [Documentation](#documentation) @@ -74,12 +72,6 @@ schemist --help For the Python API, [see below](#python-api). -## Example - - -## Other commands - - ## Python API @@ -89,8 +81,6 @@ For the Python API, [see below](#python-api). >>> import schemist as sch ``` - - ## Documentation Full API documentation is at [ReadTheDocs](https://schemist.readthedocs.org). \ No newline at end of file