diff --git a/README.md b/README.md index c3152dd4..8a29dc25 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ [![PyPI version](https://badge.fury.io/py/bdi-kit.svg)](https://pypi.org/project/bdi-kit) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Documentation Status](https://readthedocs.org/projects/bdi-kit/badge/?version=latest)](https://bdi-kit.readthedocs.io/en/latest/?badge=latest) +[![Tests](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/build.yml/badge.svg)](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/build.yml) +[![Lint](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/lint.yml/badge.svg)](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/lint.yml) + # bdi-kit This project aims to assist users in performing data integration on biomedical data. It provides tools to streamline the process of integrating disparate biomedical datasets. @@ -21,7 +25,7 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel ## Documentation -See our examples [here](https://github.com/VIDA-NYU/bdi-kit/tree/devel/examples). +Documentation is available [here](https://bdi-kit.readthedocs.io/). ## Contributing We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter. diff --git a/bdikit/api.py b/bdikit/api.py index b35d1ad1..e79556dc 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -20,6 +20,9 @@ class APIManager: def __init__( self, ): + """ + Create/instantiate an APIManager object. + """ # TODO: move into database object (in data_ingestion folder) self.dataset = None # TODO: move into database object (in data_ingestion folder) @@ -32,6 +35,11 @@ def __init__( self.value_mappings = None # TODO move this to a property in value_manager def load_global_table(self, global_table_path=None): + """ + Load a global/target table. + + :param global_table_path: Path to the CSV dataset. + """ if global_table_path is None: self.global_table = load_dataframe(GDC_DATA_PATH) else: @@ -39,6 +47,11 @@ def load_global_table(self, global_table_path=None): return self.global_table def load_dataset(self, dataset_path): + """ + Load the dataset. + + :param dataset_path: Path to the CSV dataset. + """ if self.global_table is None: self.load_global_table() self.dataset = load_dataframe(dataset_path) @@ -46,11 +59,21 @@ def load_dataset(self, dataset_path): return self.dataset def reduce_scope(self): + """ + Reduce the scope of the target domain. Supports only GDC format. + """ self.scope_manager = ScopeReducingManager(self.dataset, self.global_table) self.reduced_scope = self.scope_manager.reduce() plot_reduce_scope(self.reduced_scope, self.dataset) def map_columns(self, algorithm="SimFloodAlgorithm"): + """ + Map columns. + + :param algorithm: The algorithm to perform the colum mappings. Supports: `SimFloodAlgorithm`, + `ComaAlgorithm`, `CupidAlgorithm`, `DistributionBasedAlgorithm`, `JaccardDistanceAlgorithm`, + and `GPTAlgorithm`. + """ self.column_manager = ColumnMappingManager( self.dataset, self.global_table, algorithm ) @@ -61,6 +84,13 @@ def map_columns(self, algorithm="SimFloodAlgorithm"): return self.column_mappings def map_values(self, algorithm="EditAlgorithm"): + """ + Map column values. + + :param algorithm: The algorithm to perform the value mappings. Supports: `EditAlgorithm`, + `TFIDFAlgorithm`, `EmbeddingAlgorithm` and `LLMAlgorithm`. + + """ self.global_table_all = get_gdc_data(self.column_mappings.values()) self.value_manager = ValueMappingManager( self.dataset, self.column_mappings, self.global_table_all, algorithm @@ -73,6 +103,13 @@ def map_values(self, algorithm="EditAlgorithm"): def update_reduced_scope( self, original_column, new_candidate_name, new_candidate_sim=1.0 ): + """ + Update the values returned by the scope reducer algorithm. + + :param original_column: Name of the original column. + :param new_candidate_name: New name of the candidate column. + :param new_candidate_sim: New similarity of the candidate column. + """ for index in range(len(self.reduced_scope)): if self.reduced_scope[index]["Candidate column"] == original_column: self.reduced_scope[index]["Top k columns"].append( @@ -83,6 +120,11 @@ def update_reduced_scope( break def update_column_mappings(self, new_mappings): + """ + Update the column mappings. + + :param new_mappings: List of tuples (original_column, correct_target_column). + """ for original_column, new_target_column in new_mappings: self.column_mappings[original_column] = new_target_column @@ -92,6 +134,14 @@ def update_column_mappings(self, new_mappings): def update_value_mappings( self, original_column, original_value, new_target_value, new_similarity=1.0 ): + """ + Update the column value mappings. + + :param original_column: Name of the original column. + :param original_value: Name of the original column value. + :param new_target_value: New name of the target column value. + :param new_similarity: New similarity of the original and target column values. + """ for index in range(len(self.value_mappings[original_column]["matches"])): if ( self.value_mappings[original_column]["matches"][index][0] diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..6247f7e2 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 00000000..fe4f7256 --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,7 @@ +API +========= + +.. module:: bdikit.api + +.. autoclass:: APIManager + :members: diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..3a35ed8e --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,91 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath("../../")) + + +# -- Project information ----------------------------------------------------- + +project = "bdi-kit" +copyright = "2024, NYU" +author = "NYU" + +# The full version, including alpha/beta/rc tags +release = "" +master_doc = "index" + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "nbsphinx", + "nbsphinx_link", + "sphinxemoji.sphinxemoji", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# html_logo = 'images/logo.png' + +# html_theme_options = { +# 'logo_only': True, +# 'display_version': False, +# } + +autodoc_member_order = "bysource" + +autoclass_content = "both" + +add_module_names = False + +autodoc_mock_imports = [ + "sklearn", + "pandas", + "numpy", + "IPython", + "torch", + "transformers", + "matplotlib", + "openai", + "polyfuzz", + "flair", + "autofj", + "Levenshtein", + "valentine", + "altair", + "panel", +] diff --git a/docs/source/examples.rst b/docs/source/examples.rst new file mode 100644 index 00000000..5f82a39f --- /dev/null +++ b/docs/source/examples.rst @@ -0,0 +1,7 @@ +Examples +========== + +Here can find different Jupyter notebook examples about how to use `bdi-kit`: + +- `Column and Value Mappings `__ + diff --git a/docs/source/getting-started.ipynb b/docs/source/getting-started.ipynb new file mode 100644 index 00000000..3564bbfb --- /dev/null +++ b/docs/source/getting-started.ipynb @@ -0,0 +1,2122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b6a5938c-d32e-4816-ba31-a0b54e4a9826", + "metadata": {}, + "source": [ + "# Getting Started" + ] + }, + { + "cell_type": "markdown", + "id": "1af374aa-f7b2-4329-b605-faf71c17e6b0", + "metadata": {}, + "source": [ + "First, import the class `APIManager`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f01f77a6-f349-4f71-a847-c26b69a60c41", + "metadata": {}, + "outputs": [], + "source": [ + "from bdikit import APIManager" + ] + }, + { + "cell_type": "markdown", + "id": "fb0b6170-c8e5-4ff3-9948-a55acb913563", + "metadata": {}, + "source": [ + "## Dataset Loading" + ] + }, + { + "cell_type": "markdown", + "id": "ee01a713-6cea-42a7-9a64-7aeaba7b0857", + "metadata": {}, + "source": [ + "In this example, we are mapping data from Dou et al. (https://pubmed.ncbi.nlm.nih.gov/37567170/) to the GDC format." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "99afdf9a-b68a-4a30-9735-f8ef36027757", + "metadata": {}, + "outputs": [], + "source": [ + "manager = APIManager()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "34e5d030-a50d-4076-9e96-7e4e73fe7cda", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryHistologic_Grade_FIGOHistologic_typePath_Stage_Primary_Tumor-pTPath_Stage_Reg_Lymph_Nodes-pNClin_Stage_Dist_Mets-cMPath_Stage_Dist_Mets-pMtumor_Stage-PathologicalFIGO_stageBMIAgeRaceEthnicityGenderTumor_SiteTumor_FocalityTumor_Size_cm
0United StatesFIGO grade 1EndometrioidpT1a (FIGO IA)pN0cM0Staging IncompleteStage IIA38.8864.0WhiteNot-Hispanic or LatinoFemaleAnterior endometriumUnifocal2.9
1United StatesFIGO grade 1EndometrioidpT1a (FIGO IA)pNXcM0Staging IncompleteStage IVIA39.7658.0WhiteNot-Hispanic or LatinoFemalePosterior endometriumUnifocal3.5
2United StatesFIGO grade 2EndometrioidpT1a (FIGO IA)pN0cM0Staging IncompleteStage IIA51.1950.0WhiteNot-Hispanic or LatinoFemaleOther, specifyUnifocal4.5
3NaNNaNCarcinosarcomaNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4United StatesFIGO grade 2EndometrioidpT1a (FIGO IA)pNXcM0No pathologic evidence of distant metastasisStage IIA32.6975.0WhiteNot-Hispanic or LatinoFemaleOther, specifyUnifocal3.5
......................................................
99UkraineFIGO grade 3EndometrioidpT1a (FIGO IA)pNXcM0Staging IncompleteStage IIA29.4075.0NaNNaNFemaleOther, specifyUnifocal4.2
100UkraineFIGO grade 2EndometrioidpT2 (FIGO II)pN0cM0Staging IncompleteStage IIII35.4274.0NaNNaNFemaleOther, specifyUnifocal1.5
101United StatesNaNSerouspT2 (FIGO II)pN0Staging IncompleteStaging IncompleteStage IIII24.3285.0Black or African AmericanNot-Hispanic or LatinoFemaleOther, specifyUnifocal3.8
102UkraineNaNSerouspT1a (FIGO IA)pN0cM0Staging IncompleteStage IIA34.0670.0NaNNaNFemaleOther, specifyUnifocal5.0
103UkraineNaNSerousNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

104 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " Country Histologic_Grade_FIGO Histologic_type \\\n", + "0 United States FIGO grade 1 Endometrioid \n", + "1 United States FIGO grade 1 Endometrioid \n", + "2 United States FIGO grade 2 Endometrioid \n", + "3 NaN NaN Carcinosarcoma \n", + "4 United States FIGO grade 2 Endometrioid \n", + ".. ... ... ... \n", + "99 Ukraine FIGO grade 3 Endometrioid \n", + "100 Ukraine FIGO grade 2 Endometrioid \n", + "101 United States NaN Serous \n", + "102 Ukraine NaN Serous \n", + "103 Ukraine NaN Serous \n", + "\n", + " Path_Stage_Primary_Tumor-pT Path_Stage_Reg_Lymph_Nodes-pN \\\n", + "0 pT1a (FIGO IA) pN0 \n", + "1 pT1a (FIGO IA) pNX \n", + "2 pT1a (FIGO IA) pN0 \n", + "3 NaN NaN \n", + "4 pT1a (FIGO IA) pNX \n", + ".. ... ... \n", + "99 pT1a (FIGO IA) pNX \n", + "100 pT2 (FIGO II) pN0 \n", + "101 pT2 (FIGO II) pN0 \n", + "102 pT1a (FIGO IA) pN0 \n", + "103 NaN NaN \n", + "\n", + " Clin_Stage_Dist_Mets-cM Path_Stage_Dist_Mets-pM \\\n", + "0 cM0 Staging Incomplete \n", + "1 cM0 Staging Incomplete \n", + "2 cM0 Staging Incomplete \n", + "3 NaN NaN \n", + "4 cM0 No pathologic evidence of distant metastasis \n", + ".. ... ... \n", + "99 cM0 Staging Incomplete \n", + "100 cM0 Staging Incomplete \n", + "101 Staging Incomplete Staging Incomplete \n", + "102 cM0 Staging Incomplete \n", + "103 NaN NaN \n", + "\n", + " tumor_Stage-Pathological FIGO_stage BMI Age \\\n", + "0 Stage I IA 38.88 64.0 \n", + "1 Stage IV IA 39.76 58.0 \n", + "2 Stage I IA 51.19 50.0 \n", + "3 NaN NaN NaN NaN \n", + "4 Stage I IA 32.69 75.0 \n", + ".. ... ... ... ... \n", + "99 Stage I IA 29.40 75.0 \n", + "100 Stage II II 35.42 74.0 \n", + "101 Stage II II 24.32 85.0 \n", + "102 Stage I IA 34.06 70.0 \n", + "103 NaN NaN NaN NaN \n", + "\n", + " Race Ethnicity Gender \\\n", + "0 White Not-Hispanic or Latino Female \n", + "1 White Not-Hispanic or Latino Female \n", + "2 White Not-Hispanic or Latino Female \n", + "3 NaN NaN NaN \n", + "4 White Not-Hispanic or Latino Female \n", + ".. ... ... ... \n", + "99 NaN NaN Female \n", + "100 NaN NaN Female \n", + "101 Black or African American Not-Hispanic or Latino Female \n", + "102 NaN NaN Female \n", + "103 NaN NaN NaN \n", + "\n", + " Tumor_Site Tumor_Focality Tumor_Size_cm \n", + "0 Anterior endometrium Unifocal 2.9 \n", + "1 Posterior endometrium Unifocal 3.5 \n", + "2 Other, specify Unifocal 4.5 \n", + "3 NaN NaN NaN \n", + "4 Other, specify Unifocal 3.5 \n", + ".. ... ... ... \n", + "99 Other, specify Unifocal 4.2 \n", + "100 Other, specify Unifocal 1.5 \n", + "101 Other, specify Unifocal 3.8 \n", + "102 Other, specify Unifocal 5.0 \n", + "103 NaN NaN NaN \n", + "\n", + "[104 rows x 17 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset_path = '../../examples/datasets/dou.csv'\n", + "dataset = manager.load_dataset(dataset_path)\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "id": "6d4f458b-610e-4911-b386-53a8261f5188", + "metadata": {}, + "source": [ + "## Reducing the GDC Scope" + ] + }, + { + "cell_type": "markdown", + "id": "ca7e99b8-d571-4d66-9737-1ab0fd4d6d2c", + "metadata": {}, + "source": [ + "Since the GDC contains 700+ attributes, a first step we take is to select a subset of those attributes that are likely matches to the attributes in the Dou et al. schema -- the top-k candidates for each column. We can explore the candidates for each column using the ScopeReducerExplorer." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "18eedf76-fbd9-4b86-8f1e-3f8c7a222e43", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + " 0%| | 0/17 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Original ColumnTarget Column
0Countrycountry_of_birth
1Histologic_Grade_FIGOhistologic_progression_type
2Histologic_typedysplasia_type
3Path_Stage_Primary_Tumor-pTuicc_clinical_m
4Path_Stage_Reg_Lymph_Nodes-pNfigo_stage
5Clin_Stage_Dist_Mets-cMinrg_stage
6Path_Stage_Dist_Mets-pMlast_known_disease_status
7tumor_Stage-Pathologicaltumor_grade_category
8FIGO_stagefigo_stage
9BMIage_at_index
10Ageweight
11Racerace
12Ethnicityethnicity
13Gendergender
14Tumor_Sitetumor_shape
15Tumor_Focalitytumor_focality
16Tumor_Size_cmtumor_depth
\n", + "" + ], + "text/plain": [ + " Original Column Target Column\n", + "0 Country country_of_birth\n", + "1 Histologic_Grade_FIGO histologic_progression_type\n", + "2 Histologic_type dysplasia_type\n", + "3 Path_Stage_Primary_Tumor-pT uicc_clinical_m\n", + "4 Path_Stage_Reg_Lymph_Nodes-pN figo_stage\n", + "5 Clin_Stage_Dist_Mets-cM inrg_stage\n", + "6 Path_Stage_Dist_Mets-pM last_known_disease_status\n", + "7 tumor_Stage-Pathological tumor_grade_category\n", + "8 FIGO_stage figo_stage\n", + "9 BMI age_at_index\n", + "10 Age weight\n", + "11 Race race\n", + "12 Ethnicity ethnicity\n", + "13 Gender gender\n", + "14 Tumor_Site tumor_shape\n", + "15 Tumor_Focality tumor_focality\n", + "16 Tumor_Size_cm tumor_depth" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "column_mappings = manager.map_columns()" + ] + }, + { + "cell_type": "markdown", + "id": "5dc5661a-3553-4068-a85c-c9893ae4133c", + "metadata": {}, + "source": [ + "Users can change the algorithm to perform the column mappings. We provide a GPT-based algorithm (`GPTAlgorithm`). To use it, you need to add an environment variable for your OpenAI key (`export OPENAI_API_KEY='your-api-key-here'`)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2e3acf43-56d9-48db-a741-4cac3816fe62", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Original ColumnTarget Column
0Countrycountry_of_residence_at_enrollment
1Histologic_Grade_FIGOtumor_grade
2Histologic_typesample_type
3Path_Stage_Primary_Tumor-pTfigo_stage
4Path_Stage_Reg_Lymph_Nodes-pNajcc_pathologic_n
5Clin_Stage_Dist_Mets-cMajcc_clinical_m
6Path_Stage_Dist_Mets-pMajcc_pathologic_m
7tumor_Stage-Pathologicalajcc_pathologic_stage
8FIGO_stagefigo_stage
9BMIbmi
10Ageage_at_onset
11Racerace
12Ethnicityethnicity
13Genderanalyte_type
14Tumor_Siteprimary_site
15Tumor_Focalitytumor_focality
16Tumor_Size_cmbreslow_thickness
\n", + "
" + ], + "text/plain": [ + " Original Column Target Column\n", + "0 Country country_of_residence_at_enrollment\n", + "1 Histologic_Grade_FIGO tumor_grade\n", + "2 Histologic_type sample_type\n", + "3 Path_Stage_Primary_Tumor-pT figo_stage\n", + "4 Path_Stage_Reg_Lymph_Nodes-pN ajcc_pathologic_n\n", + "5 Clin_Stage_Dist_Mets-cM ajcc_clinical_m\n", + "6 Path_Stage_Dist_Mets-pM ajcc_pathologic_m\n", + "7 tumor_Stage-Pathological ajcc_pathologic_stage\n", + "8 FIGO_stage figo_stage\n", + "9 BMI bmi\n", + "10 Age age_at_onset\n", + "11 Race race\n", + "12 Ethnicity ethnicity\n", + "13 Gender analyte_type\n", + "14 Tumor_Site primary_site\n", + "15 Tumor_Focality tumor_focality\n", + "16 Tumor_Size_cm breslow_thickness" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "column_mappings = manager.map_columns(algorithm='GPTAlgorithm')" + ] + }, + { + "cell_type": "markdown", + "id": "b059686e-0039-4bac-94e2-7a18a0ba55eb", + "metadata": {}, + "source": [ + "Users can update column mappings through the `update_column_mappings` method." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "caf4e53b-64e5-4d59-81d9-2141ff58db5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Column mapping updated!\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Original ColumnTarget Column
0Countrycountry_of_residence_at_enrollment
1Histologic_Grade_FIGOtumor_grade
2Histologic_typeprimary_diagnosis
3Path_Stage_Primary_Tumor-pTajcc_pathologic_t
4Path_Stage_Reg_Lymph_Nodes-pNajcc_pathologic_n
5Clin_Stage_Dist_Mets-cMajcc_clinical_m
6Path_Stage_Dist_Mets-pMajcc_pathologic_m
7tumor_Stage-Pathologicalajcc_pathologic_stage
8FIGO_stagefigo_stage
9BMIbmi
10Ageage_at_onset
11Racerace
12Ethnicityethnicity
13Genderanalyte_type
14Tumor_Siteprimary_site
15Tumor_Focalitytumor_focality
16Tumor_Size_cmbreslow_thickness
\n", + "
" + ], + "text/plain": [ + " Original Column Target Column\n", + "0 Country country_of_residence_at_enrollment\n", + "1 Histologic_Grade_FIGO tumor_grade\n", + "2 Histologic_type primary_diagnosis\n", + "3 Path_Stage_Primary_Tumor-pT ajcc_pathologic_t\n", + "4 Path_Stage_Reg_Lymph_Nodes-pN ajcc_pathologic_n\n", + "5 Clin_Stage_Dist_Mets-cM ajcc_clinical_m\n", + "6 Path_Stage_Dist_Mets-pM ajcc_pathologic_m\n", + "7 tumor_Stage-Pathological ajcc_pathologic_stage\n", + "8 FIGO_stage figo_stage\n", + "9 BMI bmi\n", + "10 Age age_at_onset\n", + "11 Race race\n", + "12 Ethnicity ethnicity\n", + "13 Gender analyte_type\n", + "14 Tumor_Site primary_site\n", + "15 Tumor_Focality tumor_focality\n", + "16 Tumor_Size_cm breslow_thickness" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "manager.update_column_mappings([('Histologic_type', 'primary_diagnosis'), ('Path_Stage_Primary_Tumor-pT', 'ajcc_pathologic_t')])" + ] + }, + { + "cell_type": "markdown", + "id": "81d7a0fd-3077-43e3-9a91-69e6a280075a", + "metadata": {}, + "source": [ + "## Value Mapping" + ] + }, + { + "cell_type": "markdown", + "id": "5a4607e0-4f6a-4b9a-82b3-6f5ff6d184d5", + "metadata": {}, + "source": [ + "Perform value mapping. By default it uses the edit distance algorithm. In this example it will use and LLM-based algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ed0ff201-3984-4b3b-8a0f-eacd91c56bac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Histologic_Grade_FIGO:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0FIGO grade 1G11.0
1FIGO grade 2G21.0
2nanNot Reported1.0
3FIGO grade 3G31.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 FIGO grade 1 G1 1.0\n", + "1 FIGO grade 2 G2 1.0\n", + "2 nan Not Reported 1.0\n", + "3 FIGO grade 3 G3 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Histologic_type:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0EndometrioidEndometrioid adenocarcinoma, NOS1.0
1CarcinosarcomaCarcinosarcoma, NOS1.0
2SerousSerous cystadenocarcinoma, NOS0.6
3Clear cellClear cell carcinoma1.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 Endometrioid Endometrioid adenocarcinoma, NOS 1.0\n", + "1 Carcinosarcoma Carcinosarcoma, NOS 1.0\n", + "2 Serous Serous cystadenocarcinoma, NOS 0.6\n", + "3 Clear cell Clear cell carcinoma 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Path_Stage_Primary_Tumor-pT:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0pT1a (FIGO IA)T1a1.0
1nanUnknown1.0
2pT3a (FIGO IIIA)T3a1.0
3pT1 (FIGO I)T11.0
4pT1b (FIGO IB)T1b1.0
5pT2 (FIGO II)T21.0
6pT3b (FIGO IIIB)T3b1.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 pT1a (FIGO IA) T1a 1.0\n", + "1 nan Unknown 1.0\n", + "2 pT3a (FIGO IIIA) T3a 1.0\n", + "3 pT1 (FIGO I) T1 1.0\n", + "4 pT1b (FIGO IB) T1b 1.0\n", + "5 pT2 (FIGO II) T2 1.0\n", + "6 pT3b (FIGO IIIB) T3b 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Path_Stage_Reg_Lymph_Nodes-pN:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0pN0N01.0
1pNXNX1.0
2nanUnknown1.0
3pN2 (FIGO IIIC2)N21.0
4pN1 (FIGO IIIC1)N11.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 pN0 N0 1.0\n", + "1 pNX NX 1.0\n", + "2 nan Unknown 1.0\n", + "3 pN2 (FIGO IIIC2) N2 1.0\n", + "4 pN1 (FIGO IIIC1) N1 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Clin_Stage_Dist_Mets-cM:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0cM0M00.9
1nanUnknown1.0
2Staging IncompleteUnknown0.9
3cM1M11.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 cM0 M0 0.9\n", + "1 nan Unknown 1.0\n", + "2 Staging Incomplete Unknown 0.9\n", + "3 cM1 M1 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Path_Stage_Dist_Mets-pM:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0Staging IncompleteUnknown0.9
1nanNot Reported1.0
2No pathologic evidence of distant metastasisM01.0
3pM1M11.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 Staging Incomplete Unknown 0.9\n", + "1 nan Not Reported 1.0\n", + "2 No pathologic evidence of distant metastasis M0 1.0\n", + "3 pM1 M1 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column tumor_Stage-Pathological:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0Stage IStage I1.0
1Stage IVStage IV1.0
2nanUnknown1.0
3Stage IIIStage III1.0
4Stage IIStage II1.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 Stage I Stage I 1.0\n", + "1 Stage IV Stage IV 1.0\n", + "2 nan Unknown 1.0\n", + "3 Stage III Stage III 1.0\n", + "4 Stage II Stage II 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column FIGO_stage:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0IAStage IA1.0
1nanNot Reported1.0
2IIIAStage IIIA1.0
3IIIC2Stage IIIC21.0
4IBStage IB1.0
5IIStage II1.0
6IIIC1Stage IIIC11.0
7IVBStage IVB1.0
8IIIBStage IIIB1.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 IA Stage IA 1.0\n", + "1 nan Not Reported 1.0\n", + "2 IIIA Stage IIIA 1.0\n", + "3 IIIC2 Stage IIIC2 1.0\n", + "4 IB Stage IB 1.0\n", + "5 II Stage II 1.0\n", + "6 IIIC1 Stage IIIC1 1.0\n", + "7 IVB Stage IVB 1.0\n", + "8 IIIB Stage IIIB 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Race:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0Whitewhite1.0
1nannot reported1.0
2Asianasian1.0
3Not Reportednot reported1.0
4Black or African Americanblack or african american1.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 White white 1.0\n", + "1 nan not reported 1.0\n", + "2 Asian asian 1.0\n", + "3 Not Reported not reported 1.0\n", + "4 Black or African American black or african american 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Tumor_Site:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0Anterior endometriumCorpus uteri0.70
1Posterior endometriumCorpus uteri0.85
2Other, specifyUnknown1.00
3nanNot Applicable1.00
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 Anterior endometrium Corpus uteri 0.70\n", + "1 Posterior endometrium Corpus uteri 0.85\n", + "2 Other, specify Unknown 1.00\n", + "3 nan Not Applicable 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Tumor_Focality:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0UnifocalUnifocal1.0
1nanUnknown1.0
2MultifocalMultifocal1.0
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 Unifocal Unifocal 1.0\n", + "1 nan Unknown 1.0\n", + "2 Multifocal Multifocal 1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Country:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0United StatesUnited States1.0
1Other_specifyAndorra0.0
2UkraineUkraine1.0
3PolandPoland1.0
4nan--
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 United States United States 1.0\n", + "1 Other_specify Andorra 0.0\n", + "2 Ukraine Ukraine 1.0\n", + "3 Poland Poland 1.0\n", + "4 nan - -" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Ethnicity:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0Not-Hispanic or Latinonot hispanic or latino1.0
1nanhispanic or latino0
2Hispanic or Latinohispanic or latino1.0
3Not reported--
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 Not-Hispanic or Latino not hispanic or latino 1.0\n", + "1 nan hispanic or latino 0\n", + "2 Hispanic or Latino hispanic or latino 1.0\n", + "3 Not reported - -" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Column Gender:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Current ValueTarget ValueSimilarity
0nanDNA0.1
1Female--
\n", + "
" + ], + "text/plain": [ + " Current Value Target Value Similarity\n", + "0 nan DNA 0.1\n", + "1 Female - -" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "value_mappings = manager.map_values('LLMAlgorithm')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..f890d2d4 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,16 @@ +Overview +========= + +This project aims to assist users in performing data integration on biomedical data. It provides tools to streamline the process of integrating disparate biomedical datasets. + +You can find the source code in our `GitHub repository `__. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + installation + getting-started + examples + api diff --git a/docs/source/installation.rst b/docs/source/installation.rst new file mode 100644 index 00000000..18d6d50b --- /dev/null +++ b/docs/source/installation.rst @@ -0,0 +1,15 @@ +Installation +============ + +This package works with Python 3.8+ in Linux and Mac. You can install the latest stable version of this library from `PyPI `__: + +:: + + $ pip install bdi-kit + + +To install the latest development version: + +:: + + $ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel diff --git a/readthedocs.yml b/readthedocs.yml new file mode 100644 index 00000000..e124acca --- /dev/null +++ b/readthedocs.yml @@ -0,0 +1,21 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/source/conf.py + +# Optionally set the version of Python and requirements required to build your docs +python: + install: + - requirements: docs/requirements_docs.txt \ No newline at end of file