Skip to content

Commit

Permalink
Merge pull request #42 from VIDA-NYU/docs
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez authored Jun 5, 2024
2 parents 04c62b0 + a19d48b commit d3dcbef
Show file tree
Hide file tree
Showing 11 changed files with 2,389 additions and 1 deletion.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
[![PyPI version](https://badge.fury.io/py/bdi-kit.svg)](https://pypi.org/project/bdi-kit)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![Documentation Status](https://readthedocs.org/projects/bdi-kit/badge/?version=latest)](https://bdi-kit.readthedocs.io/en/latest/?badge=latest)
[![Tests](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/build.yml/badge.svg)](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/build.yml)
[![Lint](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/lint.yml/badge.svg)](https://github.com/VIDA-NYU/bdi-kit/actions/workflows/lint.yml)


# bdi-kit
This project aims to assist users in performing data integration on biomedical data. It provides tools to streamline the process of integrating disparate biomedical datasets.
Expand All @@ -21,7 +25,7 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel


## Documentation
See our examples [here](https://github.com/VIDA-NYU/bdi-kit/tree/devel/examples).
Documentation is available [here](https://bdi-kit.readthedocs.io/).

## Contributing
We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
Expand Down
50 changes: 50 additions & 0 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ class APIManager:
def __init__(
self,
):
"""
Create/instantiate an APIManager object.
"""
# TODO: move into database object (in data_ingestion folder)
self.dataset = None
# TODO: move into database object (in data_ingestion folder)
Expand All @@ -32,25 +35,45 @@ def __init__(
self.value_mappings = None # TODO move this to a property in value_manager

def load_global_table(self, global_table_path=None):
"""
Load a global/target table.
:param global_table_path: Path to the CSV dataset.
"""
if global_table_path is None:
self.global_table = load_dataframe(GDC_DATA_PATH)
else:
self.global_table = load_dataframe(global_table_path)
return self.global_table

def load_dataset(self, dataset_path):
"""
Load the dataset.
:param dataset_path: Path to the CSV dataset.
"""
if self.global_table is None:
self.load_global_table()
self.dataset = load_dataframe(dataset_path)

return self.dataset

def reduce_scope(self):
"""
Reduce the scope of the target domain. Supports only GDC format.
"""
self.scope_manager = ScopeReducingManager(self.dataset, self.global_table)
self.reduced_scope = self.scope_manager.reduce()
plot_reduce_scope(self.reduced_scope, self.dataset)

def map_columns(self, algorithm="SimFloodAlgorithm"):
"""
Map columns.
:param algorithm: The algorithm to perform the colum mappings. Supports: `SimFloodAlgorithm`,
`ComaAlgorithm`, `CupidAlgorithm`, `DistributionBasedAlgorithm`, `JaccardDistanceAlgorithm`,
and `GPTAlgorithm`.
"""
self.column_manager = ColumnMappingManager(
self.dataset, self.global_table, algorithm
)
Expand All @@ -61,6 +84,13 @@ def map_columns(self, algorithm="SimFloodAlgorithm"):
return self.column_mappings

def map_values(self, algorithm="EditAlgorithm"):
"""
Map column values.
:param algorithm: The algorithm to perform the value mappings. Supports: `EditAlgorithm`,
`TFIDFAlgorithm`, `EmbeddingAlgorithm` and `LLMAlgorithm`.
"""
self.global_table_all = get_gdc_data(self.column_mappings.values())
self.value_manager = ValueMappingManager(
self.dataset, self.column_mappings, self.global_table_all, algorithm
Expand All @@ -73,6 +103,13 @@ def map_values(self, algorithm="EditAlgorithm"):
def update_reduced_scope(
self, original_column, new_candidate_name, new_candidate_sim=1.0
):
"""
Update the values returned by the scope reducer algorithm.
:param original_column: Name of the original column.
:param new_candidate_name: New name of the candidate column.
:param new_candidate_sim: New similarity of the candidate column.
"""
for index in range(len(self.reduced_scope)):
if self.reduced_scope[index]["Candidate column"] == original_column:
self.reduced_scope[index]["Top k columns"].append(
Expand All @@ -83,6 +120,11 @@ def update_reduced_scope(
break

def update_column_mappings(self, new_mappings):
"""
Update the column mappings.
:param new_mappings: List of tuples (original_column, correct_target_column).
"""
for original_column, new_target_column in new_mappings:
self.column_mappings[original_column] = new_target_column

Expand All @@ -92,6 +134,14 @@ def update_column_mappings(self, new_mappings):
def update_value_mappings(
self, original_column, original_value, new_target_value, new_similarity=1.0
):
"""
Update the column value mappings.
:param original_column: Name of the original column.
:param original_value: Name of the original column value.
:param new_target_value: New name of the target column value.
:param new_similarity: New similarity of the original and target column values.
"""
for index in range(len(self.value_mappings[original_column]["matches"])):
if (
self.value_mappings[original_column]["matches"][index][0]
Expand Down
20 changes: 20 additions & 0 deletions docs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
35 changes: 35 additions & 0 deletions docs/make.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd
7 changes: 7 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
API
=========

.. module:: bdikit.api

.. autoclass:: APIManager
:members:
91 changes: 91 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys

sys.path.insert(0, os.path.abspath("../../"))


# -- Project information -----------------------------------------------------

project = "bdi-kit"
copyright = "2024, NYU"
author = "NYU"

# The full version, including alpha/beta/rc tags
release = ""
master_doc = "index"

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"nbsphinx",
"nbsphinx_link",
"sphinxemoji.sphinxemoji",
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']

# html_logo = 'images/logo.png'

# html_theme_options = {
# 'logo_only': True,
# 'display_version': False,
# }

autodoc_member_order = "bysource"

autoclass_content = "both"

add_module_names = False

autodoc_mock_imports = [
"sklearn",
"pandas",
"numpy",
"IPython",
"torch",
"transformers",
"matplotlib",
"openai",
"polyfuzz",
"flair",
"autofj",
"Levenshtein",
"valentine",
"altair",
"panel",
]
7 changes: 7 additions & 0 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Examples
==========

Here can find different Jupyter notebook examples about how to use `bdi-kit`:

- `Column and Value Mappings <https://github.com/VIDA-NYU/bdi-kit/blob/devel/examples/column_and_value_mapping.ipynb>`__

Loading

0 comments on commit d3dcbef

Please sign in to comment.