Skip to content

Commit

Permalink
Merge branch 'devel' into value-mapping-algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
EduardoPena authored Jun 4, 2024
2 parents 22178cb + e87c17e commit c85a9a4
Show file tree
Hide file tree
Showing 21 changed files with 456 additions and 345 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Lint

on: [push, pull_request]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable
with:
src: ./bdikit/
12 changes: 10 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
all: test
SRC := ./bdikit/

PHONY: test
all: lint test

PHONY: format test lint

lint:
black --check ${SRC}

test:
python3 -m pytest

format:
black ${SRC}
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,25 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel
## Documentation
See our examples [here](https://github.com/VIDA-NYU/bdi-kit/tree/devel/examples).

## Contributing
We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
The CI runs for every pull request and will fail if code is not properly formatted.
To make sure formatting is correct, you can do the following steps.

Make sure you have black installed:
```
pip install black
```

To format the code, anyone can use the command before committing your changes:
```
make format
```

Or you can use the black command directly:
```
black ./bdikit/
```

## Folder Structure

Expand Down
4 changes: 2 additions & 2 deletions bdikit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = '0.2.0.dev0'
__version__ = "0.2.0.dev0"
# To shortcut the import path
from bdikit.api import APIManager
from bdikit.api import APIManager
66 changes: 44 additions & 22 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@
from bdikit.mapping_recommendation.scope_reducing_manager import ScopeReducingManager
from bdikit.mapping_recommendation.value_mapping_manager import ValueMappingManager
from bdikit.mapping_recommendation.column_mapping_manager import ColumnMappingManager
from bdikit.visualization.mappings import plot_reduce_scope, plot_column_mappings, plot_value_mappings
from bdikit.visualization.mappings import (
plot_reduce_scope,
plot_column_mappings,
plot_value_mappings,
)
from bdikit.utils import get_gdc_data
from os.path import join, dirname
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages

GDC_DATA_PATH = join(dirname(__file__), './resource/gdc_table.csv')
GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")


class APIManager():

def __init__(self,):
class APIManager:
def __init__(
self,
):
# TODO: move into database object (in data_ingestion folder)
self.dataset = None
# TODO: move into database object (in data_ingestion folder)
Expand All @@ -23,8 +28,8 @@ def __init__(self,):
self.reduced_scope = None
self.column_manager = None
self.value_manager = None
self.column_mappings = None # TODO move this to a property in column_manager
self.value_mappings = None # TODO move this to a property in value_manager
self.column_mappings = None # TODO move this to a property in column_manager
self.value_mappings = None # TODO move this to a property in value_manager

def load_global_table(self, global_table_path=None):
if global_table_path is None:
Expand All @@ -45,41 +50,58 @@ def reduce_scope(self):
self.reduced_scope = self.scope_manager.reduce()
plot_reduce_scope(self.reduced_scope, self.dataset)

def map_columns(self, algorithm='SimFloodAlgorithm'):
self.column_manager = ColumnMappingManager(self.dataset, self.global_table, algorithm)
def map_columns(self, algorithm="SimFloodAlgorithm"):
self.column_manager = ColumnMappingManager(
self.dataset, self.global_table, algorithm
)
self.column_manager.reduced_scope = self.reduced_scope
self.column_mappings = self.column_manager.map()
plot_column_mappings(self.column_mappings)

return self.column_mappings

def map_values(self, algorithm='EditAlgorithm'):
def map_values(self, algorithm="EditAlgorithm"):
self.global_table_all = get_gdc_data(self.column_mappings.values())
self.value_manager = ValueMappingManager(self.dataset, self.column_mappings, self.global_table_all, algorithm)
self.value_manager = ValueMappingManager(
self.dataset, self.column_mappings, self.global_table_all, algorithm
)
self.value_mappings = self.value_manager.map()
plot_value_mappings(self.value_mappings)

return self.value_mappings

def update_reduced_scope(self, original_column, new_candidate_name, new_candidate_sim=1.0):
def update_reduced_scope(
self, original_column, new_candidate_name, new_candidate_sim=1.0
):
for index in range(len(self.reduced_scope)):
if self.reduced_scope[index]['Candidate column'] == original_column:
self.reduced_scope[index]['Top k columns'].append((new_candidate_name, new_candidate_sim))
print('Reduced scope updated!')
if self.reduced_scope[index]["Candidate column"] == original_column:
self.reduced_scope[index]["Top k columns"].append(
(new_candidate_name, new_candidate_sim)
)
print("Reduced scope updated!")
plot_reduce_scope(self.reduced_scope)
break

def update_column_mappings(self, new_mappings):
for original_column, new_target_column in new_mappings:
self.column_mappings[original_column] = new_target_column

print('Column mapping updated!')
print("Column mapping updated!")
plot_column_mappings(self.column_mappings)

def update_value_mappings(self, original_column, original_value, new_target_value, new_similarity=1.0):
for index in range(len(self.value_mappings[original_column]['matches'])):
if self.value_mappings[original_column]['matches'][index][0] == original_value:
self.value_mappings[original_column]['matches'][index] = (original_value, new_target_value, new_similarity)
print('Value mapping updated!')
def update_value_mappings(
self, original_column, original_value, new_target_value, new_similarity=1.0
):
for index in range(len(self.value_mappings[original_column]["matches"])):
if (
self.value_mappings[original_column]["matches"][index][0]
== original_value
):
self.value_mappings[original_column]["matches"][index] = (
original_value,
new_target_value,
new_similarity,
)
print("Value mapping updated!")
plot_value_mappings(self.value_mappings)
break
34 changes: 18 additions & 16 deletions bdikit/data_ingestion/column.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,43 @@
from enum import Enum


class ColumnType(Enum):
STRING = 'string'
FLOAT = 'float'
INTEGER = 'integer'
# TODO semantic types?
STRING = "string"
FLOAT = "float"
INTEGER = "integer"
# TODO semantic types?


class Column:
def __init__(self, df_name, column_name, column_type=ColumnType.STRING, domain_values=None, null_values_representations=None):
def __init__(
self,
df_name,
column_name,
column_type=ColumnType.STRING,
domain_values=None,
null_values_representations=None,
):
self.df_name = df_name
self.column_name = column_name
self.column_type = column_type

if domain_values is None:
self.domain_values = set()
else:
self.domain_values = set(domain_values)

if null_values_representations is None:
self.null_values_representations = set()
else:
self.null_values_representations = set(null_values_representations)



def __str__(self):
return f"Column(df_name={self.df_name}, column_name={self.column_name}, column_type={self.column_type}, domain_values={self.domain_values}, null_values_representations={self.null_values_representations})"

def __eq__(self, value):
if not isinstance(value, Column):
return False
return self.df_name == value.df_name and self.column_name == value.column_name

def __hash__(self):
return hash((self.df_name, self.column_name))





12 changes: 6 additions & 6 deletions bdikit/data_ingestion/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from .column import Column, ColumnType


class Database:
"""
A class representing a database that stores dataframes.
Expand All @@ -14,7 +15,7 @@ class Database:
load_data(df_name, file_path): Load data from a CSV file into a dataframe and store it in the database.
load_data_from_folder(folder_path): Load data from all CSV files in a folder.
get_dataframe(df_name): Retrieve a dataframe by its name.
get_dataframe_names(): Get the names of all dataframes stored in the database.
get_dataframe_names(): Get the names of all dataframes stored in the database.
describe_database(): Print out the names, shape, columns, and head of all dataframes stored in the database.
"""

Expand All @@ -32,7 +33,8 @@ def load_data(self, df_name, file_path):
"""
if df_name in self.dataframes:
raise ValueError(
f"Dataframe associated with file name '{df_name}' already exists in the database.")
f"Dataframe associated with file name '{df_name}' already exists in the database."
)

df = pd.read_csv(file_path)
self.dataframes[df_name] = df
Expand All @@ -42,7 +44,6 @@ def load_data(self, df_name, file_path):
column = Column(df_name, c, ColumnType.STRING)
self.columns.add(column)


def load_data_from_folder(self, folder_path):
"""
Function to load data from all CSV files in a folder using the Database class.
Expand Down Expand Up @@ -76,7 +77,7 @@ def get_dataframe_names(self):
list: A list of dataframe names.
"""
return list(self.dataframes.keys())

def get_columns(self):
"""
Get the names of all columns stored in the database.
Expand All @@ -99,7 +100,6 @@ def describe_database(self):
# print(f"\t\t- Head: \n{self.dataframes[df_name].head()}")



# def main():
# col1 = Column('df1', 'col1', ColumnType.STRING, ['a', 'b', 'c'], ['n/a', 'na'])
# col2 = Column('df1', 'col2', ColumnType.INTEGER, [1, 2, 3], ['n/a', 'na'])
Expand All @@ -112,4 +112,4 @@ def describe_database(self):
# print(col3 == col4)

# if __name__ == "__main__":
# main()
# main()
2 changes: 1 addition & 1 deletion bdikit/data_ingestion/dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
def load_dataframe(dataset_path):
dataset = pd.read_csv(dataset_path)

return dataset
return dataset
2 changes: 1 addition & 1 deletion bdikit/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_cached_model_or_download(model_name: str):
if len(sys.argv) < 2:
print("Please provide a model_id as a command line argument.")
sys.exit(1)

model_id = sys.argv[1]
model_path = get_cached_model_or_download(model_id)
print(f"Downloaded model: {model_path}")
Loading

0 comments on commit c85a9a4

Please sign in to comment.