diff --git a/bdikit/mapping_algorithms/value_mapping/value_mappers.py b/bdikit/mapping_algorithms/value_mapping/value_mappers.py index 5b8b635d..d6ad11d3 100644 --- a/bdikit/mapping_algorithms/value_mapping/value_mappers.py +++ b/bdikit/mapping_algorithms/value_mapping/value_mappers.py @@ -1,5 +1,6 @@ import pandas as pd -from typing import Callable +from typing import Any, Callable +from collections import defaultdict class ValueMapper: @@ -52,12 +53,12 @@ class DictionaryMapper(ValueMapper): values stored in the provided dictionary. """ - def __init__(self, dictionary: dict): - self.dictionary = dictionary + def __init__(self, dictionary: dict, missing_data_value: Any = None): + self.dictionary = defaultdict(lambda: missing_data_value, dictionary) def map(self, input_column: pd.Series) -> pd.Series: """ Transforms the values in the input_column to the values specified in the dictionary provided using the object constructor. """ - return input_column.map(self.dictionary) + return input_column.map(self.dictionary, na_action="ignore") diff --git a/tests/test_api.py b/tests/test_api.py index b9a40c6c..1d21ae74 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -208,6 +208,40 @@ def test_value_mapping_dataframe(): assert len(src_column_mapping["matches"]) == 3 -# TODO -# def test_preview_value_mappings(): -# pass +def test_end_to_end_api_integration(): + # given + df_source = pd.DataFrame( + {"src_column": ["Red Apple", "Banana", "Oorange", "Strawberry"]} + ) + df_target = pd.DataFrame( + {"tgt_column": ["apple", "banana", "orange", "kiwi", "grapes"]} + ) + + # when + column_mappings = bdi.match_columns(df_source, df_target, method="coma") + # then + assert column_mappings is not None + assert column_mappings.empty == False + assert "source" in column_mappings.columns + assert "target" in column_mappings.columns + + # when + value_mappings = bdi.match_values( + df_source, df_target, column_mappings, method="tfidf" + ) + + assert value_mappings is not None + assert "src_column" in value_mappings + assert value_mappings["src_column"]["matches"] is not None + assert value_mappings["src_column"]["target"] == "tgt_column" + + src_column_mapping = value_mappings["src_column"] + assert len(src_column_mapping["matches"]) == 3 + assert len(src_column_mapping["matches"]) == 3 + + # when + harmonization_spec = bdi.update_mappings(value_mappings, []) + df_mapped = bdi.materialize_mapping(df_source, harmonization_spec) + + assert "tgt_column" in df_mapped.columns + assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", None]