Skip to content

Commit

Permalink
Added basic value mapping objects and materialization
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jun 3, 2024
1 parent 73451be commit b840fd5
Show file tree
Hide file tree
Showing 2 changed files with 187 additions and 0 deletions.
85 changes: 85 additions & 0 deletions bdikit/mapping_algorithms/value_mapping/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd
from typing import List


class ValueMapper:
"""
A ValueMapper represents objects that transform the values in a input
column to the values from a new output column.
"""

def map(self, input_column: pd.Series) -> pd.Series:
"""
Every concrete ValueMapper should implement this method, which takes a
pandas Series as input and returns a new pandas Series with transformed
values.
"""
pass


class IdentityValueMapper(ValueMapper):
"""
A column mapper that maps each value in input column into itself.
"""

def map(self, input_column: pd.Series) -> pd.Series:
"""
Simply copies the values in input_column to the output column.
"""
return input_column.copy()


class FunctionValueMapper(ValueMapper):
"""
A column mapper that transforms each value in the input column using the
provided custom function.
"""

def __init__(self, function):
self.function = function

def map(self, input_column: pd.Series) -> pd.Series:
"""
Applies the given function to each value in input_column to generate
the output column.
"""
return input_column.map(self.function)


class DictionaryMapper(ValueMapper):
"""
A column mapper that transforms each value in the input column using the
values stored in the provided dictionary.
"""

def __init__(self, dictionary: dict):
self.dictionary = dictionary

def map(self, input_column: pd.Series) -> pd.Series:
"""
Transforms the values in the input_column to the values specified in
the dictionary provided using the object constructor.
"""
return input_column.map(self.dictionary)


def map_column_values(
input_column: pd.Series, target: str, value_mapper: ValueMapper
) -> pd.Series:
new_column = value_mapper.map(input_column)
new_column.name = target
return new_column


def materialize_mapping(
input_dataframe: pd.DataFrame, target: List[dict]
) -> pd.DataFrame:
output_dataframe = pd.DataFrame()
for mapping_spec in target:
from_column_name = mapping_spec["from"]
to_column_name = mapping_spec["to"]
value_mapper = mapping_spec["mapper"]
output_dataframe[to_column_name] = map_column_values(
input_dataframe[from_column_name], to_column_name, value_mapper
)
return output_dataframe
102 changes: 102 additions & 0 deletions tests/test_value_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import unittest
import pandas as pd
from bdikit.mapping_algorithms.value_mapping import (
map_column_values,
materialize_mapping,
FunctionValueMapper,
DictionaryMapper,
IdentityValueMapper,
)


class ValueMappingTest(unittest.TestCase):

def test_identity_mapper(self):
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
identity_mapper = IdentityValueMapper()

# when
mapped_column = identity_mapper.map(str_column)

# then
self.assertTrue(mapped_column.eq(["a", "b", "c", "d", "e"]).all())

def test_dictionary_mapper(self):
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
dict_mapper = DictionaryMapper(
dictionary={"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
)

# when
mapped_column = dict_mapper.map(str_column)

# then
self.assertTrue(mapped_column.eq([1, 2, 3, 4, 5]).all())

def test_custom_function_mapper(self):
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
fn_mapper = FunctionValueMapper(function=lambda x: x + x)

# when
mapped_column = fn_mapper.map(str_column)

# then
self.assertTrue(mapped_column.eq(["aa", "bb", "cc", "dd", "ee"]).all())

def test_map_column_values(self):
"""
Ensures that the map_column_values function correctly maps the values of
a column and assings the target name.
"""
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
value_mapper = FunctionValueMapper(function=lambda x: x.upper())
target_column_name = "string column"

# when
mapped_column = map_column_values(
str_column, target=target_column_name, value_mapper=value_mapper
)

# then
upper_cased_values = ["A", "B", "C", "D", "E"]
self.assertTrue(mapped_column.name == target_column_name)
self.assertTrue(mapped_column.eq(upper_cased_values).all())

def test_map_dataframe_column_values(self):
# given
str_column_1 = ["a", "b", "c", "d", "e"]
str_column_2 = ["a", "b", "c", "d", "e"]
df_base = pd.DataFrame(
{"column_str_1": str_column_1, "column_str_2": str_column_2}
)

value_mapping_spec = [
{
"from": "column_str_1",
"to": "string column 1",
"mapper": IdentityValueMapper(),
},
{
"from": "column_str_2",
"to": "string column 2",
"mapper": FunctionValueMapper(function=lambda x: x.upper()),
},
]

# when
df_mapped = materialize_mapping(df_base, target=value_mapping_spec)

# then
self.assertTrue(len(df_mapped.columns) == 2)

self.assertTrue("string column 1" in df_mapped.columns)
self.assertTrue(df_mapped["string column 1"].eq(str_column_1).all())

self.assertTrue("string column 2" in df_mapped.columns)
self.assertTrue(
df_mapped["string column 2"].eq(["A", "B", "C", "D", "E"]).all()
)

0 comments on commit b840fd5

Please sign in to comment.