Skip to content

Commit

Permalink
fix graph style and make the graph changes last
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Jun 10, 2024
1 parent 58227cb commit b0064d5
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 29 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@ nosetests.xml
# Model
*.pt
**/*.pt

# Scope Reducing Json
examples/*.json
16 changes: 15 additions & 1 deletion bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
from bdikit.utils import get_gdc_data
from os.path import join, dirname
import os
import logging

os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages
logger = logging.getLogger(__name__)

GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")

Expand Down Expand Up @@ -64,7 +66,19 @@ def reduce_scope(self):
"""
self.scope_manager = ScopeReducingManager(self.dataset, self.global_table)
self.reduced_scope = self.scope_manager.reduce()
plot_reduce_scope(self.reduced_scope, self.dataset)
return self.scope_manager.get_heatmap()

def update_scope(self, reduced_scope=None):
if self.scope_manager is None:
logger.warning("Scope manager not initialized. Please run reduce_scope() first.")
return

if reduced_scope is None:
self.reduced_scope = self.scope_manager.visualization_manager.reduced_scope
else:
self.reduced_scope = reduced_scope

return self.reduced_scope

def map_columns(self, algorithm="SimFloodAlgorithm"):
"""
Expand Down
8 changes: 8 additions & 0 deletions bdikit/mapping_recommendation/scope_reducing_manager.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
from bdikit.mapping_algorithms.scope_reducing.algorithms import YurongReducer
from bdikit.visualization.scope_reducing import SRHeatMapManager


class ScopeReducingManager:
def __init__(self, dataset, target_domain):
self.dataset = dataset
self.target_domain = target_domain
self.best_method = YurongReducer()
self.visualization_manager = None

def reduce(self):
reducings = self.best_method.reduce_scope(self.dataset)
self.visualization_manager = SRHeatMapManager(self.dataset, reducings)
return reducings

def get_heatmap(self):
self.visualization_manager.get_heatmap()
return self.visualization_manager.plot_heatmap()

11 changes: 11 additions & 0 deletions bdikit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,14 @@ def get_gdc_metadata():
metadata[key] = data

return metadata


def get_gdc_layered_metadata():
metadata = {}
gdc_schema = read_gdc_schema()

for subschema, values in gdc_schema.items():
for key, data in values["properties"].items():
metadata[key] = (subschema, data)

return metadata
2 changes: 1 addition & 1 deletion bdikit/visualization/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
def plot_reduce_scope(reduced_scope, dataset):
scope_explorer = SRHeatMapManager(dataset, reduced_scope)
scope_explorer.get_heatmap()
display(scope_explorer.plot_heatmap())
return scope_explorer.plot_heatmap()


def plot_column_mappings(column_mappings):
Expand Down
88 changes: 61 additions & 27 deletions bdikit/visualization/scope_reducing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pandas as pd
import panel as pn
from bdikit.utils import get_gdc_metadata, read_gdc_schema
from bdikit.utils import get_gdc_layered_metadata, get_gdc_metadata, read_gdc_schema
from Levenshtein import distance
from natsort import index_natsorted
from sklearn.cluster import AffinityPropagation
Expand All @@ -18,30 +18,38 @@


def clean_reduced_scope(reduced_scope, max_chars_samples):
gdc_metadata = get_gdc_metadata()
gdc_metadata = get_gdc_layered_metadata()

candidates_dfs = {}

for column_data in reduced_scope:
column_name = column_data["Candidate column"]
recommendations = []
for candidate_name, candidate_similarity in column_data["Top k columns"]:
candidate_description = gdc_metadata[candidate_name].get("description", "")
subschema, gdc_data = gdc_metadata[candidate_name]
candidate_description = gdc_data.get("description", "")
candidate_description = candidate_description
candidate_values = ", ".join(gdc_metadata[candidate_name].get("enum", []))
candidate_values = ", ".join(gdc_data.get("enum", []))
candidate_values = truncate_text(candidate_values, max_chars_samples)
recommendations.append(
(
candidate_name,
candidate_similarity,
candidate_description,
candidate_values,
subschema,
)
)

candidates_dfs[column_name] = pd.DataFrame(
recommendations,
columns=["Candidate", "Similarity", "Description", "Values (sample)"],
columns=[
"Candidate",
"Similarity",
"Description",
"Values (sample)",
"Subschema",
],
)

return candidates_dfs
Expand Down Expand Up @@ -225,6 +233,7 @@ def get_heatmap(self):
"Value": c[1],
"Description": cadidate_info["Description"].values[0],
"Values (sample)": cadidate_info["Values (sample)"].values[0],
"Subschema": cadidate_info["Subschema"].values[0],
}
)
rec_table.append(col_dict)
Expand Down Expand Up @@ -368,26 +377,47 @@ def get_clusters(self):
clusters[exemplar] = cluster
self.clusters = clusters

def _plot_heatmap_base(self, heatmap_rec_list):
def _plot_heatmap_base(self, heatmap_rec_list, show_subschema):
single = alt.selection_point(name="single")
base = (
alt.Chart(heatmap_rec_list)
.mark_rect(size=100)
.encode(
y=alt.X("Column:O", sort=None),
x=alt.X(f"Recommendation:O", sort=None),
color=alt.condition(single, "Value:Q", alt.value("lightgray")),
# color="Value:Q",
tooltip=[
alt.Tooltip("Column", title="Column"),
alt.Tooltip("Recommendation", title="Recommendation"),
alt.Tooltip("Value", title="Correlation Score"),
alt.Tooltip("Description", title="Description"),
alt.Tooltip("Values (sample)", title="Values (sample)"),
],
if show_subschema:
base = (
alt.Chart(heatmap_rec_list)
.mark_rect(size=100)
.encode(
y=alt.X("Column:O", sort=None),
x=alt.X(f"Recommendation:O", sort=None),
color=alt.condition(single, "Value:Q", alt.value("lightgray")),
# color="Value:Q",
tooltip=[
alt.Tooltip("Column", title="Column"),
alt.Tooltip("Recommendation", title="Recommendation"),
alt.Tooltip("Value", title="Correlation Score"),
alt.Tooltip("Description", title="Description"),
alt.Tooltip("Values (sample)", title="Values (sample)"),
],
facet=alt.Facet("Subschema:O", columns=1),
)
.add_params(single)
)
else:
base = (
alt.Chart(heatmap_rec_list)
.mark_rect(size=100)
.encode(
y=alt.X("Column:O", sort=None),
x=alt.X(f"Recommendation:O", sort=None),
color=alt.condition(single, "Value:Q", alt.value("lightgray")),
# color="Value:Q",
tooltip=[
alt.Tooltip("Column", title="Column"),
alt.Tooltip("Recommendation", title="Recommendation"),
alt.Tooltip("Value", title="Correlation Score"),
alt.Tooltip("Description", title="Description"),
alt.Tooltip("Values (sample)", title="Values (sample)"),
],
)
.add_params(single)
)
.add_params(single)
)
return pn.pane.Vega(base)

def _plot_selected_row(self, heatmap_rec_list, selection):
Expand Down Expand Up @@ -443,7 +473,6 @@ def _candidates_table(self, heatmap_rec_list, selection):

def _plot_column_histogram(self, column):
if self.dataset[column].dtype == "float64":
print(column)
chart = (
alt.Chart(self.dataset.fillna("Null"), height=300)
.mark_bar()
Expand Down Expand Up @@ -485,6 +514,7 @@ def _plot_pane(
subschemas=[],
n_similar=0,
threshold=0.5,
show_subschema=False,
acc_click=0,
rej_click=0,
):
Expand Down Expand Up @@ -522,7 +552,7 @@ def _plot_pane(
heatmap_rec_list["Recommendation"].isin(subschema_rec_cols)
]

heatmap_pane = self._plot_heatmap_base(heatmap_rec_list)
heatmap_pane = self._plot_heatmap_base(heatmap_rec_list, show_subschema)
cand_table = pn.bind(
self._candidates_table,
heatmap_rec_list,
Expand Down Expand Up @@ -557,7 +587,7 @@ def plot_heatmap(self):
name="Recommendation subschema", options=self.subschemas, width=220
)
n_similar_slider = pn.widgets.IntSlider(
name="N Similar", start=1, end=5, value=5, width=220
name="N Similar", start=0, end=5, value=0, width=220
)
thresh_slider = pn.widgets.EditableFloatSlider(
name="Threshold", start=0, end=1.0, step=0.01, value=0.1, width=220
Expand All @@ -567,6 +597,9 @@ def plot_heatmap(self):

rej_button = pn.widgets.Button(name="Decline Match", button_type="danger")

# Style
show_subschema = pn.widgets.Checkbox(name="Show subschema", value=False)

def on_click_accept_match(event):
self._accept_match()

Expand All @@ -582,13 +615,14 @@ def on_click_reject_match(event):
select_rec_groups,
n_similar_slider,
thresh_slider,
show_subschema,
acc_button.param.clicks,
rej_button.param.clicks,
)

column_left = pn.Row(
"# Column",
select_column,
show_subschema,
select_rec_groups,
n_similar_slider,
thresh_slider,
Expand Down

0 comments on commit b0064d5

Please sign in to comment.