fix graph style and make the graph changes last

VIDA-NYU · Jun 10, 2024 · b0064d5 · b0064d5
1 parent 58227cb
commit b0064d5
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 29 deletions.
diff --git a/.gitignore b/.gitignore
@@ -74,3 +74,6 @@ nosetests.xml
 # Model
 *.pt
 **/*.pt
+
+# Scope Reducing Json
+examples/*.json
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -10,8 +10,10 @@
 from bdikit.utils import get_gdc_data
 from os.path import join, dirname
 import os
+import logging
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable huggingface messages
+logger = logging.getLogger(__name__)
 
 GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
 
@@ -64,7 +66,19 @@ def reduce_scope(self):
         """
         self.scope_manager = ScopeReducingManager(self.dataset, self.global_table)
         self.reduced_scope = self.scope_manager.reduce()
-        plot_reduce_scope(self.reduced_scope, self.dataset)
+        return self.scope_manager.get_heatmap()
+
+    def update_scope(self, reduced_scope=None):
+        if self.scope_manager is None:
+            logger.warning("Scope manager not initialized. Please run reduce_scope() first.")
+            return
+
+        if reduced_scope is None:
+            self.reduced_scope = self.scope_manager.visualization_manager.reduced_scope
+        else:
+            self.reduced_scope = reduced_scope
+
+        return self.reduced_scope
 
     def map_columns(self, algorithm="SimFloodAlgorithm"):
         """

diff --git a/bdikit/mapping_recommendation/scope_reducing_manager.py b/bdikit/mapping_recommendation/scope_reducing_manager.py
@@ -1,12 +1,20 @@
 from bdikit.mapping_algorithms.scope_reducing.algorithms import YurongReducer
+from bdikit.visualization.scope_reducing import SRHeatMapManager
 
 
 class ScopeReducingManager:
     def __init__(self, dataset, target_domain):
         self.dataset = dataset
         self.target_domain = target_domain
         self.best_method = YurongReducer()
+        self.visualization_manager = None
 
     def reduce(self):
         reducings = self.best_method.reduce_scope(self.dataset)
+        self.visualization_manager = SRHeatMapManager(self.dataset, reducings)
         return reducings
+
+    def get_heatmap(self):
+        self.visualization_manager.get_heatmap()
+        return self.visualization_manager.plot_heatmap()
+
diff --git a/bdikit/utils.py b/bdikit/utils.py
@@ -53,3 +53,14 @@ def get_gdc_metadata():
             metadata[key] = data
 
     return metadata
+
+
+def get_gdc_layered_metadata():
+    metadata = {}
+    gdc_schema = read_gdc_schema()
+
+    for subschema, values in gdc_schema.items():
+        for key, data in values["properties"].items():
+            metadata[key] = (subschema, data)
+
+    return metadata
diff --git a/bdikit/visualization/mappings.py b/bdikit/visualization/mappings.py
@@ -10,7 +10,7 @@
 def plot_reduce_scope(reduced_scope, dataset):
     scope_explorer = SRHeatMapManager(dataset, reduced_scope)
     scope_explorer.get_heatmap()
-    display(scope_explorer.plot_heatmap())
+    return scope_explorer.plot_heatmap()
 
 
 def plot_column_mappings(column_mappings):

diff --git a/bdikit/visualization/scope_reducing.py b/bdikit/visualization/scope_reducing.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 import panel as pn
-from bdikit.utils import get_gdc_metadata, read_gdc_schema
+from bdikit.utils import get_gdc_layered_metadata, get_gdc_metadata, read_gdc_schema
 from Levenshtein import distance
 from natsort import index_natsorted
 from sklearn.cluster import AffinityPropagation
@@ -18,30 +18,38 @@
 
 
 def clean_reduced_scope(reduced_scope, max_chars_samples):
-    gdc_metadata = get_gdc_metadata()
+    gdc_metadata = get_gdc_layered_metadata()
 
     candidates_dfs = {}
 
     for column_data in reduced_scope:
         column_name = column_data["Candidate column"]
         recommendations = []
         for candidate_name, candidate_similarity in column_data["Top k columns"]:
-            candidate_description = gdc_metadata[candidate_name].get("description", "")
+            subschema, gdc_data = gdc_metadata[candidate_name]
+            candidate_description = gdc_data.get("description", "")
             candidate_description = candidate_description
-            candidate_values = ", ".join(gdc_metadata[candidate_name].get("enum", []))
+            candidate_values = ", ".join(gdc_data.get("enum", []))
             candidate_values = truncate_text(candidate_values, max_chars_samples)
             recommendations.append(
                 (
                     candidate_name,
                     candidate_similarity,
                     candidate_description,
                     candidate_values,
+                    subschema,
                 )
             )
 
         candidates_dfs[column_name] = pd.DataFrame(
             recommendations,
-            columns=["Candidate", "Similarity", "Description", "Values (sample)"],
+            columns=[
+                "Candidate",
+                "Similarity",
+                "Description",
+                "Values (sample)",
+                "Subschema",
+            ],
         )
 
     return candidates_dfs
@@ -225,6 +233,7 @@ def get_heatmap(self):
                         "Value": c[1],
                         "Description": cadidate_info["Description"].values[0],
                         "Values (sample)": cadidate_info["Values (sample)"].values[0],
+                        "Subschema": cadidate_info["Subschema"].values[0],
                     }
                 )
             rec_table.append(col_dict)
@@ -368,26 +377,47 @@ def get_clusters(self):
             clusters[exemplar] = cluster
         self.clusters = clusters
 
-    def _plot_heatmap_base(self, heatmap_rec_list):
+    def _plot_heatmap_base(self, heatmap_rec_list, show_subschema):
         single = alt.selection_point(name="single")
-        base = (
-            alt.Chart(heatmap_rec_list)
-            .mark_rect(size=100)
-            .encode(
-                y=alt.X("Column:O", sort=None),
-                x=alt.X(f"Recommendation:O", sort=None),
-                color=alt.condition(single, "Value:Q", alt.value("lightgray")),
-                # color="Value:Q",
-                tooltip=[
-                    alt.Tooltip("Column", title="Column"),
-                    alt.Tooltip("Recommendation", title="Recommendation"),
-                    alt.Tooltip("Value", title="Correlation Score"),
-                    alt.Tooltip("Description", title="Description"),
-                    alt.Tooltip("Values (sample)", title="Values (sample)"),
-                ],
+        if show_subschema:
+            base = (
+                alt.Chart(heatmap_rec_list)
+                .mark_rect(size=100)
+                .encode(
+                    y=alt.X("Column:O", sort=None),
+                    x=alt.X(f"Recommendation:O", sort=None),
+                    color=alt.condition(single, "Value:Q", alt.value("lightgray")),
+                    # color="Value:Q",
+                    tooltip=[
+                        alt.Tooltip("Column", title="Column"),
+                        alt.Tooltip("Recommendation", title="Recommendation"),
+                        alt.Tooltip("Value", title="Correlation Score"),
+                        alt.Tooltip("Description", title="Description"),
+                        alt.Tooltip("Values (sample)", title="Values (sample)"),
+                    ],
+                    facet=alt.Facet("Subschema:O", columns=1),
+                )
+                .add_params(single)
+            )
+        else:
+            base = (
+                alt.Chart(heatmap_rec_list)
+                .mark_rect(size=100)
+                .encode(
+                    y=alt.X("Column:O", sort=None),
+                    x=alt.X(f"Recommendation:O", sort=None),
+                    color=alt.condition(single, "Value:Q", alt.value("lightgray")),
+                    # color="Value:Q",
+                    tooltip=[
+                        alt.Tooltip("Column", title="Column"),
+                        alt.Tooltip("Recommendation", title="Recommendation"),
+                        alt.Tooltip("Value", title="Correlation Score"),
+                        alt.Tooltip("Description", title="Description"),
+                        alt.Tooltip("Values (sample)", title="Values (sample)"),
+                    ],
+                )
+                .add_params(single)
             )
-            .add_params(single)
-        )
         return pn.pane.Vega(base)
 
     def _plot_selected_row(self, heatmap_rec_list, selection):
@@ -443,7 +473,6 @@ def _candidates_table(self, heatmap_rec_list, selection):
 
     def _plot_column_histogram(self, column):
         if self.dataset[column].dtype == "float64":
-            print(column)
             chart = (
                 alt.Chart(self.dataset.fillna("Null"), height=300)
                 .mark_bar()
@@ -485,6 +514,7 @@ def _plot_pane(
         subschemas=[],
         n_similar=0,
         threshold=0.5,
+        show_subschema=False,
         acc_click=0,
         rej_click=0,
     ):
@@ -522,7 +552,7 @@ def _plot_pane(
                 heatmap_rec_list["Recommendation"].isin(subschema_rec_cols)
             ]
 
-        heatmap_pane = self._plot_heatmap_base(heatmap_rec_list)
+        heatmap_pane = self._plot_heatmap_base(heatmap_rec_list, show_subschema)
         cand_table = pn.bind(
             self._candidates_table,
             heatmap_rec_list,
@@ -557,7 +587,7 @@ def plot_heatmap(self):
             name="Recommendation subschema", options=self.subschemas, width=220
         )
         n_similar_slider = pn.widgets.IntSlider(
-            name="N Similar", start=1, end=5, value=5, width=220
+            name="N Similar", start=0, end=5, value=0, width=220
         )
         thresh_slider = pn.widgets.EditableFloatSlider(
             name="Threshold", start=0, end=1.0, step=0.01, value=0.1, width=220
@@ -567,6 +597,9 @@ def plot_heatmap(self):
 
         rej_button = pn.widgets.Button(name="Decline Match", button_type="danger")
 
+        # Style
+        show_subschema = pn.widgets.Checkbox(name="Show subschema", value=False)
+
         def on_click_accept_match(event):
             self._accept_match()
 
@@ -582,13 +615,14 @@ def on_click_reject_match(event):
             select_rec_groups,
             n_similar_slider,
             thresh_slider,
+            show_subschema,
             acc_button.param.clicks,
             rej_button.param.clicks,
         )
 
         column_left = pn.Row(
-            "# Column",
             select_column,
+            show_subschema,
             select_rec_groups,
             n_similar_slider,
             thresh_slider,
-Original file line number
+Diff line change
@@ Expand Up / @@ -74,3 +74,6 @@ nosetests.xml @@
     # Model
     *.pt
     **/*.pt
+    # Scope Reducing Json
+    examples/*.json