refactor: mallow aggregator

taharallouche · Nov 21, 2023 · 67d59db · 67d59db
1 parent 6c2c5ae
commit 67d59db
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 64 deletions.
diff --git a/size_matters/aggregation/aggregators.py b/size_matters/aggregation/aggregators.py
@@ -2,16 +2,12 @@
 import pandas as pd
 import ray
 
-from size_matters.utils.inventory import COLUMNS, RELIABILITY_BOUNDS, RULES, Dataset
+from size_matters.utils.inventory import COLUMNS, RELIABILITY_BOUNDS, RULES
 
 
 @ray.remote
 def apply_standard_approval_aggregator(annotations: pd.DataFrame) -> pd.DataFrame:
-    alternatives = [
-        column
-        for column in annotations.columns
-        if column != COLUMNS.question and column != COLUMNS.voter
-    ]
+    alternatives = _get_alternatives(annotations)
 
     approvals_per_question = annotations.groupby(COLUMNS.question, sort=False)[
         alternatives
@@ -32,11 +28,7 @@ def apply_standard_approval_aggregator(annotations: pd.DataFrame) -> pd.DataFram
 
 @ray.remote
 def apply_condorcet_aggregator(annotations: pd.DataFrame) -> pd.DataFrame:
-    alternatives = [
-        column
-        for column in annotations.columns
-        if column != COLUMNS.question and column != COLUMNS.voter
-    ]
+    alternatives = _get_alternatives(annotations)
     questions = list(annotations[COLUMNS.question].unique())
     number_of_alternatives = len(alternatives)
 
@@ -70,54 +62,41 @@ def apply_condorcet_aggregator(annotations: pd.DataFrame) -> pd.DataFrame:
 
 @ray.remote
 def apply_mallow_aggregator(
-    Annotations: pd.DataFrame,
-    dataset: Dataset,
+    annotations: pd.DataFrame,
     distance: str = RULES.jaccard,
 ) -> pd.DataFrame:
-    """
-    Takes Annotations as input and applies weighted approval rule.
-    The weights are determined according to the ballot's size.
-    These weights are the optimal Mallows noise with the input distance.
-    :param Annotations: dataframe of answers as binary vectors
-    :param data: name of the dataset
-    :param distance: The distance of the noise model
-    :return: agg_weighted: dataframe of the aggregated answers
-    """
-
-    Alternatives = dataset.alternatives
-
-    # Initialize the aggregation dataframe
-    Questions = list(Annotations.Question.unique())
-    agg_weighted = pd.DataFrame(columns=[COLUMNS.question] + Alternatives)
-
-    # weight of each voter and aggregate the answers for each question
-    weights = pd.DataFrame(columns=[COLUMNS.voter, COLUMNS.weight])
-    weights[COLUMNS.voter] = Annotations.Voter.unique()
-    n = len(list(weights[COLUMNS.voter]))
-    D = Annotations.loc[:, Alternatives].to_numpy()
-    for i in range(len(Questions)):
-        # Compute the weight of each voter according to the chosen distance
+    alternatives = _get_alternatives(annotations)
+
+    questions = list(annotations[COLUMNS.question].unique())
+    aggregated_labels = pd.DataFrame(columns=[COLUMNS.question] + alternatives)
+
+    for i, question in enumerate(questions):
+        question_annotations = annotations[annotations[COLUMNS.question] == question][
+            alternatives
+        ].to_numpy()
+
+        vote_size = np.sum(question_annotations, axis=1)
+
         if distance == RULES.euclid:
-            weights[COLUMNS.weight] = np.sqrt(
-                np.sum(D[n * i : n * (i + 1), :], axis=1) + 1  # noqa: E203
-            ) - np.sqrt(
-                np.sum(D[n * i : n * (i + 1), :], axis=1) - 1  # noqa: E203
-            )
+            weights = np.sqrt(vote_size + 1) - np.sqrt(vote_size - 1)
         elif distance == RULES.jaccard:
-            weights[COLUMNS.weight] = 1 / np.sum(
-                D[n * i : n * (i + 1), :], axis=1  # noqa: E203
-            )
+            weights = 1 / vote_size
         elif distance == RULES.dice:
-            weights[COLUMNS.weight] = 2 / (
-                np.sum(D[n * i : n * (i + 1), :], axis=1) + 1  # noqa: E203
-            )
-
-        L = np.matmul(
-            weights[COLUMNS.weight].T, D[n * i : n * (i + 1), :]  # noqa: E203
-        )  # noqa: E203
-        k = np.argmax(L)
-        agg_weighted.loc[i] = [Questions[i]] + [
-            t == k for t in range(0, len(Alternatives))
+            weights = 2 / (vote_size + 1)
+
+        weighted_scores = np.matmul(weights.T, question_annotations)
+        most_likely_label = np.argmax(weighted_scores)
+
+        aggregated_labels.loc[i] = [question] + [
+            label == most_likely_label for label in range(len(alternatives))
         ]
 
-    return agg_weighted
+    return aggregated_labels
+
+
+def _get_alternatives(annotations: pd.DataFrame) -> list[str]:
+    return [
+        column
+        for column in annotations.columns
+        if column != COLUMNS.question and column != COLUMNS.voter
+    ]
diff --git a/size_matters/evaluation/accuracy.py b/size_matters/evaluation/accuracy.py
@@ -58,15 +58,9 @@ def compare_methods(dataset: Dataset, max_voters: int, n_batch: int) -> NDArray:
             ) = ray.get(
                 [
                     apply_standard_approval_aggregator.remote(annotations_batch),
-                    apply_mallow_aggregator.remote(
-                        annotations_batch, dataset, RULES.euclid
-                    ),
-                    apply_mallow_aggregator.remote(
-                        annotations_batch, dataset, RULES.jaccard
-                    ),
-                    apply_mallow_aggregator.remote(
-                        annotations_batch, dataset, RULES.dice
-                    ),
+                    apply_mallow_aggregator.remote(annotations_batch, RULES.euclid),
+                    apply_mallow_aggregator.remote(annotations_batch, RULES.jaccard),
+                    apply_mallow_aggregator.remote(annotations_batch, RULES.dice),
                     apply_condorcet_aggregator.remote(annotations_batch),
                 ]
             )