Skip to content

Commit

Permalink
refactor: mallow aggregator
Browse files Browse the repository at this point in the history
  • Loading branch information
taharallouche committed Nov 21, 2023
1 parent 6c2c5ae commit 67d59db
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 64 deletions.
89 changes: 34 additions & 55 deletions size_matters/aggregation/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,12 @@
import pandas as pd
import ray

from size_matters.utils.inventory import COLUMNS, RELIABILITY_BOUNDS, RULES, Dataset
from size_matters.utils.inventory import COLUMNS, RELIABILITY_BOUNDS, RULES


@ray.remote
def apply_standard_approval_aggregator(annotations: pd.DataFrame) -> pd.DataFrame:
alternatives = [
column
for column in annotations.columns
if column != COLUMNS.question and column != COLUMNS.voter
]
alternatives = _get_alternatives(annotations)

approvals_per_question = annotations.groupby(COLUMNS.question, sort=False)[
alternatives
Expand All @@ -32,11 +28,7 @@ def apply_standard_approval_aggregator(annotations: pd.DataFrame) -> pd.DataFram

@ray.remote
def apply_condorcet_aggregator(annotations: pd.DataFrame) -> pd.DataFrame:
alternatives = [
column
for column in annotations.columns
if column != COLUMNS.question and column != COLUMNS.voter
]
alternatives = _get_alternatives(annotations)
questions = list(annotations[COLUMNS.question].unique())
number_of_alternatives = len(alternatives)

Expand Down Expand Up @@ -70,54 +62,41 @@ def apply_condorcet_aggregator(annotations: pd.DataFrame) -> pd.DataFrame:

@ray.remote
def apply_mallow_aggregator(
Annotations: pd.DataFrame,
dataset: Dataset,
annotations: pd.DataFrame,
distance: str = RULES.jaccard,
) -> pd.DataFrame:
"""
Takes Annotations as input and applies weighted approval rule.
The weights are determined according to the ballot's size.
These weights are the optimal Mallows noise with the input distance.
:param Annotations: dataframe of answers as binary vectors
:param data: name of the dataset
:param distance: The distance of the noise model
:return: agg_weighted: dataframe of the aggregated answers
"""

Alternatives = dataset.alternatives

# Initialize the aggregation dataframe
Questions = list(Annotations.Question.unique())
agg_weighted = pd.DataFrame(columns=[COLUMNS.question] + Alternatives)

# weight of each voter and aggregate the answers for each question
weights = pd.DataFrame(columns=[COLUMNS.voter, COLUMNS.weight])
weights[COLUMNS.voter] = Annotations.Voter.unique()
n = len(list(weights[COLUMNS.voter]))
D = Annotations.loc[:, Alternatives].to_numpy()
for i in range(len(Questions)):
# Compute the weight of each voter according to the chosen distance
alternatives = _get_alternatives(annotations)

questions = list(annotations[COLUMNS.question].unique())
aggregated_labels = pd.DataFrame(columns=[COLUMNS.question] + alternatives)

for i, question in enumerate(questions):
question_annotations = annotations[annotations[COLUMNS.question] == question][
alternatives
].to_numpy()

vote_size = np.sum(question_annotations, axis=1)

if distance == RULES.euclid:
weights[COLUMNS.weight] = np.sqrt(
np.sum(D[n * i : n * (i + 1), :], axis=1) + 1 # noqa: E203
) - np.sqrt(
np.sum(D[n * i : n * (i + 1), :], axis=1) - 1 # noqa: E203
)
weights = np.sqrt(vote_size + 1) - np.sqrt(vote_size - 1)
elif distance == RULES.jaccard:
weights[COLUMNS.weight] = 1 / np.sum(
D[n * i : n * (i + 1), :], axis=1 # noqa: E203
)
weights = 1 / vote_size
elif distance == RULES.dice:
weights[COLUMNS.weight] = 2 / (
np.sum(D[n * i : n * (i + 1), :], axis=1) + 1 # noqa: E203
)

L = np.matmul(
weights[COLUMNS.weight].T, D[n * i : n * (i + 1), :] # noqa: E203
) # noqa: E203
k = np.argmax(L)
agg_weighted.loc[i] = [Questions[i]] + [
t == k for t in range(0, len(Alternatives))
weights = 2 / (vote_size + 1)

weighted_scores = np.matmul(weights.T, question_annotations)
most_likely_label = np.argmax(weighted_scores)

aggregated_labels.loc[i] = [question] + [
label == most_likely_label for label in range(len(alternatives))
]

return agg_weighted
return aggregated_labels


def _get_alternatives(annotations: pd.DataFrame) -> list[str]:
return [
column
for column in annotations.columns
if column != COLUMNS.question and column != COLUMNS.voter
]
12 changes: 3 additions & 9 deletions size_matters/evaluation/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,9 @@ def compare_methods(dataset: Dataset, max_voters: int, n_batch: int) -> NDArray:
) = ray.get(
[
apply_standard_approval_aggregator.remote(annotations_batch),
apply_mallow_aggregator.remote(
annotations_batch, dataset, RULES.euclid
),
apply_mallow_aggregator.remote(
annotations_batch, dataset, RULES.jaccard
),
apply_mallow_aggregator.remote(
annotations_batch, dataset, RULES.dice
),
apply_mallow_aggregator.remote(annotations_batch, RULES.euclid),
apply_mallow_aggregator.remote(annotations_batch, RULES.jaccard),
apply_mallow_aggregator.remote(annotations_batch, RULES.dice),
apply_condorcet_aggregator.remote(annotations_batch),
]
)
Expand Down

0 comments on commit 67d59db

Please sign in to comment.