diff --git a/Metrics.ipynb b/Metrics.ipynb new file mode 100644 index 0000000..ca5b551 --- /dev/null +++ b/Metrics.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f0b57e26-f4fb-4eea-a8a0-060b8e00ec99", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install rank_eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0ba5720-11b7-43ba-87b9-44a13dafc4ba", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install ranx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "273145ec-bb91-4182-8a53-90816057c442", + "metadata": {}, + "outputs": [], + "source": [ + "from rank_eval import Qrels, Run, evaluate, compare\n", + "import pandas as pd\n", + "from pandas import DataFrame\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5923ed76-11a1-419f-835f-81dc36e90e4e", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('resp_bertopic_unsuperv_filter.csv')\n", + "df['indice'] = df['indice'].astype(str)\n", + "df['num_tema_cadastrado'] = df['num_tema_cadastrado'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c170ef1-096d-42e4-b5a3-ab905d603b10", + "metadata": {}, + "outputs": [], + "source": [ + "q_id = []\n", + "doc_id = []\n", + "score = []\n", + "for indice, linha in df.iterrows():\n", + " for i in range(1,7):\n", + " q_id.append(linha[0])\n", + " sug = f\"sugerido_{i}\"\n", + " doc_id.append(linha[sug])\n", + " sim = f\"similaridade_{i}\"\n", + " score.append(linha[sim])\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "165e3b90-12b9-4c5a-902b-94ac8f73c2fb", + "metadata": {}, + "outputs": [], + "source": [ + "ranx_dict ={\"q_id\": q_id,\n", + " \"doc_id\": doc_id,\n", + " \"score\" : score\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f72578a-9ad2-42bc-927e-d0cad36933a6", + "metadata": {}, + "outputs": [], + "source": [ + "qrel = Qrels.from_df(df, q_id_col='indice', doc_id_col='num_tema_cadastrado', score_col='posicao_tema_real')" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "7f61ea60-65ad-4fd5-bd03-1e5953d1e24d", + "metadata": {}, + "outputs": [], + "source": [ + "run_df = DataFrame.from_dict(ranx_dict)\n", + "run_df['q_id'] = run_df['q_id'].astype(str)\n", + "run_df['doc_id'] = run_df['doc_id'].astype(str)\n", + "run = Run.from_df(\n", + " df=run_df,\n", + " q_id_col=\"q_id\",\n", + " doc_id_col=\"doc_id\",\n", + " score_col=\"score\",\n", + " name=\"my_run\",\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "6cd573e2-43b6-403b-961f-08997eb945e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'map@6': 0.027846322990471534,\n", + " 'ndcg@6': 0.035352674332042214,\n", + " 'mrr': 0.027846322990471534}" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate(qrel, run, [\"map@6\",\"ndcg@6\",\"mrr\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea3557d-ed84-4fdd-9bd0-bf6647aa366c", + "metadata": {}, + "outputs": [], + "source": [ + "# Access scores for each query\n", + "dict(run.scores)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a170f6e4-901f-4f9a-a5f9-d3deea092193", + "metadata": {}, + "outputs": [], + "source": [ + "# Computed metric scores are saved in the Run object\n", + "run.mean_scores\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b8f939e-8028-4bcf-a391-915edc14ff1f", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare different runs and perform statistical tests\n", + "#report = compare(\n", + "# qrels=qrels,\n", + "# runs=[run_1, run_2, run_3, run_4, run_5],\n", + "# metrics=[\"map@100\", \"mrr@100\", \"ndcg@10\"],\n", + "# max_p=0.01 # P-value threshold\n", + "#)\n", + "\n", + "#print(report)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "782a63b1-b2e9-44b4-91ce-7d332af1deb7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}