Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Evaluation Metric and Dataset for Tabled #28

Open
wants to merge 12 commits into
base: dev
Choose a base branch
from
Open
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,13 @@ page_results = extract_tables(images, highres_images, text_lines, det_models, la

| Avg score | Time per table | Total tables |
|-------------|------------------|----------------|
| 0.847 | 0.029 | 688 |
| 0.679 | 0.038 | 1000 |

## Quality

Getting good ground truth data for tables is hard, since you're either constrained to simple layouts that can be heuristically parsed and rendered, or you need to use LLMs, which make mistakes. I chose to use GPT-4 table predictions as a pseudo-ground-truth.
Tabled was evaluated using the FinTabNet dataset, which consists of tables extracted from financial documents, with fairly complex multi-row and multi-column structures. The source HTML of the tables is the ground truth. The HTML generated by Tabled is compared with the ground truth using a [tree edit distance](https://arxiv.org/abs/1911.10683) based approach.

Tabled gets a `.847` alignment score when compared to GPT-4, which indicates alignment between the text in table rows/cells. Some of the misalignments are due to GPT-4 mistakes, or small inconsistencies in what GPT-4 considered the borders of the table. In general, extraction quality is quite high.
Tabled achieves a score of `0.679` using this approach. A large chunk of the inconsistency arises from the treatment of multi-row/column tables, which is actively being improved.

## Performance

Expand Down
65 changes: 35 additions & 30 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,82 @@
import argparse
import json
import time

import click
import datasets
from surya.input.pdflines import get_table_blocks
from tabulate import tabulate
from bs4 import BeautifulSoup
from tqdm import tqdm
from scoring import score_table
from tabled.assignment import assign_rows_columns

from scoring import batched_TEDS

from tabled.assignment import assign_rows_columns
from tabled.formats import formatter
from tabled.inference.models import load_recognition_models
from tabled.inference.recognition import recognize_tables
from tabled.formats.common import replace_newlines
from tabled.inference.models import load_recognition_models, load_detection_models, load_layout_models
from tabled.inference.recognition import recognize_tables, get_cells


@click.command()
@click.argument("out_file", type=str)
@click.option("--dataset", type=str, default="vikp/table_bench2", help="Dataset to use")
@click.option("--dataset", type=str, default="tarun-menta/fintabnet-html-test", help="Dataset to use")
@click.option("--max", type=int, default=None, help="Max number of tables to process")
def main(out_file, dataset, max):
ds = datasets.load_dataset(dataset, split="train")
ds = ds.shuffle(seed=0)

rec_models = load_recognition_models()
detection_models, rec_models= load_detection_models(), load_recognition_models()

results = []
table_imgs = []
table_blocks = []
image_sizes = []
table_bboxes = []
text_lines = []
iterations = len(ds)
if max is not None:
iterations = min(max, len(ds))
for i in range(iterations):
for i in tqdm(range(iterations), desc='Preparing Inputs'):
row = ds[i]
line_data = json.loads(row["text_lines"])
table_bbox = row["table_bbox"]
image_size = row["page_size"]
table_img = row["table_image"]
table_img = row['highres_table_img']
line_data = row['pdftext_lines']
image_size = row['highres_img'].size
table_bbox = row['highres_table_bbox']

table_block = get_table_blocks([table_bbox], line_data, image_size)[0]
table_imgs.append(table_img)
table_blocks.append(table_block)
image_sizes.append(image_size)
table_bboxes.append(table_bbox)
text_lines.append(line_data)

start = time.time()
table_rec = recognize_tables(table_imgs, table_blocks, [False] * len(table_imgs), rec_models)
table_cells, needs_ocr = get_cells(table_imgs, table_bboxes, image_sizes, text_lines, detection_models, detect_boxes=False)
table_rec = recognize_tables(table_imgs, table_cells, needs_ocr, rec_models)
total_time = time.time() - start
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, image_sizes)]

for i in range(iterations):
row = ds[i]
table_cells = cells[i]
table_bbox = row["table_bbox"]
gpt4_table = json.loads(row["gpt_4_table"])["markdown_table"]

table_markdown, _ = formatter("markdown", table_cells)
gt_table_html = row['orig_html']

marker_table_html, _ = formatter("html", table_cells, numalign=None, stralign=None, headers="")
marker_table_soup = BeautifulSoup(marker_table_html, 'html.parser')
marker_table_soup.find('tbody').unwrap() #Tabulate wraps the table in <tbody> which fintabnet data doesn't
marker_table_html = str(marker_table_soup)

results.append({
"score": score_table(table_markdown, gpt4_table),
"arxiv_id": row["arxiv_id"],
"page_idx": row["page_idx"],
"marker_table": table_markdown,
"gpt4_table": gpt4_table,
"table_bbox": table_bbox
"marker_table": marker_table_html,
"gt_table": gt_table_html,
})

scores = batched_TEDS([r['gt_table'] for r in results], [r['marker_table'] for r in results])
for result, score in zip(results, scores):
result.update({'score': score})

avg_score = sum([r["score"] for r in results]) / len(results)
headers = ["Avg score", "Time per table", "Total tables"]
data = [f"{avg_score:.3f}", f"{total_time / len(ds):.3f}", len(ds)]
data = [f"{avg_score:.3f}", f"{total_time / iterations:.3f}", iterations]

table = tabulate([data], headers=headers, tablefmt="github")
print(table)
print("Avg score computed by aligning table cell text with GPT-4 table cell text.")
print("Avg score computed by comparing tabled predicted HTML with original HTML")

with open(out_file, "w+") as f:
json.dump(results, f, indent=2)
Expand Down
174 changes: 133 additions & 41 deletions benchmarks/scoring.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,133 @@
from rapidfuzz import fuzz
import re


def split_to_cells(table):
table = table.strip()
table = re.sub(r" {2,}", "", table)
table_rows = table.split("\n")
table_rows = [t for t in table_rows if t.strip()]
table_cells = [[c.strip() for c in r.split("|")] for r in table_rows]
return table_cells


def align_rows(hypothesis, ref_row):
best_alignment = []
best_alignment_score = 0
for j in range(0, len(hypothesis)):
alignments = []
for i in range(len(ref_row)):
if i >= len(hypothesis[j]):
alignments.append(0)
continue
alignment = fuzz.ratio(hypothesis[j][i], ref_row[i], score_cutoff=30) / 100
alignments.append(alignment)
if len(alignments) == 0:
continue
alignment_score = sum(alignments) / len(alignments)
if alignment_score >= best_alignment_score:
best_alignment = alignments
best_alignment_score = alignment_score
return best_alignment


def score_table(hypothesis, reference):
hypothesis = split_to_cells(hypothesis)
reference = split_to_cells(reference)

alignments = []
for i in range(0, len(reference)):
alignments.extend(align_rows(hypothesis, reference[i]))
return sum(alignments) / max(len(alignments), 1)
'''
TEDS Code Adapter from https://github.com/ibm-aur-nlp/EDD
'''

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List

from tqdm import tqdm
import distance
from apted import APTED, Config
from apted.helpers import Tree
from lxml import html
from collections import deque
import numpy as np

def wrap_table_html(table_html:str)->str:
return f'<html><body>{table_html}</body></html>'

class TableTree(Tree):
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
self.children = list(children)

def bracket(self):
"""Show tree using brackets notation"""
if self.tag == 'td':
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
(self.tag, self.colspan, self.rowspan, self.content)
else:
result = '"tag": %s' % self.tag
for child in self.children:
result += child.bracket()
return "{{{}}}".format(result)

class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
"""
return max(map(len, sequences))

def normalized_distance(self, *sequences):
"""Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)

def rename(self, node1, node2):
"""Compares attributes of trees"""
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
return 1.
if node1.tag == 'td':
if node1.content or node2.content:
return self.normalized_distance(node1.content, node2.content)
return 0.

def tokenize(node):
''' Tokenizes table cells
'''
global __tokens__
__tokens__.append('<%s>' % node.tag)
if node.text is not None:
__tokens__ += list(node.text)
for n in node.getchildren():
tokenize(n)
if node.tag != 'unk':
__tokens__.append('</%s>' % node.tag)
if node.tag != 'td' and node.tail is not None:
__tokens__ += list(node.tail)

def tree_convert_html(node, convert_cell=False, parent=None):
''' Converts HTML tree to the format required by apted
'''
global __tokens__
if node.tag == 'td':
if convert_cell:
__tokens__ = []
tokenize(node)
cell = __tokens__[1:-1].copy()
else:
cell = []
new_node = TableTree(node.tag,
int(node.attrib.get('colspan', '1')),
int(node.attrib.get('rowspan', '1')),
cell, *deque())
else:
new_node = TableTree(node.tag, None, None, None, *deque())
if parent is not None:
parent.children.append(new_node)
if node.tag != 'td':
for n in node.getchildren():
tree_convert_html(n, convert_cell, new_node)
if parent is None:
return new_node

def similarity_eval_html(pred, true, structure_only=False):
''' Computes TEDS score between the prediction and the ground truth of a
given samples
'''
if pred.xpath('body/table') and true.xpath('body/table'):
pred = pred.xpath('body/table')[0]
true = true.xpath('body/table')[0]
n_nodes_pred = len(pred.xpath(".//*"))
n_nodes_true = len(true.xpath(".//*"))
tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
tree_true = tree_convert_html(true, convert_cell=not structure_only)
n_nodes = max(n_nodes_pred, n_nodes_true)
distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
return 1.0 - (float(distance) / n_nodes)
else:
return 0.0

def TEDS(prediction, ground_truth):
prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
if prediction:
return similarity_eval_html(
html.fromstring(prediction),
html.fromstring(ground_truth)
)
else:
return 0.

def batched_TEDS(gts: List[str], preds: List[str], n_jobs:int=16):
with ThreadPoolExecutor(max_workers=n_jobs) as pool:
futures = [pool.submit(TEDS, pred, gt) for pred, gt in zip(preds, gts)]

teds_scores = []
for future in futures:
teds_scores.append(future.result())

return teds_scores
Loading
Loading