VikParuchuri · tarun-menta · Oct 18, 2024 · Oct 21, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/README.md b/README.md
@@ -114,13 +114,13 @@ page_results = extract_tables(images, highres_images, text_lines, det_models, la
 
 |   Avg score |   Time per table |   Total tables |
 |-------------|------------------|----------------|
-|       0.847 |            0.029 |            688 |
+|       0.679 |            0.038 |           1000 |
 
 ## Quality
 
-Getting good ground truth data for tables is hard, since you're either constrained to simple layouts that can be heuristically parsed and rendered, or you need to use LLMs, which make mistakes.  I chose to use GPT-4 table predictions as a pseudo-ground-truth.
+Tabled was evaluated using the FinTabNet dataset, which consists of tables extracted from financial documents, with fairly complex multi-row and multi-column structures. The source HTML of the tables is the ground truth. The HTML generated by Tabled is compared with the ground truth using a [tree edit distance](https://arxiv.org/abs/1911.10683) based approach.
 
-Tabled gets a `.847` alignment score when compared to GPT-4, which indicates alignment between the text in table rows/cells.  Some of the misalignments are due to GPT-4 mistakes, or small inconsistencies in what GPT-4 considered the borders of the table.  In general, extraction quality is quite high.
+Tabled achieves a score of `0.679` using this approach. A large chunk of the inconsistency arises from the treatment of multi-row/column tables, which is actively being improved.
 
 ## Performance
 

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -1,77 +1,82 @@
-import argparse
 import json
 import time
-
 import click
 import datasets
-from surya.input.pdflines import get_table_blocks
 from tabulate import tabulate
+from bs4 import BeautifulSoup
 from tqdm import tqdm
-from scoring import score_table
-from tabled.assignment import assign_rows_columns
 
+from scoring import batched_TEDS
+
+from tabled.assignment import assign_rows_columns
 from tabled.formats import formatter
-from tabled.inference.models import load_recognition_models
-from tabled.inference.recognition import recognize_tables
+from tabled.formats.common import replace_newlines
+from tabled.inference.models import load_recognition_models, load_detection_models, load_layout_models
+from tabled.inference.recognition import recognize_tables, get_cells
 
 
 @click.command()
 @click.argument("out_file", type=str)
-@click.option("--dataset", type=str, default="vikp/table_bench2", help="Dataset to use")
+@click.option("--dataset", type=str, default="tarun-menta/fintabnet-html-test", help="Dataset to use")
 @click.option("--max", type=int, default=None, help="Max number of tables to process")
 def main(out_file, dataset, max):
     ds = datasets.load_dataset(dataset, split="train")
+    ds = ds.shuffle(seed=0)
 
-    rec_models = load_recognition_models()
+    detection_models, rec_models= load_detection_models(), load_recognition_models()
 
     results = []
     table_imgs = []
-    table_blocks = []
     image_sizes = []
+    table_bboxes = []
+    text_lines = []
     iterations = len(ds)
     if max is not None:
         iterations = min(max, len(ds))
-    for i in range(iterations):
+    for i in tqdm(range(iterations), desc='Preparing Inputs'):
         row = ds[i]
-        line_data = json.loads(row["text_lines"])
-        table_bbox = row["table_bbox"]
-        image_size = row["page_size"]
-        table_img = row["table_image"]
+        table_img = row['highres_table_img']
+        line_data = row['pdftext_lines']
+        image_size = row['highres_img'].size
+        table_bbox = row['highres_table_bbox']
 
-        table_block = get_table_blocks([table_bbox], line_data, image_size)[0]
         table_imgs.append(table_img)
-        table_blocks.append(table_block)
         image_sizes.append(image_size)
+        table_bboxes.append(table_bbox)
+        text_lines.append(line_data)
 
     start = time.time()
-    table_rec = recognize_tables(table_imgs, table_blocks, [False] * len(table_imgs), rec_models)
+    table_cells, needs_ocr = get_cells(table_imgs, table_bboxes, image_sizes, text_lines, detection_models, detect_boxes=False)
+    table_rec = recognize_tables(table_imgs, table_cells, needs_ocr, rec_models)
     total_time = time.time() - start
     cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, image_sizes)]
 
     for i in range(iterations):
         row = ds[i]
         table_cells = cells[i]
-        table_bbox = row["table_bbox"]
-        gpt4_table = json.loads(row["gpt_4_table"])["markdown_table"]
-
-        table_markdown, _ = formatter("markdown", table_cells)
+        gt_table_html = row['orig_html']
 
+        marker_table_html, _ = formatter("html", table_cells, numalign=None, stralign=None, headers="")
+        marker_table_soup = BeautifulSoup(marker_table_html, 'html.parser')
+        marker_table_soup.find('tbody').unwrap()    #Tabulate wraps the table in <tbody> which fintabnet data doesn't
+        marker_table_html = str(marker_table_soup)
+
         results.append({
-            "score": score_table(table_markdown, gpt4_table),
-            "arxiv_id": row["arxiv_id"],
-            "page_idx": row["page_idx"],
-            "marker_table": table_markdown,
-            "gpt4_table": gpt4_table,
-            "table_bbox": table_bbox
+            "marker_table": marker_table_html,
+            "gt_table": gt_table_html,
         })
 
+    scores = batched_TEDS([r['gt_table'] for r in results], [r['marker_table'] for r in results])
+    for result, score in zip(results, scores):
+        result.update({'score': score})
+
     avg_score = sum([r["score"] for r in results]) / len(results)
     headers = ["Avg score", "Time per table", "Total tables"]
-    data = [f"{avg_score:.3f}", f"{total_time / len(ds):.3f}", len(ds)]
+    data = [f"{avg_score:.3f}", f"{total_time / iterations:.3f}", iterations]
 
     table = tabulate([data], headers=headers, tablefmt="github")
     print(table)
-    print("Avg score computed by aligning table cell text with GPT-4 table cell text.")
+    print("Avg score computed by comparing tabled predicted HTML with original HTML")
 
     with open(out_file, "w+") as f:
         json.dump(results, f, indent=2)

diff --git a/benchmarks/scoring.py b/benchmarks/scoring.py
@@ -1,41 +1,133 @@
-from rapidfuzz import fuzz
-import re
-
-
-def split_to_cells(table):
-    table = table.strip()
-    table = re.sub(r" {2,}", "", table)
-    table_rows = table.split("\n")
-    table_rows = [t for t in table_rows if t.strip()]
-    table_cells = [[c.strip() for c in r.split("|")] for r in table_rows]
-    return table_cells
-
-
-def align_rows(hypothesis, ref_row):
-    best_alignment = []
-    best_alignment_score = 0
-    for j in range(0, len(hypothesis)):
-        alignments = []
-        for i in range(len(ref_row)):
-            if i >= len(hypothesis[j]):
-                alignments.append(0)
-                continue
-            alignment = fuzz.ratio(hypothesis[j][i], ref_row[i], score_cutoff=30) / 100
-            alignments.append(alignment)
-        if len(alignments) == 0:
-            continue
-        alignment_score = sum(alignments) / len(alignments)
-        if alignment_score >= best_alignment_score:
-            best_alignment = alignments
-            best_alignment_score = alignment_score
-    return best_alignment
-
-
-def score_table(hypothesis, reference):
-    hypothesis = split_to_cells(hypothesis)
-    reference = split_to_cells(reference)
-
-    alignments = []
-    for i in range(0, len(reference)):
-        alignments.extend(align_rows(hypothesis, reference[i]))
-    return sum(alignments) / max(len(alignments), 1)
+'''
+TEDS Code Adapter from https://github.com/ibm-aur-nlp/EDD
+'''
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List
+
+from tqdm import tqdm
+import distance
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import html
+from collections import deque
+import numpy as np
+
+def wrap_table_html(table_html:str)->str:
+    return f'<html><body>{table_html}</body></html>'
+
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+
+def tokenize(node):
+    ''' Tokenizes table cells
+    '''
+    global __tokens__
+    __tokens__.append('<%s>' % node.tag)
+    if node.text is not None:
+        __tokens__ += list(node.text)
+    for n in node.getchildren():
+        tokenize(n)
+    if node.tag != 'unk':
+        __tokens__.append('</%s>' % node.tag)
+    if node.tag != 'td' and node.tail is not None:
+            __tokens__ += list(node.tail)
+
+def tree_convert_html(node, convert_cell=False, parent=None):
+    ''' Converts HTML tree to the format required by apted
+    '''
+    global __tokens__
+    if node.tag == 'td':
+        if convert_cell:
+            __tokens__ = []
+            tokenize(node)
+            cell = __tokens__[1:-1].copy()
+        else:
+            cell = []
+        new_node = TableTree(node.tag,
+                             int(node.attrib.get('colspan', '1')),
+                             int(node.attrib.get('rowspan', '1')),
+                             cell, *deque())
+    else:
+        new_node = TableTree(node.tag, None, None, None, *deque())
+    if parent is not None:
+        parent.children.append(new_node)
+    if node.tag != 'td':
+        for n in node.getchildren():
+            tree_convert_html(n, convert_cell, new_node)
+    if parent is None:
+        return new_node
+
+def similarity_eval_html(pred, true, structure_only=False):
+    ''' Computes TEDS score between the prediction and the ground truth of a
+        given samples
+    '''
+    if pred.xpath('body/table') and true.xpath('body/table'):
+        pred = pred.xpath('body/table')[0]
+        true = true.xpath('body/table')[0]
+        n_nodes_pred = len(pred.xpath(".//*"))
+        n_nodes_true = len(true.xpath(".//*"))
+        tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
+        tree_true = tree_convert_html(true, convert_cell=not structure_only)
+        n_nodes = max(n_nodes_pred, n_nodes_true)
+        distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+        return 1.0 - (float(distance) / n_nodes)
+    else:
+        return 0.0
+
+def TEDS(prediction, ground_truth):
+    prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
+    if prediction:
+        return similarity_eval_html(
+            html.fromstring(prediction),
+            html.fromstring(ground_truth)
+        )
+    else:
+        return 0.
+
+def batched_TEDS(gts: List[str], preds: List[str], n_jobs:int=16):
+    with ThreadPoolExecutor(max_workers=n_jobs) as pool:
+        futures = [pool.submit(TEDS, pred, gt) for pred, gt in zip(preds, gts)]
+
+    teds_scores = []
+    for future in futures:
+        teds_scores.append(future.result())
+
+    return teds_scores