Update requirements, add bench script

VikParuchuri · Oct 15, 2024 · 9cde053 · 9cde053
1 parent 859302c
commit 9cde053
Show file tree

Hide file tree

Showing 8 changed files with 293 additions and 167 deletions.
diff --git a/.github/workflows/cla.yml b/.github/workflows/cla.yml
@@ -0,0 +1,32 @@
+name: "Tabled CLA Assistant"
+on:
+  issue_comment:
+    types: [created]
+  pull_request_target:
+    types: [opened,closed,synchronize]
+
+# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
+permissions:
+  actions: write
+  contents: write
+  pull-requests: write
+  statuses: write
+
+jobs:
+  CLAAssistant:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Tabled CLA Assistant"
+        if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
+        uses: contributor-assistant/github-action@v2.3.0
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # the below token should have repo scope and must be manually added by you in the repository's secret
+          # This token is required only if you have configured to store the signatures in a remote repository/organization
+          PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+        with:
+          path-to-signatures: 'signatures/version1/cla.json'
+          path-to-document: 'https://github.com/VikParuchuri/tabled/blob/master/CLA.md'
+          # branch should not be protected
+          branch: 'master'
+          allowlist: VikParuchuri
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,27 @@
+name: Python package
+on:
+  push:
+    tags:
+      - "v*.*.*"
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install python dependencies
+        run: |
+          pip install poetry
+          poetry install
+      - name: Build package
+        run: |
+          poetry build
+      - name: Publish package
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          poetry config pypi-token.pypi "$PYPI_TOKEN"
+          poetry publish
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,29 @@
+name: Integration test
+
+on: [push]
+
+env:
+  TORCH_DEVICE: "cpu"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install apt dependencies
+        run: |
+          sudo apt-get update
+      - name: Install python dependencies
+        run: |
+          pip install poetry
+          poetry install
+          poetry run pip uninstall torch -y
+          poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
+      - name: Run benchmark test
+        run: |
+          poetry run python benchmarks/benchmark.py --max 5 temp.json
+          poetry run python scripts/verify_benchmark_scores.py temp.json
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -2,6 +2,7 @@
 import json
 import time
 
+import click
 import datasets
 from surya.input.pdflines import get_table_blocks
 from tabulate import tabulate
@@ -14,21 +15,23 @@
 from tabled.inference.recognition import recognize_tables
 
 
-def main():
-    parser = argparse.ArgumentParser(description="Benchmark table conversion.")
-    parser.add_argument("out_file", help="Output filename for results")
-    parser.add_argument("--dataset", type=str, help="Dataset to use", default="vikp/table_bench2")
-    args = parser.parse_args()
-
-    ds = datasets.load_dataset(args.dataset, split="train")
+@click.command()
+@click.argument("out_file", type=str)
+@click.option("--dataset", type=str, default="vikp/table_bench2", help="Dataset to use")
+@click.option("--max", type=int, default=None, help="Max number of tables to process")
+def main(out_file, dataset, max):
+    ds = datasets.load_dataset(dataset, split="train")
 
     rec_models = load_recognition_models()
 
     results = []
     table_imgs = []
     table_blocks = []
     image_sizes = []
-    for i in range(len(ds)):
+    iterations = len(ds)
+    if max is not None:
+        iterations = min(max, len(ds))
+    for i in range(iterations):
         row = ds[i]
         line_data = json.loads(row["text_lines"])
         table_bbox = row["table_bbox"]
@@ -45,7 +48,7 @@ def main():
     total_time = time.time() - start
     cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, image_sizes)]
 
-    for i in range(len(ds)):
+    for i in range(iterations):
         row = ds[i]
         table_cells = cells[i]
         table_bbox = row["table_bbox"]
@@ -70,7 +73,7 @@ def main():
     print(table)
     print("Avg score computed by aligning table cell text with GPT-4 table cell text.")
 
-    with open(args.out_file, "w+") as f:
+    with open(out_file, "w+") as f:
         json.dump(results, f, indent=2)
 
 

diff --git a/extract.py b/extract.py
@@ -13,7 +13,7 @@
 from tabled.inference.models import load_detection_models, load_recognition_models
 
 
-@click.command()
+@click.command(help="Extract tables from PDFs")
 @click.argument("in_path", type=click.Path(exists=True))
 @click.argument("out_folder", type=click.Path())
 @click.option("--save_json", is_flag=True, help="Save row/column/cell information in json format")