Skip to content

Commit

Permalink
Update requirements, add bench script
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 15, 2024
1 parent 859302c commit 9cde053
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 167 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/cla.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: "Tabled CLA Assistant"
on:
issue_comment:
types: [created]
pull_request_target:
types: [opened,closed,synchronize]

# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
permissions:
actions: write
contents: write
pull-requests: write
statuses: write

jobs:
CLAAssistant:
runs-on: ubuntu-latest
steps:
- name: "Tabled CLA Assistant"
if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
uses: contributor-assistant/github-action@v2.3.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# the below token should have repo scope and must be manually added by you in the repository's secret
# This token is required only if you have configured to store the signatures in a remote repository/organization
PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
with:
path-to-signatures: 'signatures/version1/cla.json'
path-to-document: 'https://github.com/VikParuchuri/tabled/blob/master/CLA.md'
# branch should not be protected
branch: 'master'
allowlist: VikParuchuri
27 changes: 27 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install
- name: Build package
run: |
poetry build
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
29 changes: 29 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Integration test

on: [push]

env:
TORCH_DEVICE: "cpu"

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install apt dependencies
run: |
sudo apt-get update
- name: Install python dependencies
run: |
pip install poetry
poetry install
poetry run pip uninstall torch -y
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Run benchmark test
run: |
poetry run python benchmarks/benchmark.py --max 5 temp.json
poetry run python scripts/verify_benchmark_scores.py temp.json
23 changes: 13 additions & 10 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import time

import click
import datasets
from surya.input.pdflines import get_table_blocks
from tabulate import tabulate
Expand All @@ -14,21 +15,23 @@
from tabled.inference.recognition import recognize_tables


def main():
parser = argparse.ArgumentParser(description="Benchmark table conversion.")
parser.add_argument("out_file", help="Output filename for results")
parser.add_argument("--dataset", type=str, help="Dataset to use", default="vikp/table_bench2")
args = parser.parse_args()

ds = datasets.load_dataset(args.dataset, split="train")
@click.command()
@click.argument("out_file", type=str)
@click.option("--dataset", type=str, default="vikp/table_bench2", help="Dataset to use")
@click.option("--max", type=int, default=None, help="Max number of tables to process")
def main(out_file, dataset, max):
ds = datasets.load_dataset(dataset, split="train")

rec_models = load_recognition_models()

results = []
table_imgs = []
table_blocks = []
image_sizes = []
for i in range(len(ds)):
iterations = len(ds)
if max is not None:
iterations = min(max, len(ds))
for i in range(iterations):
row = ds[i]
line_data = json.loads(row["text_lines"])
table_bbox = row["table_bbox"]
Expand All @@ -45,7 +48,7 @@ def main():
total_time = time.time() - start
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, image_sizes)]

for i in range(len(ds)):
for i in range(iterations):
row = ds[i]
table_cells = cells[i]
table_bbox = row["table_bbox"]
Expand All @@ -70,7 +73,7 @@ def main():
print(table)
print("Avg score computed by aligning table cell text with GPT-4 table cell text.")

with open(args.out_file, "w+") as f:
with open(out_file, "w+") as f:
json.dump(results, f, indent=2)


Expand Down
2 changes: 1 addition & 1 deletion extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from tabled.inference.models import load_detection_models, load_recognition_models


@click.command()
@click.command(help="Extract tables from PDFs")
@click.argument("in_path", type=click.Path(exists=True))
@click.argument("out_folder", type=click.Path())
@click.option("--save_json", is_flag=True, help="Save row/column/cell information in json format")
Expand Down
Loading

0 comments on commit 9cde053

Please sign in to comment.