Skip to content

Commit

Permalink
Merged main
Browse files Browse the repository at this point in the history
  • Loading branch information
wpfl-dbt committed Dec 12, 2024
2 parents e73215a + c426882 commit 6f6332f
Show file tree
Hide file tree
Showing 52 changed files with 1,539 additions and 797 deletions.
18 changes: 4 additions & 14 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,6 @@ on:
branches: [ main ]
workflow_dispatch:

env:
MB__BATCH_SIZE: 10_000
MB__BACKEND_TYPE: postgres
MB__DATASETS_CONFIG: datasets.toml
# PostgreSQL backend settings
MB__POSTGRES__HOST: localhost
MB__POSTGRES__PORT: 5432
MB__POSTGRES__USER: matchbox_user
MB__POSTGRES__PASSWORD: matchbox_password
MB__POSTGRES__DATABASE: matchbox
MB__POSTGRES__DB_SCHEMA: mb


jobs:
run-unit-tests:
name: tests
Expand All @@ -34,8 +21,11 @@ jobs:

- name: Install the project
run: uv sync --all-extras --dev

- name: Copy environment variables
run: cp environments/dev_docker.env .env

- name: Set up PostgreSQL
- name: Run DBs and API
run: |
docker compose up -d --wait
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ datasets.toml
scratch/
*.ipynb_checkpoints
.ruff_cache
notebooks/
*.parquet

# DuckDB
*.duckdb
Expand Down Expand Up @@ -177,6 +179,9 @@ dmypy.json
# Cython debug symbols
cython_debug/

# Mac things
.DS_Store

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
Expand Down
11 changes: 0 additions & 11 deletions .gitlab/merge_request_templates/merge_template.md

This file was deleted.

27 changes: 14 additions & 13 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
repos:
- repo: local
hooks:
# Run the formatter.
- id: ruff-format
name: ruff-format
description: "Run 'ruff format' for extremely fast Python formatting"
entry: ruff format
language: python
types_or: [python, pyi]
# Run the linter.
# Run the ruff linter and formatter using just command
- id: ruff
name: ruff
description: "Run 'ruff' for extremely fast Python linting"
entry: ruff check
args: [ --fix ] # Enable lint fixes.
language: python
types_or: [python, pyi]
description: "Format Python code using ruff format via just command"
entry: just format
language: system
types_or: [python, pyi]
pass_filenames: false

# Check for secrets
- id: trufflehog
name: TruffleHog
description: Detect secrets in your data.
entry: bash -c "trufflehog git file://. --since-commit HEAD --only-verified --fail"
language: system
stages: ["commit", "push"]
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ If the dataset isn't already in matchbox, it'll need to be indexed.

Pipelines using this part of matchbox will:

1. Use `matchbox.query()` to retrieve source data from a particular model's perspective
1. Use `matchbox.query()` to retrieve source data from the perspective of a particular resolution point
2. Use `matchbox.process()` to clean the data with standardised processes
3. Use `matchbox.make_model()` with `matchbox.dedupers` and `matchbox.linkers` to create a new model
4. Generate probabilistic model outputs using `model.run()`
Expand All @@ -80,6 +80,10 @@ This project is managed by [uv](https://docs.astral.sh/uv/), linted and formated
uv sync --all-extras
```

Secret scanning is done with [trufflehog](https://github.com/trufflesecurity/trufflehog).

For security, use of [pre-commit](https://pre-commit.com) is expected. Ensure your hooks are installed with `pre-commit install`.

Task running is done with [just](https://just.systems/man/en/). To see all available commands:

```console
Expand Down
20 changes: 20 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@ services:
- "5432:5432"
volumes:
- matchbox_data:/var/lib/postgresql/data
api:
build:
context: .
dockerfile: src/matchbox/server/Dockerfile
ports:
- "8000:8000"
depends_on:
- matchbox-postgres

develop:
# https://docs.docker.com/compose/file-watch/#compose-watch-versus-bind-mounts
watch:
# Sync the working directory with the `/app` directory in the container
- action: sync
path: ./src/matchbox/server
target: /code/src/matchbox/server

# Rebuild the image on changes to the `pyproject.toml`
- action: rebuild
path: ./pyproject.toml

volumes:
warehouse_data:
Expand Down
12 changes: 12 additions & 0 deletions environments/dev_docker.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
MB__BATCH_SIZE=250_000
MB__BACKEND_TYPE=postgres
MB__DATASETS_CONFIG=datasets.toml

MB__POSTGRES__HOST=matchbox-postgres
MB__POSTGRES__PORT=5432
MB__POSTGRES__USER=matchbox_user
MB__POSTGRES__PASSWORD=matchbox_password
MB__POSTGRES__DATABASE=matchbox
MB__POSTGRES__DB_SCHEMA=mb

API__ROOT=http://localhost:8000
12 changes: 12 additions & 0 deletions environments/dev_local.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
MB__BATCH_SIZE=250_000
MB__BACKEND_TYPE=postgres
MB__DATASETS_CONFIG=datasets.toml

MB__POSTGRES__HOST=localhost
MB__POSTGRES__PORT=5432
MB__POSTGRES__USER=matchbox_user
MB__POSTGRES__PASSWORD=matchbox_password
MB__POSTGRES__DATABASE=matchbox
MB__POSTGRES__DB_SCHEMA=mb

API__ROOT=http://localhost:8000
12 changes: 12 additions & 0 deletions environments/sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
MB__BATCH_SIZE=
MB__BACKEND_TYPE=
MB__DATASETS_CONFIG=

MB__POSTGRES__HOST=
MB__POSTGRES__PORT=
MB__POSTGRES__USER=
MB__POSTGRES__PASSWORD=
MB__POSTGRES__DATABASE=
MB__POSTGRES__DB_SCHEMA=

API__ROOT=
10 changes: 5 additions & 5 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ format:
uv run ruff format .
uv run ruff check . --fix

# Scan for secrets
scan:
trufflehog git file://. --only-verified

# Run Python tests
test:
docker compose up -d --wait
uv run pytest

# Run development version of API
api:
uv run fastapi dev src/matchbox/server/api.py
uv run pytest
7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"click>=8.1.7",
"connectorx>=0.3.3",
"duckdb>=1.1.1",
"httpx>=0.28.0",
"matplotlib>=3.9.2",
"pandas>=2.2.3",
"psycopg2>=2.9.10",
Expand Down Expand Up @@ -39,6 +40,8 @@ dev = [
"pytest-env>=1.1.5",
"ruff>=0.6.8",
"docker>=7.1.0",
"tomli-w>=1.1.0",
"vcrpy>=6.0.2",
]
typing = [
"polars>=1.11.0",
Expand Down Expand Up @@ -90,7 +93,3 @@ log_cli = false
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"

[tool.pytest_env]
MB__POSTGRES__SCHEMA = "test"
MB__BATCH_SIZE = "400"
118 changes: 0 additions & 118 deletions references/METHODOLOGY.md

This file was deleted.

24 changes: 23 additions & 1 deletion sample.datasets.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,43 @@ database = "pg_warehouse"
db_schema = "companieshouse"
db_table = "companies"
db_pk = "id"
index = [
{ literal = "crn", alias = "crn_id", type = "VARCHAR" },
{ literal = "company_name", alias = "name" },
{ literal = "*" },
{ literal = "postcode" }
]

[datasets.data_hub_companies]
database = "pg_warehouse"
db_schema = "dbt"
db_table = "data_hub__companies"
db_pk = "id"
index = [
{ literal = "cdms", alias = "cdms_id", type = "VARCHAR" },
{ literal = "company_name", alias = "name" },
{ literal = "postcode" },
{ literal = "*" }
]

[datasets.hmrc_exporters]
database = "pg_warehouse"
db_schema = "hmrc"
db_table = "trade__exporters"
db_pk = "id"
index = [
{ literal = "company_name", alias = "name" },
{ literal = "postcode" },
]

[datasets.export_wins]
database = "pg_warehouse"
db_schema = "dbt"
db_table = "export_wins__wins_dataset"
db_pk = "id"
db_pk = "id"
index = [
{ literal = "company_name" },
{ literal = "postcode" },
{ literal = "cdms", alias = "cdms_id", type = "VARCHAR" },
{ literal = "data_hub_company_id", alias = "dh_id", type = "VARCHAR" },
]
Loading

0 comments on commit 6f6332f

Please sign in to comment.