Skip to content

Commit

Permalink
feat: schedule Hugging Face Parquet dataset push every day
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Oct 29, 2024
1 parent 0b2f414 commit 7e525c1
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 7 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/container-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.net,static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.net,images.openfoodfacts.org" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=8GB" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_WORK_MEM=1GB" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV
- name: Set various variable for production deployment
if: matrix.env == 'robotoff-org'
run: |
Expand All @@ -58,6 +59,7 @@ jobs:
echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.org" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=16GB" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_WORK_MEM=2GB" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=1" >> $GITHUB_ENV
- name: Wait for container build workflow
uses: tomchv/[email protected]
id: wait-build
Expand Down Expand Up @@ -182,6 +184,8 @@ jobs:
# Secret key to secure batch job import
echo "BATCH_JOB_KEY=${{ secrets.BATCH_JOB_KEY }}" >> .env
# Enable or not dataset push to Hugging Face
echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env
- name: Create Docker volumes
uses: appleboy/ssh-action@master
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ x-robotoff-base-env:
GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
BATCH_JOB_KEY: # Secure Batch job import with a token key
HF_TOKEN: # Hugging Face token
ENABLE_HF_PUSH: # Enable Hugging Face dataset push (0 or 1, disabled by default)

x-robotoff-worker-base:
&robotoff-worker
Expand Down
2 changes: 1 addition & 1 deletion robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,7 +1205,7 @@ def launch_normalize_barcode_job(
def push_jsonl_to_hf(
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated.",
commit_message: str = "Database updated",
output_path: Optional[str] = None,
):
"""Clean and convert the JSONL database before pushing to HF.
Expand Down
6 changes: 3 additions & 3 deletions robotoff/products.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,9 +601,9 @@ def convert_jsonl_to_parquet(

def push_data_to_hf(
data_path: str,
repo_id: str,
revision: str,
commit_message: str,
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated",
) -> None:
logger.info(f"Start pushing data to Hugging Face at {repo_id}")
if not os.path.exists(data_path):
Expand Down
28 changes: 25 additions & 3 deletions robotoff/scheduler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import os
import tempfile
import uuid
from typing import Iterable

Expand All @@ -23,9 +24,11 @@
from robotoff.models import Prediction, ProductInsight, db
from robotoff.products import (
Product,
convert_jsonl_to_parquet,
fetch_dataset,
get_min_product_store,
has_dataset_changed,
push_data_to_hf,
)
from robotoff.types import InsightType, ServerType
from robotoff.utils import get_logger
Expand Down Expand Up @@ -290,14 +293,33 @@ def update_insight_attributes(product: Product, insight: ProductInsight) -> bool


# this job does no use database
def _update_data():
"""Refreshes the PO product dump data."""
def _update_data() -> None:
"""Download the latest version of the Product Opener product JSONL dump,
convert it to Parquet format and push it to Hugging Face Hub.
Conversion to Parquet is only performed if the envvar ENABLE_HF_PUSH is
set to 1.
"""
logger.info("Downloading new version of product dataset")
ds_changed = False
try:
if has_dataset_changed():
if ds_changed := has_dataset_changed():
fetch_dataset()
except requests.exceptions.RequestException:
logger.exception("Exception during product dataset refresh")
return

if not settings.ENABLE_HF_PUSH:
logger.info("HF push is disabled, skipping Parquet conversion")

if ds_changed:
logger.info("Starting conversion of JSONL to Parquet (to HF)")
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, "converted_data.parquet")
convert_jsonl_to_parquet(output_file_path=file_path)
push_data_to_hf(data_path=file_path)
else:
logger.info("No changes in product dataset, skipping Parquet conversion")


def transform_insight_iter(insights_iter: Iterable[dict]):
Expand Down
1 change: 1 addition & 0 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,4 @@ def get_package_version() -> str:

# SQL queries paths
JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql"
ENABLE_HF_PUSH = bool(int(os.environ.get("ENABLE_HF_PUSH", 0)))

0 comments on commit 7e525c1

Please sign in to comment.