Skip to content

Commit

Permalink
feat: ⚡ Add CLI command to convert and push JSONL to Huggingface (#1436)
Browse files Browse the repository at this point in the history
* feat: ⚡ Add CLI command to convert and push JSON to Huggingface

* feat: 🎨 Add unittests and fix bugs

* feat: 🔥 Optimize DuckDB query + Enqueue job (WIP)

* fix: 🔖 Fix enqueue job + Better logging and error handling

* refactor: 🎨 Update sql query: {feature}_tags instead of {feature}

* fix: 🐛 Remove timer on

* fix: 🐛 Remove enqueue job

NEw spec: we only use the CLI command for testing. The JSONL conversion will be handled in a scheduled job instead

* feat: 🎨 Add possibility to convert the dataset locally

* fix: 🐛 poetry lock --no-update after merge conflict resolution

* fix: 🔥 Improve performances + push to hub fixed + ingredients_text_{lang} added

* fix: 🐛 Bug with threading fixed

* style: 🎨 Black
  • Loading branch information
jeremyarancio authored Oct 28, 2024
1 parent ef0f861 commit d68a231
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 7 deletions.
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ x-robotoff-base-env:
GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
BATCH_JOB_KEY: # Secure Batch job import with a token key
HF_TOKEN: # Hugging Face token

x-robotoff-worker-base:
&robotoff-worker
Expand Down
12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,5 +1201,38 @@ def launch_normalize_barcode_job(
logger.info("Updated %d images", updated)


@app.command()
def push_jsonl_to_hf(
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated.",
output_path: Optional[str] = None,
):
"""Clean and convert the JSONL database before pushing to HF.
Possibility to only convert the database locally by indicating an `output_path`.
"""
import os
import tempfile

from robotoff.products import convert_jsonl_to_parquet, push_data_to_hf
from robotoff.utils.logger import get_logger

logger = get_logger()
logger.info("Start command: convert JSONL to Parquet (to HF).")
if output_path:
convert_jsonl_to_parquet(output_file_path=output_path)
else:
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, "converted_data.parquet")
convert_jsonl_to_parquet(output_file_path=file_path)
push_data_to_hf(
data_path=file_path,
repo_id=repo_id,
revision=revision,
commit_message=commit_message,
)
logger.info("JSONL to Parquet succesfully finished.")


def main() -> None:
app()
50 changes: 50 additions & 0 deletions robotoff/products.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
from pathlib import Path
from typing import Iterable, Iterator, Optional, Union

import duckdb
import requests
from huggingface_hub import HfApi
from pymongo import MongoClient

from robotoff import settings
Expand Down Expand Up @@ -572,3 +574,51 @@ def get_product(
:return: the product as a dict or None if it was not found
"""
return get_product_store(product_id.server_type).get_product(product_id, projection)


def convert_jsonl_to_parquet(
output_file_path: str,
dataset_path: Path = settings.JSONL_DATASET_PATH,
query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY,
) -> None:
logger.info("Start JSONL to Parquet conversion process.")
if not dataset_path.exists() or not query_path.exists():
raise FileNotFoundError(
f"{str(dataset_path)} or {str(query_path)} was not found."
)
query = (
query_path.read_text()
.replace("{dataset_path}", str(dataset_path))
.replace("{output_path}", output_file_path)
)
try:
duckdb.sql(query)
except duckdb.Error as e:
logger.error(f"Error executing query: {query}\nError message: {e}")
raise
logger.info("JSONL successfully converted into Parquet file.")


def push_data_to_hf(
data_path: str,
repo_id: str,
revision: str,
commit_message: str,
) -> None:
logger.info(f"Start pushing data to Hugging Face at {repo_id}")
if not os.path.exists(data_path):
raise FileNotFoundError(f"Data is missing: {data_path}")
if os.path.splitext(data_path)[-1] != ".parquet":
raise ValueError(
f"A parquet file is expected. Got {os.path.splitext(data_path)[-1]} instead."
)
# We use the HF_Hub api since it gives us way more flexibility than push_to_hub()
HfApi().upload_file(
path_or_fileobj=data_path,
repo_id=repo_id,
revision=revision,
repo_type="dataset",
path_in_repo="products.parquet",
commit_message=commit_message,
)
logger.info(f"Data succesfully pushed to Hugging Face at {repo_id}")
3 changes: 3 additions & 0 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,6 @@ def get_package_version() -> str:

# Batch jobs
GOOGLE_PROJECT_NAME = "robotoff"

# SQL queries paths
JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql"
124 changes: 124 additions & 0 deletions robotoff/utils/sql/jsonl_to_parquet.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
SET threads to 4;
SET preserve_insertion_order = false;
COPY (
SELECT
code,
additives_n,
additives_tags,
allergens_from_ingredients,
allergens_from_user,
allergens_tags,
brands_tags,
categories_properties_tags,
categories,
checkers_tags,
cities_tags,
compared_to_category,
complete,
completeness,
correctors_tags,
countries_imported,
countries_tags,
creator,
data_sources_tags,
ecoscore_data,
ecoscore_score,
ecoscore_tags,
editors,
emb_code,
emb_codes,
entry_dates_tags,
environment_impact_level,
food_groups_tags,
forest_footprint_data,
generic_name,
grades,
id,
images,
informers_tags,
ingredients_analysis_tags,
ingredients_from_palm_oil_n,
ingredients_n,
ingredients_text_with_allergens,
ingredients_text,
COLUMNS('ingredients_text_\w{2}$'),
ingredients_with_specified_percent_n,
ingredients_with_unspecified_percent_n,
ciqual_food_name_tags,
ingredients_percent_analysis,
ingredients_original_tags,
ingredients_without_ciqual_codes_n,
ingredients_without_ciqual_codes,
ingredients,
known_ingredients_n,
labels_tags,
lang,
languages_tags,
languages_codes,
last_edit_dates_tags,
last_editor,
last_modified_by,
last_updated_t,
link,
main_countries_tags,
manufacturing_places,
max_imgid,
misc_tags,
minerals_tags,
new_additives_n,
nova_groups_markers,
nova_groups_tags,
nucleotides_tags,
nutrient_levels_tags,
unknown_nutrients_tags,
nutriments,
nutriscore_data,
nutriscore_score,
nutriscore_tags,
nutrition_data_prepared_per,
nutrition_data,
nutrition_grades_tags,
nutrition_score_beverage,
nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients,
nutrition_score_warning_no_fiber,
nutrition_score_warning_no_fruits_vegetables_nuts,
obsolete_since_date,
obsolete,
origins,
owner_fields,
packaging_recycling_tags,
packaging_shapes_tags,
packaging_tags,
packagings_materials,
packagings_n,
packagings_n,
photographers,
pnns_groups_1_tags
pnns_groups_2_tags,
popularity_key,
popularity_tags,
product_name,
product_quantity_unit,
product_quantity,
purchase_places_tags,
quantity,
rev,
scans_n,
scores,
serving_quantity,
serving_size,
sources,
sources_fields,
specific_ingredients,
states_tags,
stores,
traces_tags,
unique_scans_n,
unknown_ingredients_n,
vitamins_tags,
weighers_tags,
with_non_nutritive_sweeteners,
with_sweeteners,
FROM read_ndjson('{dataset_path}', ignore_errors=True)
) TO '{output_path}' (FORMAT PARQUET)
;
29 changes: 28 additions & 1 deletion tests/unit/test_products.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import json
import os
import tempfile
from pathlib import Path
from typing import Optional
from unittest.mock import Mock

import pytest

from robotoff.products import is_special_image, is_valid_image
from robotoff.products import convert_jsonl_to_parquet, is_special_image, is_valid_image
from robotoff.settings import TEST_DATA_DIR
from robotoff.types import JSONType

Expand Down Expand Up @@ -51,3 +55,26 @@ def test_is_valid_image(
output: bool,
):
assert is_valid_image(images, image_path) is output


class TestConvertJSONLToParquet:
def test_convert_jsonl_to_parquet(self, mocker: Mock):
"""This function doesn't test the DuckDB Query but only the logic of the `convert_jsonl_to_parquet`function.
The reason is that the JSONL dataset schema can change over time, potentially leading to this test to fail.
The JSONL schema validity responsability should remain out of this unittest.
"""
# Mock the DuckDB SQL query and parquet writing
mock_duckdb_sql = mocker.patch("duckdb.sql")
with tempfile.TemporaryDirectory() as tmp_dir:
output_file_path = os.path.join(tmp_dir, "test_converted.parquet")
convert_jsonl_to_parquet(output_file_path=output_file_path)
mock_duckdb_sql.assert_called_once()

def test_convert_jsonl_to_parquet_data_missing(self):
non_existing_path = Path("non/existing/dataset/path")
with pytest.raises(FileNotFoundError):
convert_jsonl_to_parquet(
output_file_path="any_path",
dataset_path=non_existing_path,
query_path=non_existing_path,
)

0 comments on commit d68a231

Please sign in to comment.