feat: ⚡ Add CLI command to convert and push JSONL to Huggingface (#1436)

* feat: ⚡ Add CLI command to convert and push JSON to Huggingface * feat: 🎨 Add unittests and fix bugs * feat: 🔥 Optimize DuckDB query + Enqueue job (WIP) * fix: 🔖 Fix enqueue job + Better logging and error handling * refactor: 🎨 Update sql query: {feature}_tags instead of {feature} * fix: 🐛 Remove timer on * fix: 🐛 Remove enqueue job NEw spec: we only use the CLI command for testing. The JSONL conversion will be handled in a scheduled job instead * feat: 🎨 Add possibility to convert the dataset locally * fix: 🐛 poetry lock --no-update after merge conflict resolution * fix: 🔥 Improve performances + push to hub fixed + ingredients_text_{lang} added * fix: 🐛 Bug with threading fixed * style: 🎨 Black
openfoodfacts · Oct 28, 2024 · d68a231 · d68a231
1 parent ef0f861
commit d68a231
Show file tree

Hide file tree

Showing 7 changed files with 245 additions and 7 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -58,6 +58,7 @@ x-robotoff-base-env:
   GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
   GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
   BATCH_JOB_KEY: # Secure Batch job import with a token key 
+  HF_TOKEN: # Hugging Face token
 
 x-robotoff-worker-base:
   &robotoff-worker

diff --git a/poetry.lock b/poetry.lock
diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
@@ -1201,5 +1201,38 @@ def launch_normalize_barcode_job(
                 logger.info("Updated %d images", updated)
 
 
+@app.command()
+def push_jsonl_to_hf(
+    repo_id: str = "openfoodfacts/product-database",
+    revision: str = "main",
+    commit_message: str = "Database updated.",
+    output_path: Optional[str] = None,
+):
+    """Clean and convert the JSONL database before pushing to HF.
+    Possibility to only convert the database locally by indicating an `output_path`.
+    """
+    import os
+    import tempfile
+
+    from robotoff.products import convert_jsonl_to_parquet, push_data_to_hf
+    from robotoff.utils.logger import get_logger
+
+    logger = get_logger()
+    logger.info("Start command: convert JSONL to Parquet (to HF).")
+    if output_path:
+        convert_jsonl_to_parquet(output_file_path=output_path)
+    else:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            file_path = os.path.join(tmp_dir, "converted_data.parquet")
+            convert_jsonl_to_parquet(output_file_path=file_path)
+            push_data_to_hf(
+                data_path=file_path,
+                repo_id=repo_id,
+                revision=revision,
+                commit_message=commit_message,
+            )
+    logger.info("JSONL to Parquet succesfully finished.")
+
+
 def main() -> None:
     app()
diff --git a/robotoff/products.py b/robotoff/products.py
@@ -10,7 +10,9 @@
 from pathlib import Path
 from typing import Iterable, Iterator, Optional, Union
 
+import duckdb
 import requests
+from huggingface_hub import HfApi
 from pymongo import MongoClient
 
 from robotoff import settings
@@ -572,3 +574,51 @@ def get_product(
     :return: the product as a dict or None if it was not found
     """
     return get_product_store(product_id.server_type).get_product(product_id, projection)
+
+
+def convert_jsonl_to_parquet(
+    output_file_path: str,
+    dataset_path: Path = settings.JSONL_DATASET_PATH,
+    query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY,
+) -> None:
+    logger.info("Start JSONL to Parquet conversion process.")
+    if not dataset_path.exists() or not query_path.exists():
+        raise FileNotFoundError(
+            f"{str(dataset_path)} or {str(query_path)} was not found."
+        )
+    query = (
+        query_path.read_text()
+        .replace("{dataset_path}", str(dataset_path))
+        .replace("{output_path}", output_file_path)
+    )
+    try:
+        duckdb.sql(query)
+    except duckdb.Error as e:
+        logger.error(f"Error executing query: {query}\nError message: {e}")
+        raise
+    logger.info("JSONL successfully converted into Parquet file.")
+
+
+def push_data_to_hf(
+    data_path: str,
+    repo_id: str,
+    revision: str,
+    commit_message: str,
+) -> None:
+    logger.info(f"Start pushing data to Hugging Face at {repo_id}")
+    if not os.path.exists(data_path):
+        raise FileNotFoundError(f"Data is missing: {data_path}")
+    if os.path.splitext(data_path)[-1] != ".parquet":
+        raise ValueError(
+            f"A parquet file is expected. Got {os.path.splitext(data_path)[-1]} instead."
+        )
+    # We use the HF_Hub api since it gives us way more flexibility than push_to_hub()
+    HfApi().upload_file(
+        path_or_fileobj=data_path,
+        repo_id=repo_id,
+        revision=revision,
+        repo_type="dataset",
+        path_in_repo="products.parquet",
+        commit_message=commit_message,
+    )
+    logger.info(f"Data succesfully pushed to Hugging Face at {repo_id}")
diff --git a/robotoff/settings.py b/robotoff/settings.py
@@ -360,3 +360,6 @@ def get_package_version() -> str:
 
 # Batch jobs
 GOOGLE_PROJECT_NAME = "robotoff"
+
+# SQL queries paths
+JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql"
diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql
@@ -0,0 +1,124 @@
+SET threads to 4;
+SET preserve_insertion_order = false;
+COPY ( 
+    SELECT
+        code,
+        additives_n,
+        additives_tags,
+        allergens_from_ingredients,
+        allergens_from_user,
+        allergens_tags,
+        brands_tags,
+        categories_properties_tags,
+        categories,
+        checkers_tags,
+        cities_tags,
+        compared_to_category,
+        complete,
+        completeness,
+        correctors_tags,
+        countries_imported,
+        countries_tags,
+        creator,
+        data_sources_tags,
+        ecoscore_data,
+        ecoscore_score,
+        ecoscore_tags,
+        editors,
+        emb_code,
+        emb_codes,
+        entry_dates_tags,
+        environment_impact_level,
+        food_groups_tags,
+        forest_footprint_data,
+        generic_name,
+        grades,
+        id,
+        images,
+        informers_tags,
+        ingredients_analysis_tags,
+        ingredients_from_palm_oil_n,
+        ingredients_n,
+        ingredients_text_with_allergens,
+        ingredients_text,
+        COLUMNS('ingredients_text_\w{2}$'),
+        ingredients_with_specified_percent_n,
+        ingredients_with_unspecified_percent_n,
+        ciqual_food_name_tags,
+        ingredients_percent_analysis,
+        ingredients_original_tags,
+        ingredients_without_ciqual_codes_n,
+        ingredients_without_ciqual_codes,
+        ingredients,
+        known_ingredients_n,
+        labels_tags,
+        lang,
+        languages_tags,
+        languages_codes,
+        last_edit_dates_tags,
+        last_editor,
+        last_modified_by,
+        last_updated_t,
+        link,
+        main_countries_tags,
+        manufacturing_places,
+        max_imgid,
+        misc_tags,
+        minerals_tags,
+        new_additives_n,
+        nova_groups_markers,
+        nova_groups_tags,
+        nucleotides_tags,
+        nutrient_levels_tags,
+        unknown_nutrients_tags,
+        nutriments,
+        nutriscore_data,
+        nutriscore_score,
+        nutriscore_tags,
+        nutrition_data_prepared_per,
+        nutrition_data,
+        nutrition_grades_tags,
+        nutrition_score_beverage,
+        nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients,
+        nutrition_score_warning_no_fiber,
+        nutrition_score_warning_no_fruits_vegetables_nuts,
+        obsolete_since_date,
+        obsolete,
+        origins,
+        owner_fields,
+        packaging_recycling_tags,
+        packaging_shapes_tags,
+        packaging_tags,
+        packagings_materials,
+        packagings_n,
+        packagings_n,
+        photographers,
+        pnns_groups_1_tags
+        pnns_groups_2_tags,
+        popularity_key,
+        popularity_tags,
+        product_name,
+        product_quantity_unit,
+        product_quantity,
+        purchase_places_tags,
+        quantity,
+        rev,
+        scans_n,
+        scores,
+        serving_quantity,
+        serving_size,
+        sources,
+        sources_fields,
+        specific_ingredients,
+        states_tags,
+        stores,
+        traces_tags,
+        unique_scans_n,
+        unknown_ingredients_n,
+        vitamins_tags,
+        weighers_tags,
+        with_non_nutritive_sweeteners,
+        with_sweeteners,
+    FROM read_ndjson('{dataset_path}', ignore_errors=True)
+) TO '{output_path}' (FORMAT PARQUET)
+;
diff --git a/tests/unit/test_products.py b/tests/unit/test_products.py
@@ -1,9 +1,13 @@
 import json
+import os
+import tempfile
+from pathlib import Path
 from typing import Optional
+from unittest.mock import Mock
 
 import pytest
 
-from robotoff.products import is_special_image, is_valid_image
+from robotoff.products import convert_jsonl_to_parquet, is_special_image, is_valid_image
 from robotoff.settings import TEST_DATA_DIR
 from robotoff.types import JSONType
 
@@ -51,3 +55,26 @@ def test_is_valid_image(
     output: bool,
 ):
     assert is_valid_image(images, image_path) is output
+
+
+class TestConvertJSONLToParquet:
+    def test_convert_jsonl_to_parquet(self, mocker: Mock):
+        """This function doesn't test the DuckDB Query but only the logic of the `convert_jsonl_to_parquet`function.
+        The reason is that the JSONL dataset schema can change over time, potentially leading to this test to fail.
+        The JSONL schema validity responsability should remain out of this unittest.
+        """
+        # Mock the DuckDB SQL query and parquet writing
+        mock_duckdb_sql = mocker.patch("duckdb.sql")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            output_file_path = os.path.join(tmp_dir, "test_converted.parquet")
+            convert_jsonl_to_parquet(output_file_path=output_file_path)
+        mock_duckdb_sql.assert_called_once()
+
+    def test_convert_jsonl_to_parquet_data_missing(self):
+        non_existing_path = Path("non/existing/dataset/path")
+        with pytest.raises(FileNotFoundError):
+            convert_jsonl_to_parquet(
+                output_file_path="any_path",
+                dataset_path=non_existing_path,
+                query_path=non_existing_path,
+            )