uv sync
python -m spacy download en_core_web_trf
docker compose up -d
alembic upgrade head- Gather URLs from dataset
python core/pipeline/1_get_dataset_urls.py - Get URL Annotations
python core/pipeline/2_get_url_annotations.py - Get HTML from URLs
python core/pipeline/3_fetch_html.py - Process HTML data
python core/pipeline/4_process_data.py