Skip to content

Commit f2a2f6d

Browse files
committed
feat: added file existance policy
1 parent 13f1a26 commit f2a2f6d

File tree

2 files changed

+27
-6
lines changed

2 files changed

+27
-6
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ repos:
1212
rev: 23.9.1
1313
hooks:
1414
- id: black
15-
language_version: python3.10
15+
language_version: python3.11
1616
stages: [pre-commit]
1717
- repo: https://github.com/astral-sh/ruff-pre-commit
1818
rev: v0.0.278

src/modalities/api.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22

33
import os
4+
from enum import Enum
45
from pathlib import Path
56

67
from pydantic import FilePath
@@ -15,9 +16,18 @@
1516
from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter
1617
from modalities.registry.components import COMPONENTS
1718
from modalities.registry.registry import Registry
19+
from modalities.utils.logging import get_logger
1820

1921

20-
def create_raw_data_index(src_path: Path, index_path: Path):
22+
class FileExistencePolicy(Enum):
23+
SKIP = "skip"
24+
ERROR = "error"
25+
OVERRIDE = "override"
26+
27+
28+
def create_raw_data_index(
29+
src_path: Path, index_path: Path, file_existence_policy: FileExistencePolicy = FileExistencePolicy.ERROR
30+
):
2131
"""Creates the index file for the content of a large jsonl-file. The index file
2232
contains the byte-offsets and lengths of each line in the jsonl-file.
2333
Background is the ability to further process the respective file without loading it,
@@ -32,12 +42,23 @@ def create_raw_data_index(src_path: Path, index_path: Path):
3242
ValueError: If the index file already exists.
3343
"""
3444
index_path = LargeFileLinesReader.default_index_path(src_path, index_path)
35-
os.makedirs(index_path.parent, exist_ok=True)
3645
if index_path.exists():
37-
raise ValueError("index already exists. delete it or specify different output folder.")
46+
if file_existence_policy == FileExistencePolicy.SKIP:
47+
get_logger(name="main").warning(f"Index already exists at {str(index_path)}. Skipping index creation.")
48+
return
49+
elif file_existence_policy == FileExistencePolicy.OVERRIDE:
50+
get_logger(name="main").warning(f"Index already exists at {str(index_path)}. Overriding it.")
51+
os.remove(index_path)
52+
elif file_existence_policy == FileExistencePolicy.ERROR:
53+
raise ValueError("index already exists. delete it or specify different output folder.")
54+
else:
55+
raise ValueError(f"Unknown file existence policy: {file_existence_policy}")
56+
57+
get_logger(name="main").info(
58+
f"Reading raw data from {str(src_path)} and" f" writing index to {str(index_path)} ..."
59+
)
60+
os.makedirs(index_path.parent, exist_ok=True)
3861

39-
print(f"reading raw data from {src_path}")
40-
print(f"writing index to {index_path}")
4162
generator = IndexGenerator(src_path)
4263
generator.create_index(index_path)
4364

0 commit comments

Comments
 (0)