1
1
#!/usr/bin/env python
2
2
3
3
import os
4
+ from enum import Enum
4
5
from pathlib import Path
5
6
6
7
from pydantic import FilePath
15
16
from modalities .models .huggingface_adapters .hf_adapter import HFModelAdapter
16
17
from modalities .registry .components import COMPONENTS
17
18
from modalities .registry .registry import Registry
19
+ from modalities .utils .logging import get_logger
18
20
19
21
20
- def create_raw_data_index (src_path : Path , index_path : Path ):
22
+ class FileExistencePolicy (Enum ):
23
+ SKIP = "skip"
24
+ ERROR = "error"
25
+ OVERRIDE = "override"
26
+
27
+
28
+ def create_raw_data_index (
29
+ src_path : Path , index_path : Path , file_existence_policy : FileExistencePolicy = FileExistencePolicy .ERROR
30
+ ):
21
31
"""Creates the index file for the content of a large jsonl-file. The index file
22
32
contains the byte-offsets and lengths of each line in the jsonl-file.
23
33
Background is the ability to further process the respective file without loading it,
@@ -32,12 +42,23 @@ def create_raw_data_index(src_path: Path, index_path: Path):
32
42
ValueError: If the index file already exists.
33
43
"""
34
44
index_path = LargeFileLinesReader .default_index_path (src_path , index_path )
35
- os .makedirs (index_path .parent , exist_ok = True )
36
45
if index_path .exists ():
37
- raise ValueError ("index already exists. delete it or specify different output folder." )
46
+ if file_existence_policy == FileExistencePolicy .SKIP :
47
+ get_logger (name = "main" ).warning (f"Index already exists at { str (index_path )} . Skipping index creation." )
48
+ return
49
+ elif file_existence_policy == FileExistencePolicy .OVERRIDE :
50
+ get_logger (name = "main" ).warning (f"Index already exists at { str (index_path )} . Overriding it." )
51
+ os .remove (index_path )
52
+ elif file_existence_policy == FileExistencePolicy .ERROR :
53
+ raise ValueError ("index already exists. delete it or specify different output folder." )
54
+ else :
55
+ raise ValueError (f"Unknown file existence policy: { file_existence_policy } " )
56
+
57
+ get_logger (name = "main" ).info (
58
+ f"Reading raw data from { str (src_path )} and" f" writing index to { str (index_path )} ..."
59
+ )
60
+ os .makedirs (index_path .parent , exist_ok = True )
38
61
39
- print (f"reading raw data from { src_path } " )
40
- print (f"writing index to { index_path } " )
41
62
generator = IndexGenerator (src_path )
42
63
generator .create_index (index_path )
43
64
0 commit comments