Skip to content

Commit 19f5aae

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 8818e19 commit 19f5aae

26 files changed

+31
-44
lines changed

Diff for: ac_dc/anonymization.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
3030
tag_type=tag_type,
3131
)
3232
if anonymize_condition:
33-
for (ent, start, end, tag) in ner:
33+
for ent, start, end, tag in ner:
3434
# we need to actually walk through and replace by start, end span.
3535
sentence = sentence.replace(ent, f" <{tag}> ")
3636
return sentence, ner

Diff for: ac_dc/deduplicate/self_deduplicate.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8 -*-
32
# @Date : 2022-01-08 22:39:29
43
# @Author : Chenghao Mou ([email protected])
54
# @Description: Self-deduplication with `datasets`
@@ -27,8 +26,7 @@
2726

2827

2928
def main(conf: str) -> None:
30-
31-
with open(conf, "r") as f:
29+
with open(conf) as f:
3230
conf = yaml.safe_load(f.read())
3331

3432
if conf["load_from_disk"]["path"]:
@@ -201,5 +199,4 @@ def main(conf: str) -> None:
201199

202200

203201
if __name__ == "__main__":
204-
205202
typer.run(main)

Diff for: ac_dc/visualization/get_data_for_visualization.py

-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def __init__(
2121
path_kenlm_model,
2222
path_save_stats,
2323
):
24-
2524
self.ds = dataset
2625
self.num_iter = num_iter
2726

@@ -166,7 +165,6 @@ def compute_stats(self):
166165

167166

168167
if __name__ == "__main__":
169-
170168
lang_dataset_id = "en"
171169

172170
dataset_name = "oscar" # "TurkuNLP/register_oscar"

Diff for: ac_dc/visualization/visualization.py

-2
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,6 @@ def filtering_of_words(self):
625625
)
626626

627627
if display_discarded_words_by_filter:
628-
629628
if "len_word" in columns:
630629
cond_filter = np.invert(conds_words["len_word"])
631630
Visualization_for_lang.display_dataset(
@@ -698,7 +697,6 @@ def is_doc_discarded(key, score):
698697
return score < key[1]
699698

700699
if personal_doc:
701-
702700
st.markdown("Statistics of the document:")
703701

704702
for key in self.keys:

Diff for: bertin/evaluation/run_glue.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");

Diff for: bertin/evaluation/run_ner.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2020 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");

Diff for: bertin/mc4/mc4.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ def _generate_examples(self, filepaths):
404404
for filepath in filepaths:
405405
logger.info("generating examples from = %s", filepath)
406406
if filepath.endswith("jsonl"):
407-
with open(filepath, "r", encoding="utf-8") as f:
407+
with open(filepath, encoding="utf-8") as f:
408408
for line in f:
409409
if line:
410410
example = json.loads(line)

Diff for: bertin/run_mlm_flax.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2021 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");

Diff for: bertin/run_mlm_flax_stream.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2021 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state):
446445
args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
447446
data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
448447

449-
with open(os.path.join(save_dir, "training_state.json"), "r") as f:
448+
with open(os.path.join(save_dir, "training_state.json")) as f:
450449
training_state = json.load(f)
451450
step = training_state["step"]
452451

Diff for: bertin/utils/dataset_perplexity.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def get_perplexity(doc):
1717

1818

1919
with open("mc4-es-train-50M-stats.csv", "w") as csv:
20-
with open("mc4-es-train-50M-steps.jsonl", "r") as data:
20+
with open("mc4-es-train-50M-steps.jsonl") as data:
2121
for line in tqdm(data):
2222
text = json.loads(line)["text"]
2323
csv.write(f"{len(text.split())},{get_perplexity(text)}\n")

Diff for: cc_pseudo_crawl/python_scripts/download_warc.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,9 @@ def get_warcs(batch):
143143
existing_compressed_warcs,
144144
)
145145

146-
batch["compressed_warc"], batch["download_exception"] = [
146+
batch["compressed_warc"], batch["download_exception"] = (
147147
list(l) for l in zip(*warcs_or_exceptions)
148-
]
148+
)
149149
return batch
150150

151151

Diff for: cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py

-1
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,6 @@ def main(args: PreprocessingConfig) -> None: # Setup logging
431431
]
432432

433433
def process_file(file_name: str):
434-
435434
logger.info(config.HF_DATASETS_CACHE)
436435
processing_name = (
437436
"-".join(args.metadata_to_include)

Diff for: cc_pseudo_crawl/python_scripts/load_all_seed_ids.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def main():
2121

2222
seed_ids = []
2323
for seed_path in args.seed_paths:
24-
with open(seed_path, "r") as fi:
24+
with open(seed_path) as fi:
2525
data = csv.reader(fi)
2626
# First line is all the headers that we remove.
2727
seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]

Diff for: kenlm_training/cc_net/execution.py

-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def get_executor(
4242
task_parallelism: int = -1,
4343
options: dict = {},
4444
) -> Executor:
45-
4645
execution_mode = execution.split(",")[0]
4746
options.update(
4847
{kv.split("=", 1)[0]: kv.split("=", 1)[1] for kv in execution.split(",")[1:]}

Diff for: kenlm_training/cc_net/jsonql.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -880,8 +880,7 @@ def describe(source, columns=None, weights=None, **kwargs):
880880
continue
881881
if "." in k or k == ALL_DOCUMENTS:
882882
continue
883-
for line in display_stats(stats, k, weights=weights, **kwargs):
884-
yield line
883+
yield from display_stats(stats, k, weights=weights, **kwargs)
885884

886885

887886
def shard(lines):
@@ -961,7 +960,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]:
961960
if filename.suffix == ".gz":
962961
file: TextIO = gzip.open(filename, "rt") # type: ignore
963962
else:
964-
file = open(filename, "rt")
963+
file = open(filename)
965964

966965
return _close_when_exhausted(file)
967966

@@ -1015,7 +1014,7 @@ def open_write(
10151014
if filename.suffix == ".gz":
10161015
return BlockedGzipWriter(Path(filename), mode, block_size="64M")
10171016

1018-
return open(filename, "wt")
1017+
return open(filename, "w")
10191018

10201019

10211020
def parse_size(size):

Diff for: kenlm_training/tests/test_jsonql.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def do(self, x):
262262
def acc(values):
263263
print("acc: started")
264264
res = 0
265-
for (x, _) in values:
265+
for x, _ in values:
266266
res += int(x)
267267
print("acc: done")
268268
yield f"acc: result={res}"

Diff for: pii-manager/setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@
2727

2828
def requirements(filename="requirements.txt"):
2929
"""Read the requirements file"""
30-
with io.open(filename, "r") as f:
30+
with open(filename) as f:
3131
return [line.strip() for line in f if line and line[0] != "#"]
3232

3333

3434
def long_description():
3535
"""
3636
Take the README and remove markdown hyperlinks
3737
"""
38-
with open("README.md", "rt", encoding="utf-8") as f:
38+
with open("README.md", encoding="utf-8") as f:
3939
desc = f.read()
4040
desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M)
4141
return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X)

Diff for: pii-manager/src/pii_manager/api/manager.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,19 @@ def fetch_all_tasks(
3131
"""
3232
taskdict = get_taskdict(debug=debug)
3333
# Language-independent
34-
for task in taskdict[LANG_ANY].values():
35-
yield task
34+
yield from taskdict[LANG_ANY].values()
3635

3736
langdict = taskdict.get(lang, {})
3837
# Country-independent
39-
for task in langdict.get(COUNTRY_ANY, {}).values():
40-
yield task
38+
yield from langdict.get(COUNTRY_ANY, {}).values()
4139
# Country-specific
4240
if country:
4341
if country[0] in (COUNTRY_ANY, "all"):
4442
country = country_list(lang)
4543
for c in country:
4644
if c == COUNTRY_ANY: # already included above
4745
continue
48-
for task in langdict.get(c, {}).values():
49-
yield task
46+
yield from langdict.get(c, {}).values()
5047

5148

5249
def fetch_task(

Diff for: pii-manager/test/unit/api/test_file.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def datafile(name: str) -> str:
1212

1313

1414
def readfile(name: str) -> str:
15-
with open(name, "rt", encoding="utf-8") as f:
15+
with open(name, encoding="utf-8") as f:
1616
return f.read().strip()
1717

1818

Diff for: pii-manager/test/unit/api/test_file_taskfile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def datafile(name: str) -> str:
1414

1515

1616
def readfile(name: str) -> str:
17-
with open(name, "rt", encoding="utf-8") as f:
17+
with open(name, encoding="utf-8") as f:
1818
return f.read().strip()
1919

2020

Diff for: pii-manager/test/unit/api/test_manager.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ def test20_info():
2121
info = obj.task_info()
2222

2323
exp = {
24-
(PiiEnum.CREDIT_CARD, None,): [
24+
(
25+
PiiEnum.CREDIT_CARD,
26+
None,
27+
): [
2528
(
2629
"credit card",
2730
"Credit card numbers for most international credit cards (detect & validate)",

Diff for: pii-manager/test/unit/api/test_manager_add.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test110_call():
4747
obj = PiiManager("en", None, PiiEnum.EMAIL_ADDRESS)
4848
obj.add_tasks([DUMMY_REGEX])
4949

50-
for (doc, exp) in TEST_REGEX:
50+
for doc, exp in TEST_REGEX:
5151
got = obj(doc)
5252
assert got == exp
5353

@@ -86,6 +86,6 @@ def test200_call():
8686
obj = PiiManager("en")
8787
obj.add_tasks([DUMMY_CLASS])
8888

89-
for (doc, exp) in TEST_CLASS:
89+
for doc, exp in TEST_CLASS:
9090
got = obj(doc)
9191
assert got == exp

Diff for: pii-manager/test/unit/api/test_manager_ctx.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def test10_context_regex():
3838
"""
3939
obj = PiiManager("en", mode="extract")
4040
obj.add_tasks([DUMMY_REGEX])
41-
for (text, exp) in TEST:
41+
for text, exp in TEST:
4242
got = obj(text)
4343
assert list(got) == exp
4444

@@ -64,6 +64,6 @@ def test20_context_class():
6464
"""
6565
obj = PiiManager("en", mode="extract")
6666
obj.add_tasks([DUMMY_CLASS])
67-
for (text, exp) in TEST:
67+
for text, exp in TEST:
6868
got = obj(text)
6969
assert list(got) == exp

Diff for: pii-manager/test/unit/helper/test_context.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def test10_context_true():
7474
"""
7575
Check valid contexts
7676
"""
77-
for (text, context) in TEST_TRUE:
77+
for text, context in TEST_TRUE:
7878
spec = mod.context_spec(context)
7979
assert mod.context_check(text, spec, 20) is True
8080

@@ -83,7 +83,7 @@ def test20_context_false():
8383
"""
8484
Check invalid contexts
8585
"""
86-
for (text, context) in TEST_FALSE:
86+
for text, context in TEST_FALSE:
8787
spec = mod.context_spec(context)
8888
assert mod.context_check(text, spec, 20) is False
8989

Diff for: pii-manager/test/unit/helper/test_norm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ def test10_normalizer():
88
"""
99
Create base object
1010
"""
11-
for (text, exp) in TEST:
11+
for text, exp in TEST:
1212
assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp

Diff for: tokenizer/python_script/dedup_lines.py

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
META_COLUMNS = ["meta"]
3030

31+
3132
# filter text to remove certain lines (e.g. menu items, copyright notice)
3233
def filter_lines(article, skip_set, used_lines):
3334
# TODO discuss the strip

0 commit comments

Comments
 (0)