Skip to content

Commit b99f6e0

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 7f96765 commit b99f6e0

File tree

26 files changed

+31
-44
lines changed

26 files changed

+31
-44
lines changed

ac_dc/anonymization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
3030
tag_type=tag_type,
3131
)
3232
if anonymize_condition:
33-
for (ent, start, end, tag) in ner:
33+
for ent, start, end, tag in ner:
3434
# we need to actually walk through and replace by start, end span.
3535
sentence = sentence.replace(ent, f" <{tag}> ")
3636
return sentence, ner

ac_dc/deduplicate/self_deduplicate.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8 -*-
32
# @Date : 2022-01-08 22:39:29
43
# @Author : Chenghao Mou ([email protected])
54
# @Description: Self-deduplication with `datasets`
@@ -27,8 +26,7 @@
2726

2827

2928
def main(conf: str) -> None:
30-
31-
with open(conf, "r") as f:
29+
with open(conf) as f:
3230
conf = yaml.safe_load(f.read())
3331

3432
if conf["load_from_disk"]["path"]:
@@ -201,5 +199,4 @@ def main(conf: str) -> None:
201199

202200

203201
if __name__ == "__main__":
204-
205202
typer.run(main)

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def __init__(
2121
path_kenlm_model,
2222
path_save_stats,
2323
):
24-
2524
self.ds = dataset
2625
self.num_iter = num_iter
2726

@@ -166,7 +165,6 @@ def compute_stats(self):
166165

167166

168167
if __name__ == "__main__":
169-
170168
lang_dataset_id = "en"
171169

172170
dataset_name = "oscar" # "TurkuNLP/register_oscar"

ac_dc/visualization/visualization.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,6 @@ def filtering_of_words(self):
625625
)
626626

627627
if display_discarded_words_by_filter:
628-
629628
if "len_word" in columns:
630629
cond_filter = np.invert(conds_words["len_word"])
631630
Visualization_for_lang.display_dataset(
@@ -698,7 +697,6 @@ def is_doc_discarded(key, score):
698697
return score < key[1]
699698

700699
if personal_doc:
701-
702700
st.markdown("Statistics of the document:")
703701

704702
for key in self.keys:

bertin/evaluation/run_glue.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");

bertin/evaluation/run_ner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2020 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");

bertin/mc4/mc4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ def _generate_examples(self, filepaths):
404404
for filepath in filepaths:
405405
logger.info("generating examples from = %s", filepath)
406406
if filepath.endswith("jsonl"):
407-
with open(filepath, "r", encoding="utf-8") as f:
407+
with open(filepath, encoding="utf-8") as f:
408408
for line in f:
409409
if line:
410410
example = json.loads(line)

bertin/run_mlm_flax.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2021 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");

bertin/run_mlm_flax_stream.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2021 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state):
446445
args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
447446
data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
448447

449-
with open(os.path.join(save_dir, "training_state.json"), "r") as f:
448+
with open(os.path.join(save_dir, "training_state.json")) as f:
450449
training_state = json.load(f)
451450
step = training_state["step"]
452451

bertin/utils/dataset_perplexity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def get_perplexity(doc):
1717

1818

1919
with open("mc4-es-train-50M-stats.csv", "w") as csv:
20-
with open("mc4-es-train-50M-steps.jsonl", "r") as data:
20+
with open("mc4-es-train-50M-steps.jsonl") as data:
2121
for line in tqdm(data):
2222
text = json.loads(line)["text"]
2323
csv.write(f"{len(text.split())},{get_perplexity(text)}\n")

0 commit comments

Comments
 (0)