Skip to content

Commit 738ffae

Browse files
author
Rogier van Dalen
committed
Make calls to "open" deterministic
They now use UTF-8 encoding. This is the same as the old behaviour if (on Linux) environment variable LC_ALL=C.UTF-8 was set. However, often this environment variable was not set and behaviour changed from environment to environment. For CSV files, this also uses newline="".
1 parent 093c105 commit 738ffae

File tree

244 files changed

+670
-479
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

244 files changed

+670
-479
lines changed

recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
166166
stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
167167
test_stats=stage_stats,
168168
)
169-
with open(self.hparams.cer_file, "w") as w:
169+
with open(self.hparams.cer_file, "w", encoding="utf-8") as w:
170170
self.cer_metric.write_stats(w)
171171

172172
def init_optimizers(self):
@@ -318,7 +318,7 @@ def text_pipeline(wrd):
318318
if __name__ == "__main__":
319319
# CLI:
320320
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
321-
with open(hparams_file) as fin:
321+
with open(hparams_file, encoding="utf-8") as fin:
322322
hparams = load_hyperpyyaml(fin, overrides)
323323

324324
# create ddp_group with the right communication protocol

recipes/AISHELL-1/ASR/seq2seq/train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
141141
stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
142142
test_stats=stage_stats,
143143
)
144-
with open(self.hparams.cer_file, "w") as w:
144+
with open(self.hparams.cer_file, "w", encoding="utf-8") as w:
145145
self.cer_metric.write_stats(w)
146146

147147

@@ -260,7 +260,7 @@ def text_pipeline(wrd):
260260
if __name__ == "__main__":
261261
# CLI:
262262
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
263-
with open(hparams_file) as fin:
263+
with open(hparams_file, encoding="utf-8") as fin:
264264
hparams = load_hyperpyyaml(fin, overrides)
265265

266266
# create ddp_group with the right communication protocol

recipes/AISHELL-1/ASR/transformer/train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
204204
stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
205205
test_stats=stage_stats,
206206
)
207-
with open(self.hparams.cer_file, "w") as w:
207+
with open(self.hparams.cer_file, "w", encoding="utf-8") as w:
208208
self.cer_metric.write_stats(w)
209209

210210
# save the averaged checkpoint at the end of the evaluation stage
@@ -382,7 +382,7 @@ def text_pipeline(wrd):
382382
if __name__ == "__main__":
383383
# CLI:
384384
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
385-
with open(hparams_file) as fin:
385+
with open(hparams_file, encoding="utf-8") as fin:
386386
hparams = load_hyperpyyaml(fin, overrides)
387387

388388
# create ddp_group with the right communication protocol

recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
187187
stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
188188
test_stats=stage_stats,
189189
)
190-
with open(self.hparams.cer_file, "w") as w:
190+
with open(self.hparams.cer_file, "w", encoding="utf-8") as w:
191191
self.cer_metric.write_stats(w)
192192

193193
# save the averaged checkpoint at the end of the evaluation stage
@@ -384,7 +384,7 @@ def text_pipeline(wrd):
384384
if __name__ == "__main__":
385385
# CLI:
386386
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
387-
with open(hparams_file) as fin:
387+
with open(hparams_file, encoding="utf-8") as fin:
388388
hparams = load_hyperpyyaml(fin, overrides)
389389

390390
# create ddp_group with the right communication protocol

recipes/AISHELL-1/Tokenizer/train.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
if __name__ == "__main__":
2424
# CLI:
2525
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
26-
with open(hparams_file) as fin:
26+
with open(hparams_file, encoding="utf-8") as fin:
2727
hparams = load_hyperpyyaml(fin, overrides)
2828

2929
# create ddp_group with the right communication protocol

recipes/AISHELL-1/aishell_prepare.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def prepare_aishell(
150150
data_folder, "transcript/aishell_transcript_v0.8.txt"
151151
)
152152

153-
with open(path_to_transcript, "r") as f:
153+
with open(path_to_transcript, "r", encoding="utf-8") as f:
154154
lines = f.readlines()
155155
for line in lines:
156156
key = line.split()[0]
@@ -180,7 +180,7 @@ def prepare_aishell(
180180
total_line = 0
181181
total_duration = 0
182182
id = 0
183-
with open(tmp_csv, mode="w", encoding="utf-8") as csv_f:
183+
with open(tmp_csv, mode="w", newline="", encoding="utf-8") as csv_f:
184184
csv_writer = csv.writer(
185185
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
186186
)

recipes/AMI/Diarization/experiment.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def prepare_subset_json(full_meta_data, rec_id, out_meta_file):
154154
if k.startswith(rec_id):
155155
subset[key] = full_meta_data[key]
156156

157-
with open(out_meta_file, mode="w") as json_f:
157+
with open(out_meta_file, mode="w", encoding="utf-8") as json_f:
158158
json.dump(subset, json_f, indent=2)
159159

160160

@@ -302,11 +302,11 @@ def diarize_dataset(full_meta, split_type, n_lambdas, pval, n_neighbors=10):
302302
# This is not needed but just staying with the standards.
303303
concate_rttm_file = out_rttm_dir + "/sys_output.rttm"
304304
logger.debug("Concatenating individual RTTM files...")
305-
with open(concate_rttm_file, "w") as cat_file:
305+
with open(concate_rttm_file, "w", encoding="utf-8") as cat_file:
306306
for f in glob.glob(out_rttm_dir + "/*.rttm"):
307307
if f == concate_rttm_file:
308308
continue
309-
with open(f, "r") as indi_rttm_file:
309+
with open(f, "r", encoding="utf-8") as indi_rttm_file:
310310
shutil.copyfileobj(indi_rttm_file, cat_file)
311311

312312
msg = "The system generated RTTM file for %s set : %s" % (
@@ -507,7 +507,7 @@ def audio_pipeline(wav):
507507
# Load hyperparameters file with command-line overrides.
508508
params_file, run_opts, overrides = sb.core.parse_arguments(sys.argv[1:])
509509

510-
with open(params_file) as fin:
510+
with open(params_file, encoding="utf-8") as fin:
511511
params = load_hyperpyyaml(fin, overrides)
512512

513513
# Dataset prep (preparing metadata files)
@@ -558,7 +558,7 @@ def audio_pipeline(wav):
558558
# AMI Dev Set: Tune hyperparams on dev set.
559559
# Read the meta-data file for dev set generated during data_prep
560560
dev_meta_file = params["dev_meta_file"]
561-
with open(dev_meta_file, "r") as f:
561+
with open(dev_meta_file, "r", encoding="utf-8") as f:
562562
meta_dev = json.load(f)
563563

564564
full_meta = meta_dev
@@ -600,7 +600,7 @@ def audio_pipeline(wav):
600600
# Load 'dev' and 'eval' metadata files.
601601
full_meta_dev = full_meta # current full_meta is for 'dev'
602602
eval_meta_file = params["eval_meta_file"]
603-
with open(eval_meta_file, "r") as f:
603+
with open(eval_meta_file, "r", encoding="utf-8") as f:
604604
full_meta_eval = json.load(f)
605605

606606
# Tag to be appended to final output DER files. Writing DER for individual files.

recipes/AMI/ami_prepare.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def prepare_segs_for_RTTM(
294294
RTTM = RTTM + rttm_per_rec
295295

296296
# Write one RTTM as groundtruth. For example, "fullref_eval.rttm"
297-
with open(out_rttm_file, "w") as f:
297+
with open(out_rttm_file, "w", encoding="utf-8") as f:
298298
for item in RTTM:
299299
f.write("%s\n" % item)
300300

@@ -406,7 +406,7 @@ def prepare_metadata(
406406

407407
# Read RTTM
408408
RTTM = []
409-
with open(rttm_file, "r") as f:
409+
with open(rttm_file, "r", encoding="utf-8") as f:
410410
for line in f:
411411
entry = line[:-1]
412412
RTTM.append(entry)
@@ -438,12 +438,12 @@ def prepare_metadata(
438438
segs_file = save_dir + "/" + filename + ".segments.rttm"
439439
subsegment_file = save_dir + "/" + filename + ".subsegments.rttm"
440440

441-
with open(segs_file, "w") as f:
441+
with open(segs_file, "w", encoding="utf-8") as f:
442442
for row in MERGED_SEGMENTS:
443443
line_str = " ".join(row)
444444
f.write("%s\n" % line_str)
445445

446-
with open(subsegment_file, "w") as f:
446+
with open(subsegment_file, "w", encoding="utf-8") as f:
447447
for row in SUBSEGMENTS:
448448
line_str = " ".join(row)
449449
f.write("%s\n" % line_str)
@@ -510,7 +510,7 @@ def prepare_metadata(
510510
}
511511

512512
out_json_file = save_dir + "/" + filename + "." + mic_type + ".subsegs.json"
513-
with open(out_json_file, mode="w") as json_f:
513+
with open(out_json_file, mode="w", encoding="utf-8") as json_f:
514514
json.dump(json_dict, json_f, indent=2)
515515

516516
msg = "%s JSON prepared" % (out_json_file)

recipes/Aishell1Mix/prepare_data.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,10 @@ def create_aishell1mix2_csv(
227227
]
228228

229229
with open(
230-
savepath + "/aishell1mix2_" + set_type + ".csv", "w"
230+
savepath + "/aishell1mix2_" + set_type + ".csv",
231+
"w",
232+
newline="",
233+
encoding="utf-8",
231234
) as csvfile:
232235
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
233236
writer.writeheader()
@@ -304,7 +307,10 @@ def create_aishell1mix3_csv(
304307
]
305308

306309
with open(
307-
savepath + "/aishell1mix3_" + set_type + ".csv", "w"
310+
savepath + "/aishell1mix3_" + set_type + ".csv",
311+
"w",
312+
newline="",
313+
encoding="utf-8",
308314
) as csvfile:
309315
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
310316
writer.writeheader()

recipes/Aishell1Mix/separation/scripts/create_aishell1_metadata.py

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def create_aishell1_metadata(aishell1_dir, md_dir):
3838
with open(
3939
os.path.join(aishell1_dir, "transcript/aishell_transcript_v0.8.txt"),
4040
"r",
41+
encoding="utf-8",
4142
) as f:
4243
lines = f.readlines()
4344
for line in lines:

recipes/Aishell1Mix/separation/train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ def save_results(self, test_data):
356356
test_data, **self.hparams.dataloader_opts
357357
)
358358

359-
with open(save_file, "w") as results_csv:
359+
with open(save_file, "w", newline="", encoding="utf-8") as results_csv:
360360
writer = csv.DictWriter(results_csv, fieldnames=csv_columns)
361361
writer.writeheader()
362362

@@ -561,7 +561,7 @@ def audio_pipeline_noise(noise_wav):
561561
if __name__ == "__main__":
562562
# Load hyperparameters file with command-line overrides
563563
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
564-
with open(hparams_file) as fin:
564+
with open(hparams_file, encoding="utf-8") as fin:
565565
hparams = load_hyperpyyaml(fin, overrides)
566566

567567
# Initialize ddp (useful only for multi-GPU DDP training)

recipes/AudioMNIST/audiomnist_prepare.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def read_file_list(file_name):
278278
result: lists
279279
the file list
280280
"""
281-
with open(file_name) as list_file:
281+
with open(file_name, encoding="utf-8") as list_file:
282282
return [line.strip() for line in list_file]
283283

284284

@@ -381,7 +381,7 @@ def read_meta(file_name):
381381
result: dict
382382
raw metadata
383383
"""
384-
with open(file_name) as meta_file:
384+
with open(file_name, encoding="utf-8") as meta_file:
385385
return json.load(meta_file)
386386

387387

@@ -652,7 +652,7 @@ def convert_split(
652652
metadata[item_id].update(process_meta)
653653

654654
logger.info(f"Saving metadata to {metadata_file_path}")
655-
with open(metadata_file_path, "w") as metadata_file:
655+
with open(metadata_file_path, "w", encoding="utf-8") as metadata_file:
656656
json.dump(metadata, metadata_file, indent=2)
657657

658658

@@ -805,7 +805,7 @@ def read_digit_lookup(file_name):
805805
}
806806
807807
"""
808-
with open(file_name) as lookup_file:
808+
with open(file_name, encoding="utf-8") as lookup_file:
809809
reader = csv.DictReader(lookup_file)
810810
lookup = {row["digit"]: row for row in reader}
811811
for value in lookup.values():

recipes/AudioMNIST/diffusion/train.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1598,7 +1598,7 @@ def check_tensorboard(hparams):
15981598
sb.utils.distributed.ddp_init_group(run_opts)
15991599

16001600
# Load hyperparameters file with command-line overrides.
1601-
with open(hparams_file) as fin:
1601+
with open(hparams_file, encoding="utf-8") as fin:
16021602
hparams = load_hyperpyyaml(fin, overrides)
16031603

16041604
# Check whether Tensorboard is available and enabled

recipes/BinauralWSJ0Mix/prepare_data.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,10 @@ def create_binaural_wsj0mix2_csv(
100100
]
101101

102102
with open(
103-
os.path.join(savepath, savename + set_type + ".csv"), "w"
103+
os.path.join(savepath, savename + set_type + ".csv"),
104+
"w",
105+
newline="",
106+
encoding="utf-8",
104107
) as csvfile:
105108
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
106109
writer.writeheader()
@@ -184,7 +187,10 @@ def create_binaural_wsj0mix3_csv(
184187
]
185188

186189
with open(
187-
os.path.join(savepath, savename + set_type + ".csv"), "w"
190+
os.path.join(savepath, savename + set_type + ".csv"),
191+
"w",
192+
newline="",
193+
encoding="utf-8",
188194
) as csvfile:
189195
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
190196
writer.writeheader()
@@ -279,7 +285,10 @@ def create_binaural_wsj0mix2_noise_csv(
279285
]
280286

281287
with open(
282-
os.path.join(savepath, savename + set_type + ".csv"), "w"
288+
os.path.join(savepath, savename + set_type + ".csv"),
289+
"w",
290+
newline="",
291+
encoding="utf-8",
283292
) as csvfile:
284293
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
285294
writer.writeheader()
@@ -359,7 +368,10 @@ def create_binaural_wsj0mix2_reverb_csv(
359368
]
360369

361370
with open(
362-
os.path.join(savepath, savename + set_type + ".csv"), "w"
371+
os.path.join(savepath, savename + set_type + ".csv"),
372+
"w",
373+
newline="",
374+
encoding="utf-8",
363375
) as csvfile:
364376
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
365377
writer.writeheader()

recipes/BinauralWSJ0Mix/separation/train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ def save_results(self, test_data):
490490
test_data, **self.hparams.dataloader_opts
491491
)
492492

493-
with open(save_file, "w") as results_csv:
493+
with open(save_file, "w", newline="", encoding="utf-8") as results_csv:
494494
writer = csv.DictWriter(results_csv, fieldnames=csv_columns)
495495
writer.writeheader()
496496

@@ -680,7 +680,7 @@ def audio_pipeline_noise(noise_wav):
680680
if __name__ == "__main__":
681681
# Load hyperparameters file with command-line overrides
682682
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
683-
with open(hparams_file) as fin:
683+
with open(hparams_file, encoding="utf-8") as fin:
684684
hparams = load_hyperpyyaml(fin, overrides)
685685

686686
# Initialize ddp (useful only for multi-GPU DDP training)

recipes/CVSS/S2ST/train.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ def _save_progress_sample(self, epoch):
410410
)
411411

412412
sample_path = save_folder / f"{utt_id}.txt"
413-
with open(sample_path, "w") as file:
413+
with open(sample_path, "w", encoding="utf-8") as file:
414414
file.write(f"pred: {transcript}\n")
415415
file.write(f"ref: {tgt_transcript}\n")
416416

@@ -421,7 +421,7 @@ def _save_progress_sample(self, epoch):
421421
)
422422

423423
bleu_path = save_folder / "bleu.txt"
424-
with open(bleu_path, "w") as file:
424+
with open(bleu_path, "w", encoding="utf-8") as file:
425425
file.write(
426426
f"BLEU score: {round(self.bleu_metric.summarize('BLEU'), 2)}\n"
427427
)
@@ -553,7 +553,7 @@ def unit_pipeline(utt_id):
553553
# Load hyperparameters file with command-line overrides
554554
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
555555

556-
with open(hparams_file) as fin:
556+
with open(hparams_file, encoding="utf-8") as fin:
557557
hparams = load_hyperpyyaml(fin, overrides)
558558

559559
# If distributed_launch=True then

0 commit comments

Comments
 (0)