Skip to content

Commit 29bb691

Browse files
committed
don't make ascii normalization the default
1 parent ce2fc2c commit 29bb691

File tree

3 files changed

+7
-2
lines changed

3 files changed

+7
-2
lines changed

config/config.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ language =
4848
# Modernize language if modernization is available for your language: currently only French is supported.
4949
modernize = yes
5050

51+
# Transliterate characters to closest ascii representation.
52+
ascii = no
53+
5154
# Stem words using the Porter Stemmer
5255
stemmer = yes
5356

lib/textpair/generate_ngrams.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def __init__(
3737
minimum_word_length=2,
3838
word_order=True,
3939
modernize=True,
40+
ascii=False,
4041
pos_to_keep=[],
4142
debug=False,
4243
):
@@ -55,6 +56,7 @@ def __init__(
5556
"stopwords": stopwords, # TODO: generate error if file not found
5657
"text_object_level": text_object_level,
5758
"pos_to_keep": set(pos_to_keep),
59+
"ascii": ascii,
5860
}
5961
self.debug = debug
6062
self.input_path = ""
@@ -153,7 +155,7 @@ def process_file(self, input_file):
153155
ngram_gap=self.config["gap"],
154156
text_object_type=self.config["text_object_level"],
155157
min_word_length=self.config["minimum_word_length"],
156-
ascii=True,
158+
ascii=self.config["ascii"],
157159
)
158160
doc_ngrams = []
159161
metadata = {}

lib/textpair/parse_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def parse_config(textpair_config, output_path="./output", skip_web_app=False):
2828
tei_parsing[key] = value
2929
for key, value in dict(config["PREPROCESSING"]).items():
3030
if value:
31-
if key == "skipgram" or key == "numbers" or key == "word_order" or key == "modernize":
31+
if key in ["skipgram", "numbers", "word_order", "modernize", "ascii"]:
3232
if value.lower() == "yes" or value.lower() == "true":
3333
value = True
3434
else:

0 commit comments

Comments
 (0)