Skip to content

Commit

Permalink
don't make ascii normalization the default
Browse files Browse the repository at this point in the history
  • Loading branch information
clovis committed Dec 5, 2018
1 parent ce2fc2c commit 29bb691
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 2 deletions.
3 changes: 3 additions & 0 deletions config/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ language =
# Modernize language if modernization is available for your language: currently only French is supported.
modernize = yes

# Transliterate characters to closest ascii representation.
ascii = no

# Stem words using the Porter Stemmer
stemmer = yes

Expand Down
4 changes: 3 additions & 1 deletion lib/textpair/generate_ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
minimum_word_length=2,
word_order=True,
modernize=True,
ascii=False,
pos_to_keep=[],
debug=False,
):
Expand All @@ -55,6 +56,7 @@ def __init__(
"stopwords": stopwords, # TODO: generate error if file not found
"text_object_level": text_object_level,
"pos_to_keep": set(pos_to_keep),
"ascii": ascii,
}
self.debug = debug
self.input_path = ""
Expand Down Expand Up @@ -153,7 +155,7 @@ def process_file(self, input_file):
ngram_gap=self.config["gap"],
text_object_type=self.config["text_object_level"],
min_word_length=self.config["minimum_word_length"],
ascii=True,
ascii=self.config["ascii"],
)
doc_ngrams = []
metadata = {}
Expand Down
2 changes: 1 addition & 1 deletion lib/textpair/parse_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def parse_config(textpair_config, output_path="./output", skip_web_app=False):
tei_parsing[key] = value
for key, value in dict(config["PREPROCESSING"]).items():
if value:
if key == "skipgram" or key == "numbers" or key == "word_order" or key == "modernize":
if key in ["skipgram", "numbers", "word_order", "modernize", "ascii"]:
if value.lower() == "yes" or value.lower() == "true":
value = True
else:
Expand Down

0 comments on commit 29bb691

Please sign in to comment.