chbrown · davidycliao · Nov 29, 2021 · Nov 29, 2021 · Nov 29, 2021 · Nov 30, 2021
diff --git a/liwc/.ipynb_checkpoints/trie-checkpoint.py b/liwc/.ipynb_checkpoints/trie-checkpoint.py
@@ -0,0 +1,34 @@
+def build_trie(lexicon):
+    """
+    Build a character-trie from the plain pattern_string -> categories_list
+    mapping provided by `lexicon`.
+
+    Some LIWC patterns end with a `*` to indicate a wildcard match.
+    """
+    trie = {}
+    for pattern, category_names in lexicon.items():
+        cursor = trie
+        for char in pattern:
+            if char == "*":
+                cursor["*"] = category_names
+                break
+            if char not in cursor:
+                cursor[char] = {}
+            cursor = cursor[char]
+        cursor["$"] = category_names
+    return trie
+
+
+def search_trie(trie, token, token_i=0):
+    """
+    Search the given character-trie for paths that match the `token` string.
+    """
+    if "*" in trie:
+        return trie["*"]
+    if "$" in trie and token_i == len(token):
+        return trie["$"]
+    if token_i < len(token):
+        char = token[token_i]
+        if char in trie:
+            return search_trie(trie[char], token, token_i + 1)
+    return []
diff --git a/liwc/__init__.py b/liwc/__init__.py
@@ -9,16 +9,20 @@
     __version__ = None
 
 
-def load_token_parser(filepath):
+def load_token_parser(filepath, encoding = "utf-8"):
     """
     Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
     (parse, category_names), where:
     * `parse` is a function from a token to a list of strings (potentially
       empty) of matching categories
     * `category_names` is a list of strings representing all LIWC categories in
       the lexicon
+    * `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese. 
+    * `load_token_parser()` now can read multiple dictionaries from the distributor such as Dutch_LIWC2015_Dictionary,
+    German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English,
+    Spanish_LIWC2007_Dictionary as well as Swedish from the user.
     """
-    lexicon, category_names = read_dic(filepath)
+    lexicon, category_names = read_dic(filepath, encoding = encoding)
     trie = build_trie(lexicon)
 
     def parse_token(token):

diff --git a/liwc/dic.py b/liwc/dic.py
@@ -26,20 +26,37 @@ def _parse_lexicon(lines, category_mapping):
         yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]
 
 
-def read_dic(filepath):
+def read_dic(filepath, encoding = "utf-8"):
     """
     Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
     (lexicon, category_names), where:
     * `lexicon` is a dict mapping string patterns to lists of category names
     * `category_names` is a list of category names (as strings)
     """
-    with open(filepath) as lines:
+    try:
+        with open(filepath) as lines:
         # read up to first "%" (should be very first line of file)
-        for line in lines:
-            if line.strip() == "%":
-                break
-        # read categories (a mapping from integer string to category name)
-        category_mapping = dict(_parse_categories(lines))
-        # read lexicon (a mapping from matching string to a list of category names)
-        lexicon = dict(_parse_lexicon(lines, category_mapping))
-    return lexicon, list(category_mapping.values())
+            for line in lines:
+                if line.strip() == "%":
+                    break
+            # read categories (a mapping from integer string to category name)
+            category_mapping = dict(_parse_categories(lines))
+            # read lexicon (a mapping from matching string to a list of category names)
+            lexicon = dict(_parse_lexicon(lines, category_mapping))
+        return lexicon, list(category_mapping.values())
+    except UnicodeDecodeError:
+        # decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian,
+        # Portuguese, Swedish
+        with open(filepath, encoding="windows-1252") as lines:
+        # read up to first "%" (should be very first line of file)
+            for line in lines:
+                if line.strip() == "%":
+                    break
+            # read categories (a mapping from integer string to category name)
+            category_mapping = dict(_parse_categories(lines))
+            # read lexicon (a mapping from matching string to a list of category names)
+            lexicon = dict(_parse_lexicon(lines, category_mapping))
+        return lexicon, list(category_mapping.values())
+    except UnicodeDecodeError as e:
+        print("encoding requires correct encoding")
+