diff --git a/liwc/.ipynb_checkpoints/trie-checkpoint.py b/liwc/.ipynb_checkpoints/trie-checkpoint.py new file mode 100644 index 0000000..b19f2fc --- /dev/null +++ b/liwc/.ipynb_checkpoints/trie-checkpoint.py @@ -0,0 +1,34 @@ +def build_trie(lexicon): + """ + Build a character-trie from the plain pattern_string -> categories_list + mapping provided by `lexicon`. + + Some LIWC patterns end with a `*` to indicate a wildcard match. + """ + trie = {} + for pattern, category_names in lexicon.items(): + cursor = trie + for char in pattern: + if char == "*": + cursor["*"] = category_names + break + if char not in cursor: + cursor[char] = {} + cursor = cursor[char] + cursor["$"] = category_names + return trie + + +def search_trie(trie, token, token_i=0): + """ + Search the given character-trie for paths that match the `token` string. + """ + if "*" in trie: + return trie["*"] + if "$" in trie and token_i == len(token): + return trie["$"] + if token_i < len(token): + char = token[token_i] + if char in trie: + return search_trie(trie[char], token, token_i + 1) + return [] diff --git a/liwc/__init__.py b/liwc/__init__.py index ffd36b6..aed6847 100644 --- a/liwc/__init__.py +++ b/liwc/__init__.py @@ -9,7 +9,7 @@ __version__ = None -def load_token_parser(filepath): +def load_token_parser(filepath, encoding = "utf-8"): """ Reads a LIWC lexicon from a file in the .dic format, returning a tuple of (parse, category_names), where: @@ -17,8 +17,12 @@ def load_token_parser(filepath): empty) of matching categories * `category_names` is a list of strings representing all LIWC categories in the lexicon + * `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese. + * `load_token_parser()` now can read multiple dictionaries from the distributor such as Dutch_LIWC2015_Dictionary, + German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English, + Spanish_LIWC2007_Dictionary as well as Swedish from the user. """ - lexicon, category_names = read_dic(filepath) + lexicon, category_names = read_dic(filepath, encoding = encoding) trie = build_trie(lexicon) def parse_token(token): diff --git a/liwc/dic.py b/liwc/dic.py index b9d4f0c..252dc3d 100644 --- a/liwc/dic.py +++ b/liwc/dic.py @@ -26,20 +26,37 @@ def _parse_lexicon(lines, category_mapping): yield parts[0], [category_mapping[category_id] for category_id in parts[1:]] -def read_dic(filepath): +def read_dic(filepath, encoding = "utf-8"): """ Reads a LIWC lexicon from a file in the .dic format, returning a tuple of (lexicon, category_names), where: * `lexicon` is a dict mapping string patterns to lists of category names * `category_names` is a list of category names (as strings) """ - with open(filepath) as lines: + try: + with open(filepath) as lines: # read up to first "%" (should be very first line of file) - for line in lines: - if line.strip() == "%": - break - # read categories (a mapping from integer string to category name) - category_mapping = dict(_parse_categories(lines)) - # read lexicon (a mapping from matching string to a list of category names) - lexicon = dict(_parse_lexicon(lines, category_mapping)) - return lexicon, list(category_mapping.values()) + for line in lines: + if line.strip() == "%": + break + # read categories (a mapping from integer string to category name) + category_mapping = dict(_parse_categories(lines)) + # read lexicon (a mapping from matching string to a list of category names) + lexicon = dict(_parse_lexicon(lines, category_mapping)) + return lexicon, list(category_mapping.values()) + except UnicodeDecodeError: + # decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian, + # Portuguese, Swedish + with open(filepath, encoding="windows-1252") as lines: + # read up to first "%" (should be very first line of file) + for line in lines: + if line.strip() == "%": + break + # read categories (a mapping from integer string to category name) + category_mapping = dict(_parse_categories(lines)) + # read lexicon (a mapping from matching string to a list of category names) + lexicon = dict(_parse_lexicon(lines, category_mapping)) + return lexicon, list(category_mapping.values()) + except UnicodeDecodeError as e: + print("encoding requires correct encoding") +