Skip to content

encoding for #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions liwc/.ipynb_checkpoints/trie-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
def build_trie(lexicon):
"""
Build a character-trie from the plain pattern_string -> categories_list
mapping provided by `lexicon`.

Some LIWC patterns end with a `*` to indicate a wildcard match.
"""
trie = {}
for pattern, category_names in lexicon.items():
cursor = trie
for char in pattern:
if char == "*":
cursor["*"] = category_names
break
if char not in cursor:
cursor[char] = {}
cursor = cursor[char]
cursor["$"] = category_names
return trie


def search_trie(trie, token, token_i=0):
"""
Search the given character-trie for paths that match the `token` string.
"""
if "*" in trie:
return trie["*"]
if "$" in trie and token_i == len(token):
return trie["$"]
if token_i < len(token):
char = token[token_i]
if char in trie:
return search_trie(trie[char], token, token_i + 1)
return []
8 changes: 6 additions & 2 deletions liwc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,20 @@
__version__ = None


def load_token_parser(filepath):
def load_token_parser(filepath, encoding = "utf-8"):
"""
Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
(parse, category_names), where:
* `parse` is a function from a token to a list of strings (potentially
empty) of matching categories
* `category_names` is a list of strings representing all LIWC categories in
the lexicon
* `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese.
* `load_token_parser()` now can read multiple dictionaries from the distributor such as Dutch_LIWC2015_Dictionary,
German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English,
Spanish_LIWC2007_Dictionary as well as Swedish from the user.
"""
lexicon, category_names = read_dic(filepath)
lexicon, category_names = read_dic(filepath, encoding = encoding)
trie = build_trie(lexicon)

def parse_token(token):
Expand Down
37 changes: 27 additions & 10 deletions liwc/dic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,37 @@ def _parse_lexicon(lines, category_mapping):
yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]


def read_dic(filepath):
def read_dic(filepath, encoding = "utf-8"):
"""
Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
(lexicon, category_names), where:
* `lexicon` is a dict mapping string patterns to lists of category names
* `category_names` is a list of category names (as strings)
"""
with open(filepath) as lines:
try:
with open(filepath) as lines:
# read up to first "%" (should be very first line of file)
for line in lines:
if line.strip() == "%":
break
# read categories (a mapping from integer string to category name)
category_mapping = dict(_parse_categories(lines))
# read lexicon (a mapping from matching string to a list of category names)
lexicon = dict(_parse_lexicon(lines, category_mapping))
return lexicon, list(category_mapping.values())
for line in lines:
if line.strip() == "%":
break
# read categories (a mapping from integer string to category name)
category_mapping = dict(_parse_categories(lines))
# read lexicon (a mapping from matching string to a list of category names)
lexicon = dict(_parse_lexicon(lines, category_mapping))
return lexicon, list(category_mapping.values())
except UnicodeDecodeError:
# decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian,
# Portuguese, Swedish
with open(filepath, encoding="windows-1252") as lines:
# read up to first "%" (should be very first line of file)
for line in lines:
if line.strip() == "%":
break
# read categories (a mapping from integer string to category name)
category_mapping = dict(_parse_categories(lines))
# read lexicon (a mapping from matching string to a list of category names)
lexicon = dict(_parse_lexicon(lines, category_mapping))
return lexicon, list(category_mapping.values())
except UnicodeDecodeError as e:
print("encoding requires correct encoding")