From e8f81243d13795630326c9d5793d34c898b293f2 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 19:59:04 -0400 Subject: [PATCH 01/19] Initial commit. --- .gitignore | 4 + .pre-commit-config.yaml | 6 + .style.yapf | 13 + .travis.yml | 13 + .yapfignore | 1 + MANIFEST.in | 3 + README.md | 66 +- convert_to_readable.py | 33 - format-yapf-changed.sh | 11 + format-yapf.sh | 4 + init_virtualenv.sh | 17 + pep8.sh | 2 + publish.sh | 5 + punctuator.py | 176 ---- punctuator/__init__.py | 6 + punctuator/convert_to_readable.py | 42 + data.py => punctuator/data.py | 75 +- .../demo_play_with_model.py | 34 +- .../error_calculator.py | 53 +- main.py => punctuator/main.py | 105 ++- main2.py => punctuator/main2.py | 62 +- models.py => punctuator/models.py | 134 ++-- .../play_with_model.py | 20 +- punctuator/punc.py | 269 +++++++ punctuator/tests.py | 80 ++ pylint.messages | 750 ++++++++++++++++++ pylint.rc | 355 +++++++++ requirements-test.txt | 8 + requirements.txt | 4 + setup.py | 58 ++ test.sh | 5 + tox.ini | 13 + 32 files changed, 1955 insertions(+), 472 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 .style.yapf create mode 100644 .travis.yml create mode 100644 .yapfignore create mode 100644 MANIFEST.in delete mode 100644 convert_to_readable.py create mode 100755 format-yapf-changed.sh create mode 100755 format-yapf.sh create mode 100755 init_virtualenv.sh create mode 100755 pep8.sh create mode 100755 publish.sh delete mode 100644 punctuator.py create mode 100644 punctuator/__init__.py create mode 100644 punctuator/convert_to_readable.py rename data.py => punctuator/data.py (83%) rename demo_play_with_model.py => punctuator/demo_play_with_model.py (79%) rename error_calculator.py => punctuator/error_calculator.py (72%) rename main.py => punctuator/main.py (71%) rename main2.py => punctuator/main2.py (79%) rename models.py => punctuator/models.py (74%) rename play_with_model.py => punctuator/play_with_model.py (88%) create mode 100644 punctuator/punc.py create mode 100644 punctuator/tests.py create mode 100644 pylint.messages create mode 100644 pylint.rc create mode 100644 requirements-test.txt create mode 100644 requirements.txt create mode 100644 setup.py create mode 100755 test.sh create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index 1dbc687..151848d 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,7 @@ target/ #Ipython Notebook .ipynb_checkpoints +/.env* +*.out +*.geany +/data diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..6634d8c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.28.0 + hooks: + - id: yapf + args: [--in-place, --parallel, --recursive] diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 0000000..e5132c7 --- /dev/null +++ b/.style.yapf @@ -0,0 +1,13 @@ +[style] +BASED_ON_STYLE = pep8 +COLUMN_LIMIT = 160 +COALESCE_BRACKETS = true +DEDENT_CLOSING_BRACKETS = true +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true +SPACES_BEFORE_COMMENT = 1 +SPLIT_COMPLEX_COMPREHENSION = true +SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false +SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 10 +CONTINUATION_INDENT_WIDTH = 4 +INDENT_WIDTH = 4 +CONTINUATION_ALIGN_STYLE = SPACE diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..290559e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,13 @@ +dist: xenial +sudo: required +language: python +python: +- "3.5" +install: +- sudo add-apt-repository -y ppa:deadsnakes/ppa +- sudo apt-get -yq update +- sudo apt-get -yq install python3.6 python3.6-dev python3.7 python3.7-dev +- pip install -r requirements-test.txt +script: +- ./pep8.sh +- tox diff --git a/.yapfignore b/.yapfignore new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.yapfignore @@ -0,0 +1 @@ + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..93c438d --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include requirements.txt +include requirements-test.txt diff --git a/README.md b/README.md index fe4f22a..de9c9b2 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,13 @@ -**[DEMO](http://bark.phon.ioc.ee/punctuator)** and **[DEMO2](http://bark.phon.ioc.ee/punctuator/game)** - # Punctuator +[![](https://img.shields.io/pypi/v/punctuator.svg)](https://pypi.python.org/pypi/punctuator) +[![Build Status](https://img.shields.io/travis/chrisspen/punctuator.svg?branch=master)](https://travis-ci.org/chrisspen/punctuator) +[![](https://pyup.io/repos/github/chrisspen/punctuator/shield.svg)](https://pyup.io/repos/github/chrisspen/punctuator) + +This is a fork of [Ottokar Tilk's punctuator2](https://github.com/ottokart/punctuator2) cleaned up into a formal Python3 package with testing. + +**[DEMO](http://bark.phon.ioc.ee/punctuator)** and **[DEMO2](http://bark.phon.ioc.ee/punctuator/game)** + A bidirectional recurrent neural network model with attention mechanism for restoring missing inter-word punctuation in unsegmented text. The model can be trained in two stages (second stage is optional): @@ -9,6 +15,39 @@ The model can be trained in two stages (second stage is optional): 1. First stage is trained on punctuation annotated text. Here the model learns to restore puncutation based on textual features only. 2. Optional second stage can be trained on punctuation *and* pause annotated text. In this stage the model learns to combine pause durations with textual features and adapts to the target domain. If pauses are omitted then only adaptation is performed. Second stage with pause durations can be used for example for restoring punctuation in automatic speech recognition system output. +# Installation + +To install: + + virtualenv -p python3.7 .env + . .env/bin/activate + pip install punctuator + +Additionally, you'll need a trained model. You can create your own following the instructions below, or you can use a pre-trained model from [here](https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms?usp=sharing). + +Place these models in `PUNCTUATOR_DATA_DIR` directory, which defaults to `~/.punctuator`. + +For example, to download `Demo-Europarl-EN.pcl`, activate your virtual environment and run: + + . .env/bin/activate + mkdir -p ~/.punctuator + cd ~/.punctuator + gdown https://drive.google.com/uc?id=0B7BsN5f2F1fZd1Q0aXlrUDhDbnM + +To download other model files, find the Google Drive id via the share link, and substitute that in the command above. + +# Usage + +To use from the command line: + + cat input.txt | python punctuator.py model.pcl output.txt + +To use from Python: + + from punctuator import Punctuator + p = Punctuator('model.pcl') + print(p.punctuate('some text')) + # How well does it work? * A working demo can be seen here: http://bark.phon.ioc.ee/punctuator @@ -64,7 +103,7 @@ _Overall_ | _75.7_ | _73.9_ | _74.8_ ```to be ,COMMA or not to be ,COMMA that is the question .PERIOD``` Second phase data can also be without pause annotations to do just target domain adaptation. - + Make sure that first words of sentences don't have capitalized first letters. This would give the model unfair hints about period locations. Also, the text files you use for training and validation must be large enough (at least minibatch_size x sequence_length of words, which is 128x50=6400 words with default settings), otherwise you might get an error. # Configuration @@ -123,23 +162,12 @@ or with: if you want to see, which words the model sees as UNKs (OOVs). +# Development -# Citing - -The software is described in: - - @inproceedings{tilk2016, - author = {Ottokar Tilk and Tanel Alum{\"a}e}, - title = {Bidirectional Recurrent Neural Network with Attention Mechanism for Punctuation Restoration}, - booktitle = {Interspeech 2016}, - year = {2016} - } +Run all tests with: -We used the [release v1.0](https://github.com/ottokart/punctuator2/releases/tag/v1.0) in the paper. + export TESTNAME=; tox -# Alternatives +Run a specific test in a specific environment with: -* A fork from this repository that uses additional prosodic features: https://github.com/alpoktem/punkProse -* Convolutional neural network with slightly smaller accuracy but much higher speed (50x faster): https://github.com/vackosar/keras-punctuator (additional details here: https://github.com/ottokart/punctuator2/issues/14) -* A general sequence labeling model: https://github.com/marekrei/sequence-labeler that can be used for punctuation restoration with small modifications (example here: https://github.com/ottokart/sequence-labeler). Punctuator2 can be probably used for other sequence labeling problems as well. -* Our previous approach with unidirectional LSTM (less accurate, but useful if you don't want to use Theano): https://github.com/ottokart/punctuator + export TESTNAME=.test_punctuate; tox -e py37 diff --git a/convert_to_readable.py b/convert_to_readable.py deleted file mode 100644 index c563d7f..0000000 --- a/convert_to_readable.py +++ /dev/null @@ -1,33 +0,0 @@ -import sys -from io import open -from data import EOS_TOKENS, PUNCTUATION_VOCABULARY - -if __name__ == "__main__": - - if len(sys.argv) > 1: - input_file = sys.argv[1] - else: - sys.exit("Input file path argument missing") - - if len(sys.argv) > 2: - output_file = sys.argv[2] - else: - sys.exit("Output file path argument missing") - - with_newlines = len(sys.argv) > 3 and bool(int(sys.argv[3])) - - with open(input_file, 'r', encoding='utf-8') as in_f, open(output_file, 'w', encoding='utf-8') as out_f: - last_was_eos = True - first = True - for token in in_f.read().split(): - if token in PUNCTUATION_VOCABULARY: - out_f.write(token[:1]) - else: - out_f.write(('' if first else ' ') + (token.title() if last_was_eos else token)) - - last_was_eos = token in EOS_TOKENS - if with_newlines and last_was_eos: - out_f.write('\n') - first = True - else: - first = False diff --git a/format-yapf-changed.sh b/format-yapf-changed.sh new file mode 100755 index 0000000..35aab0b --- /dev/null +++ b/format-yapf-changed.sh @@ -0,0 +1,11 @@ +#!/bin/bash +FILES=`git status --porcelain | grep -E "*\.py$" | grep -v migration | grep -v "^D " | grep -v "^ D " | grep -v "^R " | awk '{print $2}'` +VENV=${VENV:-.env} +$VENV/bin/yapf --version +if [ -z "$FILES" ] +then + echo "No Python changes detected." +else + echo "Checking: $FILES" + $VENV/bin/yapf --in-place --recursive $FILES +fi diff --git a/format-yapf.sh b/format-yapf.sh new file mode 100755 index 0000000..9d9b4d2 --- /dev/null +++ b/format-yapf.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# Note, this should be used rarely, and instead the pre-commit hook relied upon. +yapf --in-place --recursive punctuator +yapf --in-place --recursive setup.py diff --git a/init_virtualenv.sh b/init_virtualenv.sh new file mode 100755 index 0000000..bfcedd9 --- /dev/null +++ b/init_virtualenv.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +cd "$(dirname "$0")" + +CACHE_DIR=/tmp/pip +REL_DIR=./ + +# Remove existing virtualenv if it exists. +[ -d $REL_DIR.env ] && rm -Rf $REL_DIR.env + +# Create virtual environment with Python 3.7 (requires python3-venv package on Ubuntu) +python3.7 -m venv $REL_DIR.env +. $REL_DIR.env/bin/activate +pip install -U pip + +pip install --cache-dir $CACHE_DIR -r requirements.txt requirements-test.txt diff --git a/pep8.sh b/pep8.sh new file mode 100755 index 0000000..be0ee1b --- /dev/null +++ b/pep8.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pylint --rcfile=pylint.rc punctuator setup.py diff --git a/publish.sh b/publish.sh new file mode 100755 index 0000000..53736c9 --- /dev/null +++ b/publish.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e +. .env/bin/activate +python setup.py sdist +twine upload dist/* diff --git a/punctuator.py b/punctuator.py deleted file mode 100644 index 5675416..0000000 --- a/punctuator.py +++ /dev/null @@ -1,176 +0,0 @@ -# coding: utf-8 -from __future__ import division, print_function - -import models -import data - -import theano -import sys -from io import open - -import theano.tensor as T -import numpy as np - -MAX_SUBSEQUENCE_LEN = 200 - -def to_array(arr, dtype=np.int32): - # minibatch of 1 sequence as column - return np.array([arr], dtype=dtype).T - -def convert_punctuation_to_readable(punct_token): - if punct_token == data.SPACE: - return " " - else: - return punct_token[0] - -def restore_with_pauses(output_file, text, pauses, word_vocabulary, reverse_punctuation_vocabulary, predict_function): - i = 0 - with open(output_file, 'w', encoding='utf-8') as f_out: - while True: - - subsequence = text[i:i+MAX_SUBSEQUENCE_LEN] - subsequence_pauses = pauses[i:i+MAX_SUBSEQUENCE_LEN] - - if len(subsequence) == 0: - break - - converted_subsequence = [word_vocabulary.get(w, word_vocabulary[data.UNK]) for w in subsequence] - - y = predict_function(to_array(converted_subsequence), to_array(subsequence_pauses, dtype=theano.config.floatX)) - - f_out.write(subsequence[0]) - - last_eos_idx = 0 - punctuations = [] - for y_t in y: - - p_i = np.argmax(y_t.flatten()) - punctuation = reverse_punctuation_vocabulary[p_i] - - punctuations.append(punctuation) - - if punctuation in data.EOS_TOKENS: - last_eos_idx = len(punctuations) # we intentionally want the index of next element - - if subsequence[-1] == data.END: - step = len(subsequence) - 1 - elif last_eos_idx != 0: - step = last_eos_idx - else: - step = len(subsequence) - 1 - - for j in range(step): - f_out.write(" " + punctuations[j] + " " if punctuations[j] != data.SPACE else " ") - if j < step - 1: - f_out.write(subsequence[1+j]) - - if subsequence[-1] == data.END: - break - - i += step - -def restore(output_file, text, word_vocabulary, reverse_punctuation_vocabulary, predict_function): - i = 0 - with open(output_file, 'w', encoding='utf-8') as f_out: - while True: - - subsequence = text[i:i+MAX_SUBSEQUENCE_LEN] - - if len(subsequence) == 0: - break - - converted_subsequence = [word_vocabulary.get(w, word_vocabulary[data.UNK]) for w in subsequence] - - y = predict_function(to_array(converted_subsequence)) - - f_out.write(subsequence[0]) - - last_eos_idx = 0 - punctuations = [] - for y_t in y: - - p_i = np.argmax(y_t.flatten()) - punctuation = reverse_punctuation_vocabulary[p_i] - - punctuations.append(punctuation) - - if punctuation in data.EOS_TOKENS: - last_eos_idx = len(punctuations) # we intentionally want the index of next element - - if subsequence[-1] == data.END: - step = len(subsequence) - 1 - elif last_eos_idx != 0: - step = last_eos_idx - else: - step = len(subsequence) - 1 - - for j in range(step): - f_out.write(" " + punctuations[j] + " " if punctuations[j] != data.SPACE else " ") - if j < step - 1: - f_out.write(subsequence[1+j]) - - if subsequence[-1] == data.END: - break - - i += step - -if __name__ == "__main__": - - if len(sys.argv) > 1: - model_file = sys.argv[1] - else: - sys.exit("Model file path argument missing") - - if len(sys.argv) > 2: - output_file = sys.argv[2] - else: - sys.exit("Output file path argument missing") - - use_pauses = len(sys.argv) > 3 and bool(int(sys.argv[3])) - - x = T.imatrix('x') - - if use_pauses: - - p = T.matrix('p') - - print("Loading model parameters...") - net, _ = models.load(model_file, 1, x, p) - - print("Building model...") - predict = theano.function( - inputs=[x, p], - outputs=net.y - ) - - else: - - print("Loading model parameters...") - net, _ = models.load(model_file, 1, x) - - print("Building model...") - predict = theano.function( - inputs=[x], - outputs=net.y - ) - - word_vocabulary = net.x_vocabulary - punctuation_vocabulary = net.y_vocabulary - - reverse_word_vocabulary = {v:k for k,v in word_vocabulary.items()} - reverse_punctuation_vocabulary = {v:k for k,v in punctuation_vocabulary.items()} - - input_text = open(sys.stdin.fileno(), 'r', encoding='utf-8').read() - - if len(input_text) == 0: - sys.exit("Input text from stdin missing.") - - text = [w for w in input_text.split() if w not in punctuation_vocabulary and w not in data.PUNCTUATION_MAPPING and not w.startswith(data.PAUSE_PREFIX)] + [data.END] - pauses = [float(s.replace(data.PAUSE_PREFIX,"").replace(">","")) for s in input_text.split() if s.startswith(data.PAUSE_PREFIX)] - - if not use_pauses: - restore(output_file, text, word_vocabulary, reverse_punctuation_vocabulary, predict) - else: - if not pauses: - pauses = [0.0 for _ in range(len(text)-1)] - restore_with_pauses(output_file, text, pauses, word_vocabulary, reverse_punctuation_vocabulary, predict) diff --git a/punctuator/__init__.py b/punctuator/__init__.py new file mode 100644 index 0000000..66596da --- /dev/null +++ b/punctuator/__init__.py @@ -0,0 +1,6 @@ +VERSION = (0, 9, 1) +__version__ = '.'.join(map(str, VERSION)) +try: + from punc import Punctuator +except ImportError: + pass diff --git a/punctuator/convert_to_readable.py b/punctuator/convert_to_readable.py new file mode 100644 index 0000000..ad09c20 --- /dev/null +++ b/punctuator/convert_to_readable.py @@ -0,0 +1,42 @@ +import sys +from io import open +from data import EOS_TOKENS, PUNCTUATION_VOCABULARY + + +def convert(input_text, out_f, with_newlines=False): + """ + Translates punctuation tokens to normal punctuation. + """ + last_was_eos = True + first = True + for token in input_text.split(): + if token in PUNCTUATION_VOCABULARY: + out_f.write(token[:1]) + else: + out_f.write(('' if first else ' ') + (token.title() if last_was_eos else token)) + + last_was_eos = token in EOS_TOKENS + if with_newlines and last_was_eos: + out_f.write('\n') + first = True + else: + first = False + + +if __name__ == "__main__": + + if len(sys.argv) > 1: + input_file = sys.argv[1] + else: + sys.exit("Input file path argument missing") + + if len(sys.argv) > 2: + output_file = sys.argv[2] + else: + sys.exit("Output file path argument missing") + + with_newlines = len(sys.argv) > 3 and bool(int(sys.argv[3])) + + with open(input_file, 'r', encoding='utf-8') as in_f, open(output_file, 'w', encoding='utf-8') as out_f: + input_text = in_f.read() + convert(input_text, out_f, with_newlines=with_newlines) diff --git a/data.py b/punctuator/data.py similarity index 83% rename from data.py rename to punctuator/data.py index 9a5bb4a..348e040 100644 --- a/data.py +++ b/punctuator/data.py @@ -1,18 +1,11 @@ # coding: utf-8 from __future__ import division, print_function -import random +# import random import os import sys import operator -try: - import cPickle -except ImportError: - import _pickle as cPickle -try: - input = raw_input -except NameError: - pass +import _pickle as cPickle from io import open import fnmatch import shutil @@ -29,6 +22,8 @@ UNK = "" SPACE = "_SPACE" +PERIOD = ".PERIOD" +COMMA = ",COMMA" MAX_WORD_VOCABULARY_SIZE = 100000 MIN_WORD_COUNT_IN_VOCAB = 2 @@ -45,7 +40,7 @@ WORD_VOCAB_FILE = os.path.join(DATA_PATH, "vocabulary") -PUNCTUATION_VOCABULARY = [SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"] +PUNCTUATION_VOCABULARY = [SPACE, COMMA, PERIOD, "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"] PUNCTUATION_MAPPING = {} # Comma, period & question mark only: @@ -56,27 +51,34 @@ CRAP_TOKENS = {"", ""} # punctuations that are not included in vocabulary nor mapping, must be added to CRAP_TOKENS PAUSE_PREFIX = "= MIN_WORD_COUNT_IN_VOCAB and wc[0] != UNK][:MAX_WORD_VOCABULARY_SIZE] # Unk will be appended to end + return [wc[0] + for wc in reversed(sorted(word_counts.items(), key=operator.itemgetter(1))) + if wc[1] >= MIN_WORD_COUNT_IN_VOCAB and wc[0] != UNK][:MAX_WORD_VOCABULARY_SIZE] # Unk will be appended to end + def write_vocabulary(vocabulary, file_name): if END not in vocabulary: @@ -89,16 +91,20 @@ def write_vocabulary(vocabulary, file_name): with open(file_name, 'w', encoding='utf-8') as f: f.write("\n".join(vocabulary)) + def iterable_to_dict(arr): return dict((x.strip(), i) for (i, x) in enumerate(arr)) + def read_vocabulary(file_name): with open(file_name, 'r', encoding='utf-8') as f: return iterable_to_dict(f.readlines()) + def write_processed_dataset(input_files, output_file): """ - data will consist of two sets of aligned subsequences (words and punctuations) of MAX_SEQUENCE_LEN tokens (actually punctuation sequence will be 1 element shorter). + data will consist of two sets of aligned subsequences (words and punctuations) of MAX_SEQUENCE_LEN tokens + (actually punctuation sequence will be 1 element shorter). If a sentence is cut, then it will be added to next subsequence entirely (words before the cut belong to both sequences) """ @@ -129,25 +135,24 @@ def write_processed_dataset(input_files, output_file): for token in line.split(): # First map oov punctuations to known punctuations - if token in PUNCTUATION_MAPPING: - token = PUNCTUATION_MAPPING[token] + token = PUNCTUATION_MAPPING.get(token, token) if skip_until_eos: - if token in EOS_TOKENS: skip_until_eos = False - continue - elif token in CRAP_TOKENS: + if token in CRAP_TOKENS: continue - elif token.startswith(PAUSE_PREFIX): - last_pause = float(token.replace(PAUSE_PREFIX,"").replace(">","")) + if token.startswith(PAUSE_PREFIX): + last_pause = float(token.replace(PAUSE_PREFIX, "").replace(">", "")) elif token in punctuation_vocabulary: - if last_token_was_punctuation: # if we encounter sequences like: "... !EXLAMATIONMARK .PERIOD ...", then we only use the first punctuation and skip the ones that follow + # if we encounter sequences like: "... !EXLAMATIONMARK .PERIOD ...", + # then we only use the first punctuation and skip the ones that follow + if last_token_was_punctuation: continue if token in EOS_TOKENS: @@ -173,12 +178,14 @@ def write_processed_dataset(input_files, output_file): num_unks += int(word == word_vocabulary[UNK]) if len(current_words) == MAX_SEQUENCE_LEN: # this also means, that last token was a word - - assert len(current_words) == len(current_punctuations) + 1, "#words: %d; #punctuations: %d" % (len(current_words), len(current_punctuations)) - assert current_pauses == [] or len(current_words) == len(current_pauses), "#words: %d; #pauses: %d" % (len(current_words), len(current_pauses)) + + assert len(current_words + ) == len(current_punctuations) + 1, "#words: %d; #punctuations: %d" % (len(current_words), len(current_punctuations)) + assert current_pauses == [] or len(current_words) == len(current_pauses + ), "#words: %d; #pauses: %d" % (len(current_words), len(current_pauses)) # Sentence did not fit into subsequence - skip it - if last_eos_idx == 0: + if last_eos_idx == 0: skip_until_eos = True current_words = [] @@ -188,18 +195,14 @@ def write_processed_dataset(input_files, output_file): last_token_was_punctuation = True # next sequence starts with a new sentence, so is preceded by eos which is punctuation else: - subsequence = [ - current_words[:-1] + [word_vocabulary[END]], - current_punctuations, - current_pauses[1:] - ] + subsequence = [current_words[:-1] + [word_vocabulary[END]], current_punctuations, current_pauses[1:]] data.append(subsequence) # Carry unfinished sentence to next subsequence - current_words = current_words[last_eos_idx+1:] - current_punctuations = current_punctuations[last_eos_idx+1:] - current_pauses = current_pauses[last_eos_idx+1:] + current_words = current_words[last_eos_idx + 1:] + current_punctuations = current_punctuations[last_eos_idx + 1:] + current_pauses = current_pauses[last_eos_idx + 1:] last_eos_idx = 0 # sequence always starts with a new sentence @@ -207,6 +210,7 @@ def write_processed_dataset(input_files, output_file): dump(data, output_file) + def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, train_output, dev_output, test_output, pretrained_embeddings_path=None): train_txt_files = [] @@ -215,7 +219,7 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra if create_vocabulary and not pretrained_embeddings_path: word_counts = dict() - + for root, dirnames, filenames in os.walk(root_path): for filename in fnmatch.filter(filenames, '*.txt'): @@ -257,6 +261,7 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra write_processed_dataset(dev_txt_files, dev_output) write_processed_dataset(test_txt_files, test_output) + if __name__ == "__main__": if len(sys.argv) > 1: @@ -282,7 +287,7 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra shutil.rmtree(DATA_PATH) os.makedirs(DATA_PATH) - + create_dev_test_train_split_and_vocabulary(path, True, TRAIN_FILE, DEV_FILE, TEST_FILE, PRETRAINED_EMBEDDINGS_PATH) # Stage 2 diff --git a/demo_play_with_model.py b/punctuator/demo_play_with_model.py similarity index 79% rename from demo_play_with_model.py rename to punctuator/demo_play_with_model.py index 5a3046f..fbc11e2 100644 --- a/demo_play_with_model.py +++ b/punctuator/demo_play_with_model.py @@ -15,24 +15,28 @@ import theano.tensor as T import numpy as np +# pylint: disable=redefined-outer-name + numbers = re.compile(r'\d') is_number = lambda x: len(numbers.sub('', x)) / len(x) < 0.6 + def to_array(arr, dtype=np.int32): # minibatch of 1 sequence as column return np.array([arr], dtype=dtype).T + def convert_punctuation_to_readable(punct_token): if punct_token == data.SPACE: return ' ' - elif punct_token.startswith('-'): + if punct_token.startswith('-'): return ' ' + punct_token[0] + ' ' - else: - return punct_token[0] + ' ' + return punct_token[0] + ' ' + def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuation_vocabulary, reverse_word_vocabulary, words, f_out, show_unk): - if len(words) == 0: + if not words: sys.exit("Input text from stdin missing.") if words[-1] != data.END: @@ -42,15 +46,12 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat while True: - subsequence = words[i:i+data.MAX_SEQUENCE_LEN] + subsequence = words[i:i + data.MAX_SEQUENCE_LEN] - if len(subsequence) == 0: + if not subsequence: break - converted_subsequence = [word_vocabulary.get( - "" if is_number(w) else w.lower(), - word_vocabulary[data.UNK]) - for w in subsequence] + converted_subsequence = [word_vocabulary.get("" if is_number(w) else w.lower(), word_vocabulary[data.UNK]) for w in subsequence] if show_unk: subsequence = [reverse_word_vocabulary[w] for w in converted_subsequence] @@ -83,9 +84,9 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat f_out.write(convert_punctuation_to_readable(current_punctuation)) if j < step - 1: if current_punctuation in data.EOS_TOKENS: - f_out.write(subsequence[1+j].title()) + f_out.write(subsequence[1 + j].title()) else: - f_out.write(subsequence[1+j]) + f_out.write(subsequence[1 + j]) if subsequence[-1] == data.END: break @@ -113,8 +114,8 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat predict = theano.function(inputs=[x], outputs=net.y) word_vocabulary = net.x_vocabulary punctuation_vocabulary = net.y_vocabulary - reverse_word_vocabulary = {v:k for k,v in net.x_vocabulary.items()} - reverse_punctuation_vocabulary = {v:k for k,v in net.y_vocabulary.items()} + reverse_word_vocabulary = {v: k for k, v in net.x_vocabulary.items()} + reverse_punctuation_vocabulary = {v: k for k, v in net.y_vocabulary.items()} human_readable_punctuation_vocabulary = [p[0] for p in punctuation_vocabulary if p != data.SPACE] tokenizer = word_tokenize @@ -127,8 +128,9 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat except NameError: text = input("\nTEXT: ") - words = [w for w in untokenizer(' '.join(tokenizer(text))).split() - if w not in punctuation_vocabulary and w not in human_readable_punctuation_vocabulary] + words = [ + w for w in untokenizer(' '.join(tokenizer(text))).split() if w not in punctuation_vocabulary and w not in human_readable_punctuation_vocabulary + ] punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuation_vocabulary, reverse_word_vocabulary, words, f_out, show_unk) f_out.flush() diff --git a/error_calculator.py b/punctuator/error_calculator.py similarity index 72% rename from error_calculator.py rename to punctuator/error_calculator.py index 075880b..1d9c88a 100644 --- a/error_calculator.py +++ b/punctuator/error_calculator.py @@ -1,5 +1,4 @@ # coding: utf-8 - """ Computes and prints the overall classification error and precision, recall, F-score over punctuations. """ @@ -11,7 +10,14 @@ import sys from io import open -MAPPING = {}#{"!EXCLAMATIONMARK": ".PERIOD", "?QUESTIONMARK": ".PERIOD", ":COLON": ".PERIOD", ";SEMICOLON": ".PERIOD"} # Can be used to estimate 2-class performance for example +# Can be used to estimate 2-class performance for example +MAPPING = { + # "!EXCLAMATIONMARK": ".PERIOD", + # "?QUESTIONMARK": ".PERIOD", + # ":COLON": ".PERIOD", + # ";SEMICOLON": ".PERIOD" +} + def compute_error(target_paths, predicted_paths): counter = 0 @@ -38,11 +44,13 @@ def compute_error(target_paths, predicted_paths): target_stream = target.read().split() predicted_stream = predicted.read().split() - + while True: if data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY: - while data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY: # skip multiple consecutive punctuations + while data.PUNCTUATION_MAPPING.get( + target_stream[t_i], target_stream[t_i] + ) in data.PUNCTUATION_VOCABULARY: # skip multiple consecutive punctuations target_punctuation = data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) target_punctuation = MAPPING.get(target_punctuation, target_punctuation) t_i += 1 @@ -57,7 +65,7 @@ def compute_error(target_paths, predicted_paths): is_correct = target_punctuation == predicted_punctuation - counter += 1 + counter += 1 total_correct += is_correct if predicted_punctuation == " " and target_punctuation != " ": @@ -86,35 +94,35 @@ def compute_error(target_paths, predicted_paths): t_i += 1 p_i += 1 - if t_i >= len(target_stream)-1 and p_i >= len(predicted_stream)-1: + if t_i >= len(target_stream) - 1 and p_i >= len(predicted_stream) - 1: break overall_tp = 0.0 overall_fp = 0.0 overall_fn = 0.0 - print("-"*46) - print("{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE')) + print("-" * 46) + print("{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION', 'PRECISION', 'RECALL', 'F-SCORE')) for p in data.PUNCTUATION_VOCABULARY: if p == data.SPACE: continue - overall_tp += true_positives.get(p,0.) - overall_fp += false_positives.get(p,0.) - overall_fn += false_negatives.get(p,0.) + overall_tp += true_positives.get(p, 0.) + overall_fp += false_positives.get(p, 0.) + overall_fn += false_negatives.get(p, 0.) punctuation = p - precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan - recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan - f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan - print(u"{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100).encode('utf-8')) - print("-"*46) - pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan - rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan - f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan - print("{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100)) - print("Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2)) + precision = (true_positives.get(p, 0.) / (true_positives.get(p, 0.) + false_positives[p])) if p in false_positives else nan + recall = (true_positives.get(p, 0.) / (true_positives.get(p, 0.) + false_negatives[p])) if p in false_negatives else nan + f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan + print(u"{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision, 3) * 100, round(recall, 3) * 100, round(f_score, 3) * 100).encode('utf-8')) + print("-" * 46) + pre = overall_tp / (overall_tp + overall_fp) if overall_fp else nan + rec = overall_tp / (overall_tp + overall_fn) if overall_fn else nan + f1 = (2. * pre * rec) / (pre + rec) if (pre + rec) else nan + print("{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre, 3) * 100, round(rec, 3) * 100, round(f1, 3) * 100)) + print("Err: %s%%" % round((100.0 - float(total_correct) / float(counter - 1) * 100.0), 2)) print("SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1)) @@ -130,5 +138,4 @@ def compute_error(target_paths, predicted_paths): else: sys.exit("Model predictions file path argument missing") - compute_error([target_path], [predicted_path]) - + compute_error([target_path], [predicted_path]) diff --git a/main.py b/punctuator/main.py similarity index 71% rename from main.py rename to punctuator/main.py index e1885cc..18aa37b 100644 --- a/main.py +++ b/punctuator/main.py @@ -1,37 +1,29 @@ # coding: utf-8 +""" +Bi-directional RNN with attention +For a sequence of N words, the model makes N punctuation decisions (no punctuation before the first word, +but there's a decision after the last word or before ) +""" from __future__ import division, print_function +import sys +import os.path from collections import OrderedDict from time import time -import models -import data - import theano -try: - import cPickle -except ImportError: - import _pickle as cPickle -import sys -import os.path -try: - input = raw_input -except NameError: - pass - import theano.tensor as T import numpy as np +import models +import data + MAX_EPOCHS = 50 MINIBATCH_SIZE = 128 L2_REG = 0.0 CLIPPING_THRESHOLD = 2.0 PATIENCE_EPOCHS = 1 -""" -Bi-directional RNN with attention -For a sequence of N words, the model makes N punctuation decisions (no punctuation before the first word, but there's a decision after the last word or before ) -""" def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): @@ -46,10 +38,10 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): P_batch = [] if len(dataset) < batch_size: - print("WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % ( - file_name, - len(dataset), - MINIBATCH_SIZE * data.MAX_SEQUENCE_LEN)) + print( + "WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % + (file_name, len(dataset), MINIBATCH_SIZE * data.MAX_SEQUENCE_LEN) + ) for subsequence in dataset: @@ -57,7 +49,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): Y_batch.append(subsequence[1]) if with_pauses: P_batch.append(subsequence[2]) - + if len(X_batch) == batch_size: # Transpose, because the model assumes the first axis is time @@ -65,7 +57,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): Y = np.array(Y_batch, dtype=np.int32).T if with_pauses: P = np.array(P_batch, dtype=theano.config.floatX).T - + if with_pauses: yield X, Y, P else: @@ -76,8 +68,9 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): if with_pauses: P_batch = [] + if __name__ == "__main__": - + if len(sys.argv) > 1: model_name = sys.argv[1] else: @@ -108,7 +101,11 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): if os.path.isfile(model_file_name): while True: - resp = input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name) + resp = input( + ("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n" \ + "[r]eplace the existing model and train a new one?\n[e]xit?\n>") + % model_file_name + ) resp = resp.lower().strip() if resp not in ('c', 'r', 'e'): continue @@ -130,19 +127,12 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): rng.seed(1) print("Building model...") - net = models.GRU( - rng=rng, - x=x, - minibatch_size=MINIBATCH_SIZE, - n_hidden=num_hidden, - x_vocabulary=word_vocabulary, - y_vocabulary=punctuation_vocabulary - ) + net = models.GRU(rng=rng, x=x, minibatch_size=MINIBATCH_SIZE, n_hidden=num_hidden, x_vocabulary=word_vocabulary, y_vocabulary=punctuation_vocabulary) starting_epoch = 0 best_ppl = np.inf validation_ppl_history = [] - + gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in net.params] cost = net.cost(y) + L2_REG * net.L2_sqr @@ -151,44 +141,33 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): updates = OrderedDict() # Compute norm of gradients - norm = T.sqrt(T.sum( - [T.sum(gparam ** 2) for gparam in gparams] - )) + norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams])) - - # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011) + # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011) for gparam, param, gsum in zip(gparams, net.params, gsums): - gparam = T.switch( - T.ge(norm, CLIPPING_THRESHOLD), - gparam / norm * CLIPPING_THRESHOLD, - gparam - ) # Clipping of gradients - updates[gsum] = gsum + (gparam ** 2) + gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD), gparam / norm * CLIPPING_THRESHOLD, gparam) # Clipping of gradients + updates[gsum] = gsum + (gparam**2) updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6))) - train_model = theano.function( - inputs=[x, y, lr], - outputs=cost, - updates=updates - ) + train_model = theano.function(inputs=[x, y, lr], outputs=cost, updates=updates) - validate_model = theano.function( - inputs=[x, y], - outputs=net.cost(y) - ) + validate_model = theano.function(inputs=[x, y], outputs=net.cost(y)) print("Training...") for epoch in range(starting_epoch, MAX_EPOCHS): t0 = time() total_neg_log_likelihood = 0 total_num_output_samples = 0 - iteration = 0 + iteration = 0 for X, Y in get_minibatch(data.TRAIN_FILE, MINIBATCH_SIZE, shuffle=True): total_neg_log_likelihood += train_model(X, Y, learning_rate) total_num_output_samples += np.prod(Y.shape) iteration += 1 if iteration % 100 == 0: - sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100))) + sys.stdout.write( + "PPL: %.4f; Speed: %.2f sps\n" % + (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100)) + ) sys.stdout.flush() print("Total number of training labels: %d" % total_num_output_samples) @@ -198,7 +177,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): total_neg_log_likelihood += validate_model(X, Y) total_num_output_samples += np.prod(Y.shape) print("Total number of validation labels: %d" % total_num_output_samples) - + ppl = np.exp(total_neg_log_likelihood / total_num_output_samples) validation_ppl_history.append(ppl) @@ -206,7 +185,15 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False): if ppl <= best_ppl: best_ppl = ppl - net.save(model_file_name, gsums=gsums, learning_rate=learning_rate, validation_ppl_history=validation_ppl_history, best_validation_ppl=best_ppl, epoch=epoch, random_state=rng.get_state()) + net.save( + model_file_name, + gsums=gsums, + learning_rate=learning_rate, + validation_ppl_history=validation_ppl_history, + best_validation_ppl=best_ppl, + epoch=epoch, + random_state=rng.get_state() + ) elif best_ppl not in validation_ppl_history[-PATIENCE_EPOCHS:]: print("Finished!") print("Best validation perplexity was %s" % best_ppl) diff --git a/main2.py b/punctuator/main2.py similarity index 79% rename from main2.py rename to punctuator/main2.py index c966943..530bd7e 100644 --- a/main2.py +++ b/punctuator/main2.py @@ -1,24 +1,18 @@ # coding: utf-8 from __future__ import division, print_function +import sys from collections import OrderedDict from time import time - -import models -import data - -import theano -import sys import os.path -try: - input = raw_input -except NameError: - pass +import theano import theano.tensor as T import numpy as np -from main import get_minibatch +from punctuator import models +from punctuator import data +from punctuator.main import get_minibatch MAX_EPOCHS = 50 MINIBATCH_SIZE = 128 @@ -26,10 +20,6 @@ CLIPPING_THRESHOLD = 2.0 PATIENCE_EPOCHS = 1 -""" -Second stage training -""" - if __name__ == "__main__": if len(sys.argv) > 1: @@ -68,7 +58,8 @@ if os.path.isfile(model_file_name): while True: - resp = input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name) + resp = input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n" \ + "[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name) resp = resp.lower().strip() if resp not in ('c', 'r', 'e'): continue @@ -96,7 +87,7 @@ y_vocabulary=punctuation_vocabulary, stage1_model_file_name=stage1_model_file_name, p=p - ) + ) starting_epoch = 0 best_ppl = np.inf @@ -110,31 +101,17 @@ updates = OrderedDict() # Compute norm of gradients - norm = T.sqrt(T.sum( - [T.sum(gparam ** 2) for gparam in gparams] - )) - + norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams])) # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011) for gparam, param, gsum in zip(gparams, net.params, gsums): - gparam = T.switch( - T.ge(norm, CLIPPING_THRESHOLD), - gparam / norm * CLIPPING_THRESHOLD, - gparam - ) # Clipping of gradients - updates[gsum] = gsum + (gparam ** 2) + gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD), gparam / norm * CLIPPING_THRESHOLD, gparam) # Clipping of gradients + updates[gsum] = gsum + (gparam**2) updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6))) - train_model = theano.function( - inputs=[x, p, y, lr], - outputs=cost, - updates=updates - ) + train_model = theano.function(inputs=[x, p, y, lr], outputs=cost, updates=updates) - validate_model = theano.function( - inputs=[x, p, y], - outputs=net.cost(y) - ) + validate_model = theano.function(inputs=[x, p, y], outputs=net.cost(y)) print("Training...") for epoch in range(starting_epoch, MAX_EPOCHS): @@ -147,7 +124,8 @@ total_num_output_samples += np.prod(Y.shape) iteration += 1 if iteration % 100 == 0: - sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100))) + sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" \ + % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100))) sys.stdout.flush() print("Total number of training labels: %d" % total_num_output_samples) @@ -165,7 +143,15 @@ if ppl <= best_ppl: best_ppl = ppl - net.save(model_file_name, gsums=gsums, learning_rate=learning_rate, validation_ppl_history=validation_ppl_history, best_validation_ppl=best_ppl, epoch=epoch, random_state=rng.get_state()) + net.save( + model_file_name, + gsums=gsums, + learning_rate=learning_rate, + validation_ppl_history=validation_ppl_history, + best_validation_ppl=best_ppl, + epoch=epoch, + random_state=rng.get_state() + ) elif best_ppl not in validation_ppl_history[-PATIENCE_EPOCHS:]: print("Finished!") break diff --git a/models.py b/punctuator/models.py similarity index 74% rename from models.py rename to punctuator/models.py index dd92376..9ad11f6 100644 --- a/models.py +++ b/punctuator/models.py @@ -1,47 +1,50 @@ # coding: utf-8 from __future__ import division, print_function +import os +import logging +import _pickle as cPickle import theano -try: - import cPickle - cpickle_options = {} -except ImportError: - import _pickle as cPickle - cpickle_options = { 'encoding': 'latin-1' } -import os import theano.tensor as T import numpy as np +cpickle_options = {'encoding': 'latin-1'} + + def PReLU(a, x): return T.maximum(0.0, x) + a * T.minimum(0.0, x) + def ReLU(x): return T.maximum(0.0, x) + def _get_shape(i, o, keepdims): if (i == 1 or o == 1) and not keepdims: - return (max(i,o),) - else: - return (i, o) + return (max(i, o),) + return (i, o) + def _slice(tensor, size, i): """Gets slice of columns of the tensor""" if tensor.ndim == 2: - return tensor[:, i*size:(i+1)*size] - elif tensor.ndim == 1: - return tensor[i*size:(i+1)*size] - else: - raise NotImplementedError("Tensor should be 1 or 2 dimensional") + return tensor[:, i * size:(i + 1) * size] + if tensor.ndim == 1: + return tensor[i * size:(i + 1) * size] + raise NotImplementedError("Tensor should be 1 or 2 dimensional") + def weights_const(i, o, name, const, keepdims=False): W_values = np.ones(_get_shape(i, o, keepdims)).astype(theano.config.floatX) * const return theano.shared(value=W_values, name=name, borrow=True) + def weights_identity(i, o, name, const, keepdims=False): #"A Simple Way to Initialize Recurrent Networks of Rectified Linear Units" (2015) (http://arxiv.org/abs/1504.00941) W_values = np.eye(*_get_shape(i, o, keepdims)).astype(theano.config.floatX) * const return theano.shared(value=W_values, name=name, borrow=True) + def weights_Glorot(i, o, name, rng, is_logistic_sigmoid=False, keepdims=False): #http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf d = np.sqrt(6. / (i + o)) @@ -50,19 +53,22 @@ def weights_Glorot(i, o, name, rng, is_logistic_sigmoid=False, keepdims=False): W_values = rng.uniform(low=-d, high=d, size=_get_shape(i, o, keepdims)).astype(theano.config.floatX) return theano.shared(value=W_values, name=name, borrow=True) + def load(file_path, minibatch_size, x, p=None): - from . import models - try: - import cPickle - except ImportError: - import _pickle as cPickle - import theano - import numpy as np + #import models + # try: + # import cPickle + # except ImportError: + # import _pickle as cPickle + # import theano + # import numpy as np with open(file_path, 'rb') as f: state = cPickle.load(f, **cpickle_options) - Model = getattr(models, state["type"]) + logging.info('Looking up %s.', state["type"]) + # Model = getattr(models, state["type"]) + Model = globals()[state["type"]] rng = np.random rng.set_state(state["random_state"]) @@ -76,7 +82,7 @@ def load(file_path, minibatch_size, x, p=None): y_vocabulary=state["y_vocabulary"], stage1_model_file_name=state.get("stage1_model_file_name", None), p=p - ) + ) for net_param, state_param in zip(net.params, state["params"]): net_param.set_value(state_param, borrow=True) @@ -85,7 +91,8 @@ def load(file_path, minibatch_size, x, p=None): return net, (gsums, state["learning_rate"], state["validation_ppl_history"], state["epoch"], rng) -class GRULayer(object): + +class GRULayer: def __init__(self, rng, n_in, n_out, minibatch_size): super(GRULayer, self).__init__() @@ -98,9 +105,9 @@ def __init__(self, rng, n_in, n_out, minibatch_size): self.h0 = theano.shared(value=np.zeros((minibatch_size, n_out)).astype(theano.config.floatX), name='h0', borrow=True) # Gate parameters: - self.W_x = weights_Glorot(n_in, n_out*2, 'W_x', rng) - self.W_h = weights_Glorot(n_out, n_out*2, 'W_h', rng) - self.b = weights_const(1, n_out*2, 'b', 0) + self.W_x = weights_Glorot(n_in, n_out * 2, 'W_x', rng) + self.W_h = weights_Glorot(n_out, n_out * 2, 'W_h', rng) + self.b = weights_const(1, n_out * 2, 'b', 0) # Input parameters self.W_x_h = weights_Glorot(n_in, n_out, 'W_x_h', rng) self.W_h_h = weights_Glorot(n_out, n_out, 'W_h_h', rng) @@ -120,7 +127,8 @@ def step(self, x_t, h_tm1): return h_t -class GRU(object): + +class GRU: def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, stage1_model_file_name=None, p=None): @@ -136,12 +144,12 @@ def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, # input model pretrained_embs_path = "We.pcl" if os.path.exists(pretrained_embs_path): - print("Found pretrained embeddings in '%s'. Using them..." % pretrained_embs_path) + logging.info("Found pretrained embeddings in '%s'. Using them...", pretrained_embs_path) with open(pretrained_embs_path, 'rb') as f: We = cPickle.load(f, **cpickle_options) n_emb = len(We[0]) - We.append([0.1]*n_emb) # END - We.append([0.0]*n_emb) # UNK - both quite arbitrary initializations + We.append([0.1] * n_emb) # END + We.append([0.0] * n_emb) # UNK - both quite arbitrary initializations We = np.array(We).astype(theano.config.floatX) self.We = theano.shared(value=We, name="We", borrow=True) @@ -153,7 +161,7 @@ def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, self.GRU_b = GRULayer(rng=rng, n_in=n_emb, n_out=n_hidden, minibatch_size=minibatch_size) # output model - self.GRU = GRULayer(rng=rng, n_in=n_hidden*2, n_out=n_hidden, minibatch_size=minibatch_size) + self.GRU = GRULayer(rng=rng, n_in=n_hidden * 2, n_out=n_hidden, minibatch_size=minibatch_size) self.Wy = weights_const(n_hidden, y_vocabulary_size, 'Wy', 0) self.by = weights_const(1, y_vocabulary_size, 'by', 0) @@ -170,10 +178,7 @@ def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, self.Wf_f = weights_const(n_hidden, n_hidden, 'Wf_f', 0) self.bf = weights_const(1, n_hidden, 'by', 0) - self.params = [self.We, - self.Wy, self.by, - self.Wa_h, self.Wa_c, self.ba, self.Wa_y, - self.Wf_h, self.Wf_c, self.Wf_f, self.bf] + self.params = [self.We, self.Wy, self.by, self.Wa_h, self.Wa_c, self.ba, self.Wa_y, self.Wf_h, self.Wf_c, self.Wf_f, self.bf] self.params += self.GRU.params + self.GRU_f.params + self.GRU_b.params @@ -190,7 +195,7 @@ def output_recurrence(x_t, h_tm1, Wa_h, Wa_y, Wf_h, Wf_c, Wf_f, bf, Wy, by, cont alphas = T.exp(T.dot(h_a, Wa_y)) alphas = alphas.reshape((alphas.shape[0], alphas.shape[1])) # drop 2-axis (sized 1) alphas = alphas / alphas.sum(axis=0, keepdims=True) - weighted_context = (context * alphas[:,:,None]).sum(axis=0) + weighted_context = (context * alphas[:, :, None]).sum(axis=0) h_t = self.GRU.step(x_t=x_t, h_tm1=h_tm1) @@ -206,46 +211,46 @@ def output_recurrence(x_t, h_tm1, Wa_h, Wa_y, Wf_h, Wf_c, Wf_f, bf, Wy, by, cont x_emb = self.We[x.flatten()].reshape((x.shape[0], minibatch_size, n_emb)) - [h_f_t, h_b_t], _ = theano.scan(fn=input_recurrence, + [h_f_t, h_b_t], _ = theano.scan( + fn=input_recurrence, sequences=[x_emb, x_emb[::-1]], # forward and backward sequences - outputs_info=[self.GRU_f.h0, self.GRU_b.h0]) + outputs_info=[self.GRU_f.h0, self.GRU_b.h0] + ) # 0-axis is time steps, 1-axis is batch size and 2-axis is hidden layer size context = T.concatenate([h_f_t, h_b_t[::-1]], axis=2) projected_context = T.dot(context, self.Wa_c) + self.ba - [_, self.last_hidden_states, self.y, self.alphas], _ = theano.scan(fn=output_recurrence, + [_, self.last_hidden_states, self.y, self.alphas], _ = theano.scan( + fn=output_recurrence, sequences=[context[1:]], # ignore the 1st word in context, because there's no punctuation before that non_sequences=[self.Wa_h, self.Wa_y, self.Wf_h, self.Wf_c, self.Wf_f, self.bf, self.Wy, self.by, context, projected_context], - outputs_info=[self.GRU.h0, None, None, None]) + outputs_info=[self.GRU.h0, None, None, None] + ) - print("Number of parameters is %d" % sum(np.prod(p.shape.eval()) for p in self.params)) + logging.info("Number of parameters is %d", sum(np.prod(p.shape.eval()) for p in self.params)) self.L1 = sum(abs(p).sum() for p in self.params) self.L2_sqr = sum((p**2).sum() for p in self.params) def cost(self, y): - num_outputs = self.y.shape[0]*self.y.shape[1] # time steps * number of parallel sequences in batch + num_outputs = self.y.shape[0] * self.y.shape[1] # time steps * number of parallel sequences in batch output = self.y.reshape((num_outputs, self.y.shape[2])) return -T.sum(T.log(output[T.arange(num_outputs), y.flatten()])) def save(self, file_path, gsums=None, learning_rate=None, validation_ppl_history=None, best_validation_ppl=None, epoch=None, random_state=None): - try: - import cPickle - except ImportError: - import _pickle as cPickle state = { - "type": self.__class__.__name__, - "n_hidden": self.n_hidden, - "x_vocabulary": self.x_vocabulary, - "y_vocabulary": self.y_vocabulary, - "stage1_model_file_name": self.stage1_model_file_name if hasattr(self, "stage1_model_file_name") else None, - "params": [p.get_value(borrow=True) for p in self.params], - "gsums": [s.get_value(borrow=True) for s in gsums] if gsums else None, - "learning_rate": learning_rate, - "validation_ppl_history": validation_ppl_history, - "epoch": epoch, - "random_state": random_state + "type": self.__class__.__name__, + "n_hidden": self.n_hidden, + "x_vocabulary": self.x_vocabulary, + "y_vocabulary": self.y_vocabulary, + "stage1_model_file_name": self.stage1_model_file_name if hasattr(self, "stage1_model_file_name") else None, + "params": [p.get_value(borrow=True) for p in self.params], + "gsums": [s.get_value(borrow=True) for s in gsums] if gsums else None, + "learning_rate": learning_rate, + "validation_ppl_history": validation_ppl_history, + "epoch": epoch, + "random_state": random_state } with open(file_path, 'wb') as f: @@ -255,6 +260,7 @@ def save(self, file_path, gsums=None, learning_rate=None, validation_ppl_history class GRUstage2(GRU): def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, stage1_model_file_name, p=None): + # pylint: disable=super-init-not-called y_vocabulary_size = len(y_vocabulary) @@ -282,13 +288,11 @@ def recurrence(x_t, p_t, h_tm1, Wy, by): return [h_t, y_t] - [_, self.y], _ = theano.scan(fn=recurrence, - sequences=[self.stage1.last_hidden_states, p], - non_sequences=[self.Wy, self.by], - outputs_info=[self.GRU.h0, None]) + [_, self.y + ], _ = theano.scan(fn=recurrence, sequences=[self.stage1.last_hidden_states, p], non_sequences=[self.Wy, self.by], outputs_info=[self.GRU.h0, None]) - print("Number of parameters is %d" % sum(np.prod(p.shape.eval()) for p in self.params)) - print("Number of parameters with stage1 params is %d" % sum(np.prod(p.shape.eval()) for p in self.params + self.stage1.params)) + logging.info("Number of parameters is %d", sum(np.prod(p.shape.eval()) for p in self.params)) + logging.info("Number of parameters with stage1 params is %d", sum(np.prod(p.shape.eval()) for p in self.params + self.stage1.params)) self.L1 = sum(abs(p).sum() for p in self.params) self.L2_sqr = sum((p**2).sum() for p in self.params) diff --git a/play_with_model.py b/punctuator/play_with_model.py similarity index 88% rename from play_with_model.py rename to punctuator/play_with_model.py index 45300c1..20c9bff 100644 --- a/play_with_model.py +++ b/punctuator/play_with_model.py @@ -12,19 +12,23 @@ import theano.tensor as T import numpy as np +# pylint: disable=redefined-outer-name + + def to_array(arr, dtype=np.int32): # minibatch of 1 sequence as column return np.array([arr], dtype=dtype).T + def convert_punctuation_to_readable(punct_token): if punct_token == data.SPACE: return " " - else: - return punct_token[0] + return punct_token[0] + def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuation_vocabulary, reverse_word_vocabulary, text, f_out, show_unk): - if len(text) == 0: + if not text: sys.exit("Input text from stdin missing.") text = [w for w in text.split() if w not in punctuation_vocabulary] + [data.END] @@ -33,9 +37,9 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat while True: - subsequence = text[i:i+data.MAX_SEQUENCE_LEN] + subsequence = text[i:i + data.MAX_SEQUENCE_LEN] - if len(subsequence) == 0: + if not subsequence: break converted_subsequence = [word_vocabulary.get(w, word_vocabulary[data.UNK]) for w in subsequence] @@ -68,7 +72,7 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat for j in range(step): f_out.write(" " + punctuations[j] + " " if punctuations[j] != data.SPACE else " ") if j < step - 1: - f_out.write(subsequence[1+j]) + f_out.write(subsequence[1 + j]) if subsequence[-1] == data.END: break @@ -96,8 +100,8 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat predict = theano.function(inputs=[x], outputs=net.y) word_vocabulary = net.x_vocabulary punctuation_vocabulary = net.y_vocabulary - reverse_word_vocabulary = {v:k for k,v in net.x_vocabulary.items()} - reverse_punctuation_vocabulary = {v:k for k,v in net.y_vocabulary.items()} + reverse_word_vocabulary = {v: k for k, v in net.x_vocabulary.items()} + reverse_punctuation_vocabulary = {v: k for k, v in net.y_vocabulary.items()} with open(sys.stdout.fileno(), 'w', encoding='utf-8', closefd=False) as f_out: while True: diff --git a/punctuator/punc.py b/punctuator/punc.py new file mode 100644 index 0000000..e78b1e4 --- /dev/null +++ b/punctuator/punc.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import division, print_function + +import os +import sys +import logging +import pickle +from io import open, StringIO + +import theano +import theano.tensor as T +import numpy as np + +import gdown + +import models +import data +from convert_to_readable import convert + +PUNCTUATOR_DATA_DIR = os.path.expanduser(os.environ.get('PUNCTUATOR_DATA_DIR', '~/.punctuator')) + +MAX_SUBSEQUENCE_LEN = 200 + +# pylint: disable=redefined-outer-name + +DEMO_DATA_GID = '0B7BsN5f2F1fZd1Q0aXlrUDhDbnM' # Demo-Europarl-EN.pcl + + +def download_model(gid=DEMO_DATA_GID): + _cwd = os.getcwd() + try: + os.makedirs(PUNCTUATOR_DATA_DIR, exist_ok=True) + os.chdir(PUNCTUATOR_DATA_DIR) + logging.info('Downloading %s...', gid) + fn = gdown.download(url=f'https://drive.google.com/uc?id={gid}', output=None, quiet=False) + return os.path.join(PUNCTUATOR_DATA_DIR, fn) + finally: + os.chdir(_cwd) + + +def to_array(arr, dtype=np.int32): + # minibatch of 1 sequence as column + return np.array([arr], dtype=dtype).T + + +def convert_punctuation_to_readable(punct_token): + if punct_token == data.SPACE: + return " " + return punct_token[0] + + +def restore_with_pauses(output_file, text, pauses, word_vocabulary, reverse_punctuation_vocabulary, predict_function): + i = 0 + if isinstance(output_file, str): + f_out = open(output_file, 'w', encoding='utf-8') + f_callback = f_out.close + else: + f_out = output_file + f_callback = None + try: + while True: + + subsequence = text[i:i + MAX_SUBSEQUENCE_LEN] + subsequence_pauses = pauses[i:i + MAX_SUBSEQUENCE_LEN] + + if not subsequence: + break + + converted_subsequence = [word_vocabulary.get(w, word_vocabulary[data.UNK]) for w in subsequence] + + y = predict_function(to_array(converted_subsequence), to_array(subsequence_pauses, dtype=theano.config.floatX)) + + f_out.write(subsequence[0]) + + last_eos_idx = 0 + punctuations = [] + for y_t in y: + + p_i = np.argmax(y_t.flatten()) + punctuation = reverse_punctuation_vocabulary[p_i] + + punctuations.append(punctuation) + + if punctuation in data.EOS_TOKENS: + last_eos_idx = len(punctuations) # we intentionally want the index of next element + + if subsequence[-1] == data.END: + step = len(subsequence) - 1 + elif last_eos_idx != 0: + step = last_eos_idx + else: + step = len(subsequence) - 1 + + for j in range(step): + f_out.write(" " + punctuations[j] + " " if punctuations[j] != data.SPACE else " ") + if j < step - 1: + f_out.write(subsequence[1 + j]) + + if subsequence[-1] == data.END: + break + + i += step + finally: + if callable(f_callback): + f_callback() + + +def restore(output_file, text, word_vocabulary, reverse_punctuation_vocabulary, predict_function): + i = 0 + if isinstance(output_file, str): + f_out = open(output_file, 'w', encoding='utf-8') + f_callback = f_out.close + else: + f_out = output_file + f_callback = None + try: + while True: + + subsequence = text[i:i + MAX_SUBSEQUENCE_LEN] + + if not subsequence: + break + + converted_subsequence = [word_vocabulary.get(w, word_vocabulary[data.UNK]) for w in subsequence] + + y = predict_function(to_array(converted_subsequence)) + + f_out.write(subsequence[0]) + + last_eos_idx = 0 + punctuations = [] + for y_t in y: + + p_i = np.argmax(y_t.flatten()) + punctuation = reverse_punctuation_vocabulary[p_i] + + punctuations.append(punctuation) + + if punctuation in data.EOS_TOKENS: + last_eos_idx = len(punctuations) # we intentionally want the index of next element + + if subsequence[-1] == data.END: + step = len(subsequence) - 1 + elif last_eos_idx != 0: + step = last_eos_idx + else: + step = len(subsequence) - 1 + + for j in range(step): + f_out.write(" " + punctuations[j] + " " if punctuations[j] != data.SPACE else " ") + if j < step - 1: + f_out.write(subsequence[1 + j]) + + if subsequence[-1] == data.END: + break + + i += step + finally: + if callable(f_callback): + f_callback() + + +class Punctuator: + + def __init__(self, model_file, use_pauses=False): + self.model_file = model_file + self.use_pauses = use_pauses + + x = T.imatrix('x') + + if use_pauses: + + p = T.matrix('p') + + logging.info("Loading model parameters...") + net, _ = models.load(model_file, 1, x, p) + + logging.info("Building model...") + self.predict = theano.function(inputs=[x, p], outputs=net.y) + + else: + + logging.info("Loading model parameters...") + net, _ = models.load(model_file, 1, x) + + logging.info("Building model...") + self.predict = theano.function(inputs=[x], outputs=net.y) + + self.net = net + self.word_vocabulary = net.x_vocabulary + self.punctuation_vocabulary = net.y_vocabulary + + self.reverse_word_vocabulary = {v: k for k, v in self.word_vocabulary.items()} + self.reverse_punctuation_vocabulary = {v: k for k, v in self.punctuation_vocabulary.items()} + + def save(self, fn): + assert isinstance(fn, str) + with open(fn, 'wb') as fout: + pickle.dump(self, fout) + + @classmethod + def load(cls, fn): + assert isinstance(fn, str) + with open(fn, 'rb') as fin: + return pickle.load(fin) + + def punctuate(self, input_text, escape=True): + + text = [ + w for w in input_text.split() if w not in self.punctuation_vocabulary and w not in data.PUNCTUATION_MAPPING and not w.startswith(data.PAUSE_PREFIX) + ] + [data.END] + pauses = [float(s.replace(data.PAUSE_PREFIX, "").replace(">", "")) for s in input_text.split() if s.startswith(data.PAUSE_PREFIX)] + + fout = StringIO() + if self.use_pauses: + if not pauses: + pauses = [0.0 for _ in range(len(text) - 1)] + restore_with_pauses(fout, text, pauses, self.word_vocabulary, self.reverse_punctuation_vocabulary, self.predict) + else: + restore(fout, text, self.word_vocabulary, self.reverse_punctuation_vocabulary, self.predict) + + # Convert tokenize punctuation to normal punctuation. + if escape: + fout2 = StringIO() + convert(fout.getvalue(), fout2) + + # if isinstance(output_file, str): + # with open(output_file, 'w', encoding='utf-8') as fout_final: + # fout_final.write(fout2.getvalue()) + # else: + # output_file.write(fout2.getvalue()) + + output_text = fout2.getvalue() + if output_text and not output_text.endswith('.'): + output_text += '.' + + return output_text + + +def command_line_runner(): + + if len(sys.argv) > 1: + model_file = sys.argv[1] + else: + sys.exit("Model file path argument missing") + if model_file[0] not in ('.', '..', '/'): + model_file = os.path.join(PUNCTUATOR_DATA_DIR, model_file) + assert os.path.isfile(model_file), 'Specified model file does not exist: %s' % model_file + + if len(sys.argv) > 2: + output_file = sys.argv[2] + else: + sys.exit("Output file path argument missing") + + input_text = open(sys.stdin.fileno(), 'r', encoding='utf-8').read().strip() + if not input_text: + sys.exit("Input text from stdin missing.") + + use_pauses = len(sys.argv) > 3 and bool(int(sys.argv[3])) + + p = Punctuator(model_file, use_pauses) + output_text = p.punctuate(input_text) + with open(output_file, 'w', encoding='utf-8') as fout: + fout.write(output_text) + + +if __name__ == '__main__': + command_line_runner() diff --git a/punctuator/tests.py b/punctuator/tests.py new file mode 100644 index 0000000..29366ad --- /dev/null +++ b/punctuator/tests.py @@ -0,0 +1,80 @@ +from __future__ import absolute_import + +import time +import os +import unittest +import tempfile +import shutil +from io import StringIO + +import punc +from punc import Punctuator, download_model + + +class Tests(unittest.TestCase): + + def test_punctuate(self): + + samples = [( + 'mary had a little lamb its fleece was white as snow and anywhere that mary went the lamb was sure to go', + 'Mary had a little lamb, its fleece was white as snow and anywhere that mary went, the lamb was sure to go.' + ), + ( + "they say it's only as cold as it feels in your mind i don't buy into that theory much what do you think", + "They say it's only as cold as it feels in your mind. I don't buy into that theory much. What do you think." + )] + + # Create temp directory for downloading data. + d = tempfile.mkdtemp() + punc.PUNCTUATOR_DATA_DIR = d + print('Temp dir:', d) + os.chdir(d) + try: + + # Download pre-trained model. + model_file = download_model() + print('Model file:', model_file) + + # Create punctuator. + t0 = time.time() + p = Punctuator(model_file=model_file) + td = time.time() - t0 + print('Loaded in %s seconds.' % td) + + # Add punctuation. + for input_text, expect_output_text in samples: + fout = StringIO() + actual_output_text = p.punctuate(input_text) + print('expect_output_text:', expect_output_text) + print('actual_output_text:', actual_output_text) + self.assertEqual(actual_output_text, expect_output_text) + + # Serialize the entire punctuator, not just the model. + print('Writing...') + t0 = time.time() + fn = 'data.pickle' + p.save(fn) + td = time.time() - t0 + print('Wrote in %s seconds.' % td) + + # Load puncutator. + print('Loading...') + t0 = time.time() + p2 = Punctuator.load(fn) + td = time.time() - t0 + print('Loaded in %s seconds.' % td) + + # Confirm punctuations match previous. + for input_text, expect_output_text in samples: + fout = StringIO() + actual_output_text = p2.punctuate(input_text) + print('expect_output_text:', expect_output_text) + print('actual_output_text:', actual_output_text) + self.assertEqual(actual_output_text, expect_output_text) + + finally: + shutil.rmtree(d) + + +if __name__ == '__main__': + unittest.main() diff --git a/pylint.messages b/pylint.messages new file mode 100644 index 0000000..f856863 --- /dev/null +++ b/pylint.messages @@ -0,0 +1,750 @@ +:blacklisted-name (C0102): *Black listed name "%s"* + Used when the name is listed in the black list (unauthorized names). +:invalid-name (C0103): *Invalid %s name "%s"%s* + Used when the name doesn't match the regular expression associated to its type + (constant, variable, class...). +:missing-docstring (C0111): *Missing %s docstring* + Used when a module, function, class or method has no docstring.Some special + methods like __init__ doesn't necessary require a docstring. +:empty-docstring (C0112): *Empty %s docstring* + Used when a module, function, class or method has an empty docstring (it would + be too easy ;). +:unneeded-not (C0113): *Consider changing "%s" to "%s"* + Used when a boolean expression contains an unneeded negation. +:singleton-comparison (C0121): *Comparison to %s should be %s* + Used when an expression is compared to singleton values like True, False or + None. +:misplaced-comparison-constant (C0122): *Comparison should be %s* + Used when the constant is placed on the left sideof a comparison. It is + usually clearer in intent to place it in the right hand side of the + comparison. +:unidiomatic-typecheck (C0123): *Using type() instead of isinstance() for a typecheck.* + The idiomatic way to perform an explicit typecheck in Python is to use + isinstance(x, Y) rather than type(x) == Y, type(x) is Y. Though there are + unusual situations where these give different results. +:consider-using-enumerate (C0200): *Consider using enumerate instead of iterating with range and len* + Emitted when code that iterates with range and len is encountered. Such code + can be simplified by using the enumerate builtin. +:consider-iterating-dictionary (C0201): *Consider iterating the dictionary directly instead of calling .keys()* + Emitted when the keys of a dictionary are iterated through the .keys() method. + It is enough to just iterate through the dictionary itself, as in "for key in + dictionary". +:bad-classmethod-argument (C0202): *Class method %s should have %s as first argument* + Used when a class method has a first argument named differently than the value + specified in valid-classmethod-first-arg option (default to "cls"), + recommended to easily differentiate them from regular instance methods. +:bad-mcs-method-argument (C0203): *Metaclass method %s should have %s as first argument* + Used when a metaclass method has a first agument named differently than the + value specified in valid-classmethod-first-arg option (default to "cls"), + recommended to easily differentiate them from regular instance methods. +:bad-mcs-classmethod-argument (C0204): *Metaclass class method %s should have %s as first argument* + Used when a metaclass class method has a first argument named differently than + the value specified in valid-metaclass-classmethod-first-arg option (default + to "mcs"), recommended to easily differentiate them from regular instance + methods. +:line-too-long (C0301): *Line too long (%s/%s)* + Used when a line is longer than a given number of characters. +:too-many-lines (C0302): *Too many lines in module (%s/%s)* + Used when a module has too much lines, reducing its readability. +:trailing-whitespace (C0303): *Trailing whitespace* + Used when there is whitespace between the end of a line and the newline. +:missing-final-newline (C0304): *Final newline missing* + Used when the last line in a file is missing a newline. +:trailing-newlines (C0305): *Trailing newlines* + Used when there are trailing blank lines in a file. +:multiple-statements (C0321): *More than one statement on a single line* + Used when more than on statement are found on the same line. +:superfluous-parens (C0325): *Unnecessary parens after %r keyword* + Used when a single item in parentheses follows an if, for, or other keyword. +:bad-whitespace (C0326): *%s space %s %s %s* + Used when a wrong number of spaces is used around an operator, bracket or + block opener. +:mixed-line-endings (C0327): *Mixed line endings LF and CRLF* + Used when there are mixed (LF and CRLF) newline signs in a file. +:unexpected-line-ending-format (C0328): *Unexpected line ending format. There is '%s' while it should be '%s'.* + Used when there is different newline than expected. +:bad-continuation (C0330): *Wrong %s indentation%s%s.* + TODO +:wrong-spelling-in-comment (C0401): *Wrong spelling of a word '%s' in a comment:* + Used when a word in comment is not spelled correctly. +:wrong-spelling-in-docstring (C0402): *Wrong spelling of a word '%s' in a docstring:* + Used when a word in docstring is not spelled correctly. +:invalid-characters-in-docstring (C0403): *Invalid characters %r in a docstring* + Used when a word in docstring cannot be checked by enchant. +:multiple-imports (C0410): *Multiple imports on one line (%s)* + Used when import statement importing multiple modules is detected. +:wrong-import-order (C0411): *%s comes before %s* + Used when PEP8 import order is not respected (standard imports first, then + third-party libraries, then local imports) +:ungrouped-imports (C0412): *Imports from package %s are not grouped* + Used when imports are not grouped by packages +:wrong-import-position (C0413): *Import "%s" should be placed at the top of the module* + Used when code and imports are mixed +:old-style-class (C1001): *Old-style class defined.* + Used when a class is defined that does not inherit from anotherclass and does + not inherit explicitly from "object". This message can't be emitted when using + Python >= 3.0. +:syntax-error (E0001): + Used when a syntax error is raised for a module. +:unrecognized-inline-option (E0011): *Unrecognized file option %r* + Used when an unknown inline option is encountered. +:bad-option-value (E0012): *Bad option value %r* + Used when a bad value for an inline option is encountered. +:init-is-generator (E0100): *__init__ method is a generator* + Used when the special class method __init__ is turned into a generator by a + yield in its body. +:return-in-init (E0101): *Explicit return in __init__* + Used when the special class method __init__ has an explicit return value. +:function-redefined (E0102): *%s already defined line %s* + Used when a function / class / method is redefined. +:not-in-loop (E0103): *%r not properly in loop* + Used when break or continue keywords are used outside a loop. +:return-outside-function (E0104): *Return outside function* + Used when a "return" statement is found outside a function or method. +:yield-outside-function (E0105): *Yield outside function* + Used when a "yield" statement is found outside a function or method. +:return-arg-in-generator (E0106): *Return with argument inside generator* + Used when a "return" statement with an argument is found outside in a + generator function or method (e.g. with some "yield" statements). This message + can't be emitted when using Python >= 3.3. +:nonexistent-operator (E0107): *Use of the non-existent %s operator* + Used when you attempt to use the C-style pre-increment orpre-decrement + operator -- and ++, which doesn't exist in Python. +:duplicate-argument-name (E0108): *Duplicate argument name %s in function definition* + Duplicate argument names in function definitions are syntax errors. +:abstract-class-instantiated (E0110): *Abstract class %r with abstract methods instantiated* + Used when an abstract class with `abc.ABCMeta` as metaclass has abstract + methods and is instantiated. +:bad-reversed-sequence (E0111): *The first reversed() argument is not a sequence* + Used when the first argument to reversed() builtin isn't a sequence (does not + implement __reversed__, nor __getitem__ and __len__ +:continue-in-finally (E0116): *'continue' not supported inside 'finally' clause* + Emitted when the `continue` keyword is found inside a finally clause, which is + a SyntaxError. +:method-hidden (E0202): *An attribute defined in %s line %s hides this method* + Used when a class defines a method which is hidden by an instance attribute + from an ancestor class or set by some client code. +:access-member-before-definition (E0203): *Access to member %r before its definition line %s* + Used when an instance member is accessed before it's actually assigned. +:no-method-argument (E0211): *Method has no argument* + Used when a method which should have the bound instance as first argument has + no argument defined. +:no-self-argument (E0213): *Method should have "self" as first argument* + Used when a method has an attribute different the "self" as first argument. + This is considered as an error since this is a so common convention that you + shouldn't break it! +:invalid-slots-object (E0236): *Invalid object %r in __slots__, must contain only non empty strings* + Used when an invalid (non-string) object occurs in __slots__. +:assigning-non-slot (E0237): *Assigning to attribute %r not defined in class slots* + Used when assigning to an attribute not defined in the class slots. +:invalid-slots (E0238): *Invalid __slots__ object* + Used when an invalid __slots__ is found in class. Only a string, an iterable + or a sequence is permitted. +:inherit-non-class (E0239): *Inheriting %r, which is not a class.* + Used when a class inherits from something which is not a class. +:inconsistent-mro (E0240): *Inconsistent method resolution order for class %r* + Used when a class has an inconsistent method resolutin order. +:duplicate-bases (E0241): *Duplicate bases for class %r* + Used when a class has duplicate bases. +:non-iterator-returned (E0301): *__iter__ returns non-iterator* + Used when an __iter__ method returns something which is not an iterable (i.e. + has no `next` method) +:unexpected-special-method-signature (E0302): *The special method %r expects %s param(s), %d %s given* + Emitted when a special method was defined with an invalid number of + parameters. If it has too few or too many, it might not work at all. +:invalid-length-returned (E0303): *__len__ does not return non-negative integer* + Used when an __len__ method returns something which is not a non-negative + integer +:import-error (E0401): *Unable to import %s* + Used when pylint has been unable to import a module. +:used-before-assignment (E0601): *Using variable %r before assignment* + Used when a local variable is accessed before it's assignment. +:undefined-variable (E0602): *Undefined variable %r* + Used when an undefined variable is accessed. +:undefined-all-variable (E0603): *Undefined variable name %r in __all__* + Used when an undefined variable name is referenced in __all__. +:invalid-all-object (E0604): *Invalid object %r in __all__, must contain only strings* + Used when an invalid (non-string) object occurs in __all__. +:no-name-in-module (E0611): *No name %r in module %r* + Used when a name cannot be found in a module. +:unbalanced-tuple-unpacking (E0632): *Possible unbalanced tuple unpacking with sequence%s: left side has %d label(s), right side has %d value(s)* + Used when there is an unbalanced tuple unpacking in assignment +:unpacking-non-sequence (E0633): *Attempting to unpack a non-sequence%s* + Used when something which is not a sequence is used in an unpack assignment +:bad-except-order (E0701): *Bad except clauses order (%s)* + Used when except clauses are not in the correct order (from the more specific + to the more generic). If you don't fix the order, some exceptions may not be + catched by the most specific handler. +:raising-bad-type (E0702): *Raising %s while only classes or instances are allowed* + Used when something which is neither a class, an instance or a string is + raised (i.e. a `TypeError` will be raised). +:misplaced-bare-raise (E0704): *The raise statement is not inside an except clause* + Used when a bare raise is not used inside an except clause. This generates an + error, since there are no active exceptions to be reraised. An exception to + this rule is represented by a bare raise inside a finally clause, which might + work, as long as an exception is raised inside the try block, but it is + nevertheless a code smell that must not be relied upon. +:raising-non-exception (E0710): *Raising a new style class which doesn't inherit from BaseException* + Used when a new style class which doesn't inherit from BaseException is + raised. +:notimplemented-raised (E0711): *NotImplemented raised - should raise NotImplementedError* + Used when NotImplemented is raised instead of NotImplementedError +:catching-non-exception (E0712): *Catching an exception which doesn't inherit from BaseException: %s* + Used when a class which doesn't inherit from BaseException is used as an + exception in an except clause. +:slots-on-old-class (E1001): *Use of __slots__ on an old style class* + Used when an old style class uses the __slots__ attribute. This message can't + be emitted when using Python >= 3.0. +:super-on-old-class (E1002): *Use of super on an old style class* + Used when an old style class uses the super builtin. This message can't be + emitted when using Python >= 3.0. +:bad-super-call (E1003): *Bad first argument %r given to super()* + Used when another argument than the current class is given as first argument + of the super builtin. +:missing-super-argument (E1004): *Missing argument to super()* + Used when the super builtin didn't receive an argument. This message can't be + emitted when using Python >= 3.0. +:no-member (E1101): *%s %r has no %r member* + Used when a variable is accessed for an unexistent member. +:not-callable (E1102): *%s is not callable* + Used when an object being called has been inferred to a non callable object +:assignment-from-no-return (E1111): *Assigning to function call which doesn't return* + Used when an assignment is done on a function call but the inferred function + doesn't return anything. +:no-value-for-parameter (E1120): *No value for argument %s in %s call* + Used when a function call passes too few arguments. +:too-many-function-args (E1121): *Too many positional arguments for %s call* + Used when a function call passes too many positional arguments. +:unexpected-keyword-arg (E1123): *Unexpected keyword argument %r in %s call* + Used when a function call passes a keyword argument that doesn't correspond to + one of the function's parameter names. +:redundant-keyword-arg (E1124): *Argument %r passed by position and keyword in %s call* + Used when a function call would result in assigning multiple values to a + function parameter, one value from a positional argument and one from a + keyword argument. +:invalid-sequence-index (E1126): *Sequence index is not an int, slice, or instance with __index__* + Used when a sequence type is indexed with an invalid type. Valid types are + ints, slices, and objects with an __index__ method. +:invalid-slice-index (E1127): *Slice index is not an int, None, or instance with __index__* + Used when a slice index is not an integer, None, or an object with an + __index__ method. +:assignment-from-none (E1128): *Assigning to function call which only returns None* + Used when an assignment is done on a function call but the inferred function + returns nothing but None. +:not-context-manager (E1129): *Context manager '%s' doesn't implement __enter__ and __exit__.* + Used when an instance in a with statement doesn't implement the context + manager protocol(__enter__/__exit__). +:invalid-unary-operand-type (E1130): + Emitted when an unary operand is used on an object which does not support this + type of operation +:unsupported-binary-operation (E1131): + Emitted when a binary arithmetic operation between two operands is not + supported. +:repeated-keyword (E1132): *Got multiple values for keyword argument %r in function call* + Emitted when a function call got multiple values for a keyword. +:not-an-iterable (E1133): *Non-iterable value %s is used in an iterating context* + Used when a non-iterable value is used in place whereiterable is expected +:not-a-mapping (E1134): *Non-mapping value %s is used in a mapping context* + Used when a non-mapping value is used in place wheremapping is expected +:unsupported-membership-test (E1135): *Value '%s' doesn't support membership test* + Emitted when an instance in membership test expression doesn'timplement + membership protocol (__contains__/__iter__/__getitem__) +:unsubscriptable-object (E1136): *Value '%s' is unsubscriptable* + Emitted when a subscripted value doesn't support subscription(i.e. doesn't + define __getitem__ method) +:logging-unsupported-format (E1200): *Unsupported logging format character %r (%#02x) at index %d* + Used when an unsupported format character is used in a logging statement + format string. +:logging-format-truncated (E1201): *Logging format string ends in middle of conversion specifier* + Used when a logging statement format string terminates before the end of a + conversion specifier. +:logging-too-many-args (E1205): *Too many arguments for logging format string* + Used when a logging format string is given too few arguments. +:logging-too-few-args (E1206): *Not enough arguments for logging format string* + Used when a logging format string is given too many arguments +:bad-format-character (E1300): *Unsupported format character %r (%#02x) at index %d* + Used when a unsupported format character is used in a format string. +:truncated-format-string (E1301): *Format string ends in middle of conversion specifier* + Used when a format string terminates before the end of a conversion specifier. +:mixed-format-string (E1302): *Mixing named and unnamed conversion specifiers in format string* + Used when a format string contains both named (e.g. '%(foo)d') and unnamed + (e.g. '%d') conversion specifiers. This is also used when a named conversion + specifier contains * for the minimum field width and/or precision. +:format-needs-mapping (E1303): *Expected mapping for format string, not %s* + Used when a format string that uses named conversion specifiers is used with + an argument that is not a mapping. +:missing-format-string-key (E1304): *Missing key %r in format string dictionary* + Used when a format string that uses named conversion specifiers is used with a + dictionary that doesn't contain all the keys required by the format string. +:too-many-format-args (E1305): *Too many arguments for format string* + Used when a format string that uses unnamed conversion specifiers is given too + many arguments. +:too-few-format-args (E1306): *Not enough arguments for format string* + Used when a format string that uses unnamed conversion specifiers is given too + few arguments +:bad-str-strip-call (E1310): *Suspicious argument in %s.%s call* + The argument to a str.{l,r,}strip call contains a duplicate character, +:print-statement (E1601): *print statement used* + Used when a print statement is used (`print` is a function in Python 3) This + message can't be emitted when using Python >= 3.0. +:parameter-unpacking (E1602): *Parameter unpacking specified* + Used when parameter unpacking is specified for a function(Python 3 doesn't + allow it) This message can't be emitted when using Python >= 3.0. +:unpacking-in-except (E1603): *Implicit unpacking of exceptions is not supported in Python 3* + Python3 will not allow implicit unpacking of exceptions in except clauses. See + http://www.python.org/dev/peps/pep-3110/ This message can't be emitted when + using Python >= 3.0. +:old-raise-syntax (E1604): *Use raise ErrorClass(args) instead of raise ErrorClass, args.* + Used when the alternate raise syntax 'raise foo, bar' is used instead of + 'raise foo(bar)'. This message can't be emitted when using Python >= 3.0. +:backtick (E1605): *Use of the `` operator* + Used when the deprecated "``" (backtick) operator is used instead of the str() + function. This message can't be emitted when using Python >= 3.0. +:long-suffix (E1606): *Use of long suffix* + Used when "l" or "L" is used to mark a long integer. This will not work in + Python 3, since `int` and `long` types have merged. This message can't be + emitted when using Python >= 3.0. +:old-ne-operator (E1607): *Use of the <> operator* + Used when the deprecated "<>" operator is used instead of "!=". This is + removed in Python 3. This message can't be emitted when using Python >= 3.0. +:old-octal-literal (E1608): *Use of old octal literal* + Usen when encountering the old octal syntax, removed in Python 3. To use the + new syntax, prepend 0o on the number. This message can't be emitted when using + Python >= 3.0. +:import-star-module-level (E1609): *Import * only allowed at module level* + Used when the import star syntax is used somewhere else than the module level. + This message can't be emitted when using Python >= 3.0. +:fatal (F0001): + Used when an error occurred preventing the analysis of a module (unable to + find it for instance). +:astroid-error (F0002): *%s: %s* + Used when an unexpected error occurred while building the Astroid + representation. This is usually accompanied by a traceback. Please report such + errors ! +:parse-error (F0010): *error while code parsing: %s* + Used when an exception occured while building the Astroid representation which + could be handled by astroid. +:method-check-failed (F0202): *Unable to check methods signature (%s / %s)* + Used when Pylint has been unable to check methods signature compatibility for + an unexpected reason. Please report this kind if you don't make sense of it. +:raw-checker-failed (I0001): *Unable to run raw checkers on built-in module %s* + Used to inform that a built-in module has not been checked using the raw + checkers. +:bad-inline-option (I0010): *Unable to consider inline option %r* + Used when an inline option is either badly formatted or can't be used inside + modules. +:locally-disabled (I0011): *Locally disabling %s (%s)* + Used when an inline option disables a message or a messages category. +:locally-enabled (I0012): *Locally enabling %s (%s)* + Used when an inline option enables a message or a messages category. +:file-ignored (I0013): *Ignoring entire file* + Used to inform that the file will not be checked +:suppressed-message (I0020): *Suppressed %s (from line %d)* + A message was triggered on a line, but suppressed explicitly by a disable= + comment in the file. This message is not generated for messages that are + ignored due to configuration settings. +:useless-suppression (I0021): *Useless suppression of %s* + Reported when a message is explicitly disabled for a line or a block of code, + but never triggered. +:deprecated-pragma (I0022): *Pragma "%s" is deprecated, use "%s" instead* + Some inline pylint options have been renamed or reworked, only the most recent + form should be used. NOTE:skip-all is only available with pylint >= 0.26 +:too-many-nested-blocks (R0101): *Too many nested blocks (%s/%s)* + Used when a function or a method has too many nested blocks. This makes the + code less understandable and maintainable. +:simplifiable-if-statement (R0102): *The if statement can be replaced with %s* + Used when an if statement can be replaced with 'bool(test)'. +:no-self-use (R0201): *Method could be a function* + Used when a method doesn't use its bound instance, and so could be written as + a function. +:no-classmethod-decorator (R0202): *Consider using a decorator instead of calling classmethod* + Used when a class method is defined without using the decorator syntax. +:no-staticmethod-decorator (R0203): *Consider using a decorator instead of calling staticmethod* + Used when a static method is defined without using the decorator syntax. +:redefined-variable-type (R0204): *Redefinition of %s type from %s to %s* + Used when the type of a variable changes inside a method or a function. +:cyclic-import (R0401): *Cyclic import (%s)* + Used when a cyclic import between two or more modules is detected. +:duplicate-code (R0801): *Similar lines in %s files* + Indicates that a set of similar lines has been detected among multiple file. + This usually means that the code should be refactored to avoid this + duplication. +:too-many-ancestors (R0901): *Too many ancestors (%s/%s)* + Used when class has too many parent classes, try to reduce this to get a + simpler (and so easier to use) class. +:too-many-instance-attributes (R0902): *Too many instance attributes (%s/%s)* + Used when class has too many instance attributes, try to reduce this to get a + simpler (and so easier to use) class. +:too-few-public-methods (R0903): *Too few public methods (%s/%s)* + Used when class has too few public methods, so be sure it's really worth it. +:too-many-public-methods (R0904): *Too many public methods (%s/%s)* + Used when class has too many public methods, try to reduce this to get a + simpler (and so easier to use) class. +:too-many-return-statements (R0911): *Too many return statements (%s/%s)* + Used when a function or method has too many return statement, making it hard + to follow. +:too-many-branches (R0912): *Too many branches (%s/%s)* + Used when a function or method has too many branches, making it hard to + follow. +:too-many-arguments (R0913): *Too many arguments (%s/%s)* + Used when a function or method takes too many arguments. +:too-many-locals (R0914): *Too many local variables (%s/%s)* + Used when a function or method has too many local variables. +:too-many-statements (R0915): *Too many statements (%s/%s)* + Used when a function or method has too many statements. You should then split + it in smaller functions / methods. +:too-many-boolean-expressions (R0916): *Too many boolean expressions in if statement (%s/%s)* + Used when a if statement contains too many boolean expressions +:unreachable (W0101): *Unreachable code* + Used when there is some code behind a "return" or "raise" statement, which + will never be accessed. +:dangerous-default-value (W0102): *Dangerous default value %s as argument* + Used when a mutable value as list or dictionary is detected in a default value + for an argument. +:pointless-statement (W0104): *Statement seems to have no effect* + Used when a statement doesn't have (or at least seems to) any effect. +:pointless-string-statement (W0105): *String statement has no effect* + Used when a string is used as a statement (which of course has no effect). + This is a particular case of W0104 with its own message so you can easily + disable it if you're using those strings as documentation, instead of + comments. +:expression-not-assigned (W0106): *Expression "%s" is assigned to nothing* + Used when an expression that is not a function call is assigned to nothing. + Probably something else was intended. +:unnecessary-pass (W0107): *Unnecessary pass statement* + Used when a "pass" statement that can be avoided is encountered. +:unnecessary-lambda (W0108): *Lambda may not be necessary* + Used when the body of a lambda expression is a function call on the same + argument list as the lambda itself; such lambda expressions are in all but a + few cases replaceable with the function being called in the body of the + lambda. +:duplicate-key (W0109): *Duplicate key %r in dictionary* + Used when a dictionary expression binds the same key multiple times. +:deprecated-lambda (W0110): *map/filter on lambda could be replaced by comprehension* + Used when a lambda is the first argument to "map" or "filter". It could be + clearer as a list comprehension or generator expression. This message can't be + emitted when using Python >= 3.0. +:useless-else-on-loop (W0120): *Else clause on loop without a break statement* + Loops should only have an else clause if they can exit early with a break + statement, otherwise the statements under else should be on the same scope as + the loop itself. +:exec-used (W0122): *Use of exec* + Used when you use the "exec" statement (function for Python 3), to discourage + its usage. That doesn't mean you can not use it ! +:eval-used (W0123): *Use of eval* + Used when you use the "eval" function, to discourage its usage. Consider using + `ast.literal_eval` for safely evaluating strings containing Python expressions + from untrusted sources. +:confusing-with-statement (W0124): *Following "as" with another context manager looks like a tuple.* + Emitted when a `with` statement component returns multiple values and uses + name binding with `as` only for a part of those values, as in with ctx() as a, + b. This can be misleading, since it's not clear if the context manager returns + a tuple or if the node without a name binding is another context manager. +:using-constant-test (W0125): *Using a conditional statement with a constant value* + Emitted when a conditional statement (If or ternary if) uses a constant value + for its test. This might not be what the user intended to do. +:lost-exception (W0150): *%s statement in finally block may swallow exception* + Used when a break or a return statement is found inside the finally clause of + a try...finally block: the exceptions raised in the try clause will be + silently swallowed instead of being re-raised. +:assert-on-tuple (W0199): *Assert called on a 2-uple. Did you mean 'assert x,y'?* + A call of assert on a tuple will always evaluate to true if the tuple is not + empty, and will always evaluate to false if it is. +:attribute-defined-outside-init (W0201): *Attribute %r defined outside __init__* + Used when an instance attribute is defined outside the __init__ method. +:bad-staticmethod-argument (W0211): *Static method with %r as first argument* + Used when a static method has "self" or a value specified in valid- + classmethod-first-arg option or valid-metaclass-classmethod-first-arg option + as first argument. +:protected-access (W0212): *Access to a protected member %s of a client class* + Used when a protected member (i.e. class member with a name beginning with an + underscore) is access outside the class or a descendant of the class where + it's defined. +:arguments-differ (W0221): *Arguments number differs from %s %r method* + Used when a method has a different number of arguments than in the implemented + interface or in an overridden method. +:signature-differs (W0222): *Signature differs from %s %r method* + Used when a method signature is different than in the implemented interface or + in an overridden method. +:abstract-method (W0223): *Method %r is abstract in class %r but is not overridden* + Used when an abstract method (i.e. raise NotImplementedError) is not + overridden in concrete class. +:super-init-not-called (W0231): *__init__ method from base class %r is not called* + Used when an ancestor class method has an __init__ method which is not called + by a derived class. +:no-init (W0232): *Class has no __init__ method* + Used when a class has no __init__ method, neither its parent classes. +:non-parent-init-called (W0233): *__init__ method from a non direct base class %r is called* + Used when an __init__ method is called on a class which is not in the direct + ancestors for the analysed class. +:unnecessary-semicolon (W0301): *Unnecessary semicolon* + Used when a statement is ended by a semi-colon (";"), which isn't necessary + (that's python, not C ;). +:bad-indentation (W0311): *Bad indentation. Found %s %s, expected %s* + Used when an unexpected number of indentation's tabulations or spaces has been + found. +:mixed-indentation (W0312): *Found indentation with %ss instead of %ss* + Used when there are some mixed tabs and spaces in a module. +:lowercase-l-suffix (W0332): *Use of "l" as long integer identifier* + Used when a lower case "l" is used to mark a long integer. You should use a + upper case "L" since the letter "l" looks too much like the digit "1" This + message can't be emitted when using Python >= 3.0. +:wildcard-import (W0401): *Wildcard import %s* + Used when `from module import *` is detected. +:deprecated-module (W0402): *Uses of a deprecated module %r* + Used a module marked as deprecated is imported. +:relative-import (W0403): *Relative import %r, should be %r* + Used when an import relative to the package directory is detected. This + message can't be emitted when using Python >= 3.0. +:reimported (W0404): *Reimport %r (imported line %s)* + Used when a module is reimported multiple times. +:import-self (W0406): *Module import itself* + Used when a module is importing itself. +:misplaced-future (W0410): *__future__ import is not the first non docstring statement* + Python 2.5 and greater require __future__ import to be the first non docstring + statement in the module. +:fixme (W0511): + Used when a warning note as FIXME or XXX is detected. +:invalid-encoded-data (W0512): *Cannot decode using encoding "%s", unexpected byte at position %d* + Used when a source line cannot be decoded using the specified source file + encoding. This message can't be emitted when using Python >= 3.0. +:global-variable-undefined (W0601): *Global variable %r undefined at the module level* + Used when a variable is defined through the "global" statement but the + variable is not defined in the module scope. +:global-variable-not-assigned (W0602): *Using global for %r but no assignment is done* + Used when a variable is defined through the "global" statement but no + assignment to this variable is done. +:global-statement (W0603): *Using the global statement* + Used when you use the "global" statement to update a global variable. Pylint + just try to discourage this usage. That doesn't mean you can not use it ! +:global-at-module-level (W0604): *Using the global statement at the module level* + Used when you use the "global" statement at the module level since it has no + effect +:unused-import (W0611): *Unused %s* + Used when an imported module or variable is not used. +:unused-variable (W0612): *Unused variable %r* + Used when a variable is defined but not used. +:unused-argument (W0613): *Unused argument %r* + Used when a function or method argument is not used. +:unused-wildcard-import (W0614): *Unused import %s from wildcard import* + Used when an imported module or variable is not used from a `'from X import + *'` style import. +:redefined-outer-name (W0621): *Redefining name %r from outer scope (line %s)* + Used when a variable's name hide a name defined in the outer scope. +:redefined-builtin (W0622): *Redefining built-in %r* + Used when a variable or function override a built-in. +:redefine-in-handler (W0623): *Redefining name %r from %s in exception handler* + Used when an exception handler assigns the exception to an existing name +:undefined-loop-variable (W0631): *Using possibly undefined loop variable %r* + Used when an loop variable (i.e. defined by a for loop or a list comprehension + or a generator expression) is used outside the loop. +:cell-var-from-loop (W0640): *Cell variable %s defined in loop* + A variable used in a closure is defined in a loop. This will result in all + closures using the same value for the closed-over variable. +:bare-except (W0702): *No exception type(s) specified* + Used when an except clause doesn't specify exceptions type to catch. +:broad-except (W0703): *Catching too general exception %s* + Used when an except catches a too general exception, possibly burying + unrelated errors. +:duplicate-except (W0705): *Catching previously caught exception type %s* + Used when an except catches a type that was already caught by a previous + handler. +:nonstandard-exception (W0710): *Exception doesn't inherit from standard "Exception" class* + Used when a custom exception class is raised but doesn't inherit from the + builtin "Exception" class. This message can't be emitted when using Python >= + 3.0. +:binary-op-exception (W0711): *Exception to catch is the result of a binary "%s" operation* + Used when the exception to catch is of the form "except A or B:". If intending + to catch multiple, rewrite as "except (A, B):" +:property-on-old-class (W1001): *Use of "property" on an old style class* + Used when Pylint detect the use of the builtin "property" on an old style + class while this is relying on new style classes features. This message can't + be emitted when using Python >= 3.0. +:logging-not-lazy (W1201): *Specify string format arguments as logging function parameters* + Used when a logging statement has a call form of "logging.(format_string % (format_args...))". Such calls should leave string + interpolation to the logging method itself and be written "logging.(format_string, format_args...)" so that the program may avoid + incurring the cost of the interpolation in those cases in which no message + will be logged. For more, see http://www.python.org/dev/peps/pep-0282/. +:logging-format-interpolation (W1202): *Use % formatting in logging functions and pass the % parameters as arguments* + Used when a logging statement has a call form of "logging.(format_string.format(format_args...))". Such calls should use % + formatting instead, but leave interpolation to the logging function by passing + the parameters as arguments. +:bad-format-string-key (W1300): *Format string dictionary key should be a string, not %s* + Used when a format string that uses named conversion specifiers is used with a + dictionary whose keys are not all strings. +:unused-format-string-key (W1301): *Unused key %r in format string dictionary* + Used when a format string that uses named conversion specifiers is used with a + dictionary that contains keys not required by the format string. +:bad-format-string (W1302): *Invalid format string* + Used when a PEP 3101 format string is invalid. This message can't be emitted + when using Python < 2.7. +:missing-format-argument-key (W1303): *Missing keyword argument %r for format string* + Used when a PEP 3101 format string that uses named fields doesn't receive one + or more required keywords. This message can't be emitted when using Python < + 2.7. +:unused-format-string-argument (W1304): *Unused format argument %r* + Used when a PEP 3101 format string that uses named fields is used with an + argument that is not required by the format string. This message can't be + emitted when using Python < 2.7. +:format-combined-specification (W1305): *Format string contains both automatic field numbering and manual field specification* + Usen when a PEP 3101 format string contains both automatic field numbering + (e.g. '{}') and manual field specification (e.g. '{0}'). This message can't be + emitted when using Python < 2.7. +:missing-format-attribute (W1306): *Missing format attribute %r in format specifier %r* + Used when a PEP 3101 format string uses an attribute specifier ({0.length}), + but the argument passed for formatting doesn't have that attribute. This + message can't be emitted when using Python < 2.7. +:invalid-format-index (W1307): *Using invalid lookup key %r in format specifier %r* + Used when a PEP 3101 format string uses a lookup specifier ({a[1]}), but the + argument passed for formatting doesn't contain or doesn't have that key as an + attribute. This message can't be emitted when using Python < 2.7. +:anomalous-backslash-in-string (W1401): *Anomalous backslash in string: '%s'. String constant might be missing an r prefix.* + Used when a backslash is in a literal string but not as an escape. +:anomalous-unicode-escape-in-string (W1402): *Anomalous Unicode escape in byte string: '%s'. String constant might be missing an r or u prefix.* + Used when an escape like \u is encountered in a byte string where it has no + effect. +:bad-open-mode (W1501): *"%s" is not a valid mode for open.* + Python supports: r, w, a[, x] modes with b, +, and U (only with r) options. + See http://docs.python.org/2/library/functions.html#open +:boolean-datetime (W1502): *Using datetime.time in a boolean context.* + Using datetime.time in a boolean context can hide subtle bugs when the time + they represent matches midnight UTC. This behaviour was fixed in Python 3.5. + See http://bugs.python.org/issue13936 for reference. This message can't be + emitted when using Python >= 3.5. +:redundant-unittest-assert (W1503): *Redundant use of %s with constant value %r* + The first argument of assertTrue and assertFalse is a condition. If a constant + is passed as parameter, that condition will be always true. In this case a + warning should be emitted. +:deprecated-method (W1505): *Using deprecated method %s()* + The method is marked as deprecated and will be removed in a future version of + Python. Consider looking for an alternative in the documentation. +:apply-builtin (W1601): *apply built-in referenced* + Used when the apply built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:basestring-builtin (W1602): *basestring built-in referenced* + Used when the basestring built-in function is referenced (missing from Python + 3) This message can't be emitted when using Python >= 3.0. +:buffer-builtin (W1603): *buffer built-in referenced* + Used when the buffer built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:cmp-builtin (W1604): *cmp built-in referenced* + Used when the cmp built-in function is referenced (missing from Python 3) This + message can't be emitted when using Python >= 3.0. +:coerce-builtin (W1605): *coerce built-in referenced* + Used when the coerce built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:execfile-builtin (W1606): *execfile built-in referenced* + Used when the execfile built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:file-builtin (W1607): *file built-in referenced* + Used when the file built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:long-builtin (W1608): *long built-in referenced* + Used when the long built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:raw_input-builtin (W1609): *raw_input built-in referenced* + Used when the raw_input built-in function is referenced (missing from Python + 3) This message can't be emitted when using Python >= 3.0. +:reduce-builtin (W1610): *reduce built-in referenced* + Used when the reduce built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:standarderror-builtin (W1611): *StandardError built-in referenced* + Used when the StandardError built-in function is referenced (missing from + Python 3) This message can't be emitted when using Python >= 3.0. +:unicode-builtin (W1612): *unicode built-in referenced* + Used when the unicode built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:xrange-builtin (W1613): *xrange built-in referenced* + Used when the xrange built-in function is referenced (missing from Python 3) + This message can't be emitted when using Python >= 3.0. +:coerce-method (W1614): *__coerce__ method defined* + Used when a __coerce__ method is defined (method is not used by Python 3) This + message can't be emitted when using Python >= 3.0. +:delslice-method (W1615): *__delslice__ method defined* + Used when a __delslice__ method is defined (method is not used by Python 3) + This message can't be emitted when using Python >= 3.0. +:getslice-method (W1616): *__getslice__ method defined* + Used when a __getslice__ method is defined (method is not used by Python 3) + This message can't be emitted when using Python >= 3.0. +:setslice-method (W1617): *__setslice__ method defined* + Used when a __setslice__ method is defined (method is not used by Python 3) + This message can't be emitted when using Python >= 3.0. +:no-absolute-import (W1618): *import missing `from __future__ import absolute_import`* + Used when an import is not accompanied by ``from __future__ import + absolute_import`` (default behaviour in Python 3) This message can't be + emitted when using Python >= 3.0. +:old-division (W1619): *division w/o __future__ statement* + Used for non-floor division w/o a float literal or ``from __future__ import + division`` (Python 3 returns a float for int division unconditionally) This + message can't be emitted when using Python >= 3.0. +:dict-iter-method (W1620): *Calling a dict.iter*() method* + Used for calls to dict.iterkeys(), itervalues() or iteritems() (Python 3 lacks + these methods) This message can't be emitted when using Python >= 3.0. +:dict-view-method (W1621): *Calling a dict.view*() method* + Used for calls to dict.viewkeys(), viewvalues() or viewitems() (Python 3 lacks + these methods) This message can't be emitted when using Python >= 3.0. +:next-method-called (W1622): *Called a next() method on an object* + Used when an object's next() method is called (Python 3 uses the next() built- + in function) This message can't be emitted when using Python >= 3.0. +:metaclass-assignment (W1623): *Assigning to a class's __metaclass__ attribute* + Used when a metaclass is specified by assigning to __metaclass__ (Python 3 + specifies the metaclass as a class statement argument) This message can't be + emitted when using Python >= 3.0. +:indexing-exception (W1624): *Indexing exceptions will not work on Python 3* + Indexing exceptions will not work on Python 3. Use `exception.args[index]` + instead. This message can't be emitted when using Python >= 3.0. +:raising-string (W1625): *Raising a string exception* + Used when a string exception is raised. This will not work on Python 3. This + message can't be emitted when using Python >= 3.0. +:reload-builtin (W1626): *reload built-in referenced* + Used when the reload built-in function is referenced (missing from Python 3). + You can use instead imp.reload or importlib.reload. This message can't be + emitted when using Python >= 3.0. +:oct-method (W1627): *__oct__ method defined* + Used when a __oct__ method is defined (method is not used by Python 3) This + message can't be emitted when using Python >= 3.0. +:hex-method (W1628): *__hex__ method defined* + Used when a __hex__ method is defined (method is not used by Python 3) This + message can't be emitted when using Python >= 3.0. +:nonzero-method (W1629): *__nonzero__ method defined* + Used when a __nonzero__ method is defined (method is not used by Python 3) + This message can't be emitted when using Python >= 3.0. +:cmp-method (W1630): *__cmp__ method defined* + Used when a __cmp__ method is defined (method is not used by Python 3) This + message can't be emitted when using Python >= 3.0. +:input-builtin (W1632): *input built-in referenced* + Used when the input built-in is referenced (backwards-incompatible semantics + in Python 3) This message can't be emitted when using Python >= 3.0. +:round-builtin (W1633): *round built-in referenced* + Used when the round built-in is referenced (backwards-incompatible semantics + in Python 3) This message can't be emitted when using Python >= 3.0. +:intern-builtin (W1634): *intern built-in referenced* + Used when the intern built-in is referenced (Moved to sys.intern in Python 3) + This message can't be emitted when using Python >= 3.0. +:unichr-builtin (W1635): *unichr built-in referenced* + Used when the unichr built-in is referenced (Use chr in Python 3) This message + can't be emitted when using Python >= 3.0. +:map-builtin-not-iterating (W1636): *map built-in referenced when not iterating* + Used when the map built-in is referenced in a non-iterating context (returns + an iterator in Python 3) This message can't be emitted when using Python >= + 3.0. +:zip-builtin-not-iterating (W1637): *zip built-in referenced when not iterating* + Used when the zip built-in is referenced in a non-iterating context (returns + an iterator in Python 3) This message can't be emitted when using Python >= + 3.0. +:range-builtin-not-iterating (W1638): *range built-in referenced when not iterating* + Used when the range built-in is referenced in a non-iterating context (returns + an iterator in Python 3) This message can't be emitted when using Python >= + 3.0. +:filter-builtin-not-iterating (W1639): *filter built-in referenced when not iterating* + Used when the filter built-in is referenced in a non-iterating context + (returns an iterator in Python 3) This message can't be emitted when using + Python >= 3.0. +:using-cmp-argument (W1640): *Using the cmp argument for list.sort / sorted* + Using the cmp argument for list.sort or the sorted builtin should be avoided, + since it was removed in Python 3. Using either `key` or `functools.cmp_to_key` + should be preferred. This message can't be emitted when using Python >= 3.0. + diff --git a/pylint.rc b/pylint.rc new file mode 100644 index 0000000..ce99dec --- /dev/null +++ b/pylint.rc @@ -0,0 +1,355 @@ +# lint Python modules using external checkers. +# +# This is the main checker controlling the other ones and the reports +# generation. It is itself both a raw checker and an astng checker in order +# to: +# * handle message activation / deactivation at the module level +# * handle some basic but necessary stats'data (number of classes, methods...) +# +[MASTER] + +# Specify a configuration file. +#rcfile= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Profiled execution. +profile=no + +# Add to the black list. It should be a base name, not a +# path. You may set this option multiple times. +# Ignore all auto-generated South migration directories. +ignore=migrations,south_migrations + +# Pickle collected data for later comparisons. +persistent=yes + +# Set the cache size for astng objects. +cache-size=500 + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +[MESSAGES CONTROL] + +# Enable only checker(s) with the given id(s). This option conflicts with the +# disable-checker option +#enable-checker= + +# Enable all checker(s) except those with the given id(s). This option +# conflicts with the enable-checker option +#disable-checker= + +# Enable all messages in the listed categories (IRCWEF). +#enable-msg-cat= + +# Disable all messages in the listed categories (IRCWEF). +disable-msg-cat=I + +# Enable the message(s) with the given id(s). +#enable-msg= + +#http://docs.pylint.org/features.html +#http://pylint-messages.wikidot.com/all-codes +#pylint --list-msgs > pylint.messages + +# All these are disabled below. +# C1001: old-style class defined (Django uses these for Meta options) +# C0103: variable regex check. +# C0111: missing docstring check. It's too vague. Complains about no docstrings in __init__ and other places we don't care about. +# C0330: bad-continuation +# E1101: member check...this is usually wrong. +# E1103: type inference...this is usually wrong. +# F0401: unable to import +# R0201: method should be function check. +# R0401: cyclic import check...because sometimes it's wrong. +# R0902: too many instance attributes check. +# R0903: too few public methods check...makes no sense with Django. +# R0904: too many public method check. +# R0913: too many argument check. +# R0921: abstract class not referenced check. +# W0104: no effect check. +# W0142: magic check. +# W0212: protected data check. +# W0232: __init__ check. +# W0311: bad-indentation +# W0401: wildcard import. +# W0404: reimport check...this is sometimes wrong. +# W0511: TODO check. +# W0612: unused variable check. +# W0613: unused argument check. Too vague. +# W0614: wildcard import usage check. +# W0704: empty except check. +# E1002: Use of super on an old style class +# E1120: No value for argument +# R0901: Too many ancestors +# E1123: Unexpected keyword argument %r in %s call +# C0302: *Too many lines in module (%s)* +# R0801: *Similar lines in %s files* +# R0914: *Too many local variables (%s/%s)* +# R0912: *Too many branches (%s/%s)* +# R0915: *Too many statements (%s/%s)* +# W0703: *Catching too general exception %s* +# E1003: *Bad first argument %r given to super()* +# E0202: *An attribute defined in %s line %s hides this method* +# W0201: *Attribute %r defined outside __init__* +# W0221: *Arguments number differs from %s method* +# C0325: *Unnecessary parens after %r keyword* +# R0916: too-many-boolean-expressions +# R0204: *Redefinition of %s type from %s to %s* +# R0101: *Too many nested blocks (%s/%s)* +# I0011: *Locally disabling %s (%s)* +# W1001: *Use of "property" on an old style class* +disable=C1001,C0103,R0201,W0212,W0614,W0401,W0704,E1101,W0142,R0904,R0913,W0404,R0903,W0232,C0111,W0613,W0612,W0511,W0104,R0902,R0921,R0401,E1103,W0311,C0330,F0401,E1002,E1120,R0901,E1123,C0302,R0801,R0914,R0912,R0915,W0703,E1003,E0202,W0201,W0221,C0325,R0916,R0204,R0101,I0011,W1001,consider-using-ternary,unsubscriptable-object,inconsistent-return-statements,keyword-arg-before-vararg,wrong-import-order,redefined-outer-name + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html +output-format=text + +# Include message's id in output +include-ids=yes + +# Put messages in a separate file for each module / package specified on the +# command line instead of printing them on stdout. Reports (if any) will be +# written in a file name "pylint_global.[txt|html]". +files-output=no + +# Tells whether to display a full report or only the messages +reports=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (R0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Add a comment according to your evaluation note. This is used by the global +# evaluation report (R0004). +comment=no + +# Enable the report(s) with the given id(s). +#enable-report= + +# Disable the report(s) with the given id(s). +#disable-report= + + +# checks for : +# * doc strings +# * modules / classes / functions / methods / arguments / variables name +# * number of arguments, local variables, branches, returns and statements in +# functions, methods +# * required module attributes +# * dangerous default values as arguments +# * redefinition of function / method / class +# * uses of the global statement +# +[BASIC] + +# Regular expression which should only match functions or classes name which do +# not require a docstring +no-docstring-rgx=__.*__ + +# Regular expression which should only match correct module names +module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ + +# Regular expression which should only match correct module level names +const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ + +# Regular expression which should only match correct class names +class-rgx=[A-Z_][a-zA-Z0-9]+$ + +# Regular expression which should only match correct function names +function-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression which should only match correct method names +method-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression which should only match correct instance attribute names +attr-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression which should only match correct argument names +argument-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression which should only match correct variable names +variable-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression which should only match correct list comprehension / +# generator expression variable names +inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ + +# Good variable names which should always be accepted, separated by a comma +good-names=i,j,k,ex,Run,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names=foo,bar,baz,toto,tutu,tata + +# List of builtins function names that should not be used, separated by a comma +bad-functions=map,filter,apply,input + + +# try to find bugs in the code using type inference +# +[TYPECHECK] + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# List of classes names for which member attributes should not be checked +# (useful for classes with attributes dynamically set). +ignored-classes=SQLObject + +# When zope mode is activated, add a predefined set of Zope acquired attributes +# to generated-members. +zope=no + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E0201 when accessed. +generated-members=REQUEST,acl_users,aq_parent + + +# checks for +# * unused variables / imports +# * undefined variables +# * redefinition of variable from builtins or from an outer scope +# * use of variable before assignment +# +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching names used for dummy variables (i.e. not used). +dummy-variables-rgx=_|dummy + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + + +# checks for +# * external modules dependencies +# * relative / wildcard imports +# * cyclic imports +# * uses of deprecated modules +# +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub,string,TERMIOS,Bastion,rexec + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report R0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report R0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report R0402 must +# not be disabled) +int-import-graph= + + +# checks for sign of poor/misdesign: +# * number of methods, attributes, local variables... +# * size, complexity of functions, methods +# +[DESIGN] + +# Maximum number of arguments for function / method +max-args=5 + +# Maximum number of locals for function / method body +max-locals=15 + +# Maximum number of return / yield for function / method body +max-returns=6 + +# Maximum number of branch for function / method body +max-branchs=12 + +# Maximum number of statements in function / method body +max-statements=50 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + + +# checks for : +# * methods without self as first argument +# * overridden methods signature +# * access only to existent members via self +# * attributes not defined in the __init__ method +# * supported interfaces implementation +# * unreachable code +# +[CLASSES] + +# List of interface methods to ignore, separated by a comma. This is used for +# instance to not check methods defines in Zope's Interface base class. +#ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__,__new__,setUp + + +# checks for similarities and duplicated code. This computation may be +# memory / CPU intensive, so you should disable it if you experiments some +# problems. +# +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + + +# checks for : +# * unauthorized constructions +# * strict indentation +# * line length +# * use of <> instead of != +# +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=160 + +# Maximum number of lines in a module +max-module-lines=1000 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + + +# checks for: +# * warning notes in the code like FIXME, XXX +# * PEP 263: source code with non ascii character but no encoding declaration +# +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME,XXX,TODO diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..3920993 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,8 @@ +tox>=2.0.0 +# Update pylint/astroid after recursion bug is fixed? +# https://github.com/PyCQA/pylint/issues/2388 +astroid<=2.2.5 +pylint<=2.3.1 +yapf>=0.28.0 +pre-commit>=1.14.4 +coverage>=4.5.4 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e7c0a96 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +nltk>=3.4.5 +numpy>=1.17.2 +theano>=1.0.4 +gdown>=3.8.3 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a38082e --- /dev/null +++ b/setup.py @@ -0,0 +1,58 @@ +from __future__ import print_function +import os + +from setuptools import setup, find_packages + +import punctuator + +CURRENT_DIR = os.path.abspath(os.path.dirname(__file__)) + +try: + with open(os.path.join(CURRENT_DIR, 'README.md'), encoding='utf-8') as f: + long_description = f.read() +except TypeError: + with open(os.path.join(CURRENT_DIR, 'README.md')) as f: + long_description = f.read() + + +def get_reqs(*fns): + lst = [] + for fn in fns: + for package in open(os.path.join(CURRENT_DIR, fn)).readlines(): + package = package.strip() + if not package: + continue + lst.append(package.strip()) + return lst + + +setup( + name="punctuator", + version=punctuator.__version__, + packages=find_packages(), + author="Chris Spencer", + author_email="chrisspen@gmail.com", + description="Adds punctuation to a block of text.", + long_description=long_description, + long_description_content_type='text/markdown', + license="MIT", + url="https://github.com/chrisspen/punctuator", + # https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # 'Development Status :: 6 - Mature', + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + ], + entry_points={'console_scripts': [ + 'punctuator.py = punctuator.punc:command_line_runner', + ]}, + zip_safe=False, + install_requires=get_reqs('requirements.txt'), + tests_require=get_reqs('requirements-test.txt'), +) diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..bb139e4 --- /dev/null +++ b/test.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Runs all tests. +set -e +./pep8.sh +export TESTNAME=; tox diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..80c75a7 --- /dev/null +++ b/tox.ini @@ -0,0 +1,13 @@ +[tox] +envlist = py{35,36,37} +recreate = True + +[testenv] +basepython = + py35: python3.5 + py36: python3.6 + py37: python3.7 +deps = + -r{toxinidir}/requirements.txt + -r{toxinidir}/requirements-test.txt +commands = python punctuator/tests.py Tests{env:TESTNAME:} From d7f7eb730cf54b4873c317ac65a3b00281d4b1d3 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 20:01:35 -0400 Subject: [PATCH 02/19] Bumped version. --- punctuator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/punctuator/__init__.py b/punctuator/__init__.py index 66596da..8dcca04 100644 --- a/punctuator/__init__.py +++ b/punctuator/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 9, 1) +VERSION = (0, 9, 2) __version__ = '.'.join(map(str, VERSION)) try: from punc import Punctuator From 4058f739d69b091b004d4793df766c8be1508012 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 20:11:37 -0400 Subject: [PATCH 03/19] Dropped py35 support. --- setup.py | 1 - tox.ini | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a38082e..1e943cd 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,6 @@ def get_reqs(*fns): 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], diff --git a/tox.ini b/tox.ini index 80c75a7..d968d50 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,9 @@ [tox] -envlist = py{35,36,37} +envlist = py{36,37} recreate = True [testenv] basepython = - py35: python3.5 py36: python3.6 py37: python3.7 deps = From b65ba4691c6cfdf60fb481f6bc89f745e7561dff Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 20:17:49 -0400 Subject: [PATCH 04/19] Dropped py35 support. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 290559e..1e7c7ca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ dist: xenial sudo: required language: python python: -- "3.5" +- "3.6" install: - sudo add-apt-repository -y ppa:deadsnakes/ppa - sudo apt-get -yq update From 686653549c849ebb3860ca7c55ba2d058042a12f Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 20:23:59 -0400 Subject: [PATCH 05/19] Dropped py35 support. --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index de9c9b2..3f3d097 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ [![](https://img.shields.io/pypi/v/punctuator.svg)](https://pypi.python.org/pypi/punctuator) [![Build Status](https://img.shields.io/travis/chrisspen/punctuator.svg?branch=master)](https://travis-ci.org/chrisspen/punctuator) -[![](https://pyup.io/repos/github/chrisspen/punctuator/shield.svg)](https://pyup.io/repos/github/chrisspen/punctuator) This is a fork of [Ottokar Tilk's punctuator2](https://github.com/ottokart/punctuator2) cleaned up into a formal Python3 package with testing. From b0067709b977a942df414bc7be94510cf9d4bd62 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 20:25:42 -0400 Subject: [PATCH 06/19] Dropped py35 support. --- requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-test.txt b/requirements-test.txt index 3920993..ad768a4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,3 +6,4 @@ pylint<=2.3.1 yapf>=0.28.0 pre-commit>=1.14.4 coverage>=4.5.4 +twine>=2.0.0 From 0c4fb9d8ea2c9d8af443ae1f0bea3ccc75f4cc15 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 26 Sep 2019 20:29:07 -0400 Subject: [PATCH 07/19] Dropped py35 support. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f3d097..7db28d9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Punctuator [![](https://img.shields.io/pypi/v/punctuator.svg)](https://pypi.python.org/pypi/punctuator) -[![Build Status](https://img.shields.io/travis/chrisspen/punctuator.svg?branch=master)](https://travis-ci.org/chrisspen/punctuator) +[![Build Status](https://img.shields.io/travis/chrisspen/punctuator2.svg?branch=master)](https://travis-ci.org/chrisspen/punctuator) This is a fork of [Ottokar Tilk's punctuator2](https://github.com/ottokart/punctuator2) cleaned up into a formal Python3 package with testing. From 545f741078adba3aceb24eebd05849c142ac3633 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Sun, 29 Sep 2019 01:25:17 -0400 Subject: [PATCH 08/19] Fixed test runner call. Fixed imports. Added case fixes to output. --- punctuator/__init__.py | 9 +++---- punctuator/convert_to_readable.py | 3 ++- punctuator/models.py | 7 ------ punctuator/punc.py | 33 ++++++++++++++++++------- punctuator/tests.py | 40 +++++++++++++++++++++---------- tox.ini | 2 +- 6 files changed, 60 insertions(+), 34 deletions(-) diff --git a/punctuator/__init__.py b/punctuator/__init__.py index 8dcca04..9bf4a44 100644 --- a/punctuator/__init__.py +++ b/punctuator/__init__.py @@ -1,6 +1,7 @@ -VERSION = (0, 9, 2) +VERSION = (0, 9, 3) __version__ = '.'.join(map(str, VERSION)) try: - from punc import Punctuator -except ImportError: - pass + from .punc import Punctuator +except ImportError as exc: + import traceback + traceback.print_exc() diff --git a/punctuator/convert_to_readable.py b/punctuator/convert_to_readable.py index ad09c20..a944099 100644 --- a/punctuator/convert_to_readable.py +++ b/punctuator/convert_to_readable.py @@ -1,6 +1,7 @@ import sys from io import open -from data import EOS_TOKENS, PUNCTUATION_VOCABULARY + +from .data import EOS_TOKENS, PUNCTUATION_VOCABULARY def convert(input_text, out_f, with_newlines=False): diff --git a/punctuator/models.py b/punctuator/models.py index 9ad11f6..31493a4 100644 --- a/punctuator/models.py +++ b/punctuator/models.py @@ -55,13 +55,6 @@ def weights_Glorot(i, o, name, rng, is_logistic_sigmoid=False, keepdims=False): def load(file_path, minibatch_size, x, p=None): - #import models - # try: - # import cPickle - # except ImportError: - # import _pickle as cPickle - # import theano - # import numpy as np with open(file_path, 'rb') as f: state = cPickle.load(f, **cpickle_options) diff --git a/punctuator/punc.py b/punctuator/punc.py index e78b1e4..2cc6ca3 100644 --- a/punctuator/punc.py +++ b/punctuator/punc.py @@ -4,6 +4,7 @@ import os import sys +import re import logging import pickle from io import open, StringIO @@ -14,9 +15,9 @@ import gdown -import models -import data -from convert_to_readable import convert +from . import models +from . import data +from .convert_to_readable import convert PUNCTUATOR_DATA_DIR = os.path.expanduser(os.environ.get('PUNCTUATOR_DATA_DIR', '~/.punctuator')) @@ -163,7 +164,18 @@ def restore(output_file, text, word_vocabulary, reverse_punctuation_vocabulary, class Punctuator: + def model_exists(self, fn): + if os.path.isfile(fn): + return fn + _fn = os.path.join(PUNCTUATOR_DATA_DIR, fn) + if os.path.isfile(_fn): + return _fn + def __init__(self, model_file, use_pauses=False): + + model_file = self.model_exists(model_file) + assert model_file, 'Model %s does not exist.' % model_file + self.model_file = model_file self.use_pauses = use_pauses @@ -225,16 +237,19 @@ def punctuate(self, input_text, escape=True): fout2 = StringIO() convert(fout.getvalue(), fout2) - # if isinstance(output_file, str): - # with open(output_file, 'w', encoding='utf-8') as fout_final: - # fout_final.write(fout2.getvalue()) - # else: - # output_file.write(fout2.getvalue()) - output_text = fout2.getvalue() + if output_text and not output_text.endswith('.'): output_text += '.' + # Correct "'s" capitalization. + output_text = re.sub(r"'[a-zA-Z]+\b", lambda m: m.group(0).lower(), output_text) + + # Correct I capitalizations. + output_text = re.sub(r"\bi'm\b", "I'm", output_text) + output_text = re.sub(r"\bi've\b", "I've", output_text) + output_text = re.sub(r"\bi\b", "I", output_text) + return output_text diff --git a/punctuator/tests.py b/punctuator/tests.py index 29366ad..f122eab 100644 --- a/punctuator/tests.py +++ b/punctuator/tests.py @@ -7,32 +7,48 @@ import shutil from io import StringIO -import punc -from punc import Punctuator, download_model +from . import punc +from .punc import Punctuator, download_model class Tests(unittest.TestCase): def test_punctuate(self): - samples = [( - 'mary had a little lamb its fleece was white as snow and anywhere that mary went the lamb was sure to go', - 'Mary had a little lamb, its fleece was white as snow and anywhere that mary went, the lamb was sure to go.' - ), - ( - "they say it's only as cold as it feels in your mind i don't buy into that theory much what do you think", - "They say it's only as cold as it feels in your mind. I don't buy into that theory much. What do you think." - )] + samples = [ + ( + 'mary had a little lamb its fleece was white as snow and anywhere that mary went the lamb was sure to go', + 'Mary had a little lamb, its fleece was white as snow and anywhere that mary went, the lamb was sure to go.' + ), + ( + "they say it's only as cold as it feels in your mind i don't buy into that theory much what do you think", + "They say it's only as cold as it feels in your mind. I don't buy into that theory much. What do you think." + ), + ( + "he's a do me a favor go home to your wife", + "He's a do me: a favor go home to your wife.", + ), + ( + "they'll even negotiate your rate with the insurance company", + "They'll even negotiate your rate with the insurance company.", + ), + ( + "for me i wanted to get into commentary some sort of way i didn't know how to do that so i left the firm and i started a business", + "For me, I wanted to get into commentary some sort of way. I didn't know how to do that. So I left the firm and I started a business." + ), + ] # Create temp directory for downloading data. d = tempfile.mkdtemp() - punc.PUNCTUATOR_DATA_DIR = d + os.makedirs(punc.PUNCTUATOR_DATA_DIR, exist_ok=True) + model_file = os.path.join(punc.PUNCTUATOR_DATA_DIR, 'Demo-Europarl-EN.pcl') print('Temp dir:', d) os.chdir(d) try: # Download pre-trained model. - model_file = download_model() + if not os.path.isfile(model_file): + model_file = download_model() print('Model file:', model_file) # Create punctuator. diff --git a/tox.ini b/tox.ini index d968d50..66043f0 100644 --- a/tox.ini +++ b/tox.ini @@ -9,4 +9,4 @@ basepython = deps = -r{toxinidir}/requirements.txt -r{toxinidir}/requirements-test.txt -commands = python punctuator/tests.py Tests{env:TESTNAME:} +commands = python -m punctuator.tests Tests{env:TESTNAME:} From 70ebc1a741d90c2c76c058e501ca921a899250b4 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 3 Oct 2019 20:46:07 -0400 Subject: [PATCH 09/19] Updated venv script. --- init_virtualenv.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/init_virtualenv.sh b/init_virtualenv.sh index bfcedd9..4921e46 100755 --- a/init_virtualenv.sh +++ b/init_virtualenv.sh @@ -13,5 +13,6 @@ REL_DIR=./ python3.7 -m venv $REL_DIR.env . $REL_DIR.env/bin/activate pip install -U pip +pip install -U setuptools pip install --cache-dir $CACHE_DIR -r requirements.txt requirements-test.txt From fe0dfcd2601da32e10890b77da5633ae81fb3df3 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 3 Oct 2019 20:46:59 -0400 Subject: [PATCH 10/19] Updated venv script. --- punctuator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/punctuator/__init__.py b/punctuator/__init__.py index 9bf4a44..f8c9879 100644 --- a/punctuator/__init__.py +++ b/punctuator/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 9, 3) +VERSION = (0, 9, 4) __version__ = '.'.join(map(str, VERSION)) try: from .punc import Punctuator From 16ccaf0a2400a76fb99d5d355ea4d35267fd8fd6 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 3 Oct 2019 20:53:45 -0400 Subject: [PATCH 11/19] Updated venv script. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1e943cd..c762584 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ def get_reqs(*fns): long_description=long_description, long_description_content_type='text/markdown', license="MIT", - url="https://github.com/chrisspen/punctuator", + url="https://github.com/chrisspen/punctuator2", # https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers=[ # 'Development Status :: 6 - Mature', From 52ae2ba2a93647753329b32e909bbed454408651 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Thu, 3 Oct 2019 20:54:22 -0400 Subject: [PATCH 12/19] Updated version, fixed homepage link. --- punctuator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/punctuator/__init__.py b/punctuator/__init__.py index f8c9879..9ee9816 100644 --- a/punctuator/__init__.py +++ b/punctuator/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 9, 4) +VERSION = (0, 9, 5) __version__ = '.'.join(map(str, VERSION)) try: from .punc import Punctuator From 1d86accf53cdd6b4778e2112f18865dcf7eb1b16 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Fri, 7 Aug 2020 00:14:43 -0400 Subject: [PATCH 13/19] Refactored and cleaned up dataset conversion scripts. --- example/run.sh | 19 ---------- {example => scripts}/README.md | 0 .../convert_europarl.py | 6 ++++ scripts/convert_europarl.sh | 36 +++++++++++++++++++ scripts/convert_ted.py | 33 +++++++++++++++++ scripts/convert_ted.sh | 6 ++++ scripts/run.sh | 4 +++ 7 files changed, 85 insertions(+), 19 deletions(-) delete mode 100644 example/run.sh rename {example => scripts}/README.md (100%) rename example/dont_run_me_run_the_other_script_instead.py => scripts/convert_europarl.py (95%) mode change 100644 => 100755 create mode 100755 scripts/convert_europarl.sh create mode 100755 scripts/convert_ted.py create mode 100755 scripts/convert_ted.sh create mode 100755 scripts/run.sh diff --git a/example/run.sh b/example/run.sh deleted file mode 100644 index 42b2d1b..0000000 --- a/example/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -wget -qO- http://hltshare.fbk.eu/IWSLT2012/training-monolingual-europarl.tgz | tar xvz -rm -rf ./out -echo "Step 1/3" -mkdir ./out -grep -v " '[^ ]" ./training-monolingual-europarl/europarl-v7.en | \ -grep -v \'\ s\ | \ -grep -v \'\ ll\ | \ -grep -v \'\ ve\ | \ -grep -v \'\ m\ > step1.txt -echo "Step 2/3" -python dont_run_me_run_the_other_script_instead.py step1.txt step2.txt -echo "Step 3/3" -head -n -400000 step2.txt > ./out/ep.train.txt -tail -n 400000 step2.txt > step3.txt -head -n -200000 step3.txt > ./out/ep.dev.txt -tail -n 200000 step3.txt > ./out/ep.test.txt -echo "Cleaning up..." -rm -f step1.txt step2.txt step3.txt -echo "Preprocessing done. Now you can give the produced ./out dir as argument to data.py script for conversion and continue as described in the main README.md" diff --git a/example/README.md b/scripts/README.md similarity index 100% rename from example/README.md rename to scripts/README.md diff --git a/example/dont_run_me_run_the_other_script_instead.py b/scripts/convert_europarl.py old mode 100644 new mode 100755 similarity index 95% rename from example/dont_run_me_run_the_other_script_instead.py rename to scripts/convert_europarl.py index cfee1d8..9f42424 --- a/example/dont_run_me_run_the_other_script_instead.py +++ b/scripts/convert_europarl.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # coding: utf-8 from __future__ import division, print_function @@ -22,9 +23,11 @@ is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6 + def untokenize(line): return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot") + def skip(line): if line.strip() == '': @@ -39,6 +42,7 @@ def skip(line): return False + def process_line(line): tokens = word_tokenize(line) @@ -57,8 +61,10 @@ def process_line(line): return untokenize(" ".join(output_tokens) + " ") + skipped = 0 +print('Reading %s and outputting to %s.' % (sys.argv[1], sys.argv[2])) with open(sys.argv[2], 'w', encoding='utf-8') as out_txt: with open(sys.argv[1], 'r', encoding='utf-8') as text: diff --git a/scripts/convert_europarl.sh b/scripts/convert_europarl.sh new file mode 100755 index 0000000..ed2e989 --- /dev/null +++ b/scripts/convert_europarl.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +HOME_DIR="$(dirname "$(readlink -f "$0")")" +DEST=../data/raw/training-monolingual-europarl.tgz +WORKING_DIR=../data/raw/training-monolingual-europarl +POSTPROCESSED_DIR=../data/postprocessed + +if [ -f "$DEST" ]; then + echo "$DEST exists. Skipping download." +else + echo "$DEST does not exist. Downloading..." + wget -qO- http://hltshare.fbk.eu/IWSLT2012/training-monolingual-europarl.tgz --output-document $DEST + tar xvz $DEST +fi + +echo "Step 1/3" +mkdir -p $WORKING_DIR/out +grep -v " '[^ ]" $WORKING_DIR/europarl-v7.en | \ +grep -v \'\ s\ | \ +grep -v \'\ ll\ | \ +grep -v \'\ ve\ | \ +grep -v \'\ m\ > $WORKING_DIR/out/step1.txt + +echo "Step 2/3" +python convert_europarl.py $WORKING_DIR/out/step1.txt $WORKING_DIR/out/step2.txt + +echo "Step 3/3" +head -n -400000 $WORKING_DIR/out/step2.txt > $POSTPROCESSED_DIR/europarl.train.txt +tail -n 400000 $WORKING_DIR/out/step2.txt > $WORKING_DIR/out/step3.txt +head -n -200000 $WORKING_DIR/out/step3.txt > $POSTPROCESSED_DIR/europarl.dev.txt +tail -n 200000 $WORKING_DIR/out/step3.txt > $POSTPROCESSED_DIR/europarl.test.txt + +#echo "Cleaning up..." +#rm -f step1.txt step2.txt step3.txt +echo "Preprocessing done. Now you can give the produced $POSTPROCESSED_DIR dir as argument to data.py script for conversion and continue as described in the main README.md" diff --git a/scripts/convert_ted.py b/scripts/convert_ted.py new file mode 100755 index 0000000..ebca664 --- /dev/null +++ b/scripts/convert_ted.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +import sys +import codecs + +mapping = {"COMMA": ",COMMA", "PERIOD": ".PERIOD", "QUESTION": "?QUESTIONMARK", "O": ""} +counts = dict((p, 0) for p in mapping) + +print('Reading %s and outputting to %s.' % (sys.argv[1], sys.argv[2])) +with codecs.open(sys.argv[1], 'r', 'utf-8', 'ignore') as f_in, \ + codecs.open(sys.argv[2], 'w', 'utf-8') as f_out: + + for i, line in enumerate(f_in): + + line = line.replace('?', '') + + parts = line.split() + + if len(parts) == 0: + continue + + if len(parts) == 1: + word = "" + punct = parts[0] + else: + word, punct = parts + + counts[punct] += 1 + + f_out.write("%s %s " % (word, mapping[punct])) + +print("Counts:") +for p, c in counts.items(): + print("%s: %d" % (p, c)) diff --git a/scripts/convert_ted.sh b/scripts/convert_ted.sh new file mode 100755 index 0000000..bc22fcb --- /dev/null +++ b/scripts/convert_ted.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e +python convert_ted.py ../data/raw/LREC/dev2012 ../data/postprocessed/ted.dev.txt +python convert_ted.py ../data/raw/LREC/test2011 ../data/postprocessed/ted.test.txt +python convert_ted.py ../data/raw/LREC/test2011asr ../data/postprocessed/ted.test-asr.txt +python convert_ted.py ../data/raw/LREC/train2012 ../data/postprocessed/ted.train.txt diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100755 index 0000000..cea09e6 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +./convert_europarl.sh +./convert_ted.sh From d231fbebe8f925f556c4470b84e2173d54d1dcb7 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Fri, 7 Aug 2020 11:26:29 -0400 Subject: [PATCH 14/19] Moved raw data folder. --- .gitignore | 1 + scripts/convert_europarl.sh | 4 ++-- scripts/convert_ted.sh | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 151848d..638b34f 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ target/ *.out *.geany /data +/raw diff --git a/scripts/convert_europarl.sh b/scripts/convert_europarl.sh index ed2e989..a80e526 100755 --- a/scripts/convert_europarl.sh +++ b/scripts/convert_europarl.sh @@ -2,8 +2,8 @@ set -e HOME_DIR="$(dirname "$(readlink -f "$0")")" -DEST=../data/raw/training-monolingual-europarl.tgz -WORKING_DIR=../data/raw/training-monolingual-europarl +DEST=../raw/training-monolingual-europarl.tgz +WORKING_DIR=../raw/training-monolingual-europarl POSTPROCESSED_DIR=../data/postprocessed if [ -f "$DEST" ]; then diff --git a/scripts/convert_ted.sh b/scripts/convert_ted.sh index bc22fcb..b34bf00 100755 --- a/scripts/convert_ted.sh +++ b/scripts/convert_ted.sh @@ -1,6 +1,6 @@ #!/bin/bash set -e -python convert_ted.py ../data/raw/LREC/dev2012 ../data/postprocessed/ted.dev.txt -python convert_ted.py ../data/raw/LREC/test2011 ../data/postprocessed/ted.test.txt -python convert_ted.py ../data/raw/LREC/test2011asr ../data/postprocessed/ted.test-asr.txt -python convert_ted.py ../data/raw/LREC/train2012 ../data/postprocessed/ted.train.txt +python convert_ted.py ../raw/LREC/dev2012 ../raw/postprocessed/ted.dev.txt +python convert_ted.py ../raw/LREC/test2011 ../raw/postprocessed/ted.test.txt +python convert_ted.py ../raw/LREC/test2011asr ../raw/postprocessed/ted-asr.test.txt +python convert_ted.py ../raw/LREC/train2012 ../raw/postprocessed/ted.train.txt From fe9c78328d736a428c6279fb45ebec32c0d0f2c7 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Fri, 7 Aug 2020 13:16:11 -0400 Subject: [PATCH 15/19] Fixed pickle attribute error. --- punctuator/data.py | 2 +- punctuator/models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/punctuator/data.py b/punctuator/data.py index 348e040..d3a31e3 100644 --- a/punctuator/data.py +++ b/punctuator/data.py @@ -252,7 +252,7 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra embeddings.append(e) with open("We.pcl", 'wb') as f: - cPickle.dump(embeddings, f, cPickle.HIGHEST_PROTOCOL) + cPickle.dump(embeddings, f) else: vocabulary = build_vocabulary(word_counts) write_vocabulary(vocabulary, WORD_VOCAB_FILE) diff --git a/punctuator/models.py b/punctuator/models.py index 31493a4..f33a1f1 100644 --- a/punctuator/models.py +++ b/punctuator/models.py @@ -247,7 +247,7 @@ def save(self, file_path, gsums=None, learning_rate=None, validation_ppl_history } with open(file_path, 'wb') as f: - cPickle.dump(state, f, protocol=cPickle.HIGHEST_PROTOCOL) + cPickle.dump(state, f) class GRUstage2(GRU): From 7939f803493d7dbfdcfbee08f32d1726ce2187da Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Fri, 7 Aug 2020 14:09:02 -0400 Subject: [PATCH 16/19] Fixed venv init script. --- init_virtualenv.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init_virtualenv.sh b/init_virtualenv.sh index 4921e46..5c3f6e2 100755 --- a/init_virtualenv.sh +++ b/init_virtualenv.sh @@ -14,5 +14,6 @@ python3.7 -m venv $REL_DIR.env . $REL_DIR.env/bin/activate pip install -U pip pip install -U setuptools +pip install pypandoc -pip install --cache-dir $CACHE_DIR -r requirements.txt requirements-test.txt +pip install --cache-dir $CACHE_DIR -r requirements.txt -r requirements-test.txt From a2f0c595049ebbc0747c07020b31025d6d6a8a8d Mon Sep 17 00:00:00 2001 From: noahnewberger Date: Fri, 7 Aug 2020 14:15:46 -0400 Subject: [PATCH 17/19] Adding bytes compatability to new_branch --- punctuator/models.py | 30 ++++++++++++++++++++++++++++++ punctuator/punc.py | 14 ++++++++++---- punctuator/tests.py | 10 ++++++++-- 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/punctuator/models.py b/punctuator/models.py index 31493a4..8932508 100644 --- a/punctuator/models.py +++ b/punctuator/models.py @@ -85,6 +85,36 @@ def load(file_path, minibatch_size, x, p=None): return net, (gsums, state["learning_rate"], state["validation_ppl_history"], state["epoch"], rng) +def loads(file_bytes, minibatch_size, x, p=None): + + state = cPickle.loads(file_bytes, **cpickle_options) + + logging.info('Looking up %s.', state["type"]) + # Model = getattr(models, state["type"]) + Model = globals()[state["type"]] + + rng = np.random + rng.set_state(state["random_state"]) + + net = Model( + rng=rng, + x=x, + minibatch_size=minibatch_size, + n_hidden=state["n_hidden"], + x_vocabulary=state["x_vocabulary"], + y_vocabulary=state["y_vocabulary"], + stage1_model_file_name=state.get("stage1_model_file_name", None), + p=p + ) + + for net_param, state_param in zip(net.params, state["params"]): + net_param.set_value(state_param, borrow=True) + + gsums = [theano.shared(gsum) for gsum in state["gsums"]] if state["gsums"] else None + + return net, (gsums, state["learning_rate"], state["validation_ppl_history"], state["epoch"], rng) + + class GRULayer: def __init__(self, rng, n_in, n_out, minibatch_size): diff --git a/punctuator/punc.py b/punctuator/punc.py index 2cc6ca3..2313232 100644 --- a/punctuator/punc.py +++ b/punctuator/punc.py @@ -165,6 +165,8 @@ def restore(output_file, text, word_vocabulary, reverse_punctuation_vocabulary, class Punctuator: def model_exists(self, fn): + if isinstance(fn, bytes): + return fn if os.path.isfile(fn): return fn _fn = os.path.join(PUNCTUATOR_DATA_DIR, fn) @@ -186,16 +188,20 @@ def __init__(self, model_file, use_pauses=False): p = T.matrix('p') logging.info("Loading model parameters...") - net, _ = models.load(model_file, 1, x, p) - + if isinstance(model_file, bytes): + net, _ = models.loads(model_file, 1, x, p) + else: + net, _ = models.load(model_file, 1, x, p) logging.info("Building model...") self.predict = theano.function(inputs=[x, p], outputs=net.y) else: logging.info("Loading model parameters...") - net, _ = models.load(model_file, 1, x) - + if isinstance(model_file, bytes): + net, _ = models.loads(model_file, 1, x) + else: + net, _ = models.load(model_file, 1, x) logging.info("Building model...") self.predict = theano.function(inputs=[x], outputs=net.y) diff --git a/punctuator/tests.py b/punctuator/tests.py index f122eab..9d1788b 100644 --- a/punctuator/tests.py +++ b/punctuator/tests.py @@ -50,12 +50,18 @@ def test_punctuate(self): if not os.path.isfile(model_file): model_file = download_model() print('Model file:', model_file) - + # Check if file can be read in as bytes + infile = open(model_file, 'rb') + data = infile.read() + t0 = time.time() + p = Punctuator(data) + td = time.time() - t0 + print('Loaded in %s seconds as bytes.' % td) # Create punctuator. t0 = time.time() p = Punctuator(model_file=model_file) td = time.time() - t0 - print('Loaded in %s seconds.' % td) + print('Loaded in %s seconds from path.' % td) # Add punctuation. for input_text, expect_output_text in samples: From 7e8463aa37ef2ba0600191b820a2ac72c818a478 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Sat, 8 Aug 2020 12:36:17 -0400 Subject: [PATCH 18/19] Added NUM token. --- punctuator/__init__.py | 2 +- punctuator/data.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/punctuator/__init__.py b/punctuator/__init__.py index 9ee9816..c7d0205 100644 --- a/punctuator/__init__.py +++ b/punctuator/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 9, 5) +VERSION = (0, 9, 6) __version__ = '.'.join(map(str, VERSION)) try: from .punc import Punctuator diff --git a/punctuator/data.py b/punctuator/data.py index d3a31e3..9d7a0d1 100644 --- a/punctuator/data.py +++ b/punctuator/data.py @@ -20,6 +20,7 @@ END = "" UNK = "" +NUM = "" SPACE = "_SPACE" PERIOD = ".PERIOD" From 91b0fb0db65cc378b38b1739f0708f2fca1a7426 Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Mon, 10 Aug 2020 09:23:53 -0400 Subject: [PATCH 19/19] Moved samples. --- punctuator/tests.py | 50 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/punctuator/tests.py b/punctuator/tests.py index 9d1788b..35fa369 100644 --- a/punctuator/tests.py +++ b/punctuator/tests.py @@ -13,30 +13,30 @@ class Tests(unittest.TestCase): - def test_punctuate(self): + samples = [ + ( + 'mary had a little lamb its fleece was white as snow and anywhere that mary went the lamb was sure to go', + 'Mary had a little lamb, its fleece was white as snow and anywhere that mary went, the lamb was sure to go.' + ), + ( + "they say it's only as cold as it feels in your mind i don't buy into that theory much what do you think", + "They say it's only as cold as it feels in your mind. I don't buy into that theory much. What do you think." + ), + ( + "he's a do me a favor go home to your wife", + "He's a do me: a favor go home to your wife.", + ), + ( + "they'll even negotiate your rate with the insurance company", + "They'll even negotiate your rate with the insurance company.", + ), + ( + "for me i wanted to get into commentary some sort of way i didn't know how to do that so i left the firm and i started a business", + "For me, I wanted to get into commentary some sort of way. I didn't know how to do that. So I left the firm and I started a business." + ), + ] - samples = [ - ( - 'mary had a little lamb its fleece was white as snow and anywhere that mary went the lamb was sure to go', - 'Mary had a little lamb, its fleece was white as snow and anywhere that mary went, the lamb was sure to go.' - ), - ( - "they say it's only as cold as it feels in your mind i don't buy into that theory much what do you think", - "They say it's only as cold as it feels in your mind. I don't buy into that theory much. What do you think." - ), - ( - "he's a do me a favor go home to your wife", - "He's a do me: a favor go home to your wife.", - ), - ( - "they'll even negotiate your rate with the insurance company", - "They'll even negotiate your rate with the insurance company.", - ), - ( - "for me i wanted to get into commentary some sort of way i didn't know how to do that so i left the firm and i started a business", - "For me, I wanted to get into commentary some sort of way. I didn't know how to do that. So I left the firm and I started a business." - ), - ] + def test_punctuate(self): # Create temp directory for downloading data. d = tempfile.mkdtemp() @@ -64,7 +64,7 @@ def test_punctuate(self): print('Loaded in %s seconds from path.' % td) # Add punctuation. - for input_text, expect_output_text in samples: + for input_text, expect_output_text in self.samples: fout = StringIO() actual_output_text = p.punctuate(input_text) print('expect_output_text:', expect_output_text) @@ -87,7 +87,7 @@ def test_punctuate(self): print('Loaded in %s seconds.' % td) # Confirm punctuations match previous. - for input_text, expect_output_text in samples: + for input_text, expect_output_text in self.samples: fout = StringIO() actual_output_text = p2.punctuate(input_text) print('expect_output_text:', expect_output_text)