ngram.py

# The friend of ngram_count.py. This one uses the model generated by ngram_count.py to computed logprobs perplexities and so on.

import argparse
import json
from bncpylib.ngrams import KneserNeySmoothingModel, ContextNgramKneserNey, TagBasedKneserNeySmoothingModel, LaplaceSmoothingModel, StingyKneserNeySmoothingModel
from bncpylib.util import safe_open, safe_open_with_encoding, Timer, collect_ngrams, flush_and_close

def process_arguments():
    parser = argparse.ArgumentParser(description='The script loads a language model generated using ngram_count.py and applies it to a number of corpora, reporting for each sentence in each corpus a number of scores, ranging from its log prob, its perplexity and other derived scores.')
    parser.add_argument('-test', type=str, metavar='CORPORA',
                        help='the test corpora, the script will calculate logprobs, perplexity and so on', required=True,nargs='+')
    parser.add_argument('-lm', type=file, metavar='LANGUAGE_MODEL',
                        help='read the language model from LANGUAGE_MODEL (must be a file)',required=True)
    parser.add_argument('-o','--order',type=int, metavar='ORDER',
                        help='the order of the generated model. It defaults to 3 (trigrams).',
                        default=3)
    parser.add_argument('-k','--keep_sentences',action='store_true',help='Store the sentences together with the measurements in the csv file')
    parser.add_argument('-r','--results',type=str, metavar='RESULTS_FILES', help='The results are saved to RESULTS_FILES (one for each test corpus)',nargs='+',required=True)
    parser.add_argument('-t', '--tell_me_when', help='When passed this option the script tries to estimate the time it will take to end the construction of the model (it will always underestimate the true time as it does not take into account the time needed to save the model)', action='store_true')
#    parser.add_argument('-s', '--selective_loading',help='When passed this option the script loads in memory only the ngrams that atually appear in the test data', action='store_true')   

    
    args = parser.parse_args()

    return args

def main(args):
    # load the language model
    attrs = json.load(args.lm)
    
    if (attrs['smoothing'] == 'kneser-ney'):
        lm = KneserNeySmoothingModel.load(attrs)
    elif (attrs['smoothing'] == 'context-kneser-ney'):
        lm = ContextNgramKneserNey.load(attrs)
    elif (attrs['smoothing'] == 'stingy-kneser-ney'):
        lm = StingyKneserNeySmoothingModel.load(attrs)
    elif (attrs['smoothing'] == 'tag-kneser-ney'):
        lm = TagBasedKneserNeySmoothingModel.load(attrs)
    elif (attrs['smoothing'] == 'laplace'):
        lm = LaplaceSmoothingModel.load(attrs)
    else:
        raise NameError('Not implemented yet')

    test_corpora = map(lambda x : safe_open_with_encoding(x,'r'),args.test)
    results_files = map(lambda x : safe_open(x,'w'),args.results)

    if len(test_corpora) != len(results_files):
        raise NameError('The number of output files does not match the number of input files')

    header = 'id,ppl,sent_length,' + lm.combined_scores_csv_header()
    if args.keep_sentences:
        header += ',sentence'
    for results_file in results_files:
        results_file.write(header)
        results_file.write('\n')

    if args.tell_me_when:
        tot_lines = 0
        for test_corpus in test_corpora:
            for line in test_corpus:
                tot_lines += 1
            test_corpus.seek(0)
        timer = Timer(tot_lines)

    for i in xrange(len(test_corpora)):
        test_corpus = test_corpora[i]
        results_file = results_files[i]
        counter = 0
        for line in test_corpus:
            if not line == '\n':
                tokens = lm.tokenize_sentence(line,args.order)
                scores = lm.combined_scores(tokens)
                n_tokens = len(tokens)
                perplexity = lm.perplexity(line)
                row = '{id},{perpl},{sent_length},{scores}'.format(id=counter,perpl=perplexity,sent_length=n_tokens,scores=','.join(map(str,scores)))
                counter += 1
                if args.keep_sentences:
                    row += ',' + line[:-1]
                results_file.write(row)
                results_file.write('\n')
                if args.tell_me_when:
                    timer.advance()
        flush_and_close(results_file)
                
    if args.tell_me_when:
        timer.done()

if __name__ == '__main__':
    args = process_arguments()
    main(args)