tedunderwood · bmschmidt · May 22, 2015 · May 22, 2015 · Mar 18, 2016 · Jan 8, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+bookworm/paceofchange
+bookworm/unigrams.txt
+bookworm/jsoncatalog.txt
+__pycache__
+mainmodelpredictions.coefs.csv
+mainmodelpredictions.csv
+nationalpredictions.coefs.csv
+nationalpredictions.csv
+
diff --git a/SRT-transform.py b/SRT-transform.py
@@ -0,0 +1,78 @@
+import glob
+import sys
+import os
+import numpy as np
+
+from SRP import SRP
+
+hasher = SRP(6400)
+
+def standardize_word(word):
+    """
+    They have some funny conventions involving brackets.
+    """
+    if word.find("|")==-1:
+        return word
+    if word=="|":
+        return word
+
+    word = word.strip("|")
+    word.replace("arabic","1")
+    word.replace("+digit","11")
+    word.replace("digit","1")
+    return word
+
+try:
+    os.makedirs("SRPs")
+except:
+    pass
+try:
+    os.makedirs("logSRPs")
+except:
+    pass
+
+try:
+    os.makedirs("littleLogSRPs")
+    os.makedirs("littleSRPs")
+except:
+    pass
+
+
+for file in glob.glob("poems/*"):
+    words = []
+    counts = []
+    doc = open(file)
+    for line in doc:
+        (word,count) = line.split("\t")
+        word = standardize_word(word)
+        try:
+            words.append(word.decode("utf-8"))
+        except AttributeError:
+            # Some 2/3 mucking
+            words.append(word)
+        counts.append(int(count))
+
+    scores = hasher.stable_transform(words,counts)
+    scores = scores.astype(np.float64)
+    scores = np.divide(scores,np.linalg.norm(scores))
+
+    newfile = file.replace("poems/","SRPs/")
+    with open(newfile,"w") as f:
+        for i in range(len(scores)):
+            f.write("V%i\t%s\n" %(i,str(scores[i])))
+
+    scores = hasher.stable_transform(words,counts,log=False)
+    scores = scores.astype(np.float64)
+    scores = np.divide(scores,np.linalg.norm(scores))
+
+    newfile = file.replace("poems/","littleSRPs/")
+    with open(newfile,"w") as f:
+        for i in range(len(scores)):
+            f.write("V%i\t%s\n" %(i,str(scores[i])))
+
+    log_scores = hasher.stable_transform(words,counts,log=True)    
+    newfile = file.replace("poems/","littleLogSRPs/")
+    with open(newfile,"w") as f:
+        for i in range(len(log_scores)):
+            f.write("V%i\t%s\n" %(i,str(log_scores[i])))
+
diff --git a/bookworm/Makefile b/bookworm/Makefile
@@ -0,0 +1,11 @@
+
+all: jsoncatalog.txt unigrams.txt paceofchange
+
+paceofchange:
+	git clone git@github.com:Bookworm-Project/BookwormDB $@
+
+unigrams.txt:
+	python makeUnigrams.py > $@
+
+jsoncatalog.txt:
+	python makeJSONcatalog.py > $@
diff --git a/bookworm/field_descriptions.json b/bookworm/field_descriptions.json
@@ -0,0 +1,16 @@
+[ {"datatype": "categorical", "field": "canon", "unique": true, "type": "character"},
+ {"datatype": "etc", "field": "imprint", "unique": true, "type": "character"},
+ {"datatype": "categorical", "field": "nationality", "unique": true, "type": "character"},
+ {"datatype": "categorical", "field": "author", "unique": true, "type": "character"},
+ {"datatype": "categorical", "field": "pubname", "unique": true, "type": "character"},
+ {"datatype": "categorical", "field": "inferreddate", "unique": true, "type": "integer"},
+ {"datatype": "categorical", "field": "recept", "unique": true, "type": "character"},
+ {"datatype": "time", "field": "birth", "unique": true, "type":"character","derived":[{"resolution":"year"}]},	{"field":"searchstring","datatype":"searchstring","type":"text","unique":true},
+
+ {"datatype": "etc", "field": "judge", "unique": true, "type": "character"},
+ {"datatype": "time", "field": "yrrev", "unique": true, "type": "character","derived":[{"resolution":"year"}]},
+ {"datatype": "categorical", "field": "gender", "unique": true, "type": "character"},
+ {"datatype": "time", "field": "firstpub", "unique": true, "type": "character","derived":[{"resolution":"year"}]},
+ {"datatype": "time", "field": "actualdate", "unique": true, "type": "character","derived":[{"resolution":"year"}]},
+ {"datatype": "categorical", "field": "title", "unique": true, "type": "character"},
+ {"datatype": "time", "field": "pubrev", "unique": true, "type": "character","derived":[{"resolution":"year"}]}]
diff --git a/bookworm/makeJSONcatalog.py b/bookworm/makeJSONcatalog.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import json
+import pandas
+import numpy as np
+
+df = pd.read_csv("../poemeta.csv")
+
+# Basically all code taken from https://gist.github.com/mikedewar/1486027
+
+d = [ 
+    dict([
+        (colname, row[i]) 
+        for i,colname in enumerate(df.columns)
+    ])
+    for row in df.values
+]
+
+for row in d:
+    row['filename'] = row['docid']
+    row['searchstring'] = "'%(title)s,' by %(author)s (%(imprint)s)" % row
+    print json.dumps(row)
+
+
diff --git a/bookworm/makeUnigrams.py b/bookworm/makeUnigrams.py
@@ -0,0 +1,10 @@
+import os
+import sys
+
+for file in os.listdir("../poems"):
+    if file.endswith("tsv"):
+        identifier=file.replace(".poe.tsv","")
+        for line in open("../poems/" + file):
+            line = line.rstrip("\n")
+            print identifier + "\t" + line
+
diff --git a/modelingprocess.py b/modelingprocess.py
@@ -60,7 +60,7 @@ def model_one_volume(data5tuple):
     newmodel.fit(trainingset, yvals)
 
     testset = (testset - means) / stdevs
-    prediction = newmodel.predict_proba(testset)[0][1]
+    prediction = newmodel.predict_proba(testset.values.reshape(1,-1))[0][1]
     if i % 50 == 0:
         print(i)
     # print(str(i) + "  -  " + str(len(listtoexclude)))

diff --git a/parallel_crossvalidate.py b/parallel_crossvalidate.py
@@ -69,6 +69,7 @@ def forceint(astring):
 
     return intval
 
+
 def get_features(wordcounts, wordlist):
     numwords = len(wordlist)
     wordvec = np.zeros(numwords)
@@ -265,14 +266,18 @@ def create_model(paths, exclusions, thresholds, classifyconditions):
                         continue
                     word = fields[0]
                     if len(word) > 0 and word[0].isalpha():
-                        count = int(fields[1])
+                        count = float(fields[1])
                         wordcounts[word] += 1
                         # for initial feature selection we use the number of
                         # *documents* that contain a given word,
                         # so it's just +=1.
 
-    vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]
-
+    if sourcefolder=="poems/":
+        vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]
+    else:
+        # In an SRT, we can just take them arbitrarily. The top ten is [V0,V1,V2,...,V10]
+        vocablist = ["V" + str(i) for i in range(numfeatures)]
+
     # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
     # Feature selection is deprecated. There are cool things
     # we could do with feature selection,
@@ -334,7 +339,7 @@ def create_model(paths, exclusions, thresholds, classifyconditions):
                     continue
 
                 word = fields[0]
-                count = int(fields[1])
+                count = float(fields[1])
                 voldict[word] = count
                 totalcount += count
 
@@ -348,9 +353,13 @@ def create_model(paths, exclusions, thresholds, classifyconditions):
             voldata.append(features)
         else:
             features = get_features(voldict, vocablist)
-            voldata.append(features / (totalcount + 0.001))
-
 
+            if sourcefolder=="poems/":
+                voldata.append(features / (totalcount + 0.001))
+            else:
+                # For SRT transformations, normalization is already handled
+                voldata.append(features)
+
         volsizes[volid] = totalcount
         classflag = classdictionary[volid]
         classvector.append(classflag)

diff --git a/replicate.py b/replicate.py
@@ -4,6 +4,7 @@
 
 import parallel_crossvalidate as pc
 import sys
+import os
 
 allowable = {"full", "quarters", "nations", "genders", "canon", "halves"}
 
@@ -30,11 +31,23 @@ def instructions():
 
 assert command in allowable
 
+import traceback
+import warnings
+import sys
+
+def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
+
+    log = file if hasattr(file,'write') else sys.stderr
+    traceback.print_stack(file=log)
+    log.write(warnings.formatwarning(message, category, filename, lineno, line))
+
+warnings.showwarning = warn_with_traceback
+
 if command == 'full':
 
     ## PATHS.
 
-    sourcefolder = 'poems/'
+    sourcefolder = 'SRTs/'
     extension = '.poe.tsv'
     classpath = 'poemeta.csv'
     outputpath = 'mainmodelpredictions.csv'
@@ -77,18 +90,35 @@ def instructions():
     numfeatures = 3200
     regularization = .00007
 
-    paths = (sourcefolder, extension, classpath, outputpath)
-    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
-    thresholds = (pastthreshold, futurethreshold)
-    classifyconditions = (category2sorton, positive_class, datetype, numfeatures, regularization)
-
-    rawaccuracy, allvolumes, coefficientuples = pc.create_model(paths, exclusions, thresholds, classifyconditions)
-
-    tiltaccuracy = pc.diachronic_tilt(allvolumes, 'linear', [])
-
-    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
-
-    print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
+    # I added some dims later. This lets me just append to the existing file.
+    if os.path.exists("model-results.tsv"):
+        already_done = set([int(a.split("\t")[0]) for a in open("model-results.tsv").readlines()[1:]])
+        f = open("model-results.tsv","a")        
+    else:
+        already_done = set([])
+        f = open("model-results.tsv","w")
+        f.write("dims\tfeatures\twith_tilt\taccuracy\n")
+
+
+    for numfeatures in [20,40,80,160,320,800,1200,1600,2400,3200,6400]:
+        if numfeatures in already_done:
+            continue
+        for sourcefolder in ["poems/","logSRTs/", "SRTs"]:
+            paths = (sourcefolder, extension, classpath, outputpath)
+            classifyconditions = (category2sorton, positive_class, datetype, numfeatures, regularization)
+            exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
+            thresholds = (pastthreshold, futurethreshold)
+            rawaccuracy, allvolumes, coefficientuples = pc.create_model(paths, exclusions, thresholds, classifyconditions)
+            tiltaccuracy = pc.diachronic_tilt(allvolumes, 'linear', [])
+            f.write("%i\t%s\tF\t%s\n" % (numfeatures,sourcefolder,str(rawaccuracy)))
+            f.write("%i\t%s\tT\t%s\n" % (numfeatures,sourcefolder,str(tiltaccuracy)))
+            f.flush()
+            results = """
+            %s features
+            Using %s for features
+            If we divide the dataset with a horizontal line at 0.5, accuracy is: %s
+            Divided with a line fit to the data trend, it's %s""" %(str(numfeatures), str(sourcefolder), str(rawaccuracy), str(tiltaccuracy))
+            print(results)
 
 elif command == 'quarters':
     ## PATHS.