Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
bookworm/paceofchange
bookworm/unigrams.txt
bookworm/jsoncatalog.txt
__pycache__
mainmodelpredictions.coefs.csv
mainmodelpredictions.csv
nationalpredictions.coefs.csv
nationalpredictions.csv

78 changes: 78 additions & 0 deletions SRT-transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import glob
import sys
import os
import numpy as np

from SRP import SRP

hasher = SRP(6400)

def standardize_word(word):
"""
They have some funny conventions involving brackets.
"""
if word.find("|")==-1:
return word
if word=="|":
return word

word = word.strip("|")
word.replace("arabic","1")
word.replace("+digit","11")
word.replace("digit","1")
return word

try:
os.makedirs("SRPs")
except:
pass
try:
os.makedirs("logSRPs")
except:
pass

try:
os.makedirs("littleLogSRPs")
os.makedirs("littleSRPs")
except:
pass


for file in glob.glob("poems/*"):
words = []
counts = []
doc = open(file)
for line in doc:
(word,count) = line.split("\t")
word = standardize_word(word)
try:
words.append(word.decode("utf-8"))
except AttributeError:
# Some 2/3 mucking
words.append(word)
counts.append(int(count))

scores = hasher.stable_transform(words,counts)
scores = scores.astype(np.float64)
scores = np.divide(scores,np.linalg.norm(scores))

newfile = file.replace("poems/","SRPs/")
with open(newfile,"w") as f:
for i in range(len(scores)):
f.write("V%i\t%s\n" %(i,str(scores[i])))

scores = hasher.stable_transform(words,counts,log=False)
scores = scores.astype(np.float64)
scores = np.divide(scores,np.linalg.norm(scores))

newfile = file.replace("poems/","littleSRPs/")
with open(newfile,"w") as f:
for i in range(len(scores)):
f.write("V%i\t%s\n" %(i,str(scores[i])))

log_scores = hasher.stable_transform(words,counts,log=True)
newfile = file.replace("poems/","littleLogSRPs/")
with open(newfile,"w") as f:
for i in range(len(log_scores)):
f.write("V%i\t%s\n" %(i,str(log_scores[i])))

11 changes: 11 additions & 0 deletions bookworm/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

all: jsoncatalog.txt unigrams.txt paceofchange

paceofchange:
git clone git@github.com:Bookworm-Project/BookwormDB $@

unigrams.txt:
python makeUnigrams.py > $@

jsoncatalog.txt:
python makeJSONcatalog.py > $@
16 changes: 16 additions & 0 deletions bookworm/field_descriptions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[ {"datatype": "categorical", "field": "canon", "unique": true, "type": "character"},
{"datatype": "etc", "field": "imprint", "unique": true, "type": "character"},
{"datatype": "categorical", "field": "nationality", "unique": true, "type": "character"},
{"datatype": "categorical", "field": "author", "unique": true, "type": "character"},
{"datatype": "categorical", "field": "pubname", "unique": true, "type": "character"},
{"datatype": "categorical", "field": "inferreddate", "unique": true, "type": "integer"},
{"datatype": "categorical", "field": "recept", "unique": true, "type": "character"},
{"datatype": "time", "field": "birth", "unique": true, "type":"character","derived":[{"resolution":"year"}]}, {"field":"searchstring","datatype":"searchstring","type":"text","unique":true},

{"datatype": "etc", "field": "judge", "unique": true, "type": "character"},
{"datatype": "time", "field": "yrrev", "unique": true, "type": "character","derived":[{"resolution":"year"}]},
{"datatype": "categorical", "field": "gender", "unique": true, "type": "character"},
{"datatype": "time", "field": "firstpub", "unique": true, "type": "character","derived":[{"resolution":"year"}]},
{"datatype": "time", "field": "actualdate", "unique": true, "type": "character","derived":[{"resolution":"year"}]},
{"datatype": "categorical", "field": "title", "unique": true, "type": "character"},
{"datatype": "time", "field": "pubrev", "unique": true, "type": "character","derived":[{"resolution":"year"}]}]
23 changes: 23 additions & 0 deletions bookworm/makeJSONcatalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd
import json
import pandas
import numpy as np

df = pd.read_csv("../poemeta.csv")

# Basically all code taken from https://gist.github.com/mikedewar/1486027

d = [
dict([
(colname, row[i])
for i,colname in enumerate(df.columns)
])
for row in df.values
]

for row in d:
row['filename'] = row['docid']
row['searchstring'] = "'%(title)s,' by %(author)s (%(imprint)s)" % row
print json.dumps(row)


10 changes: 10 additions & 0 deletions bookworm/makeUnigrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os
import sys

for file in os.listdir("../poems"):
if file.endswith("tsv"):
identifier=file.replace(".poe.tsv","")
for line in open("../poems/" + file):
line = line.rstrip("\n")
print identifier + "\t" + line

2 changes: 1 addition & 1 deletion modelingprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def model_one_volume(data5tuple):
newmodel.fit(trainingset, yvals)

testset = (testset - means) / stdevs
prediction = newmodel.predict_proba(testset)[0][1]
prediction = newmodel.predict_proba(testset.values.reshape(1,-1))[0][1]
if i % 50 == 0:
print(i)
# print(str(i) + " - " + str(len(listtoexclude)))
Expand Down
21 changes: 15 additions & 6 deletions parallel_crossvalidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def forceint(astring):

return intval


def get_features(wordcounts, wordlist):
numwords = len(wordlist)
wordvec = np.zeros(numwords)
Expand Down Expand Up @@ -265,14 +266,18 @@ def create_model(paths, exclusions, thresholds, classifyconditions):
continue
word = fields[0]
if len(word) > 0 and word[0].isalpha():
count = int(fields[1])
count = float(fields[1])
wordcounts[word] += 1
# for initial feature selection we use the number of
# *documents* that contain a given word,
# so it's just +=1.

vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]

if sourcefolder=="poems/":
vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]
else:
# In an SRT, we can just take them arbitrarily. The top ten is [V0,V1,V2,...,V10]
vocablist = ["V" + str(i) for i in range(numfeatures)]

# vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
# Feature selection is deprecated. There are cool things
# we could do with feature selection,
Expand Down Expand Up @@ -334,7 +339,7 @@ def create_model(paths, exclusions, thresholds, classifyconditions):
continue

word = fields[0]
count = int(fields[1])
count = float(fields[1])
voldict[word] = count
totalcount += count

Expand All @@ -348,9 +353,13 @@ def create_model(paths, exclusions, thresholds, classifyconditions):
voldata.append(features)
else:
features = get_features(voldict, vocablist)
voldata.append(features / (totalcount + 0.001))


if sourcefolder=="poems/":
voldata.append(features / (totalcount + 0.001))
else:
# For SRT transformations, normalization is already handled
voldata.append(features)

volsizes[volid] = totalcount
classflag = classdictionary[volid]
classvector.append(classflag)
Expand Down
56 changes: 43 additions & 13 deletions replicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import parallel_crossvalidate as pc
import sys
import os

allowable = {"full", "quarters", "nations", "genders", "canon", "halves"}

Expand All @@ -30,11 +31,23 @@ def instructions():

assert command in allowable

import traceback
import warnings
import sys

def warn_with_traceback(message, category, filename, lineno, file=None, line=None):

log = file if hasattr(file,'write') else sys.stderr
traceback.print_stack(file=log)
log.write(warnings.formatwarning(message, category, filename, lineno, line))

warnings.showwarning = warn_with_traceback

if command == 'full':

## PATHS.

sourcefolder = 'poems/'
sourcefolder = 'SRTs/'
extension = '.poe.tsv'
classpath = 'poemeta.csv'
outputpath = 'mainmodelpredictions.csv'
Expand Down Expand Up @@ -77,18 +90,35 @@ def instructions():
numfeatures = 3200
regularization = .00007

paths = (sourcefolder, extension, classpath, outputpath)
exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
thresholds = (pastthreshold, futurethreshold)
classifyconditions = (category2sorton, positive_class, datetype, numfeatures, regularization)

rawaccuracy, allvolumes, coefficientuples = pc.create_model(paths, exclusions, thresholds, classifyconditions)

tiltaccuracy = pc.diachronic_tilt(allvolumes, 'linear', [])

print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))

print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
# I added some dims later. This lets me just append to the existing file.
if os.path.exists("model-results.tsv"):
already_done = set([int(a.split("\t")[0]) for a in open("model-results.tsv").readlines()[1:]])
f = open("model-results.tsv","a")
else:
already_done = set([])
f = open("model-results.tsv","w")
f.write("dims\tfeatures\twith_tilt\taccuracy\n")


for numfeatures in [20,40,80,160,320,800,1200,1600,2400,3200,6400]:
if numfeatures in already_done:
continue
for sourcefolder in ["poems/","logSRTs/", "SRTs"]:
paths = (sourcefolder, extension, classpath, outputpath)
classifyconditions = (category2sorton, positive_class, datetype, numfeatures, regularization)
exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
thresholds = (pastthreshold, futurethreshold)
rawaccuracy, allvolumes, coefficientuples = pc.create_model(paths, exclusions, thresholds, classifyconditions)
tiltaccuracy = pc.diachronic_tilt(allvolumes, 'linear', [])
f.write("%i\t%s\tF\t%s\n" % (numfeatures,sourcefolder,str(rawaccuracy)))
f.write("%i\t%s\tT\t%s\n" % (numfeatures,sourcefolder,str(tiltaccuracy)))
f.flush()
results = """
%s features
Using %s for features
If we divide the dataset with a horizontal line at 0.5, accuracy is: %s
Divided with a line fit to the data trend, it's %s""" %(str(numfeatures), str(sourcefolder), str(rawaccuracy), str(tiltaccuracy))
print(results)

elif command == 'quarters':
## PATHS.
Expand Down