-
Notifications
You must be signed in to change notification settings - Fork 0
/
bring_me_emails.py
195 lines (154 loc) · 6.47 KB
/
bring_me_emails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python
##
# bring_me_emails.py
###
"""bring_me_emails
It seemed like email subject lines have been getting longer, and I wanted to
create a database of the subject line lengths in all my emails.
Part of my historical email archive is stored as (gzipped) mbox files (see
<https://en.wikipedia.org/wiki/Mbox>. The rest are stored in a notmuchmail
database (see <https://notmuchmail.org/>.
This program has four functions that each yield a tuple of message-id, subject,
and date headers. One is for parsing mbox directories, one scans notmuch
database, one reads a single Maildir folder, and one parses a local copy of the
Enron Email Dataset (see <https://www.cs.cmu.edu/~./enron/>.)
These are chained together to feed to a main function that stores these values
in a sqlite3 database for later analysis.
The schema for this database is an unique id field (uses the message-id, but can
be cleaned using strip_ids.py , a date field, and the length of the subject line
as an int.
Analysis takes place elsewhere (for example, in this ipython notebook: Subject
line growth-sqlite-scatter.ipynb )
"""
__author__ = "Danny O'Brien <http://www.spesh.com/danny/>"
__copyright__ = "Copyright Danny O'Brien"
__contributors__ = None
__license__ = "GPL v3"
import os
import sys
import gzip
import mailbox
import email.utils
from email.parser import Parser
import datetime
import logging
try:
import notmuch
except ImportError:
pass
import sqlite3
from itertools import chain
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger('bring_me_emails')
try:
import coloredlogs
coloredlogs.install(level=logging.DEBUG, logger=log)
except ImportError:
pass
def init_database(f):
conn = sqlite3.connect('emailsubjectlinelengths.db')
conn.execute("create table if not exists email_stats(id TEXT PRIMARY KEY, date TEXT, subject INT)")
return conn
def bring_me_mboxen(path):
""" Walks down a directory looking for mboxes, gzipped and uncompressed.
Only feeds on files ending in '.mbx' or '.mbox' or '.mbx.gz'.
Yields (message-id, subject, date) header strings """
for root, dirs, files in os.walk(path):
j = sorted([os.path.join(root, i) for i in files if i.endswith(".mbox") or i.endswith(".mbx") or i.endswith(".mbx.gz")])
if j == []:
continue
for i in j:
if i.endswith(".gz"):
mailbox.open = gzip.open
else:
mailbox.open = open
m = mailbox.mbox(i)
log.debug("Scanning: {}".format(i))
for em in m:
yield (em['message-id'], em['subject'], em['date'])
def bring_me_a_maildir(path):
m = mailbox.Maildir(path, factory=None)
log.debug("Scanning maildir: {}".format(path))
for em in m:
print em['message-id']
yield (em['message-id'], em['subject'], em['date'])
def bring_me_notmuchmail(path):
""" Scans all emails from a notmuchmail database.
Yields (message-id, subject, date) header strings """
db = notmuch.Database(path)
query = db.create_query('date:..'+str(datetime.datetime.now())[:10]) # until today
total = query.count_messages()
log.debug("Scanning notmuch: {} mails".format(total))
j = query.search_messages()
for l in j:
mid = l.get_message_id()
dd = l.get_header('date')
sub = l.get_header('subject')
if (not mid.startswith('<')):
mid = '<' + mid + '>'
yield (mid, sub, dd)
def bring_me_enron(path):
""" Scans all emails from the Enron Email Dataset, a freely available corpus
of c515K emails.
Used successfully with the May 7, 2015 version of the dataset, available from:
https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz
Based on code originally by Bryan Nehl
<http://soloso.blogspot.com/2011/07/getting-enron-mail-database-into.html>
and the author of <http://mongodb-enron-email.s3-website-us-east-1.amazonaws.com/>
Yields (message-id, subject, date) header strings """
p = Parser()
for root, dirs, files in os.walk(path, topdown=False):
# distinct file name
for filename in files:
nameOfFileToOpen = "{0}/{1}".format(root, filename)
dataFile = open(nameOfFileToOpen)
raw_contents = ""
try:
for dataLine in dataFile:
raw_contents += dataLine
finally:
dataFile.close()
contents = raw_contents.decode('cp1252')
msg = p.parsestr(contents.encode("utf-8"))
yield (msg['message-id'], msg['subject'], msg['date'])
conn = init_database('email.db')
c = conn.cursor()
notmuch_path = '/home/mailuser/mynotmuchpath'
mailbox_path = '/home/mailuser/mymailboxesarehere'
maildir_path = '/home/mailuser/a_single_Maildir_folder/'
# enron_path = '/home/enronuser/maildir/'
all_mails = chain(bring_me_mboxen(mailbox_path), bring_me_a_maildir(maildir_path), bring_me_notmuchmail(notmuch_path))
cnt = 0
earliest = '2038-01-01'
latest = '1970-01-01'
# main email -> database storage routine
for (mail_id, mail_subject, mail_date) in all_mails:
try:
if ((mail_id is None) or (mail_subject is None) or (mail_date is None)):
continue
# returns a struct_time 9-tuple, last 3 values are useless though
date_s = email.utils.parsedate(mail_date)
if date_s is None:
continue
# convert the useful first 6 entries in struct_time into a
# datetime object, stringify, extract out first 10 chars to get YYYY-MM-DD
day = str(datetime.datetime(*date_s[:6]))[:10]
if (day < '1990-01-01' or day > '2020-01-01'): # FIXME simple outlier remover.
continue
# maintain a record of the current range, for display purposes
if (day < earliest):
earliest = day
if (day > latest):
latest = day
# normalize message-id storage to '<[email protected]>'
if (not mail_id.startswith('<')):
mail_id = '<' + mail_id + '>'
subject_len = len(mail_subject)
# display and commit SQL transaction every 1000 emails
cnt += 1
if (0 == cnt % 1000):
log.info("{} emails scanned, from {}..{}. Committing.".format(cnt, earliest, latest))
conn.commit()
c.execute("INSERT OR IGNORE INTO email_stats VALUES (?, ?, ?)", (mail_id, day, subject_len))
except:
print "Happily ignoring error:", sys.exc_info()