Skip to content

Commit ec3796c

Browse files
committed
Added support for database schema migrations (see #26). Added FK cascade behavior (see #63). Added Entry.content_type.
1 parent 844b6b7 commit ec3796c

File tree

5 files changed

+157
-125
lines changed

5 files changed

+157
-125
lines changed

coldsweat/commands.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -119,13 +119,8 @@ def command_export(parser, options, args):
119119
f.write(render_template(path.join(template_dir, 'export.xml'), locals()))
120120

121121
print "%d feeds exported for user %s" % (feeds.count(), username)
122-
123-
# @command('upgrade')
124-
# def command_refresh(parser, options, ags):
125-
# '''Upgrade database from a previous version'''
126-
# migrate_schema()
127-
# print 'Upgrade completed. See log file for more information'
128122

123+
129124
@command('setup')
130125
def command_setup(parser, options, args):
131126
'''Sets up a working database'''
@@ -159,6 +154,20 @@ def command_setup(parser, options, args):
159154
print "Setup for user %s completed." % username
160155

161156

157+
@command('update')
158+
def command_update(parser, options, args):
159+
'''Update Coldsweat internals from a previous version'''
160+
161+
try:
162+
if migrate_database_schema():
163+
print 'Update completed.'
164+
else:
165+
print 'Database is already up-to-date.'
166+
except OperationalError, ex:
167+
logger.error('caught exception updating database schema: (%s)' % ex)
168+
print 'Error while running database update. See log file for more information.'
169+
170+
162171
def pre_command(test_connection=False):
163172
#try
164173
connect()
@@ -170,7 +179,7 @@ def run():
170179

171180
default_username, _ = User.DEFAULT_CREDENTIALS
172181

173-
epilog = "Available commands are: %s" % ', '.join([name for name in COMMANDS])
182+
epilog = "Available commands are: %s" % ', '.join(sorted([name for name in COMMANDS]))
174183
usage='%prog command [options] [args]'
175184

176185
available_options = [

coldsweat/fetcher.py

+53-53
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
MAX_TITLE_LENGTH = 255
2626
POSITIVE_STATUS_CODES = 200, 302, 304 # Other redirects are handled by Requests
2727

28-
2928
# ------------------------------------------------------
3029
# Entry data
3130
# ------------------------------------------------------
@@ -42,7 +41,15 @@ def get_feed_timestamp(soup_feed, default):
4241
logger.debug('no feed timestamp found, using default')
4342
return default
4443

45-
def get_entry_timestamp(entry, default=None):
44+
def get_entry_id(entry, default):
45+
"""
46+
Get a useful id from a feed entry
47+
"""
48+
if ('id' in entry) and entry.id:
49+
return entry.id
50+
return default
51+
52+
def get_entry_timestamp(entry, default):
4653
"""
4754
Select the best timestamp for an entry
4855
"""
@@ -54,27 +61,40 @@ def get_entry_timestamp(entry, default=None):
5461
logger.debug('no entry timestamp found, using default')
5562
return default
5663

57-
def get_entry_title(entry):
64+
def get_entry_title(entry, default):
5865
if 'title' in entry:
5966
return truncate(html.strip_html(entry.title), MAX_TITLE_LENGTH)
60-
return 'Untitled'
67+
return default
68+
69+
def get_entry_content(entry, default):
70+
"""
71+
Select the best content from an entry
72+
"""
73+
74+
candidates = entry.get('content', [])
75+
if 'summary_detail' in entry:
76+
#logger.debug('summary found for entry %s' % entry.link)
77+
candidates.append(entry.summary_detail)
78+
for c in candidates:
79+
# Match text/html, application/xhtml+xml
80+
if 'html' in c.type:
81+
return c.type, c.value
82+
# Return first result, regardless of MIME type
83+
if candidates:
84+
return candidates[0].type, candidates[0].value
85+
86+
logger.debug('no content found for entry %s' % entry.link)
87+
return default
88+
89+
# Nullable fields
6190

6291
def get_entry_link(entry):
63-
# Special case for Feedburner entries, see: http://bit.ly/1gRAvJv
92+
# Special case for FeedBurner entries, see: http://bit.ly/1gRAvJv
6493
if 'feedburner_origlink' in entry:
65-
return entry.feedburner_origlink
94+
return scrub_url(entry.feedburner_origlink)
6695
if 'link' in entry:
67-
return entry.link
96+
return scrub_url(entry.link)
6897
return None
69-
70-
71-
def get_entry_id(entry, default=None):
72-
"""
73-
Get a useful id from a feed entry
74-
"""
75-
if ('id' in entry) and entry.id:
76-
return entry.id
77-
return default
7898

7999
def get_entry_author(entry, feed):
80100
"""
@@ -87,26 +107,6 @@ def get_entry_author(entry, feed):
87107
return feed.author_detail.name
88108
return None
89109

90-
def get_entry_content(entry):
91-
"""
92-
Select the best content from an entry
93-
"""
94-
95-
candidates = entry.get('content', [])
96-
if candidates:
97-
logger.debug('content found for entry %s' % entry.link)
98-
if 'summary_detail' in entry:
99-
logger.debug('summary found for entry %s' % entry.link)
100-
candidates.append(entry.summary_detail)
101-
for c in candidates:
102-
if 'html' in c.type: # Match text/html, application/xhtml+xml
103-
return c.type, c.value
104-
else:
105-
# If the content is declared to be (or is determined to be) text/plain,
106-
# it will not be sanitized by Feedparser. This is to avoid data loss.
107-
return c.type, escape_html(c.value)
108-
logger.debug('no content found for entry %s' % entry.link)
109-
return 'text/plain', ''
110110

111111
# ------------------------------------------------------
112112
# Add feed and subscription
@@ -188,8 +188,6 @@ def fetch_url(url, timeout=None, etag=None, modified_since=None):
188188
request_headers['If-None-Match'] = etag
189189
request_headers['If-Modified-Since'] = format_http_datetime(modified_since)
190190

191-
timeout = timeout if timeout else config.getint('fetcher', 'timeout')
192-
193191
try:
194192
response = requests.get(url, timeout=timeout, headers=request_headers)
195193
logger.debug("got status %d" % response.status_code)
@@ -200,7 +198,7 @@ def fetch_url(url, timeout=None, etag=None, modified_since=None):
200198

201199

202200

203-
def add_synthesized_entry(feed, title, content):
201+
def add_synthesized_entry(feed, title, content_type, content):
204202
'''
205203
Create an HTML entry for the given feed.
206204
'''
@@ -224,7 +222,7 @@ def add_synthesized_entry(feed, title, content):
224222
title = title,
225223
author = 'Coldsweat',
226224
content = content,
227-
#@@TODO: mime_type='text/html',
225+
content_type = content_type,
228226
last_updated_on = now
229227
)
230228
entry.save()
@@ -236,7 +234,7 @@ def fetch_feed(feed, add_entries=False):
236234

237235
def synthesize_entry(reason):
238236
title, content = u'This feed has been disabled', render_template(os.path.join(template_dir, '_entry_feed_disabled.html'), {'reason': reason})
239-
return add_synthesized_entry(feed, title, content)
237+
return add_synthesized_entry(feed, title, 'text/html', content)
240238

241239
def post_fetch(status, error=False):
242240
if status:
@@ -251,14 +249,16 @@ def post_fetch(status, error=False):
251249
synthesize_entry('Feed has accomulated too many errors (last was %s).' % status_title(status))
252250
feed.save()
253251

252+
max_history = config.getint('fetcher', 'max_history')
253+
interval = config.getint('fetcher', 'min_interval')
254+
timeout = config.getint('fetcher', 'timeout')
255+
254256
logger.debug("fetching %s" % feed.self_link)
255257

256258
schema, netloc, path, params, query, fragment = urlparse.urlparse(feed.self_link)
257259

258260
now = datetime.utcnow()
259-
260-
interval = config.getint('fetcher', 'min_interval')
261-
261+
262262
# Check freshness
263263
for fieldname in ['last_checked_on', 'last_updated_on']:
264264
value = getattr(feed, fieldname)
@@ -270,7 +270,7 @@ def post_fetch(status, error=False):
270270
logger.debug("%s for %s is below min_interval, skipped" % (fieldname, netloc))
271271
return
272272

273-
response = fetch_url(feed.self_link, etag=feed.etag, modified_since=feed.last_updated_on)
273+
response = fetch_url(feed.self_link, timeout=timeout, etag=feed.etag, modified_since=feed.last_updated_on)
274274
if not response:
275275
# Record as "503 Service unavailable"
276276
post_fetch(503, error=True)
@@ -291,7 +291,7 @@ def post_fetch(status, error=False):
291291
feed.is_enabled = False
292292
logger.warn("new %s location %s is duplicated, disabled" % (netloc, self_link))
293293
synthesize_entry('Feed has a duplicated web address.')
294-
post_fetch(DuplicatedFeedError.code)
294+
post_fetch(DuplicatedFeedError.code, error=True)
295295
return
296296

297297
if response.status_code == 304: # Not modified
@@ -302,7 +302,7 @@ def post_fetch(status, error=False):
302302
feed.is_enabled = False
303303
logger.warn("%s is gone, disabled" % netloc)
304304
synthesize_entry('Feed has been removed from the origin server.')
305-
post_fetch(response.status_code)
305+
post_fetch(response.status_code, error=True)
306306
return
307307
elif response.status_code not in POSITIVE_STATUS_CODES: # No good
308308
logger.warn("%s replied with status %d, aborted" % (netloc, response.status_code))
@@ -339,13 +339,13 @@ def post_fetch(status, error=False):
339339
logger.warn('could not find guid for entry from %s, skipped' % netloc)
340340
continue
341341

342-
title = get_entry_title(parsed_entry)
343-
mime_type, content = get_entry_content(parsed_entry)
344-
timestamp = get_entry_timestamp(parsed_entry, default=now)
345-
author = get_entry_author(parsed_entry, soup.feed)
342+
author = get_entry_author(parsed_entry, soup.feed)
343+
344+
title = get_entry_title(parsed_entry, default='Untitled')
345+
content_type, content = get_entry_content(parsed_entry, default=('text/plain', ''))
346+
timestamp = get_entry_timestamp(parsed_entry, default=now)
346347

347348
# Skip ancient feed items
348-
max_history = config.getint('fetcher', 'max_history')
349349
if max_history and ((now - timestamp).days > max_history):
350350
logger.debug("entry %s from %s is over max_history, skipped" % (guid, netloc))
351351
continue
@@ -364,7 +364,7 @@ def post_fetch(status, error=False):
364364
title = title,
365365
author = author,
366366
content = content,
367-
#@@TODO: add mime_type too
367+
content_type = content_type,
368368
link = link,
369369
last_updated_on = timestamp
370370
)

0 commit comments

Comments
 (0)