25
25
MAX_TITLE_LENGTH = 255
26
26
POSITIVE_STATUS_CODES = 200 , 302 , 304 # Other redirects are handled by Requests
27
27
28
-
29
28
# ------------------------------------------------------
30
29
# Entry data
31
30
# ------------------------------------------------------
@@ -42,7 +41,15 @@ def get_feed_timestamp(soup_feed, default):
42
41
logger .debug ('no feed timestamp found, using default' )
43
42
return default
44
43
45
- def get_entry_timestamp (entry , default = None ):
44
+ def get_entry_id (entry , default ):
45
+ """
46
+ Get a useful id from a feed entry
47
+ """
48
+ if ('id' in entry ) and entry .id :
49
+ return entry .id
50
+ return default
51
+
52
+ def get_entry_timestamp (entry , default ):
46
53
"""
47
54
Select the best timestamp for an entry
48
55
"""
@@ -54,27 +61,40 @@ def get_entry_timestamp(entry, default=None):
54
61
logger .debug ('no entry timestamp found, using default' )
55
62
return default
56
63
57
- def get_entry_title (entry ):
64
+ def get_entry_title (entry , default ):
58
65
if 'title' in entry :
59
66
return truncate (html .strip_html (entry .title ), MAX_TITLE_LENGTH )
60
- return 'Untitled'
67
+ return default
68
+
69
+ def get_entry_content (entry , default ):
70
+ """
71
+ Select the best content from an entry
72
+ """
73
+
74
+ candidates = entry .get ('content' , [])
75
+ if 'summary_detail' in entry :
76
+ #logger.debug('summary found for entry %s' % entry.link)
77
+ candidates .append (entry .summary_detail )
78
+ for c in candidates :
79
+ # Match text/html, application/xhtml+xml
80
+ if 'html' in c .type :
81
+ return c .type , c .value
82
+ # Return first result, regardless of MIME type
83
+ if candidates :
84
+ return candidates [0 ].type , candidates [0 ].value
85
+
86
+ logger .debug ('no content found for entry %s' % entry .link )
87
+ return default
88
+
89
+ # Nullable fields
61
90
62
91
def get_entry_link (entry ):
63
- # Special case for Feedburner entries, see: http://bit.ly/1gRAvJv
92
+ # Special case for FeedBurner entries, see: http://bit.ly/1gRAvJv
64
93
if 'feedburner_origlink' in entry :
65
- return entry .feedburner_origlink
94
+ return scrub_url ( entry .feedburner_origlink )
66
95
if 'link' in entry :
67
- return entry .link
96
+ return scrub_url ( entry .link )
68
97
return None
69
-
70
-
71
- def get_entry_id (entry , default = None ):
72
- """
73
- Get a useful id from a feed entry
74
- """
75
- if ('id' in entry ) and entry .id :
76
- return entry .id
77
- return default
78
98
79
99
def get_entry_author (entry , feed ):
80
100
"""
@@ -87,26 +107,6 @@ def get_entry_author(entry, feed):
87
107
return feed .author_detail .name
88
108
return None
89
109
90
- def get_entry_content (entry ):
91
- """
92
- Select the best content from an entry
93
- """
94
-
95
- candidates = entry .get ('content' , [])
96
- if candidates :
97
- logger .debug ('content found for entry %s' % entry .link )
98
- if 'summary_detail' in entry :
99
- logger .debug ('summary found for entry %s' % entry .link )
100
- candidates .append (entry .summary_detail )
101
- for c in candidates :
102
- if 'html' in c .type : # Match text/html, application/xhtml+xml
103
- return c .type , c .value
104
- else :
105
- # If the content is declared to be (or is determined to be) text/plain,
106
- # it will not be sanitized by Feedparser. This is to avoid data loss.
107
- return c .type , escape_html (c .value )
108
- logger .debug ('no content found for entry %s' % entry .link )
109
- return 'text/plain' , ''
110
110
111
111
# ------------------------------------------------------
112
112
# Add feed and subscription
@@ -188,8 +188,6 @@ def fetch_url(url, timeout=None, etag=None, modified_since=None):
188
188
request_headers ['If-None-Match' ] = etag
189
189
request_headers ['If-Modified-Since' ] = format_http_datetime (modified_since )
190
190
191
- timeout = timeout if timeout else config .getint ('fetcher' , 'timeout' )
192
-
193
191
try :
194
192
response = requests .get (url , timeout = timeout , headers = request_headers )
195
193
logger .debug ("got status %d" % response .status_code )
@@ -200,7 +198,7 @@ def fetch_url(url, timeout=None, etag=None, modified_since=None):
200
198
201
199
202
200
203
- def add_synthesized_entry (feed , title , content ):
201
+ def add_synthesized_entry (feed , title , content_type , content ):
204
202
'''
205
203
Create an HTML entry for the given feed.
206
204
'''
@@ -224,7 +222,7 @@ def add_synthesized_entry(feed, title, content):
224
222
title = title ,
225
223
author = 'Coldsweat' ,
226
224
content = content ,
227
- #@@TODO: mime_type='text/html' ,
225
+ content_type = content_type ,
228
226
last_updated_on = now
229
227
)
230
228
entry .save ()
@@ -236,7 +234,7 @@ def fetch_feed(feed, add_entries=False):
236
234
237
235
def synthesize_entry (reason ):
238
236
title , content = u'This feed has been disabled' , render_template (os .path .join (template_dir , '_entry_feed_disabled.html' ), {'reason' : reason })
239
- return add_synthesized_entry (feed , title , content )
237
+ return add_synthesized_entry (feed , title , 'text/html' , content )
240
238
241
239
def post_fetch (status , error = False ):
242
240
if status :
@@ -251,14 +249,16 @@ def post_fetch(status, error=False):
251
249
synthesize_entry ('Feed has accomulated too many errors (last was %s).' % status_title (status ))
252
250
feed .save ()
253
251
252
+ max_history = config .getint ('fetcher' , 'max_history' )
253
+ interval = config .getint ('fetcher' , 'min_interval' )
254
+ timeout = config .getint ('fetcher' , 'timeout' )
255
+
254
256
logger .debug ("fetching %s" % feed .self_link )
255
257
256
258
schema , netloc , path , params , query , fragment = urlparse .urlparse (feed .self_link )
257
259
258
260
now = datetime .utcnow ()
259
-
260
- interval = config .getint ('fetcher' , 'min_interval' )
261
-
261
+
262
262
# Check freshness
263
263
for fieldname in ['last_checked_on' , 'last_updated_on' ]:
264
264
value = getattr (feed , fieldname )
@@ -270,7 +270,7 @@ def post_fetch(status, error=False):
270
270
logger .debug ("%s for %s is below min_interval, skipped" % (fieldname , netloc ))
271
271
return
272
272
273
- response = fetch_url (feed .self_link , etag = feed .etag , modified_since = feed .last_updated_on )
273
+ response = fetch_url (feed .self_link , timeout = timeout , etag = feed .etag , modified_since = feed .last_updated_on )
274
274
if not response :
275
275
# Record as "503 Service unavailable"
276
276
post_fetch (503 , error = True )
@@ -291,7 +291,7 @@ def post_fetch(status, error=False):
291
291
feed .is_enabled = False
292
292
logger .warn ("new %s location %s is duplicated, disabled" % (netloc , self_link ))
293
293
synthesize_entry ('Feed has a duplicated web address.' )
294
- post_fetch (DuplicatedFeedError .code )
294
+ post_fetch (DuplicatedFeedError .code , error = True )
295
295
return
296
296
297
297
if response .status_code == 304 : # Not modified
@@ -302,7 +302,7 @@ def post_fetch(status, error=False):
302
302
feed .is_enabled = False
303
303
logger .warn ("%s is gone, disabled" % netloc )
304
304
synthesize_entry ('Feed has been removed from the origin server.' )
305
- post_fetch (response .status_code )
305
+ post_fetch (response .status_code , error = True )
306
306
return
307
307
elif response .status_code not in POSITIVE_STATUS_CODES : # No good
308
308
logger .warn ("%s replied with status %d, aborted" % (netloc , response .status_code ))
@@ -339,13 +339,13 @@ def post_fetch(status, error=False):
339
339
logger .warn ('could not find guid for entry from %s, skipped' % netloc )
340
340
continue
341
341
342
- title = get_entry_title (parsed_entry )
343
- mime_type , content = get_entry_content (parsed_entry )
344
- timestamp = get_entry_timestamp (parsed_entry , default = now )
345
- author = get_entry_author (parsed_entry , soup .feed )
342
+ author = get_entry_author (parsed_entry , soup .feed )
343
+
344
+ title = get_entry_title (parsed_entry , default = 'Untitled' )
345
+ content_type , content = get_entry_content (parsed_entry , default = ('text/plain' , '' ))
346
+ timestamp = get_entry_timestamp (parsed_entry , default = now )
346
347
347
348
# Skip ancient feed items
348
- max_history = config .getint ('fetcher' , 'max_history' )
349
349
if max_history and ((now - timestamp ).days > max_history ):
350
350
logger .debug ("entry %s from %s is over max_history, skipped" % (guid , netloc ))
351
351
continue
@@ -364,7 +364,7 @@ def post_fetch(status, error=False):
364
364
title = title ,
365
365
author = author ,
366
366
content = content ,
367
- #@@TODO: add mime_type too
367
+ content_type = content_type ,
368
368
link = link ,
369
369
last_updated_on = timestamp
370
370
)
0 commit comments