Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ A MongoDB backend for HTTP cache storage. It stores responses using GridFS.
To use it, set the following Scrapy setting in your project::

HTTPCACHE_STORAGE = 'scmongo.httpcache.MongoCacheStorage'

To enable zlib compression, set the following Scrapy setting in your project::

HTTPCACHE_COMPRESSION = 'zlib'
26 changes: 23 additions & 3 deletions scmongo/httpcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class MongoCacheStorage(object):

def __init__(self, settings=conf.settings):
self.expire = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.compression = self._get_compression_algorithm(settings)
self.db = get_database(settings)
self.fs = {}

Expand All @@ -33,7 +34,8 @@ def retrieve_response(self, spider, request):
url = str(gf.url)
status = str(gf.status)
headers = Headers([(x, map(str, y)) for x, y in gf.headers.iteritems()])
body = gf.read()
compression = gf._file.get('compression')
body = self._decompress(gf.read(), compression)
respcls = responsetypes.from_args(headers=headers, url=url)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
Expand All @@ -46,12 +48,14 @@ def store_response(self, spider, request, response):
'status': response.status,
'url': response.url,
'headers': dict(response.headers),
'compression': self.compression,
}
body = self._compress(response.body)
try:
self.fs[spider].put(response.body, **kwargs)
self.fs[spider].put(body, **kwargs)
except errors.FileExists:
self.fs[spider].delete(key)
self.fs[spider].put(response.body, **kwargs)
self.fs[spider].put(body, **kwargs)

def _get_file(self, spider, request):
key = spider.name + '/' + self._request_key(request)
Expand All @@ -65,3 +69,19 @@ def _get_file(self, spider, request):

def _request_key(self, request):
return request_fingerprint(request)

def _get_compression_algorithm(self, settings):
compression_algorithm = settings.get('HTTPCACHE_COMPRESSION')
if not (compression_algorithm == 'zlib' or compression_algorithm is None):
raise ValueError("Compression algorithm %s not supported" % compression_algorithm)
return compression_algorithm

def _compress(self, response_body):
if (self.compression == 'zlib'):
return response_body.encode('zlib')
return response_body

def _decompress(self, compressed_body, compression):
if compression == 'zlib':
return compressed_body.decode('zlib')
return compressed_body