diff --git a/README.rst b/README.rst index 5e13305..3f05973 100644 --- a/README.rst +++ b/README.rst @@ -29,3 +29,7 @@ A MongoDB backend for HTTP cache storage. It stores responses using GridFS. To use it, set the following Scrapy setting in your project:: HTTPCACHE_STORAGE = 'scmongo.httpcache.MongoCacheStorage' + +To enable zlib compression, set the following Scrapy setting in your project:: + + HTTPCACHE_COMPRESSION = 'zlib' diff --git a/scmongo/httpcache.py b/scmongo/httpcache.py index 75f8cfe..1125ff8 100644 --- a/scmongo/httpcache.py +++ b/scmongo/httpcache.py @@ -17,6 +17,7 @@ class MongoCacheStorage(object): def __init__(self, settings=conf.settings): self.expire = settings.getint('HTTPCACHE_EXPIRATION_SECS') + self.compression = self._get_compression_algorithm(settings) self.db = get_database(settings) self.fs = {} @@ -33,7 +34,8 @@ def retrieve_response(self, spider, request): url = str(gf.url) status = str(gf.status) headers = Headers([(x, map(str, y)) for x, y in gf.headers.iteritems()]) - body = gf.read() + compression = gf._file.get('compression') + body = self._decompress(gf.read(), compression) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response @@ -46,12 +48,14 @@ def store_response(self, spider, request, response): 'status': response.status, 'url': response.url, 'headers': dict(response.headers), + 'compression': self.compression, } + body = self._compress(response.body) try: - self.fs[spider].put(response.body, **kwargs) + self.fs[spider].put(body, **kwargs) except errors.FileExists: self.fs[spider].delete(key) - self.fs[spider].put(response.body, **kwargs) + self.fs[spider].put(body, **kwargs) def _get_file(self, spider, request): key = spider.name + '/' + self._request_key(request) @@ -65,3 +69,19 @@ def _get_file(self, spider, request): def _request_key(self, request): return request_fingerprint(request) + + def _get_compression_algorithm(self, settings): + compression_algorithm = settings.get('HTTPCACHE_COMPRESSION') + if not (compression_algorithm == 'zlib' or compression_algorithm is None): + raise ValueError("Compression algorithm %s not supported" % compression_algorithm) + return compression_algorithm + + def _compress(self, response_body): + if (self.compression == 'zlib'): + return response_body.encode('zlib') + return response_body + + def _decompress(self, compressed_body, compression): + if compression == 'zlib': + return compressed_body.decode('zlib') + return compressed_body