diff --git a/processing/load_languages.py b/processing/load_languages.py index 5ee4308..93d1fc1 100644 --- a/processing/load_languages.py +++ b/processing/load_languages.py @@ -26,6 +26,7 @@ logger = logging.getLogger(__name__) SENTRY_DSN = os.environ.get('SENTRY_DSN', None) LOGGING_LEVEL = os.environ.get('LOGGING_LEVEL', 'DEBUG') +STATIC_CATALOG_FALLBACK_DOMAIN = os.environ.get('STATIC_CATALOG_FALLBACK_DOMAIN', None) LOGGING = { 'version': 1, @@ -181,8 +182,9 @@ def load_documents(collection, articlemeta_db, all_records=False): class StaticCatalog(object): - def __init__(self, collection): + def __init__(self, collection, fallback_domain=None): self.catalog = {} + self.fallback_domain = fallback_domain self._load_static_catalog(collection, 'pdf') self._load_static_catalog(collection, 'html') self._load_static_catalog(collection, 'xml') @@ -225,7 +227,19 @@ def _load_static_catalog(self, source, tipe): url = '/'.join(['http:/', source, filename]) - content = do_request(url, json=False).iter_lines(decode_unicode='utf-8') + response = do_request(url, json=False) + + # If primary domain fails and fallback domain is configured, try fallback + if response is None and self.fallback_domain: + logger.warning(u'Failed to load from %s, trying fallback domain %s', source, self.fallback_domain) + fallback_url = '/'.join(['http:/', self.fallback_domain, filename]) + response = do_request(fallback_url, json=False) + + if response is None: + logger.error(u'Failed to load static catalog from %s (and fallback if configured)', source) + return + + content = response.iter_lines(decode_unicode='utf-8') for line in sorted([i for i in content]): splitedline = line.lower().split('/')[1:] @@ -382,7 +396,7 @@ def fulltexts(self, document): return ldata -def run(collections, articlemeta_db, all_records=False, forced_url=None): +def run(collections, articlemeta_db, all_records=False, forced_url=None, fallback_domain=None): if not isinstance(collections, list): logger.error('Collections must be a list o collection acronym') @@ -395,8 +409,13 @@ def run(collections, articlemeta_db, all_records=False, forced_url=None): collection_domain = forced_url if forced_url else coll_info['domain'] logger.info(u'Loading languages for %s', collection_domain) logger.info(u'Using mode all_records %s', str(all_records)) + + # Use environment variable fallback if not provided as parameter + effective_fallback_domain = fallback_domain if fallback_domain else STATIC_CATALOG_FALLBACK_DOMAIN + if effective_fallback_domain: + logger.info(u'Using fallback domain: %s', effective_fallback_domain) - static_catalogs = StaticCatalog(collection_domain) + static_catalogs = StaticCatalog(collection_domain, fallback_domain=effective_fallback_domain) for document in load_documents(collection, articlemeta_db, all_records=all_records): @@ -485,6 +504,12 @@ def main(): help='Collection domain to get Static catalog' ) + parser.add_argument( + '--fallback_domain', + '-f', + help='Fallback domain to try if primary domain fails (e.g., antigo.scielo.br)' + ) + args = parser.parse_args() LOGGING['handlers']['console']['level'] = args.logging_level for lg, content in LOGGING['loggers'].items(): @@ -494,7 +519,7 @@ def main(): collections = [args.collection] if args.collection else _collections_acronyms - run(collections, articlemeta_db, args.all_records, args.domain) + run(collections, articlemeta_db, args.all_records, args.domain, args.fallback_domain) if __name__ == '__main__': diff --git a/tests/test_load_languages.py b/tests/test_load_languages.py index c64a8de..7470a2a 100644 --- a/tests/test_load_languages.py +++ b/tests/test_load_languages.py @@ -10,7 +10,7 @@ from articlemeta import controller -def mock_static_catalog_init_method(self, collection): +def mock_static_catalog_init_method(self, collection, fallback_domain=None): self.catalog = { "rsp": { "v52": { @@ -114,6 +114,74 @@ def test_run(self): document['fulltexts']['html']) self.assertIsNotNone(document['fulltexts'].get('pdf')) + def test_static_catalog_fallback(self): + """Test that StaticCatalog tries fallback domain when primary fails""" + + # Mock do_request to simulate primary domain failure with actual catalog data + def mock_do_request_primary_fails(url, json=True): + if 'www.scielo.br' in url: + return None # Primary fails + elif 'antigo.scielo.br' in url: + # Fallback succeeds with catalog data + class MockResponse: + def iter_lines(self, decode_unicode=None): + # Return sample catalog entries + return [ + 'serial/rsp/v52/0034-8910-rsp-s1518-87872018052000131.pdf', + 'serial/rsp/v52/0034-8910-rsp-s1518-87872018052000131.xml', + ] + return MockResponse() + return None + + with patch.object(load_languages, 'do_request', side_effect=mock_do_request_primary_fails): + # Test with fallback domain - should populate catalog + catalog = load_languages.StaticCatalog('www.scielo.br', fallback_domain='antigo.scielo.br') + self.assertIsInstance(catalog.catalog, dict) + # Verify catalog was populated from fallback + self.assertIn('rsp', catalog.catalog) + self.assertIn('v52', catalog.catalog['rsp']) + + # Test without fallback domain - should log error and have empty catalog + def mock_do_request_always_fails(url, json=True): + return None # Both primary and fallback fail + + with patch.object(load_languages, 'do_request', side_effect=mock_do_request_always_fails): + with patch.object(load_languages, 'logger') as mock_logger: + catalog_no_fallback = load_languages.StaticCatalog('www.scielo.br', fallback_domain=None) + self.assertIsInstance(catalog_no_fallback.catalog, dict) + # Verify error was logged for each file type (pdf, html, xml) + error_calls = [call for call in mock_logger.error.call_args_list + if 'Failed to load static catalog' in str(call)] + self.assertGreaterEqual(len(error_calls), 3) # At least one for each file type + + @patch.object( + load_languages.StaticCatalog, "__init__", mock_static_catalog_init_method + ) + def test_run_with_fallback_domain(self): + """Test run function with fallback_domain parameter""" + mocked_articlemeta_db = mongomock.MongoClient().db + mocked_articlemeta_db['collections'].insert_many([ + { + "acron": "scl", + "code": "scl", + "domain": "www.scielo.br" + }, + ]) + mocked_articlemeta_db['articles'].insert_one(self._raw_json) + + # Test with fallback_domain parameter + load_languages.run(['scl'], + mocked_articlemeta_db, + all_records=True, + forced_url='www.scielo.br', + fallback_domain='antigo.scielo.br') + + document = mocked_articlemeta_db['articles'].find_one( + {'code': self._raw_json['code']}, + {'_id': 0, 'citations': 0} + ) + self.assertIsNotNone(document) + if __name__ == '__main__': main()