forked from datalogics/server_core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
opds_import.py
1194 lines (1002 loc) · 44.9 KB
/
opds_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from nose.tools import set_trace
from StringIO import StringIO
from collections import (
defaultdict,
Counter,
)
import datetime
import feedparser
import logging
import traceback
import urllib
from urlparse import urlparse, urljoin
from sqlalchemy.orm.session import Session
from lxml import builder, etree
from monitor import Monitor
from util import LanguageCodes
from util.xmlparser import XMLParser
from config import (
Configuration,
CannotLoadConfiguration,
)
from metadata_layer import (
CirculationData,
Metadata,
IdentifierData,
ContributorData,
LinkData,
MeasurementData,
SubjectData,
ReplacementPolicy,
)
from model import (
get_one,
get_one_or_create,
CoverageRecord,
DataSource,
Edition,
Hyperlink,
Identifier,
LicensePool,
Measurement,
Subject,
RightsStatus,
)
from coverage import CoverageFailure
from util.http import HTTP
from util.opds_writer import (
OPDSFeed,
OPDSMessage,
)
from s3 import S3Uploader
class AccessNotAuthenticated(Exception):
"""No authentication is configured for this service"""
pass
class SimplifiedOPDSLookup(object):
"""Tiny integration class for the Simplified 'lookup' protocol."""
LOOKUP_ENDPOINT = "lookup"
CANONICALIZE_ENDPOINT = "canonical-author-name"
UPDATES_ENDPOINT = "updates"
REMOVAL_ENDPOINT = "remove"
@classmethod
def from_config(cls, integration='Metadata Wrangler'):
url = Configuration.integration_url(integration)
if not url:
return None
return cls(url)
def __init__(self, base_url):
if not base_url.endswith('/'):
base_url += "/"
self.base_url = base_url
self._set_auth()
def _set_auth(self):
"""Sets client authentication details for the Metadata Wrangler"""
metadata_wrangler_url = Configuration.integration_url(
Configuration.METADATA_WRANGLER_INTEGRATION
)
self.client_id = self.client_secret = None
if (metadata_wrangler_url
and self.base_url.startswith(metadata_wrangler_url)):
values = Configuration.integration(Configuration.METADATA_WRANGLER_INTEGRATION)
self.client_id = values.get(Configuration.METADATA_WRANGLER_CLIENT_ID)
self.client_secret = values.get(Configuration.METADATA_WRANGLER_CLIENT_SECRET)
details = [self.client_id, self.client_secret]
if len([d for d in details if not d]) == 1:
# Raise an error if one is set, but not the other.
raise CannotLoadConfiguration("Metadata Wrangler improperly configured.")
@property
def authenticated(self):
return bool(self.client_id and self.client_secret)
def _get(self, url, **kwargs):
"""Make an HTTP request. This method is overridden in the mock class."""
return HTTP.get_with_timeout(url, **kwargs)
def opds_get(self, url):
"""Make the sort of HTTP request that's normal for an OPDS feed.
Long timeout, raise error on anything but 2xx or 3xx.
"""
kwargs = dict(timeout=120, allowed_response_codes=['2xx', '3xx'])
if self.client_id and self.client_secret:
kwargs['auth'] = (self.client_id, self.client_secret)
return self._get(url, **kwargs)
def lookup(self, identifiers):
"""Retrieve an OPDS feed with metadata for the given identifiers."""
args = "&".join(set(["urn=%s" % i.urn for i in identifiers]))
url = self.base_url + self.LOOKUP_ENDPOINT + "?" + args
logging.info("Lookup URL: %s", url)
return self.opds_get(url)
def canonicalize_author_name(self, identifier, working_display_name):
"""Attempt to find the canonical name for the author of a book.
:param identifier: an ISBN-type Identifier.
:param working_display_name: The display name of the author
(i.e. the name format human being used as opposed to the name
that goes into library records).
"""
args = "display_name=%s" % (
urllib.quote(
working_display_name.encode("utf8"))
)
if identifier:
args += "&urn=%s" % urllib.quote(identifier.urn)
url = self.base_url + self.CANONICALIZE_ENDPOINT + "?" + args
logging.info("GET %s", url)
return self._get(url, timeout=120)
def remove(self, identifiers):
"""Remove items from an authenticated Metadata Wrangler collection"""
if not self.authenticated:
raise AccessNotAuthenticated("Metadata Wrangler Collection not authenticated.")
args = "&".join(set(["urn=%s" % i.urn for i in identifiers]))
url = self.base_url + self.REMOVAL_ENDPOINT + "?" + args
logging.info("Metadata Wrangler Removal URL: %s", url)
return self._get(url)
class MockSimplifiedOPDSLookup(SimplifiedOPDSLookup):
def __init__(self, *args, **kwargs):
self.responses = []
super(MockSimplifiedOPDSLookup, self).__init__(*args, **kwargs)
def queue_response(self, status_code, headers={}, content=None):
from testing import MockRequestsResponse
self.responses.insert(
0, MockRequestsResponse(status_code, headers, content)
)
def _get(self, url, *args, **kwargs):
response = self.responses.pop()
return HTTP._process_response(
url, response, kwargs.get('allowed_response_codes'),
kwargs.get('disallowed_response_codes')
)
class OPDSXMLParser(XMLParser):
NAMESPACES = { "simplified": "http://librarysimplified.org/terms/",
"app" : "http://www.w3.org/2007/app",
"dcterms" : "http://purl.org/dc/terms/",
"dc" : "http://purl.org/dc/elements/1.1/",
"opds": "http://opds-spec.org/2010/catalog",
"schema" : "http://schema.org/",
"atom" : "http://www.w3.org/2005/Atom",
}
class OPDSImporter(object):
""" Imports editions and license pools from an OPDS feed.
Creates Edition, LicensePool and Work rows in the database, if those
don't already exist.
Should be used when a circulation server asks for data from
our internal content server, and also when our content server asks for data
from external content servers.
:param mirror: Use this MirrorUploader object to mirror all
incoming open-access books and cover images.
"""
COULD_NOT_CREATE_LICENSE_POOL = (
"No existing license pool for this identifier and no way of creating one.")
def __init__(self, _db, data_source_name=DataSource.METADATA_WRANGLER,
identifier_mapping=None, mirror=None, http_get=None):
self._db = _db
self.log = logging.getLogger("OPDS Importer")
self.data_source_name = data_source_name
self.identifier_mapping = identifier_mapping
self.metadata_client = SimplifiedOPDSLookup.from_config()
self.mirror = mirror
self.http_get = http_get
def import_from_feed(self, feed, even_if_no_author=False,
immediately_presentation_ready=False,
feed_url=None):
# Keep track of editions that were imported. Pools and works
# for those editions may be looked up or created.
imported_editions = {}
pools = {}
works = {}
# CoverageFailures that note business logic errors and non-success download statuses
failures = {}
# If parsing the overall feed throws an exception, we should address that before
# moving on. Let the exception propagate.
metadata_objs, failures = self.extract_feed_data(feed, feed_url)
# make editions. if have problem, make sure associated pool and work aren't created.
for key, metadata in metadata_objs.iteritems():
# key is identifier.urn here
# If there's a status message about this item, don't try to import it.
if key in failures.keys():
continue
try:
# Create an edition. This will also create a pool if there's circulation data.
edition = self.import_edition_from_metadata(
metadata, even_if_no_author, immediately_presentation_ready
)
if edition:
imported_editions[key] = edition
except Exception, e:
# Rather than scratch the whole import, treat this as a failure that only applies
# to this item.
self.log.error("Error importing an OPDS item", exc_info=e)
identifier, ignore = Identifier.parse_urn(self._db, key)
data_source = DataSource.lookup(self._db, self.data_source_name)
failure = CoverageFailure(identifier, traceback.format_exc(), data_source=data_source, transient=False)
failures[key] = failure
# clean up any edition might have created
if key in imported_editions:
del imported_editions[key]
# Move on to the next item, don't create a work.
continue
try:
pool, work = self.update_work_for_edition(
edition, even_if_no_author, immediately_presentation_ready
)
if pool:
pools[key] = pool
if work:
works[key] = work
except Exception, e:
identifier, ignore = Identifier.parse_urn(self._db, key)
data_source = DataSource.lookup(self._db, self.data_source_name)
failure = CoverageFailure(identifier, traceback.format_exc(), data_source=data_source, transient=False)
failures[key] = failure
return imported_editions.values(), pools.values(), works.values(), failures
def import_edition_from_metadata(
self, metadata, even_if_no_author, immediately_presentation_ready
):
""" For the passed-in Metadata object, see if can find or create an Edition
in the database. Do not set the edition's pool or work, yet.
"""
# Locate or create an Edition for this book.
edition, is_new_edition = metadata.edition(self._db)
policy = ReplacementPolicy(
subjects=True,
links=True,
contributions=True,
rights=True,
link_content=True,
even_if_not_apparently_updated=True,
mirror=self.mirror,
http_get=self.http_get,
)
metadata.apply(
edition, self.metadata_client, replace=policy
)
return edition
def update_work_for_edition(self, edition, even_if_no_author=False, immediately_presentation_ready=False):
work = None
# Find a pool for this edition. If we have CirculationData, a pool was created
# when we imported the edition. If there was already a pool from a different data
# source, that's fine too.
pool = get_one(self._db, LicensePool, identifier=edition.primary_identifier)
if pool:
# Note: pool.calculate_work will call self.set_presentation_edition(),
# which will find editions attached to same Identifier.
work, is_new_work = pool.calculate_work(even_if_no_author=even_if_no_author)
# Note: if pool.calculate_work found or made a work, it already called work.calculate_presentation()
if work:
if immediately_presentation_ready:
# We want this book to be presentation-ready
# immediately upon import. As long as no crucial
# information is missing (like language or title),
# this will do it.
work.set_presentation_ready_based_on_content()
return pool, work
@classmethod
def extract_next_links(self, feed):
parsed = feedparser.parse(feed)
feed = parsed['feed']
next_links = []
if feed and 'links' in feed:
next_links = [
link['href'] for link in feed['links']
if link['rel'] == 'next'
]
return next_links
@classmethod
def extract_last_update_dates(cls, feed):
parsed_feed = feedparser.parse(feed)
return [
cls.last_update_date_for_feedparser_entry(entry)
for entry in parsed_feed['entries']
]
def extract_feed_data(self, feed, feed_url=None):
"""Turn an OPDS feed into lists of Metadata and CirculationData objects,
with associated messages and next_links.
"""
data_source = DataSource.lookup(self._db, self.data_source_name)
fp_metadata, fp_failures = self.extract_data_from_feedparser(feed=feed, data_source=data_source)
# gets: medium, measurements, links, contributors, etc.
xml_data_meta, xml_failures = self.extract_metadata_from_elementtree(
feed, data_source=data_source, feed_url=feed_url
)
# translate the id in failures to identifier.urn
identified_failures = {}
for id, failure in fp_failures.items() + xml_failures.items():
external_identifier, ignore = Identifier.parse_urn(self._db, id)
if self.identifier_mapping:
internal_identifier = self.identifier_mapping.get(
external_identifier, external_identifier)
else:
internal_identifier = external_identifier
identified_failures[internal_identifier.urn] = failure
# Use one loop for both, since the id will be the same for both dictionaries.
metadata = {}
circulationdata = {}
for id, m_data_dict in fp_metadata.items():
external_identifier, ignore = Identifier.parse_urn(self._db, id)
if self.identifier_mapping:
internal_identifier = self.identifier_mapping.get(
external_identifier, external_identifier)
else:
internal_identifier = external_identifier
# Don't process this item if there was already an error
if internal_identifier.urn in identified_failures.keys():
continue
identifier_obj = IdentifierData(
type=internal_identifier.type,
identifier=internal_identifier.identifier
)
# form the Metadata object
xml_data_dict = xml_data_meta.get(id, {})
combined_meta = self.combine(m_data_dict, xml_data_dict)
if combined_meta.get('data_source') is None:
combined_meta['data_source'] = self.data_source_name
combined_meta['primary_identifier'] = identifier_obj
metadata[internal_identifier.urn] = Metadata(**combined_meta)
# form the CirculationData that would correspond to this Metadata
c_data_dict = m_data_dict.get('circulation')
if c_data_dict:
circ_links_dict = {}
# extract just the links to pass to CirculationData constructor
if 'links' in xml_data_dict:
circ_links_dict['links'] = xml_data_dict['links']
combined_circ = self.combine(c_data_dict, circ_links_dict)
if combined_circ.get('data_source') is None:
combined_circ['data_source'] = self.data_source_name
combined_circ['primary_identifier'] = identifier_obj
circulation = CirculationData(**combined_circ)
if circulation.formats:
metadata[internal_identifier.urn].circulation = circulation
else:
# If the CirculationData has no formats, it
# doesn't really offer any way to actually get the
# book, and we don't want to create a
# LicensePool. All the circulation data is
# useless.
#
# TODO: This will need to be revisited when we add
# ODL support.
metadata[internal_identifier.urn].circulation = None
return metadata, identified_failures
@classmethod
def combine(self, d1, d2):
"""Combine two dictionaries that can be used as keyword arguments to
the Metadata constructor.
"""
if not d1 and not d2:
return dict()
if not d1:
return dict(d2)
if not d2:
return dict(d1)
new_dict = dict(d1)
for k, v in d2.items():
if k in new_dict and isinstance(v, list):
new_dict[k].extend(v)
elif k not in new_dict or v != None:
new_dict[k] = v
return new_dict
@classmethod
def extract_data_from_feedparser(cls, feed, data_source):
feedparser_parsed = feedparser.parse(feed)
values = {}
failures = {}
for entry in feedparser_parsed['entries']:
identifier, detail, failure = cls.data_detail_for_feedparser_entry(entry=entry, data_source=data_source)
if identifier:
if failure:
failures[identifier] = failure
else:
if detail:
values[identifier] = detail
else:
# That's bad. Can't make an item-specific error message, but write to
# log that something very wrong happened.
logging.error("Tried to parse an element without a valid identifier. feed=%s" % feed)
return values, failures
@classmethod
def extract_metadata_from_elementtree(cls, feed, data_source, feed_url=None):
"""Parse the OPDS as XML and extract all author and subject
information, as well as ratings and medium.
All the stuff that Feedparser can't handle so we have to use lxml.
:return: a dictionary mapping IDs to dictionaries. The inner
dictionary can be used as keyword arguments to the Metadata
constructor.
"""
values = {}
failures = {}
parser = OPDSXMLParser()
root = etree.parse(StringIO(feed))
# Some OPDS feeds (eg Standard Ebooks) contain relative urls,
# so we need the feed's self URL to extract links. If none was
# passed in, we still might be able to guess.
#
# TODO: Section 2 of RFC 4287 says we should check xml:base
# for this, so if anyone actually uses that we'll get around
# to checking it.
if not feed_url:
links = [child.attrib for child in root.getroot() if 'link' in child.tag]
self_links = [link['href'] for link in links if link.get('rel') == 'self']
if self_links:
feed_url = self_links[0]
# First, turn Simplified <message> tags into CoverageFailure
# objects.
for failure in cls.coveragefailures_from_messages(
data_source, parser, root
):
failures[failure.obj.urn] = failure
# Then turn Atom <entry> tags into Metadata objects.
for entry in parser._xpath(root, '/atom:feed/atom:entry'):
identifier, detail, failure = cls.detail_for_elementtree_entry(parser, entry, data_source, feed_url)
if identifier:
if failure:
failures[identifier] = failure
if detail:
values[identifier] = detail
return values, failures
@classmethod
def _datetime(cls, entry, key):
value = entry.get(key, None)
if not value:
return value
return datetime.datetime(*value[:6])
@classmethod
def last_update_date_for_feedparser_entry(cls, entry):
identifier = entry.get('id')
updated = cls._datetime(entry, 'updated_parsed')
return (identifier, updated)
@classmethod
def data_detail_for_feedparser_entry(cls, entry, data_source):
"""Turn an entry dictionary created by feedparser into dictionaries of data
that can be used as keyword arguments to the Metadata and CirculationData constructors.
:return: A 3-tuple (identifier, kwargs for Metadata constructor, failure)
"""
identifier = entry.get('id')
if not identifier:
return None, None, None
# At this point we can assume that we successfully got some
# metadata, and possibly a link to the actual book.
try:
kwargs_meta = cls._data_detail_for_feedparser_entry(entry, data_source)
return identifier, kwargs_meta, None
except Exception, e:
_db = Session.object_session(data_source)
identifier_obj, ignore = Identifier.parse_urn(_db, identifier)
failure = CoverageFailure(
identifier_obj, traceback.format_exc(), data_source,
transient=True
)
return identifier, None, failure
@classmethod
def _data_detail_for_feedparser_entry(cls, entry, metadata_data_source):
"""Helper method that extracts metadata and circulation data from a feedparser
entry. This method can be overridden in tests to check that callers handle things
properly when it throws an exception.
"""
title = entry.get('title', None)
if title == OPDSFeed.NO_TITLE:
title = None
subtitle = entry.get('schema_alternativeheadline', None)
# Generally speaking, a data source will provide either
# metadata (e.g. the Simplified metadata wrangler) or both
# metadata and circulation data (e.g. a publisher's ODL feed).
#
# However there is at least one case (the Simplified
# open-access content server) where one server provides
# circulation data from a _different_ data source
# (e.g. Project Gutenberg).
#
# In this case we want the data source of the LicensePool to
# be Project Gutenberg, but the data source of the pool's
# presentation to be the open-access content server.
#
# The open-access content server uses a
# <bibframe:distribution> tag to keep track of which data
# source provides the circulation data.
circulation_data_source = metadata_data_source
circulation_data_source_tag = entry.get('bibframe_distribution')
if circulation_data_source_tag:
circulation_data_source_name = circulation_data_source_tag.get(
'bibframe:providername'
)
if circulation_data_source_name:
_db = Session.object_session(metadata_data_source)
circulation_data_source = DataSource.lookup(
_db, circulation_data_source_name
)
if not circulation_data_source:
raise ValueError(
"Unrecognized circulation data source: %s" % (
circulation_data_source_name
)
)
last_opds_update = cls._datetime(entry, 'updated_parsed')
added_to_collection_time = cls._datetime(entry, 'published_parsed')
publisher = entry.get('publisher', None)
if not publisher:
publisher = entry.get('dcterms_publisher', None)
language = entry.get('language', None)
if not language:
language = entry.get('dcterms_language', None)
links = []
def summary_to_linkdata(detail):
if not detail:
return None
if not 'value' in detail or not detail['value']:
return None
content = detail['value']
media_type = detail.get('type', 'text/plain')
return LinkData(
rel=Hyperlink.DESCRIPTION,
media_type=media_type,
content=content
)
summary_detail = entry.get('summary_detail', None)
link = summary_to_linkdata(summary_detail)
if link:
links.append(link)
for content_detail in entry.get('content', []):
link = summary_to_linkdata(content_detail)
if link:
links.append(link)
rights = entry.get('rights', "")
rights_uri = RightsStatus.rights_uri_from_string(rights)
kwargs_meta = dict(
title=title,
subtitle=subtitle,
language=language,
publisher=publisher,
links=links,
# refers to when was updated in opds feed, not our db
data_source_last_updated=last_opds_update,
)
# Only add circulation data if both the book's distributor *and*
# the source of the OPDS feed are lendable data sources.
if (circulation_data_source and circulation_data_source.offers_licenses
and metadata_data_source.offers_licenses):
kwargs_circ = dict(
data_source=circulation_data_source.name,
links=list(links),
default_rights_uri=rights_uri,
)
kwargs_meta['circulation'] = kwargs_circ
return kwargs_meta
@classmethod
def extract_messages(cls, parser, feed_tag):
"""Extract <simplified:message> tags from an OPDS feed and convert
them into OPDSMessage objects.
"""
path = '/atom:feed/simplified:message'
for message_tag in parser._xpath(feed_tag, path):
# First thing to do is determine which Identifier we're
# talking about.
identifier_tag = parser._xpath1(message_tag, 'atom:id')
if identifier_tag is None:
urn = None
else:
urn = identifier_tag.text
# What status code is associated with the message?
status_code_tag = parser._xpath1(message_tag, 'simplified:status_code')
if status_code_tag is None:
status_code = None
else:
try:
status_code = int(status_code_tag.text)
except ValueError:
status_code = None
# What is the human-readable message?
description_tag = parser._xpath1(message_tag, 'schema:description')
if description_tag is None:
description = ''
else:
description = description_tag.text
yield OPDSMessage(urn, status_code, description)
@classmethod
def coveragefailures_from_messages(cls, data_source, parser, feed_tag):
"""Extract CoverageFailure objects from a parsed OPDS document. This
allows us to determine the fate of books which could not
become <entry> tags.
"""
for message in cls.extract_messages(parser, feed_tag):
failure = cls.coveragefailure_from_message(data_source, message)
if failure:
yield failure
@classmethod
def coveragefailure_from_message(cls, data_source, message):
"""Turn a <simplified:message> tag into a CoverageFailure."""
_db = Session.object_session(data_source)
# First thing to do is determine which Identifier we're
# talking about. If we can't do that, we can't create a
# CoverageFailure object.
urn = message.urn
try:
identifier, ignore = Identifier.parse_urn(_db, urn)
except ValueError, e:
identifier = None
if not identifier:
# We can't associate this message with any particular
# Identifier so we can't turn it into a CoverageFailure.
return None
if message.status_code == 200:
# This message is telling us that nothing went wrong. It
# shouldn't become a CoverageFailure.
return None
description = message.message
status_code = message.status_code
if description and status_code:
exception = u"%s: %s" % (status_code, description)
elif status_code:
exception = unicode(status_code)
elif description:
exception = description
else:
exception = 'No detail provided.'
# All these CoverageFailures are transient because ATM we can
# only assume that the server will eventually have the data.
return CoverageFailure(
identifier, exception, data_source, transient=True
)
@classmethod
def detail_for_elementtree_entry(cls, parser, entry_tag, data_source, feed_url=None):
"""Turn an <atom:entry> tag into a dictionary of metadata that can be
used as keyword arguments to the Metadata contructor.
:return: A 2-tuple (identifier, kwargs)
"""
identifier = parser._xpath1(entry_tag, 'atom:id')
if identifier is None or not identifier.text:
# This <entry> tag doesn't identify a book so we
# can't derive any information from it.
return None, None, None
identifier = identifier.text
try:
data = cls._detail_for_elementtree_entry(parser, entry_tag, feed_url)
return identifier, data, None
except Exception, e:
_db = Session.object_session(data_source)
identifier_obj, ignore = Identifier.parse_urn(_db, identifier)
failure = CoverageFailure(identifier_obj, traceback.format_exc(), data_source, transient=True)
return identifier, None, failure
@classmethod
def _detail_for_elementtree_entry(cls, parser, entry_tag, feed_url=None):
"""Helper method that extracts metadata and circulation data from an elementtree
entry. This method can be overridden in tests to check that callers handle things
properly when it throws an exception.
"""
# We will fill this dictionary with all the information
# we can find.
data = dict()
alternate_identifiers = []
for id_tag in parser._xpath(entry_tag, "dcterms:identifier"):
v = cls.extract_identifier(id_tag)
if v:
alternate_identifiers.append(v)
data['identifiers'] = alternate_identifiers
data['medium'] = cls.extract_medium(entry_tag)
data['contributors'] = []
for author_tag in parser._xpath(entry_tag, 'atom:author'):
contributor = cls.extract_contributor(parser, author_tag)
if contributor is not None:
data['contributors'].append(contributor)
data['subjects'] = [
cls.extract_subject(parser, category_tag)
for category_tag in parser._xpath(entry_tag, 'atom:category')
]
ratings = []
for rating_tag in parser._xpath(entry_tag, 'schema:Rating'):
v = cls.extract_measurement(rating_tag)
if v:
ratings.append(v)
data['measurements'] = ratings
data['links'] = cls.consolidate_links([
cls.extract_link(link_tag, feed_url)
for link_tag in parser._xpath(entry_tag, 'atom:link')
])
return data
@classmethod
def extract_identifier(cls, identifier_tag):
"""Turn a <dcterms:identifier> tag into an IdentifierData object."""
try:
type, identifier = Identifier.type_and_identifier_for_urn(identifier_tag.text.lower())
return IdentifierData(type, identifier)
except ValueError:
return None
@classmethod
def extract_medium(cls, entry_tag):
"""Derive a value for Edition.medium from <atom:entry
schema:additionalType>.
"""
# If no additionalType is given, assume we're talking about an
# ebook.
default_additional_type = Edition.medium_to_additional_type[
Edition.BOOK_MEDIUM
]
additional_type = entry_tag.get('{http://schema.org/}additionalType',
default_additional_type)
return Edition.additional_type_to_medium.get(additional_type)
@classmethod
def extract_contributor(cls, parser, author_tag):
"""Turn an <atom:author> tag into a ContributorData object."""
subtag = parser.text_of_optional_subtag
sort_name = subtag(author_tag, 'simplified:sort_name')
display_name = subtag(author_tag, 'atom:name')
family_name = subtag(author_tag, "simplified:family_name")
wikipedia_name = subtag(author_tag, "simplified:wikipedia_name")
# TODO: we need a way of conveying roles. I believe Bibframe
# has the answer.
# TODO: Also collect VIAF and LC numbers if present. This
# requires parsing the URIs. Only the metadata wrangler will
# provide this information.
viaf = None
if sort_name or display_name or viaf:
return ContributorData(
sort_name=sort_name, display_name=display_name,
family_name=family_name,
wikipedia_name=wikipedia_name,
roles=None
)
logging.info("Refusing to create ContributorData for contributor with no sort name, display name, or VIAF.")
return None
@classmethod
def extract_subject(cls, parser, category_tag):
"""Turn an <atom:category> tag into a SubjectData object."""
attr = category_tag.attrib
# Retrieve the type of this subject - FAST, Dewey Decimal,
# etc.
scheme = attr.get('scheme')
subject_type = Subject.by_uri.get(scheme)
if not subject_type:
# We can't represent this subject because we don't
# know its scheme. Just treat it as a tag.
subject_type = Subject.TAG
# Retrieve the term (e.g. "827") and human-readable name
# (e.g. "English Satire & Humor") for this subject.
term = attr.get('term')
name = attr.get('label')
default_weight = 1
if subject_type in (
Subject.FREEFORM_AUDIENCE, Subject.AGE_RANGE
):
default_weight = 100
weight = attr.get('{http://schema.org/}ratingValue', default_weight)
try:
weight = int(weight)
except ValueError, e:
weight = 1
return SubjectData(
type=subject_type,
identifier=term,
name=name,
weight=weight
)
@classmethod
def extract_link(cls, link_tag, feed_url=None):
attr = link_tag.attrib
rel = attr.get('rel')
media_type = attr.get('type')
href = attr.get('href')
if not href or not rel:
# The link exists but has no destination, or no specified
# relationship to the entry.
return None
rights = attr.get('{%s}rights' % OPDSXMLParser.NAMESPACES["dcterms"])
if rights:
rights_uri = RightsStatus.rights_uri_from_string(rights)
else:
rights_uri = None
if feed_url and not urlparse(href).netloc:
# This link is relative, so we need to get the absolute url
href = urljoin(feed_url, href)
return LinkData(rel=rel, href=href, media_type=media_type, rights_uri=rights_uri)
@classmethod
def consolidate_links(cls, links):
"""Try to match up links with their thumbnails.
If link n is an image and link n+1 is a thumbnail, then the
thumbnail is assumed to be the thumbnail of the image.
Similarly if link n is a thumbnail and link n+1 is an image.
"""
# Strip out any links that didn't get turned into LinkData objects
# due to missing `href` or whatever.
new_links = [x for x in links if x]
# Make a new list of links from that list, to iterate over --
# we'll be modifying new_links in place so we can't iterate
# over it.
links = list(new_links)
next_link_already_handled = False
for i, link in enumerate(links):
if link.rel not in (Hyperlink.THUMBNAIL_IMAGE, Hyperlink.IMAGE):
# This is not any kind of image. Ignore it.
continue
if next_link_already_handled:
# This link and the previous link were part of an
# image-thumbnail pair.
next_link_already_handled = False
continue
if i == len(links)-1:
# This is the last link. Since there is no next link
# there's nothing to do here.
continue
# Peek at the next link.
next_link = links[i+1]
if (link.rel == Hyperlink.THUMBNAIL_IMAGE
and next_link.rel == Hyperlink.IMAGE):
# This link is a thumbnail and the next link is
# (presumably) the corresponding image.
thumbnail_link = link
image_link = next_link
elif (link.rel == Hyperlink.IMAGE
and next_link.rel == Hyperlink.THUMBNAIL_IMAGE):
thumbnail_link = next_link
image_link = link
else:
# This link and the next link do not form an
# image-thumbnail pair. Do nothing.
continue
image_link.thumbnail = thumbnail_link
new_links.remove(thumbnail_link)
next_link_already_handled = True
return new_links
@classmethod
def extract_measurement(cls, rating_tag):
type = rating_tag.get('{http://schema.org/}additionalType')
value = rating_tag.get('{http://schema.org/}ratingValue')
if not value:
value = rating_tag.attrib.get('{http://schema.org}ratingValue')
if not type:
type = Measurement.RATING
try:
value = float(value)
return MeasurementData(