Skip to content

Commit 713f70b

Browse files
authored
Merge pull request #115 from scrapinghub/opengraph_uniform_reversed_precedence
Reverse priorities for repeated properties in uniform format for opengraph
2 parents de219cb + f987d9a commit 713f70b

File tree

5 files changed

+36
-2
lines changed

5 files changed

+36
-2
lines changed

extruct/uniform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
def _uopengraph(extracted):
55
out = []
66
for obj in extracted:
7-
flattened = dict(obj['properties'])
7+
flattened = dict(reversed(obj['properties']))
88
t = flattened.pop('og:type', None)
99
if t:
1010
flattened['@type'] = t

tests/samples/songkick/elysianfields.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
<meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
3131
<meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
3232
<meta property="og:image" content="http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg">
33+
<meta property="og:image" content="http://images.sk-static.com/SECONDARY_IMAGE.jpg">
3334
</head>
3435
<body>
3536
<script>

tests/samples/songkick/elysianfields.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,10 @@
202202
[
203203
"og:image",
204204
"http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"
205+
],
206+
[
207+
"og:image",
208+
"http://images.sk-static.com/SECONDARY_IMAGE.jpg"
205209
]
206210
]
207211
}
@@ -233,6 +237,9 @@
233237
"http://ogp.me/ns#image": [
234238
{
235239
"@value": "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"
240+
},
241+
{
242+
"@value": "http://images.sk-static.com/SECONDARY_IMAGE.jpg"
236243
}
237244
],
238245
"http://ogp.me/ns#site_name": [

tests/test_extruct.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66

77
import extruct
8+
from extruct import SYNTAXES
89
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id
910

1011

@@ -16,6 +17,21 @@ def test_all(self):
1617
body = get_testdata('songkick', 'elysianfields.html')
1718
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
1819
data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
20+
# See test_rdfa_not_preserving_order()
21+
del data['rdfa'][0]['http://ogp.me/ns#image']
22+
del expected['rdfa'][0]['http://ogp.me/ns#image']
23+
self.assertEqual(jsonize_dict(data), expected)
24+
25+
@pytest.mark.xfail
26+
def test_rdfa_not_preserving_order(self):
27+
# See https://github.com/scrapinghub/extruct/issues/116
28+
# RDFa is not preserving ordering on duplicated properties. So this
29+
# test sometimes fails for property 'http://ogp.me/ns#image'
30+
body = get_testdata('songkick', 'elysianfields.html')
31+
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
32+
data = extruct.extract(body,
33+
base_url='http://www.songkick.com/artists/236156-elysian-fields',
34+
syntaxes=['rdfa'])
1935
self.assertEqual(jsonize_dict(data), expected)
2036

2137
def test_microdata_custom_url(self):

tests/test_uniform.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import unittest
22

33
import extruct
4-
from extruct.uniform import _flatten, infer_context, flatten_dict
4+
from extruct.uniform import _flatten, infer_context, flatten_dict, _uopengraph
55
from tests import get_testdata
66

77

@@ -27,6 +27,16 @@ def test_uopengraph(self):
2727
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
2828
self.assertEqual(data['opengraph'], expected)
2929

30+
def test_uopengraph_duplicated_priorities(self):
31+
# Ensures that first seen property is kept when flattening
32+
data = _uopengraph([{'properties':
33+
[('prop_{}'.format(k), 'value_{}'.format(v))
34+
for k in range(5)
35+
for v in range(5)],
36+
'namespace': 'namespace'}])
37+
for k in range(5):
38+
assert data[0]['prop_{}'.format(k)] == 'value_0'
39+
3040
def test_umicroformat(self):
3141
expected = [ { '@context': 'http://microformats.org/wiki/',
3242
'@type': ['h-hidden-phone', 'h-hidden-tablet'],

0 commit comments

Comments
 (0)