-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathveto_html_parser.py
96 lines (76 loc) · 3.56 KB
/
veto_html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from bs4 import BeautifulSoup
import logging
import re
logger = logging.getLogger("unsc_db_filler")
class VetoHTMLParser:
def __init__(self, html: str):
self.html = html
def create_resolution_to_veto_mapping_table(self) -> dict:
"""
Reads the Veto table, and creates a mapping dict:
{
'Draft Resolution': ['Permanent member casting negative vote']
}
:return: A Dictionary which maps a draft resolution to a list of veto voters
"""
"""
The structure of the page is like this:
<tr>
<th width="15%">Date</th>
<th width="15%">Draft</th>
<th width="20%">Written<br>
Record</th>
<th width="30%">Agenda Item</th>
<th width="20%">Permanent<br>
Member Casting<br>
Negative Vote</th>
</tr>
In other words, we are interested in cell[1] (Draft Resolution) and cell[4] (Caster)
The column of the draft resoltion (cell[1]) has a hyperlink to the draft resolution,
sometimes some other text next to it. We'll want to take the <a>-tag's InnerText to
only get the resolution id, and not other text.
"""
soup = BeautifulSoup(self.html, "html5lib")
rows = soup.table.find_all("tr")
records = {}
for row in rows:
cells = row.find_all("td")
# First few rows are about the Library..no real data.
if len(cells) <= 1:
continue
resolution = cells[1].a.text
veto_caster = cells[4].contents
# This is interesting..and tricky..and annoying!!
# If we'd use .text on cells[4], beautifulsoup converts the html in the <td> to a string.
# <br/> tags are then ignored. This makes it impossible to sometimes recognize a country.
# For example S/2020/667 is vetoed by China and Russian Federation.
# The UN DAG Library has put a newline there using a <br>-tag.
# .text gives then ChinaRussian Federation. Not reliable.
# .contents on the other hand gives a list of elements found in the <td> tag:
# ['China', <br/>, 'Russian Federation']. That middle element is not a string, it's a bs4.element.Tag.
# And ideally we filter it out.
# The only reliable way I have found thus far is to loop over the elements in the list,
# find out the element's name, and if it's set to 'br', to decompose() it, as in: remove from the list.
for el in veto_caster:
if el.name == "br":
el.decompose()
# Now normalize the entries
veto_caster = [self.normalize_state_name(state) for state in veto_caster]
records[resolution] = veto_caster
return records
def normalize_state_name(self, state_name: str) -> str:
"""
Takes a state name and normalizes it. This is used as over the years for example
Russia was referenced as both Russian Federation and as USSR.
:param state_name: The name of the state
:return: The normalized name of the state
"""
normalization_table = {
"Russian Federation": "Russia",
"USSR": "Russia",
}
# If the passed state_name is found in our mapping table,
# we need to normalize it. Normalize and then return that result
if state_name in normalization_table.keys():
return normalization_table.get(state_name.strip())
return state_name.strip()