-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_dblp.py
142 lines (118 loc) · 4.58 KB
/
parse_dblp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import argparse
import os
import time
import json
from lxml import html
from utils import readlines,iter_files
class InvalidElementName(Exception):
def __init__(self, invalid_element_name, tag_name, parent_name):
self.invalid_element_name = invalid_element_name
self.tag_name = tag_name
self.parent_name = parent_name
def __str__(self):
return 'Invalid name %s found in tag %s within element %s' % (repr(self.invalid_element_name),
repr(self.tag_name),
repr(self.parent_name))
def existing_file(filename: str) -> str:
if os.path.isfile(filename):
return filename
else:
raise argparse.ArgumentTypeError('%s is not a valid input file!' % filename)
def parse_args():
parser = argparse.ArgumentParser(description='Parse the DBLP XML file and convert it to CSV')
parser.add_argument('xml_filename', action='store', type=existing_file, help='The XML file that will be parsed',
metavar='xml_filename',default='./dblp/dblp.xml')
parser.add_argument('dtd_filename', action='store', type=existing_file,
help='The DTD file used to parse the XML file', metavar='dtd_filename',default='./dblp/dblp.dtd')
parsed_args = parser.parse_args()
return parsed_args
def get_elements(dtd_file) -> set:
dtd = html.etree.DTD(dtd_file)
elements = set()
for el in dtd.iterelements():
if el.type == 'element':
elements.add(el.name)
elements.remove('dblp')
return elements
def parse_xml(xml_file, elements: set):
writers = dict()
for each in elements:
writers[each] = open('./dblp/'+each+'.json','a+')
context = html.etree.iterparse(xml_file, dtd_validation=True, events=('start', 'end'))
# turn it into an iterator
context = iter(context)
# get the root element
event, root = next(context)
data = dict()
current_tag = None
unique_id = 0
for event, elem in context:
if current_tag is None and event == 'start' and elem.tag in elements:
current_tag = elem.tag
data.clear()
data.update(elem.attrib)
elif current_tag is not None and event == 'end' and elem.tag in elements:
data['id'] = unique_id
tmp = json.dumps(data)
writers[current_tag].write(tmp + '\n')
writers[current_tag].flush()
if unique_id % 10000:
print(unique_id)
unique_id += 1
current_tag = None
elif elem.tag is not None and elem.text is not None:
if elem.tag=="author":
if "author" in data:
data[elem.tag].append(elem.text)
else:
data[elem.tag] = [elem.text]
else:
data[elem.tag] = elem.text
root.clear()
for each in elements:
writers[each].close()
def remove_old(path):
files = iter_files(path)
for file in files:
filename = os.path.basename(file)
print(filename)
f = open(os.path.join('./final',filename),'a+')
lines = readlines(file)
i = 0
cnt=0
for line in lines:
article = json.loads(line)
if 'year' in article:
if int(article['year'])>=2000:
if "author" in article:
article['author']=list(set(article['author']))
tmp = json.dumps(article)
f.write(tmp+'\n')
f.flush()
i += 1
# if i % 100000 == 0:
# print(i)
else:
cnt+=1
else:
cnt+=1
print('%s skip:%d, save %d' % (filename,cnt,i))
def parse_dblp():
args = parse_args()
if args.xml_filename is not None and args.dtd_filename is not None :
start_time = time.time()
print('Start!')
with open(args.dtd_filename, 'rb') as dtd_file:
print('Reading elements from DTD file...')
elements = get_elements(dtd_file)
with open(args.xml_filename, 'rb') as xml_file:
print('Parsing XML and writing to CSV files...')
parse_xml(xml_file, elements)
end_time = time.time()
print('Done after %f seconds' % (end_time - start_time))
else:
print('Invalid input arguments.')
exit(1)
if __name__ == "__main__":
# remove_old('./dblp')
pass