-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_datafiles_tex.py
67 lines (57 loc) · 2.11 KB
/
make_datafiles_tex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import json
import shutil
import random
from nltk.tokenize import sent_tokenize
import re
from os.path import exists, join
from segtok.segmenter import split_multi
from tqdm import tqdm
from utils import iter_files
from make_datafiles_html import clean_abs, clean_text
def extract_json(src, des):
i = 0
files = list(iter_files(src))
for file in tqdm(files):
# print(file)
tmp = json.load(open(file))['paper']
paper = dict()
if "abstract" in tmp:
conclusion = ''
abs_len = len(' '.join(tmp["abstract"]).split())
if abs_len > 210: continue
flag = False
for sec in tmp['sections']:
if "introduction" in sec.lower():
int_len = len(' '.join(tmp["sections"][sec]).split())
if int_len > 1000 or abs_len > int_len:
break
abstract = clean_abs(' '.join(tmp["abstract"]))
if len(abstract) < 2:
break
introduction = clean_text(' '.join(tmp["sections"][sec]))
if len(introduction) < 2:
break
flag = True
if "conclusion" in sec.lower() and flag:
con_len = len(' '.join(tmp["sections"][sec]).split())
if con_len > 800:
conclusion = ''
break
conclusion = clean_text(' '.join(tmp["sections"][sec]))
break
if flag:
paper["abstract"] = abstract
paper["article"] = introduction
paper["conclusion"] = conclusion
name = os.path.basename(file)
name, _ = os.path.splitext(name)
paper["id"] = name
json.dump(paper, open(os.path.join(des, "%s.json" % name), 'w'),indent=4)
i += 1
if __name__ == "__main__":
path = r'F:\Dataset\json_v1'
des = r"E:\DATASET\arxiv_tex"
if not exists(des):
os.makedirs(des)
extract_json(path, des)