-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcode_clippy.py
120 lines (105 loc) · 3.53 KB
/
code_clippy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
'''Firstly You have to install magic with &pip install python-magic& and zstandard with &pip install zstandard&'''
import magic
import os
import json
import uuid
import zstandard
import subprocess
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
internal_urls = set()
external_urls = set()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
# print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def loadJsonL(fname):
import json
data = []
with open(fname) as fp:
for line in fp.readlines():
data.append(json.loads(line))
return data
def processZSTLink(url, dir, outputdir):
zstfile = url.split('/')[-1]
print("downloading: ", url)
dir += zstfile
out = subprocess.run(
f"wget -O {dir} -q {url} ", shell=True, stdout=subprocess.DEVNULL)
# dir = dir[:-4]
outputdir += zstfile[:-4]
with open(dir, 'rb') as compressed:
decomp = zstandard.ZstdDecompressor()
with open(dir[:-4], 'wb') as destination:
decomp.copy_stream(compressed, destination)
print("now starting to process the data...")
newjson = open(outputdir, 'w')
data = loadJsonL(dir[:-4])
for jsonline in data:
# extract the filename extension of the current line
# which is under meta > file_name
ext = jsonline['meta']['file_name']
# ignore the filename and only get the extension
if(ext != ''):
ext = ext.split('.')[-1].lower()
# if extension is py, then process the line
# processing means creating a file with name as this line number
if ext == 'py':
newjson.write(json.dumps(jsonline))
newjson.write('\n')
print("done!")
newjson.close()
os.remove(dir)
os.remove(dir[:-4])
urls = get_all_website_links(
"https://the-eye.eu/public/AI/training_data/code_clippy_data/code_clippy_dedup_data/train/")
for i in urls:
if i.find("_default.jsonl.zst"):
try:
processZSTLink(i, './zsts/', './pyjsons/')
except:
print("Error happened")