Skip to content

Commit 7a07a85

Browse files
added scraper for google
1 parent 45e70b1 commit 7a07a85

9 files changed

+42
-0
lines changed

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "D:\\open-source\\joble\\jobleenv\\Scripts\\python.exe"
3+
}
135 Bytes
Binary file not shown.
239 Bytes
Binary file not shown.

joble/scraper/google.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import argparse
2+
import requests
3+
from bs4 import BeautifulSoup
4+
5+
# function to remove random characters from the end of the url if exists
6+
def fix_url(url_list):
7+
if len(url_list[-1]) > 8:
8+
url_list[-1] = url_list[-1].split('&')[0]
9+
return url_list
10+
11+
# return the link for carrers page from google
12+
def get_carrer_page(name):
13+
query = name.replace(' ', '+')
14+
URL = f"https://google.com/search?q={query}+carrers"
15+
16+
resp = requests.get(URL)
17+
if '.' in name:
18+
name = name.split('.')[0]
19+
20+
if resp.status_code == 200:
21+
soup = BeautifulSoup(resp.content, "html.parser")
22+
23+
for a in soup.find_all('a', href=True):
24+
if 'url' in a['href']:
25+
# removes '/url?q=' from the start of the url
26+
url_list = a['href'][7:].split('/')
27+
if name in url_list[2]:
28+
return '/'.join(fix_url(url_list))
29+
break
30+
31+
if __name__ == '__main__':
32+
parser = argparse.ArgumentParser()
33+
parser.add_argument("name", help="name of the company", type=str)
34+
args = parser.parse_args()
35+
36+
url = get_carrer_page(args.name)
37+
print(url)
143 Bytes
Binary file not shown.
1.41 KB
Binary file not shown.
Binary file not shown.
3.29 KB
Binary file not shown.

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
attrs==19.3.0
22
Automat==20.2.0
3+
beautifulsoup4==4.9.2
34
cffi==1.14.0
45
constantly==15.1.0
56
cryptography==2.9.2
@@ -19,6 +20,7 @@ PyDispatcher==2.0.5
1920
PyHamcrest==2.0.2
2021
pyOpenSSL==19.1.0
2122
queuelib==1.5.0
23+
requests==2.24.0
2224
Scrapy==2.2.0
2325
service-identity==18.1.0
2426
six==1.15.0

0 commit comments

Comments
 (0)