Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified __pycache__/indeed.cpython-38.pyc
Binary file not shown.
Binary file added __pycache__/save.cpython-38.pyc
Binary file not shown.
Binary file added __pycache__/so.cpython-38.pyc
Binary file not shown.
76 changes: 54 additions & 22 deletions indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,58 @@
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=ptyhon&limit={LIMIT}"

def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text,"html.parser")
pagination = soup.find("div",{"class":"pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))

max_page = pages[-1]
return max_page

def extract_indeed_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text,"html.parser")
results = soup.find_all("div",{"class":"jobsearch-SerpJobCard"})
for result in results:
title = result.find("h2",{"class":"title"}).find('a')["title"]
print(title)
return jobs

def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))

max_page = pages[-1]
return max_page


def extract_job(html):
title = html.find("h2", {"class": "title"}).find('a')["title"]
company = html.find("span", {"class": "company"})
if company:
company_anchor = company.find("a")
if company_anchor is not None:
company = str(company_anchor.string)
else:
company = str(company.string)
company = company.strip()
else:
company = None
#html.find("div")-> div로 바꾸는 것!
location = html.find("div", {"class": "recJobLoc"})["data-rc-loc"]
job_id = html["data-jk"]
return {
'title': title,
'company': company,
'location': location,
'Link': f"https://www.indeed.com/viewjob?jk={job_id}"
}


def extract_jobs(last_page):
jobs = []
for page in range(last_page):
print(f"Scrapping page {page}")
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})
for result in results:
job = extract_job(result)
jobs.append(job)

return jobs


def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
1,272 changes: 1,272 additions & 0 deletions jobs.csv

Large diffs are not rendered by default.

16 changes: 12 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
from indeed import extract_indeed_pages, extract_indeed_jobs
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file

last_indeed_page = extract_indeed_pages()

indeed_jobs = extract_indeed_jobs(last_indeed_page)
so_jobs =get_so_jobs()
indeed_jobs = get_indeed_jobs()
jobs = so_jobs + indeed_jobs
print(jobs)

#csv -> 윈도우, 맥, 구글드라이브 등에서 모두 사용 가능
#Comma Separated Values
# colum = comma로 나눔
# row = new line
save_to_file(jobs)
10 changes: 10 additions & 0 deletions save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import csv

def save_to_file(jobs):
file = open("jobs.csv",mode="w")
writer = csv.writer(file)
writer.writerow(["title","company","location","link"])
for job in jobs:
writer.writerow(list(job.values()))

return
38 changes: 38 additions & 0 deletions so.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import requests
from bs4 import BeautifulSoup

URL = f"https://stackoverflow.com/jobs?q=python"

def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text,"html.parser")
pages = soup.find("div",{"class":"s-pagination"}).find_all("a")
last_page = pages[-3].get_text(strip=True)
return int(last_page)

def extract_job(html):
title = html.find("div",{"class":"fl1"}).find("h2").find('a')["title"]
company,location = html.find("div",{"class":"fl1"}).find("h3").find_all("span")
company = company.string.strip()
location = location.string.strip()
job_id = html["data-jobid"]
return {'title' : title,'company' : company,'location' : location,'apply_Link':f"https://www.stackoverflow.com/jobs/{job_id}"}


def extract_jobs(last_page):
jobs = []
for page in range(last_page):
print(f"Scrapping SO : page {page}")
result = requests.get(f"{URL}&pg={page+1}")
soup = BeautifulSoup(result.text,"html.parser")
results = soup.find_all("div",{"class":"-job"})
for result in results:
job = extract_job(result)
jobs.append(job)

return jobs

def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs