__pycache__/indeed.cpython-38.pyc

Binary file not shown.

__pycache__/save.cpython-38.pyc

Binary file not shown.

__pycache__/so.cpython-38.pyc

Binary file not shown.

indeed.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,26 +4,58 @@ @@
     LIMIT = 50
     URL = f"https://www.indeed.com/jobs?q=ptyhon&limit={LIMIT}"
-    def extract_indeed_pages():
-      result = requests.get(URL)
-      soup = BeautifulSoup(result.text,"html.parser")
-      pagination = soup.find("div",{"class":"pagination"})
-      links = pagination.find_all('a')
-      pages = []
-      for link in links[:-1]:
-        pages.append(int(link.string))
-      max_page = pages[-1]
-      return max_page
-    def extract_indeed_jobs(last_page):
-      jobs = []
-      for page in range(last_page):
-        result = requests.get(f"{URL}&start={page*LIMIT}")
-        soup = BeautifulSoup(result.text,"html.parser")
-        results = soup.find_all("div",{"class":"jobsearch-SerpJobCard"})
-        for result in results:
-          title = result.find("h2",{"class":"title"}).find('a')["title"]
-          print(title)
-      return jobs
+    def get_last_page():
+        result = requests.get(URL)
+        soup = BeautifulSoup(result.text, "html.parser")
+        pagination = soup.find("div", {"class": "pagination"})
+        links = pagination.find_all('a')
+        pages = []
+        for link in links[:-1]:
+            pages.append(int(link.string))
+        max_page = pages[-1]
+        return max_page
+    def extract_job(html):
+        title = html.find("h2", {"class": "title"}).find('a')["title"]
+        company = html.find("span", {"class": "company"})
+        if company:
+            company_anchor = company.find("a")
+            if company_anchor is not None:
+                company = str(company_anchor.string)
+            else:
+                company = str(company.string)
+            company = company.strip()
+        else:
+            company = None
+        #html.find("div")-> div로 바꾸는 것!
+        location = html.find("div", {"class": "recJobLoc"})["data-rc-loc"]
+        job_id = html["data-jk"]
+        return {
+            'title': title,
+            'company': company,
+            'location': location,
+            'Link': f"https://www.indeed.com/viewjob?jk={job_id}"
+        }
+    def extract_jobs(last_page):
+        jobs = []
+        for page in range(last_page):
+            print(f"Scrapping page {page}")
+            result = requests.get(f"{URL}&start={page*LIMIT}")
+            soup = BeautifulSoup(result.text, "html.parser")
+            results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})
+            for result in results:
+                job = extract_job(result)
+                jobs.append(job)
+        return jobs
+    def get_jobs():
+        last_page = get_last_page()
+        jobs = extract_jobs(last_page)
+        return jobs

jobs.csv

Large diffs are not rendered by default.

main.py

-Original file line number
+Diff line change
@@ -1,6 +1,14 @@
-    from indeed import extract_indeed_pages, extract_indeed_jobs
+    from indeed import get_jobs as get_indeed_jobs
+    from so import get_jobs as get_so_jobs
+    from save import save_to_file
-    last_indeed_page = extract_indeed_pages()
-    indeed_jobs = extract_indeed_jobs(last_indeed_page)
+    so_jobs =get_so_jobs()
+    indeed_jobs = get_indeed_jobs()
+    jobs = so_jobs + indeed_jobs
+    print(jobs)
+    #csv -> 윈도우, 맥, 구글드라이브 등에서 모두 사용 가능
+    #Comma Separated Values
+    # colum = comma로 나눔
+    # row = new line
+    save_to_file(jobs)

save.py

-Original file line number
+Diff line change
@@ -0,0 +1,10 @@
+    import csv
+    def save_to_file(jobs):
+      file = open("jobs.csv",mode="w")
+      writer = csv.writer(file)
+      writer.writerow(["title","company","location","link"])
+      for job in jobs:
+        writer.writerow(list(job.values()))
+      return

so.py

-Original file line number
+Diff line change
@@ -0,0 +1,38 @@
+    import requests
+    from bs4 import BeautifulSoup
+    URL = f"https://stackoverflow.com/jobs?q=python"
+    def get_last_page():
+      result = requests.get(URL)
+      soup = BeautifulSoup(result.text,"html.parser")
+      pages = soup.find("div",{"class":"s-pagination"}).find_all("a")
+      last_page = pages[-3].get_text(strip=True)
+      return int(last_page)
+    def extract_job(html):
+      title = html.find("div",{"class":"fl1"}).find("h2").find('a')["title"]
+      company,location = html.find("div",{"class":"fl1"}).find("h3").find_all("span")
+      company = company.string.strip()
+      location = location.string.strip()
+      job_id = html["data-jobid"]
+      return {'title' : title,'company' : company,'location' : location,'apply_Link':f"https://www.stackoverflow.com/jobs/{job_id}"}
+    def extract_jobs(last_page):
+      jobs = []
+      for page in range(last_page):
+        print(f"Scrapping SO : page {page}")
+        result = requests.get(f"{URL}&pg={page+1}")
+        soup = BeautifulSoup(result.text,"html.parser")
+        results = soup.find_all("div",{"class":"-job"})
+        for result in results:
+          job = extract_job(result)
+          jobs.append(job)
+      return jobs
+    def get_jobs():
+      last_page = get_last_page()
+      jobs = extract_jobs(last_page)
+      return jobs

최종 완성 #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

yeomsh wants to merge 1 commit into master from jobscrapper

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,26 +4,58 @@ @@
     LIMIT = 50
     URL = f"https://www.indeed.com/jobs?q=ptyhon&limit={LIMIT}"
-    def extract_indeed_pages():
-      result = requests.get(URL)
-      soup = BeautifulSoup(result.text,"html.parser")
-      pagination = soup.find("div",{"class":"pagination"})
-      links = pagination.find_all('a')
-      pages = []
-      for link in links[:-1]:
-        pages.append(int(link.string))
-      max_page = pages[-1]
-      return max_page
-    def extract_indeed_jobs(last_page):
-      jobs = []
-      for page in range(last_page):
-        result = requests.get(f"{URL}&start={page*LIMIT}")
-        soup = BeautifulSoup(result.text,"html.parser")
-        results = soup.find_all("div",{"class":"jobsearch-SerpJobCard"})
-        for result in results:
-          title = result.find("h2",{"class":"title"}).find('a')["title"]
-          print(title)
-      return jobs
+    def get_last_page():
+        result = requests.get(URL)
+        soup = BeautifulSoup(result.text, "html.parser")
+        pagination = soup.find("div", {"class": "pagination"})
+        links = pagination.find_all('a')
+        pages = []
+        for link in links[:-1]:
+            pages.append(int(link.string))
+        max_page = pages[-1]
+        return max_page
+    def extract_job(html):
+        title = html.find("h2", {"class": "title"}).find('a')["title"]
+        company = html.find("span", {"class": "company"})
+        if company:
+            company_anchor = company.find("a")
+            if company_anchor is not None:
+                company = str(company_anchor.string)
+            else:
+                company = str(company.string)
+            company = company.strip()
+        else:
+            company = None
+        #html.find("div")-> div로 바꾸는 것!
+        location = html.find("div", {"class": "recJobLoc"})["data-rc-loc"]
+        job_id = html["data-jk"]
+        return {
+            'title': title,
+            'company': company,
+            'location': location,
+            'Link': f"https://www.indeed.com/viewjob?jk={job_id}"
+        }
+    def extract_jobs(last_page):
+        jobs = []
+        for page in range(last_page):
+            print(f"Scrapping page {page}")
+            result = requests.get(f"{URL}&start={page*LIMIT}")
+            soup = BeautifulSoup(result.text, "html.parser")
+            results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})
+            for result in results:
+                job = extract_job(result)
+                jobs.append(job)
+        return jobs
+    def get_jobs():
+        last_page = get_last_page()
+        jobs = extract_jobs(last_page)
+        return jobs

-Original file line number
+Diff line change
@@ -1,6 +1,14 @@
-    from indeed import extract_indeed_pages, extract_indeed_jobs
+    from indeed import get_jobs as get_indeed_jobs
+    from so import get_jobs as get_so_jobs
+    from save import save_to_file
-    last_indeed_page = extract_indeed_pages()
-    indeed_jobs = extract_indeed_jobs(last_indeed_page)
+    so_jobs =get_so_jobs()
+    indeed_jobs = get_indeed_jobs()
+    jobs = so_jobs + indeed_jobs
+    print(jobs)
+    #csv -> 윈도우, 맥, 구글드라이브 등에서 모두 사용 가능
+    #Comma Separated Values
+    # colum = comma로 나눔
+    # row = new line
+    save_to_file(jobs)

-Original file line number
+Diff line change
@@ -0,0 +1,10 @@
+    import csv
+    def save_to_file(jobs):
+      file = open("jobs.csv",mode="w")
+      writer = csv.writer(file)
+      writer.writerow(["title","company","location","link"])
+      for job in jobs:
+        writer.writerow(list(job.values()))
+      return

-Original file line number
+Diff line change
@@ -0,0 +1,38 @@
+    import requests
+    from bs4 import BeautifulSoup
+    URL = f"https://stackoverflow.com/jobs?q=python"
+    def get_last_page():
+      result = requests.get(URL)
+      soup = BeautifulSoup(result.text,"html.parser")
+      pages = soup.find("div",{"class":"s-pagination"}).find_all("a")
+      last_page = pages[-3].get_text(strip=True)
+      return int(last_page)
+    def extract_job(html):
+      title = html.find("div",{"class":"fl1"}).find("h2").find('a')["title"]
+      company,location = html.find("div",{"class":"fl1"}).find("h3").find_all("span")
+      company = company.string.strip()
+      location = location.string.strip()
+      job_id = html["data-jobid"]
+      return {'title' : title,'company' : company,'location' : location,'apply_Link':f"https://www.stackoverflow.com/jobs/{job_id}"}
+    def extract_jobs(last_page):
+      jobs = []
+      for page in range(last_page):
+        print(f"Scrapping SO : page {page}")
+        result = requests.get(f"{URL}&pg={page+1}")
+        soup = BeautifulSoup(result.text,"html.parser")
+        results = soup.find_all("div",{"class":"-job"})
+        for result in results:
+          job = extract_job(result)
+          jobs.append(job)
+      return jobs
+    def get_jobs():
+      last_page = get_last_page()
+      jobs = extract_jobs(last_page)
+      return jobs

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

최종 완성 #1

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

최종 완성 #1

Are you sure you want to change the base?

Uh oh!

최종 완성 #1

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!