added 2024 salary

AlgoETS · Sep 29, 2023 · 5e1fffd · 5e1fffd
1 parent d0f6d60
commit 5e1fffd
Show file tree

Hide file tree

Showing 6 changed files with 690 additions and 191 deletions.
diff --git a/README.md b/README.md
@@ -19,7 +19,12 @@
 ![Jupyter Notebook](https://img.shields.io/badge/jupyter-%23FA0F00.svg?style=for-the-badge&logo=jupyter&logoColor=white)
 
 
-
+## PDF SALARY SCRAPER
+1. 
+    ```bash
+    python src/pdf_salary_scraper_2024.py
+    ```
+2. executer le notebook dans Salary_2023/sandbox/salary_exploration.ipynb
 
 ## Installation
 

diff --git a/data/2024 Salary Guide _PDF.pdf b/data/2024 Salary Guide _PDF.pdf
diff --git a/data/Guide salarial 2024_PDF.pdf b/data/Guide salarial 2024_PDF.pdf
diff --git a/data/salary_guide.pkl b/data/salary_guide.pkl
diff --git a/sandbox/salary_exploration.ipynb b/sandbox/salary_exploration.ipynb
diff --git a/src/pdf_salary_scraper_2024.py b/src/pdf_salary_scraper_2024.py
@@ -0,0 +1,105 @@
+# importing required modules
+from PyPDF2 import PdfReader
+import re
+from pathlib import Path
+import pandas as pd
+
+# creating a pdf reader object
+reader = PdfReader("data/2024 Salary Guide _PDF.pdf")
+provinces = ["ontario", "québec", "british columbia", "alberta", "manitoba", "saskatchewan", "nova scotia", "new brunswick", "newfoundland & labrador", "prince edward island"]
+regions=['calgary metropolitan region', 'edmonton metropolitan region', 'northern alberta', 'red deer', 'fort st. john', 'fraser valley', 'greater vancouver', 'greater victoria', 'kelowna', 'prince george', 'surrey/delta', 'winnipeg metropolitan region', 'greater moncton', 'saint john', 'york region', 'conception bay - st. john’s', 'greater halifax', 'brant county', 'cambridge', 'durham region', 'frontenac county', 'greater hamilton area', 'greater sudbury area', 'greater toronto area', 'halton region', 'hastings county', 'london area', 'niagara region', 'ottawa metropolitan region', 'oxford county', 'peel region', 'peterborough county', 'simcoe county', 'waterloo region', 'wellington county', 'windsor-essex county', 'york region', 'queens - charlottetown', 'centre-du-québec', 'chaudière-appalaches', 'estrie', 'lanaudière', 'laurentides', 'laval', 'mauricie', 'montérégie - agglomeration ', 'montérégie - brome-missisquoi', 'montérégie - la haute-yamaska', 'montérégie - les maskoutains', 'montérégie - vaudreuil-soulanges', 
+'montréal',
+'national capital',
+'outaouais',
+'saguenay-lac-saint-jean',
+'moose jaw',
+'regina metropolitan area',
+'saskatoon metropolitan area']
+
+data = {'province': [],'region':[],'job':[],'entry':[],'mid':[],'senior':[]}
+
+
+print(len(reader.pages))
+
+
+page = reader.pages
+current_region=""
+current_province=""
+
+page_number = 1
+
+for page in reader.pages:
+    print(page_number)
+
+    text = page.extract_text().lower()
+
+    for line in text.splitlines():
+        #find the province
+        for province in provinces:
+            if province in text.lower():
+                current_province = province  # Save the province that was found
+                break  # Exit the loop once a province is found
+        for region in regions:
+            if region in text.lower():
+                current_region = region
+                break
+        #2023 salary guide  |61director of operations 110.0 -140.0 186.0 -190.0 210.0 -230.0
+        if "guide salarial 2024" in line.lower():
+            #remove 2024 salary guide  |XX from the line to keep only the job title and salary range
+            pattern = r"\d{4}\s\S+\s\S+\s+\|\d+"
+            line = re.sub(pattern, "", line)
+
+
+
+        # extract job title and salary range
+        #inventory control manager 68.9 -87.5 78.9 -99.6 91.2-119.8
+
+        # extract job title and salary range
+
+        if "-" in line.lower() :
+            try:
+                #administrative assistant43.6 - 56.449.9 - 61.355.2 - 66.7
+                #administrative manager65.2 - 74.371.1 - 81.978.7 - 94.6
+                regex = r"^(\D+)(.*)$"
+
+                # Extract job title
+                match = re.match(regex, line)
+                if match is not None:  # Check if a match was found
+                    job_title = match.group(1).strip()
+
+                    # Split remaining numbers
+                    numbers = match.group(2)
+
+                    #47,7-60,4 53,5 -68,2 59,7 -75,7'
+                    pattern = r'(\d+,\d+)\s*-\s*(\d+,\d+)'
+
+                    # Find all matches of the pattern in the line
+                    numbers = re.findall(pattern, numbers)
+
+                    #split numbers into 3 groups or 4 groups(finance department)
+                    if len(numbers) == 3:
+                            entry_range = numbers[0][0] + "-" + numbers[1][1]
+                            mid_range = numbers[1][0] + "-" + numbers[1][1]
+                            senior_range = numbers[2][0] + "-" + numbers[2][1]
+                    elif(len(numbers) == 4):
+                        continue
+
+                    data['job'].append(job_title)
+                    data['entry'].append(entry_range)
+                    data['mid'].append(mid_range.strip())
+                    data['senior'].append(senior_range.strip())
+                    data['province'].append(current_province)
+                    data['region'].append(current_region)
+                else:
+                    print(f"No match found on this line: {line}")
+            except Exception as e:
+                print(f"An error occurred on the page {page_number} on this line: {line}")
+                print(f"Error details: {e}")
+
+    page_number+=1
+
+#save data to pickle file
+output_file = 'data/salary_guide.pkl'
+pd.to_pickle(data,output_file)
+df =  pd.DataFrame(data)
+print(df.head())