Skip to content

Commit

Permalink
added 2024 salary
Browse files Browse the repository at this point in the history
  • Loading branch information
mrBlackHat777 committed Sep 29, 2023
1 parent d0f6d60 commit 5e1fffd
Show file tree
Hide file tree
Showing 6 changed files with 690 additions and 191 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
![Jupyter Notebook](https://img.shields.io/badge/jupyter-%23FA0F00.svg?style=for-the-badge&logo=jupyter&logoColor=white)



## PDF SALARY SCRAPER
1.
```bash
python src/pdf_salary_scraper_2024.py
```
2. executer le notebook dans Salary_2023/sandbox/salary_exploration.ipynb

## Installation

Expand Down
Binary file added data/2024 Salary Guide _PDF.pdf
Binary file not shown.
Binary file added data/Guide salarial 2024_PDF.pdf
Binary file not shown.
Binary file modified data/salary_guide.pkl
Binary file not shown.
769 changes: 579 additions & 190 deletions sandbox/salary_exploration.ipynb

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions src/pdf_salary_scraper_2024.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# importing required modules
from PyPDF2 import PdfReader
import re
from pathlib import Path
import pandas as pd

# creating a pdf reader object
reader = PdfReader("data/2024 Salary Guide _PDF.pdf")
provinces = ["ontario", "québec", "british columbia", "alberta", "manitoba", "saskatchewan", "nova scotia", "new brunswick", "newfoundland & labrador", "prince edward island"]
regions=['calgary metropolitan region', 'edmonton metropolitan region', 'northern alberta', 'red deer', 'fort st. john', 'fraser valley', 'greater vancouver', 'greater victoria', 'kelowna', 'prince george', 'surrey/delta', 'winnipeg metropolitan region', 'greater moncton', 'saint john', 'york region', 'conception bay - st. john’s', 'greater halifax', 'brant county', 'cambridge', 'durham region', 'frontenac county', 'greater hamilton area', 'greater sudbury area', 'greater toronto area', 'halton region', 'hastings county', 'london area', 'niagara region', 'ottawa metropolitan region', 'oxford county', 'peel region', 'peterborough county', 'simcoe county', 'waterloo region', 'wellington county', 'windsor-essex county', 'york region', 'queens - charlottetown', 'centre-du-québec', 'chaudière-appalaches', 'estrie', 'lanaudière', 'laurentides', 'laval', 'mauricie', 'montérégie - agglomeration ', 'montérégie - brome-missisquoi', 'montérégie - la haute-yamaska', 'montérégie - les maskoutains', 'montérégie - vaudreuil-soulanges',
'montréal',
'national capital',
'outaouais',
'saguenay-lac-saint-jean',
'moose jaw',
'regina metropolitan area',
'saskatoon metropolitan area']

data = {'province': [],'region':[],'job':[],'entry':[],'mid':[],'senior':[]}


print(len(reader.pages))


page = reader.pages
current_region=""
current_province=""

page_number = 1

for page in reader.pages:
print(page_number)

text = page.extract_text().lower()

for line in text.splitlines():
#find the province
for province in provinces:
if province in text.lower():
current_province = province # Save the province that was found
break # Exit the loop once a province is found
for region in regions:
if region in text.lower():
current_region = region
break
#2023 salary guide |61director of operations 110.0 -140.0 186.0 -190.0 210.0 -230.0
if "guide salarial 2024" in line.lower():
#remove 2024 salary guide |XX from the line to keep only the job title and salary range
pattern = r"\d{4}\s\S+\s\S+\s+\|\d+"
line = re.sub(pattern, "", line)



# extract job title and salary range
#inventory control manager 68.9 -87.5 78.9 -99.6 91.2-119.8

# extract job title and salary range

if "-" in line.lower() :
try:
#administrative assistant43.6 - 56.449.9 - 61.355.2 - 66.7
#administrative manager65.2 - 74.371.1 - 81.978.7 - 94.6
regex = r"^(\D+)(.*)$"

# Extract job title
match = re.match(regex, line)
if match is not None: # Check if a match was found
job_title = match.group(1).strip()

# Split remaining numbers
numbers = match.group(2)

#47,7-60,4 53,5 -68,2 59,7 -75,7'
pattern = r'(\d+,\d+)\s*-\s*(\d+,\d+)'

# Find all matches of the pattern in the line
numbers = re.findall(pattern, numbers)

#split numbers into 3 groups or 4 groups(finance department)
if len(numbers) == 3:
entry_range = numbers[0][0] + "-" + numbers[1][1]
mid_range = numbers[1][0] + "-" + numbers[1][1]
senior_range = numbers[2][0] + "-" + numbers[2][1]
elif(len(numbers) == 4):
continue

data['job'].append(job_title)
data['entry'].append(entry_range)
data['mid'].append(mid_range.strip())
data['senior'].append(senior_range.strip())
data['province'].append(current_province)
data['region'].append(current_region)
else:
print(f"No match found on this line: {line}")
except Exception as e:
print(f"An error occurred on the page {page_number} on this line: {line}")
print(f"Error details: {e}")

page_number+=1

#save data to pickle file
output_file = 'data/salary_guide.pkl'
pd.to_pickle(data,output_file)
df = pd.DataFrame(data)
print(df.head())

0 comments on commit 5e1fffd

Please sign in to comment.