-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d0f6d60
commit 5e1fffd
Showing
6 changed files
with
690 additions
and
191 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# importing required modules | ||
from PyPDF2 import PdfReader | ||
import re | ||
from pathlib import Path | ||
import pandas as pd | ||
|
||
# creating a pdf reader object | ||
reader = PdfReader("data/2024 Salary Guide _PDF.pdf") | ||
provinces = ["ontario", "québec", "british columbia", "alberta", "manitoba", "saskatchewan", "nova scotia", "new brunswick", "newfoundland & labrador", "prince edward island"] | ||
regions=['calgary metropolitan region', 'edmonton metropolitan region', 'northern alberta', 'red deer', 'fort st. john', 'fraser valley', 'greater vancouver', 'greater victoria', 'kelowna', 'prince george', 'surrey/delta', 'winnipeg metropolitan region', 'greater moncton', 'saint john', 'york region', 'conception bay - st. john’s', 'greater halifax', 'brant county', 'cambridge', 'durham region', 'frontenac county', 'greater hamilton area', 'greater sudbury area', 'greater toronto area', 'halton region', 'hastings county', 'london area', 'niagara region', 'ottawa metropolitan region', 'oxford county', 'peel region', 'peterborough county', 'simcoe county', 'waterloo region', 'wellington county', 'windsor-essex county', 'york region', 'queens - charlottetown', 'centre-du-québec', 'chaudière-appalaches', 'estrie', 'lanaudière', 'laurentides', 'laval', 'mauricie', 'montérégie - agglomeration ', 'montérégie - brome-missisquoi', 'montérégie - la haute-yamaska', 'montérégie - les maskoutains', 'montérégie - vaudreuil-soulanges', | ||
'montréal', | ||
'national capital', | ||
'outaouais', | ||
'saguenay-lac-saint-jean', | ||
'moose jaw', | ||
'regina metropolitan area', | ||
'saskatoon metropolitan area'] | ||
|
||
data = {'province': [],'region':[],'job':[],'entry':[],'mid':[],'senior':[]} | ||
|
||
|
||
print(len(reader.pages)) | ||
|
||
|
||
page = reader.pages | ||
current_region="" | ||
current_province="" | ||
|
||
page_number = 1 | ||
|
||
for page in reader.pages: | ||
print(page_number) | ||
|
||
text = page.extract_text().lower() | ||
|
||
for line in text.splitlines(): | ||
#find the province | ||
for province in provinces: | ||
if province in text.lower(): | ||
current_province = province # Save the province that was found | ||
break # Exit the loop once a province is found | ||
for region in regions: | ||
if region in text.lower(): | ||
current_region = region | ||
break | ||
#2023 salary guide |61director of operations 110.0 -140.0 186.0 -190.0 210.0 -230.0 | ||
if "guide salarial 2024" in line.lower(): | ||
#remove 2024 salary guide |XX from the line to keep only the job title and salary range | ||
pattern = r"\d{4}\s\S+\s\S+\s+\|\d+" | ||
line = re.sub(pattern, "", line) | ||
|
||
|
||
|
||
# extract job title and salary range | ||
#inventory control manager 68.9 -87.5 78.9 -99.6 91.2-119.8 | ||
|
||
# extract job title and salary range | ||
|
||
if "-" in line.lower() : | ||
try: | ||
#administrative assistant43.6 - 56.449.9 - 61.355.2 - 66.7 | ||
#administrative manager65.2 - 74.371.1 - 81.978.7 - 94.6 | ||
regex = r"^(\D+)(.*)$" | ||
|
||
# Extract job title | ||
match = re.match(regex, line) | ||
if match is not None: # Check if a match was found | ||
job_title = match.group(1).strip() | ||
|
||
# Split remaining numbers | ||
numbers = match.group(2) | ||
|
||
#47,7-60,4 53,5 -68,2 59,7 -75,7' | ||
pattern = r'(\d+,\d+)\s*-\s*(\d+,\d+)' | ||
|
||
# Find all matches of the pattern in the line | ||
numbers = re.findall(pattern, numbers) | ||
|
||
#split numbers into 3 groups or 4 groups(finance department) | ||
if len(numbers) == 3: | ||
entry_range = numbers[0][0] + "-" + numbers[1][1] | ||
mid_range = numbers[1][0] + "-" + numbers[1][1] | ||
senior_range = numbers[2][0] + "-" + numbers[2][1] | ||
elif(len(numbers) == 4): | ||
continue | ||
|
||
data['job'].append(job_title) | ||
data['entry'].append(entry_range) | ||
data['mid'].append(mid_range.strip()) | ||
data['senior'].append(senior_range.strip()) | ||
data['province'].append(current_province) | ||
data['region'].append(current_region) | ||
else: | ||
print(f"No match found on this line: {line}") | ||
except Exception as e: | ||
print(f"An error occurred on the page {page_number} on this line: {line}") | ||
print(f"Error details: {e}") | ||
|
||
page_number+=1 | ||
|
||
#save data to pickle file | ||
output_file = 'data/salary_guide.pkl' | ||
pd.to_pickle(data,output_file) | ||
df = pd.DataFrame(data) | ||
print(df.head()) |