-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDF_Application_Parser.py
96 lines (85 loc) · 4.36 KB
/
PDF_Application_Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import re
import csv
import os
from PyPDF2 import PdfReader, PdfFileReader
# Define patterns for sections and their values
PATTERNS = {
"Application Date": r"Application Date:\s*(?P<ApplicationDate>[\d/]+)",
"Job Title": r"Job Title:\s*(?P<JobTitle>[^\n]+)",
"First Name": r"First Name:\s*(?P<FirstName>\w+)",
"Last Name": r"Last Name:\s*(?P<LastName>\w+)",
"Email": r"Email:\s*(?P<Email>[\w.@]+)",
"Country": r"Country:\s*(?P<Country>\w+)",
"State": r"State:\s*(?P<State>\w+)",
"City": r"City:\s*(?P<City>\w+)",
"Zip/Postal Code": r"Zip/Postal Code:\s*(?P<ZipCode>[\d\s]+)",
"Cell Phone": r"Cell Phone:\s*(?P<CellPhone>[\d\s]+)",
"Are you 18 years of age or older?": r"Are you 18 years of age or older\?\s*(?P<Age18>\w+)",
"Are you legally authorized to work in the United States?": r"Are you legally authorized to work in the United States\?\s*(?P<AuthorizedToWork>\w+)",
"Desired Compensation": r"Desired Compensation:\s*(?P<DesiredCompensation>[,\d\-\.]+(\s*/\s*hr)?)",
"Available Start Date": r"Available Start Date:\s*(?P<StartDate>[\d/]+)",
"How did you hear about this opportunity?": r"How did you hear about this opportunity\?\s*(?P<OpportunitySource>\w+)"
# Add more patterns as needed
}
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, "data")
applications_directory = os.path.join(current_directory, "documents/applications")
resumes_directory = os.path.join(current_directory, "documents/resumes")
def read_pdf(file_path):
text_pages = []
try:
with open(file_path, 'rb') as file:
pdf_reader = PdfReader(file)
for page in pdf_reader.pages:
text_pages.append(page.extract_text())
return ''.join(text_pages)
except (PdfFileReader.PdfReadError, KeyError):
print(f"Error reading file: {file_path}")
return None
def parse_application_text(text):
parsed_data = {}
for key, pattern in PATTERNS.items():
match = re.search(pattern, text)
if match:
parsed_data[key] = match.group(match.lastgroup)
return parsed_data
def parse_application(folder_path):
application_meta = {}
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
if text is not None:
parsed_data = parse_application_text(text)
if parsed_data:
application_meta[file] = parsed_data
else:
print(f"Skipping unsupported file: {file_path}")
return application_meta
def match_pdf_with_data(directory, parsed_applications):
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
if text:
for app_file, app_data in parsed_applications.items():
first_name_pattern = re.compile(re.escape(app_data.get('First Name')), re.IGNORECASE)
last_name_pattern = re.compile(re.escape(app_data.get('Last Name')), re.IGNORECASE)
if first_name_pattern.search(text) and last_name_pattern.search(text):
parsed_applications[app_file]["Resume File"] = file # add a "Document Match" entry to the application data
break # break once a match is found for a PDF
return parsed_applications
if __name__ == "__main__":
applications = parse_application(applications_directory)
applications = match_pdf_with_data(resumes_directory, applications)
output_file = os.path.join(data_directory, "pg-2_parsed_applications.csv")
with open(output_file, 'w', newline='') as csvfile:
# CSV Header using the keys from PATTERNS, prefixed by 'Filename'
fieldnames = ['Application'] + list(PATTERNS.keys()) + ['Resume File']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for file_name, parsed_data in applications.items():
# Write file_name followed by parsed data to CSV
writer.writerow({'Application': file_name, **parsed_data})