-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_extractor.py
132 lines (105 loc) · 4.94 KB
/
data_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from openai import OpenAI
import json
from typing import Dict, List
import os
def clean_openai_response(response_text: str) -> str:
"""Clean OpenAI response by removing markdown formatting"""
# Remove markdown code block indicators
response_text = response_text.replace('```json', '').replace('```', '')
# Remove leading/trailing whitespace
response_text = response_text.strip()
return response_text
def truncate_content(content: str, max_chars: int = 10000//3) -> str:
"""Truncate content by removing navigation, headers, footers, and other boilerplate"""
if not content or len(content) <= max_chars:
return content
# Split content into lines
lines = content.split('\n')
# Skip common header/footer/navigation patterns
skip_patterns = [
'navigation', 'menu', 'copyright', 'footer', 'header',
'privacy policy', 'terms of use', 'skip to content',
'search', 'social media', 'follow us', 'contact us',
'all rights reserved', '©', 'cookie'
]
# Keep only relevant content lines
filtered_lines = []
in_main_content = False
for line in lines:
line = line.strip()
if not line:
continue
# Skip lines that match header/footer patterns
if any(pattern in line.lower() for pattern in skip_patterns):
continue
# Look for main content markers
if any(marker in line.lower() for marker in ['biography', 'research', 'publications', 'about', 'profile']):
in_main_content = True
if in_main_content:
filtered_lines.append(line)
# Stop if we've collected enough content
if len('\n'.join(filtered_lines)) >= max_chars:
break
# If we haven't found any main content markers, take the middle portion
if not filtered_lines:
middle_start = len(lines) // 4
middle_end = len(lines) * 3 // 4
filtered_lines = lines[middle_start:middle_end]
# Join and truncate to max length if still needed
result = ' '.join(filtered_lines)
if len(result) > max_chars:
return result[:max_chars//2] + "\n...[content truncated]...\n" + result[-max_chars//2:]
return result
def extract_researcher_data(page_content: str) -> Dict:
"""Extract structured researcher data using OpenAI"""
client = OpenAI()
# Truncate content to stay within token limits
truncated_content = truncate_content(page_content)
system_prompt = """You are a research profile analyzer assessing academic profiles for PhD opportunities.
Return ONLY a JSON object with these exact keys:
- position (current academic position/title)
- institute (university or research institution name)
- department (academic department name)
- advisor (if PhD student/postdoc, their advisor's name)
- interests (research interests and areas)
- email (extract any email addresses)
- funding_likelihood (assess as "High", "Medium", or "Low" based on position, institution reputation, and research area)
For funding_likelihood assessment:
- "High": Full professors at top universities, or researchers in well-funded fields
- "Medium": Associate professors, or professors at mid-tier institutions
- "Low": Non-faculty positions or institutions with limited research funding"""
user_prompt = f"""
Extract information from this webpage content, paying special attention to details that indicate funding availability.
Consider:
- Institution's research standing and size
- Professor's seniority and position
- Department's prominence
- Research field's typical funding availability
Content: {truncated_content}
Return all fields, ensuring funding_likelihood is "High", "Medium", or "Low" based on the above criteria.
"""
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo", # Using gpt-3.5-turbo for better stability
temperature=0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
# Clean and parse response
response_text = completion.choices[0].message.content
cleaned_response = clean_openai_response(response_text)
try:
data = json.loads(cleaned_response)
# Clean up None values and empty structures
data = {k: v for k, v in data.items()
if v and (not isinstance(v, (dict, list)) or len(v) > 0)}
return data
except json.JSONDecodeError as e:
print(f"JSON parsing error: {str(e)}")
print(f"Cleaned response: {cleaned_response[:200]}...")
return {}
except Exception as e:
print(f"OpenAI extraction error: {str(e)}")
return {}