-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
74 lines (61 loc) · 2.55 KB
/
main.py
File metadata and controls
74 lines (61 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# main.py — upgrade PHQ-9 levels and add normalized score (0–1), save to a chosen folder
import pandas as pd
import numpy as np
import re
from pathlib import Path
from typing import List, Optional
# ---- Paths (use r"" or forward slashes) ----
SRC_PATH = Path(r"E:\Academic 7th Sem\Final Year Project\RAG\Data_Anotation\Dataset\PHQ9_Student_Depression_Dataset_Updated.xlsx")
OUT_DIR = Path(r"E:\Academic 7th Sem\Final Year Project\RAG\Data_Anotation\Output")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_PATH = OUT_DIR / "PHQ9_with_updated_levels.xlsx" # <-- final file path
assert SRC_PATH.exists(), f"Input file not found at {SRC_PATH}"
# ---------- Load ----------
# Explicit engine makes the dependency clear
df = pd.read_excel(SRC_PATH, engine="openpyxl")
# ---------- Helpers ----------
def find_total_score_column(columns: List[str]) -> Optional[str]:
patt = re.compile(r"(?:^|\b)phq[-_ ]?9\b.*?(total|score)", re.IGNORECASE)
for c in columns:
if patt.search(str(c)):
return c
candidates = ["PHQ9", "PHQ_9", "PHQ-9", "PHQ9_Score", "PHQ9_Total", "Total_PHQ9", "Total Score PHQ9", "PHQ-9 Score"]
for c in columns:
if str(c).strip() in candidates:
return c
return None
def safe_add(df: pd.DataFrame, colname: str, series: pd.Series) -> str:
name = colname
i = 1
while name in df.columns:
i += 1
name = f"{colname}__{i}"
df[name] = series
return name
# ---------- Detect & compute raw ----------
total_col = find_total_score_column(df.columns.tolist())
if total_col is None:
raise RuntimeError("Couldn't find a PHQ-9 total score column. Please ensure your file has one (e.g., 'PHQ-9 Score').")
phq_raw = pd.to_numeric(df[total_col], errors="coerce").clip(lower=0, upper=27)
# ---------- Add requested columns ----------
# Normalized score on 0..1
normalized = (phq_raw / 27.0).round(4)
def updated_level(x):
if pd.isna(x):
return None
if 0 <= x <= 4:
return "Minimal"
if 5 <= x <= 14:
return "Moderate"
if 15 <= x <= 27:
return "Severe"
return None
updated_levels = phq_raw.apply(updated_level)
col_norm = safe_add(df, "Normalized_PHQ9_Score", normalized)
col_lvl = safe_add(df, "Updated_Depression_Level", updated_levels)
# ---------- Save ----------
# Use openpyxl to write too (only one dependency needed)
with pd.ExcelWriter(OUT_PATH, engine="openpyxl") as writer:
df.to_excel(writer, index=False, sheet_name="Updated")
print(f"✅ Wrote: {OUT_PATH}")
print(df[[total_col, col_norm, col_lvl]].head(10).to_string(index=False))