Data_Anotation/main.py at main · BlissMe/Data_Anotation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# main.py — upgrade PHQ-9 levels and add normalized score (0–1), save to a chosen folder
import pandas as pd
import numpy as np
import re
from pathlib import Path
from typing import List, Optional

# ---- Paths (use r"" or forward slashes) ----
SRC_PATH = Path(r"E:\Academic 7th Sem\Final Year Project\RAG\Data_Anotation\Dataset\PHQ9_Student_Depression_Dataset_Updated.xlsx")
OUT_DIR  = Path(r"E:\Academic 7th Sem\Final Year Project\RAG\Data_Anotation\Output")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_PATH = OUT_DIR / "PHQ9_with_updated_levels.xlsx"   # <-- final file path

assert SRC_PATH.exists(), f"Input file not found at {SRC_PATH}"

# ---------- Load ----------
# Explicit engine makes the dependency clear
df = pd.read_excel(SRC_PATH, engine="openpyxl")

# ---------- Helpers ----------
def find_total_score_column(columns: List[str]) -> Optional[str]:
    patt = re.compile(r"(?:^|\b)phq[-_ ]?9\b.*?(total|score)", re.IGNORECASE)
    for c in columns:
        if patt.search(str(c)):
            return c
    candidates = ["PHQ9", "PHQ_9", "PHQ-9", "PHQ9_Score", "PHQ9_Total", "Total_PHQ9", "Total Score PHQ9", "PHQ-9 Score"]
    for c in columns:
        if str(c).strip() in candidates:
            return c
    return None

def safe_add(df: pd.DataFrame, colname: str, series: pd.Series) -> str:
    name = colname
    i = 1
    while name in df.columns:
        i += 1
        name = f"{colname}__{i}"
    df[name] = series
    return name

# ---------- Detect & compute raw ----------
total_col = find_total_score_column(df.columns.tolist())
if total_col is None:
    raise RuntimeError("Couldn't find a PHQ-9 total score column. Please ensure your file has one (e.g., 'PHQ-9 Score').")

phq_raw = pd.to_numeric(df[total_col], errors="coerce").clip(lower=0, upper=27)

# ---------- Add requested columns ----------
# Normalized score on 0..1
normalized = (phq_raw / 27.0).round(4)

def updated_level(x):
    if pd.isna(x):
        return None
    if 0 <= x <= 4:
        return "Minimal"
    if 5 <= x <= 14:
        return "Moderate"
    if 15 <= x <= 27:
        return "Severe"
    return None

updated_levels = phq_raw.apply(updated_level)

col_norm = safe_add(df, "Normalized_PHQ9_Score", normalized)
col_lvl  = safe_add(df, "Updated_Depression_Level", updated_levels)

# ---------- Save ----------
# Use openpyxl to write too (only one dependency needed)
with pd.ExcelWriter(OUT_PATH, engine="openpyxl") as writer:
    df.to_excel(writer, index=False, sheet_name="Updated")

print(f"✅ Wrote: {OUT_PATH}")
print(df[[total_col, col_norm, col_lvl]].head(10).to_string(index=False))