CoDHy/hvalidator.py at main · baksho/CoDHy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import json
import requests
from Bio import Entrez

class ValidationAgent:
    def __init__(self, kg, model_name=None, OLLAMA_BASE_URL="http://localhost:11434/api/generate"):
        self.kg = kg
        self.model = model_name or "llama3.1"
        self.api_url = OLLAMA_BASE_URL
        self.email = globals().get("Entrez.email", "basak.suvinava@mh-hannover.de")
        Entrez.email = self.email

    def check_combination_evidence(self, drugs, cancer_type):
        """
        Performs a 'Just-in-Time' PubMed search for the specific combination.
        """
        if len(drugs) < 2:
            return {"status": "Single Agent", "urls": [], "note": "Single drug"}

        # Construct Query: "Drug A" AND "Drug B"
        clean_drugs = [f'"{d}"' for d in drugs]
        query_ne = f"({' AND '.join(clean_drugs)})"                             # Novelty-Exact (NE)
        query_nc = f"({' AND '.join(clean_drugs)}) AND {cancer_type}"           # Novelty-in-Context (NC)

        ne_hits, ne_ids = self._get_pubmed_count(query_ne)
        nc_hits, nc_ids = self._get_pubmed_count(query_nc)

        found_urls = []

        if ne_hits > 0 or nc_hits > 0:
            all_ids = list(set(ne_ids + nc_ids))
            for pmid in all_ids:
                found_urls.append(f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/")

            return {
                "status": "Validated (Literature Exists)",
                "urls": found_urls,
                "ne_hits": ne_hits,
                "nc_hits": nc_hits,
                "is_novel_exact": ne_hits == 0,
                "is_novel_context": nc_hits == 0,
            }

        return {
            "status": "Inferred (Novel Combination)",
            "urls": [],
            "ne_hits": ne_hits,
            "nc_hits": nc_hits,
            "is_novel_exact": ne_hits == 0,
            "is_novel_context": nc_hits == 0,
        }

    def _get_pubmed_count(self, query):
        try:
            handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
            record = Entrez.read(handle)
            return int(record["Count"]), record["IdList"]
        except: return 0, []

    def validate_batch(self, hypotheses_list):
        print("\nValidation Agent is reviewing candidates...")
        validated_results = []

        if not hypotheses_list:
            print("   No hypotheses to validate.")
            return []

        for hypo in hypotheses_list:
            if not isinstance(hypo, dict):
                print(f"   Skipping invalid data format: {hypo}")
                continue

            combo_str = hypo.get('combination', '')
            drugs = [d.strip().split()[0] for d in combo_str.replace("+", ",").split(",") if d.strip()]
            focus_gene = hypo.get('focus_gene')
            cancer_type = hypo.get('target_disease')

            print(f"   -> Reviewing Hypothesis {hypo.get('id', '?')}: {combo_str}")

            combo_check = self.check_combination_evidence(drugs, cancer_type)

            # Retrieve Individual Safety Data and URLs from Graph
            safety_context = []
            hypo['civic_urls'] = []
            hypo['chembl_urls'] = []
            hypo['trial_urls'] = []
            hypo['reactome_urls'] = []

            for drug in drugs:
                # Optimized query to find all relationships (r) connected to the drug (d)
                validated_facts = self.kg.get_validated_subgraph(
                    drug_name=drug,
                    focus_gene=focus_gene,
                    cancer_type=cancer_type
                )
                fact_entries = []
                for record in validated_facts:
                    source = record.get('r.source', 'Unknown')
                    rel = record.get('type(r)', 'interacts with')
                    obj = record.get('o.name', 'Unknown')
                    fact_entries.append(f"({source} Fact): {drug} {rel} {obj}")

                query = """
                MATCH (d:Drug)
                WHERE toLower(d.name) CONTAINS toLower($name)
                  OR  toLower($name) CONTAINS toLower(d.name)

                OPTIONAL MATCH (g:Gene)
                WHERE toLower(g.name) CONTAINS toLower($focus_gene)
                  OR toLower($focus_gene) CONTAINS toLower(g.name)

                WITH d, g
                OPTIONAL MATCH (d)-[r1]-(:Phase)
                OPTIONAL MATCH (d)-[r2]-(g)

                WITH d, g, (collect(DISTINCT r1.source_url) + collect(DISTINCT r2.source_url)) as phase_urls
                OPTIONAL MATCH (d)-[r_civic]-(x)
                WHERE r_civic.source = "CIViC"

                WITH d, g, phase_urls, collect(DISTINCT r_civic.source_url) as civic_urls
                OPTIONAL MATCH (t:ClinicalTrial)-[r_trial1]-(d)
                OPTIONAL MATCH (t:ClinicalTrial)-[r_trial2]-(g)

                WITH d, g, phase_urls, civic_urls, (collect(DISTINCT r_trial1.source_url) + collect(DISTINCT r_trial2.source_url)) as trial_urls, collect(DISTINCT t.nct_id) as nct_ids
                OPTIONAL MATCH (g)-[r_pathway]-(p:Pathway)

                WITH d, g, phase_urls, civic_urls, trial_urls, nct_ids, collect(DISTINCT r_pathway.source_url) as reactome_urls
                OPTIONAL MATCH (d)-[:CAUSES_SIDE_EFFECT]->(se:SideEffect)

                RETURN
                    d.name as drug_name,
                    d.max_phase as phase,
                    d.is_withdrawn as withdrawn,
                    g.name as gene_name,
                    phase_urls,
                    civic_urls,
                    trial_urls,
                    reactome_urls,
                    collect(DISTINCT se.name)[..15] as side_effects
                """
                with self.kg.driver.session() as session:
                    results = session.run(query, name=drug, focus_gene=focus_gene).data()
                    if results:
                        res = results[0]
                        d_name = res['drug_name']
                        phase = res['phase']
                        withdrawn = res['withdrawn']
                        se_list = res['side_effects']

                        def clean_urls(url_list):
                            return [u for u in url_list if isinstance(u, str) and u.startswith("http")]

                        hypo['chembl_urls'].extend(clean_urls(res['phase_urls']))
                        hypo['civic_urls'].extend(clean_urls(res['civic_urls']))
                        hypo['trial_urls'].extend(clean_urls(res['trial_urls']))
                        hypo['reactome_urls'].extend(clean_urls(res['reactome_urls']))

                        status_str = f"Phase {phase}" if phase else "Unknown Status"
                        if withdrawn: status_str += " (WITHDRAWN)"
                        se_str = ", ".join(se_list) if se_list else "No data"
                        facts_str = f" | Biological Facts: {'; '.join(fact_entries)}" if fact_entries else ""
                        safety_context.append(f"DRUG: {d_name} | STATUS: {status_str} | SIDE EFFECTS: {se_str} | Biological Facts: {facts_str}")

                    else:
                        safety_context.append(f"DRUG: {drug} | Not found in Knowledge Graph.")

            hypo['civic_urls'] = list(set(hypo['civic_urls']))
            hypo['chembl_urls'] = list(set(hypo['chembl_urls']))
            hypo['trial_urls'] = list(set(hypo['trial_urls']))
            hypo['reactome_urls'] = list(set(hypo['reactome_urls']))

            safety_str = "\n".join(safety_context)

            # 3. Prompt for Verdict
            system_prompt = "You are a clinical auditor. You need to validate the drug combination hypothesis for the given disease."
            user_prompt = f"""
            HYPOTHESIS: {combo_str}
            DISEASE: {cancer_type}

            [EVIDENCE CHECK]
            Status: {combo_check['status']}

            [INDIVIDUAL DRUG PROFILES]
            {safety_str}

            TASK:
            1. Assign a Safety Score (1-10). If a drug is "WITHDRAWN", score must be < 3.
            2. Evaluate Plausibility (Biological sense) (Low/Moderate/High) with detailed reasoning.
            3. Assess Combination Toxicity Risk (Low/Moderate/High) with proper reasoning based on the [EVIDENCE CHECK] and [INDIVIDUAL_DRUG_PROFILES]. If Evidence Status is "Inferred", then you MUST predict the toxicity and state "Predicted based on individual profiles" and explain your reasoning for the overlapping toxicities.
            4. Write a short critique.
            5. Provide supporting evidences:
              - For every claim, explain the finding.
              - For citation of you claim, you MUST output the "Real PubMed Hits" listed above in the Evidence Check section.
              - If the list is empty, WRITE "No direct clinical study found." after your claim and cite no URL.
              - DO NOT invent new URLs.

            HARD RULES FOR HALLUCINATION PREVENTION:
            - DO NOT use numeric placeholders like [1], [2].
            - DO NOT use placeholders like [Journal Name], [Year], or [Source] etc.
            - DO NOT generate fake PubMed links (e.g., pubmed.ncbi.nlm.nih.gov/12345678).
            - DO NOT make up Source Names or IDs.
            - If you do not see a "http..." link in the context provided above, DO NOT WRITE A URL.

            OUTPUT FORMAT (JSON):
            {{
                "safety_score": 8,
                "plausibility": "(Low/Moderate/High). Reason: ...",
                "combination_toxicity_risk": "(Low/Moderate/High). Reason: ...",
                "critique": "...",
                "supporting_evidence": "..."
            }}
            """
            payload = {
                "model": self.model,
                "prompt": system_prompt + "\n\n" + user_prompt,
                "stream": False,
                "format": "json",
                "options": {"temperature": 0.0}
            }

            try:
                r = requests.post(self.api_url, json=payload, timeout=300)
                try:
                    val_data = json.loads(r.json().get("response", "{}"))
                except:
                    val_data = {"verdict": "Error", "safety_score": 0, "critique": "Validation agent failed to output JSON."}

                hypo.update(val_data)

                hypo['source_urls'] = list(dict.fromkeys(combo_check['urls']))
                hypo['evidence_status'] = combo_check['status']
                hypo['ne_hits'] = combo_check['ne_hits']
                hypo['nc_hits'] = combo_check['nc_hits']
                hypo['is_novel_exact'] = combo_check['is_novel_exact']
                hypo['is_novel_context'] = combo_check['is_novel_context']

                validated_results.append(hypo)

            except Exception as e:
                print(f"Validation Error: {e}")
                hypo.update({"verdict": "Error", "safety_score": 0})
                validated_results.append(hypo)

        return validated_results