invisible-threads/find_debates.py at main · baboonzero/invisible-threads · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
"""
Find genuine debates using topic-stance extraction.

Step 1: Extract TOPIC + STANCE from each insight
Step 2: Group insights by topic
Step 3: Find opposing stances within topics

Usage:
    modal run insights_first/find_debates.py --input insights_first/data/modal_extraction_20260120_024600.json
"""

import modal
import json
import numpy as np
from datetime import datetime
from collections import defaultdict

app = modal.App("debate-finder")

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
model_volume = modal.Volume.from_name("qwen-model-cache", create_if_missing=True)

vllm_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("vllm>=0.6.0", "torch", "transformers", "huggingface_hub")
)

# Step 1: Extract topic and stance
EXTRACTION_PROMPT = """Extract the TOPIC and STANCE from this business insight.

TOPIC: The specific question or decision this insight addresses.
- Should be a question like "How to handle X?" or "Should you do Y?"
- Be specific, not vague categories

STANCE: The position this insight takes on that topic.
- What does it recommend or conclude?
- Be direct and specific

## Insight:
{insight}

## Respond with JSON only:
{{
  "topic": "The specific question/decision addressed (as a question)",
  "stance": "The position taken (direct statement)",
  "topic_category": "2-3 word category (e.g., 'hiring', 'product strategy', 'team management')"
}}"""

# Step 3: Check for genuine opposition
OPPOSITION_PROMPT = """Do these two insights take GENUINELY OPPOSITE positions on the same topic?

## Topic: {topic}

## Insight A stance: {stance_a}
Full insight: {insight_a}

## Insight B stance: {stance_b}
Full insight: {insight_b}

## Criteria for GENUINE OPPOSITION:
- They address the EXACT same decision/question
- They recommend OPPOSITE actions (not just different emphasis)
- A person couldn't follow both - they're mutually exclusive

## NOT opposition:
- Different aspects of same topic
- Complementary perspectives
- Different contexts where both could apply
- One is more specific than the other

## Respond with JSON:
{{
  "is_genuine_opposition": true/false,
  "opposition_quality": 1-10,
  "debate_question": "The question they disagree on (or null)",
  "side_a": "Position A in 5 words (or null)",
  "side_b": "Position B in 5 words (or null)",
  "explanation": "Why this is or isn't genuine opposition"
}}

Quality 8-10: Clear "do X" vs "don't do X" on exact same decision
Quality 5-7: Tension but could coexist in some contexts
Quality 1-4: Different topics or complementary views"""


@app.cls(
    gpu="A10G",
    image=vllm_image,
    volumes={"/model-cache": model_volume},
    timeout=600,
    scaledown_window=300,
)
class DebateFinder:
    @modal.enter()
    def load_model(self):
        from vllm import LLM, SamplingParams

        self.llm = LLM(
            model=MODEL_ID,
            download_dir="/model-cache",
            trust_remote_code=True,
            max_model_len=4096,
            gpu_memory_utilization=0.9,
        )
        self.extract_params = SamplingParams(temperature=0.2, max_tokens=300)
        self.oppose_params = SamplingParams(temperature=0.2, max_tokens=400)

    @modal.method()
    def extract_topic_stance_batch(self, insights: list[tuple]) -> list[dict]:
        """Extract topic and stance from a batch of insights."""
        import re

        prompts = [EXTRACTION_PROMPT.format(insight=text) for text, idx in insights]
        outputs = self.llm.generate(prompts, self.extract_params)

        results = []
        for (text, idx), output in zip(insights, outputs):
            response = output.outputs[0].text.strip()
            try:
                json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
                if json_match:
                    result = json.loads(json_match.group())
                else:
                    result = {"error": "No JSON", "raw": response[:200]}
            except:
                result = {"error": "Parse failed", "raw": response[:200]}

            result["idx"] = idx
            result["insight_text"] = text
            results.append(result)

        return results

    @modal.method()
    def check_opposition_batch(self, pairs: list[dict]) -> list[dict]:
        """Check if pairs of insights are genuinely opposed."""
        import re

        prompts = []
        for p in pairs:
            prompt = OPPOSITION_PROMPT.format(
                topic=p['topic'],
                stance_a=p['stance_a'],
                insight_a=p['insight_a'],
                stance_b=p['stance_b'],
                insight_b=p['insight_b']
            )
            prompts.append(prompt)

        outputs = self.llm.generate(prompts, self.oppose_params)

        results = []
        for p, output in zip(pairs, outputs):
            response = output.outputs[0].text.strip()
            try:
                json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
                if json_match:
                    result = json.loads(json_match.group())
                else:
                    result = {"error": "No JSON"}
            except:
                result = {"error": "Parse failed"}

            result["idx_a"] = p['idx_a']
            result["idx_b"] = p['idx_b']
            result["topic"] = p['topic']
            result["insight_a"] = p['insight_a']
            result["insight_b"] = p['insight_b']
            result["stance_a"] = p['stance_a']
            result["stance_b"] = p['stance_b']
            results.append(result)

        return results


@app.local_entrypoint()
def main(input: str, topic_similarity: float = 0.7, min_quality: int = 7, batch_size: int = 50):
    """Find genuine debates through topic-stance extraction."""
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity

    # Load insights
    print(f"Loading insights from {input}...")
    with open(input, 'r', encoding='utf-8') as f:
        data = json.load(f)

    insights = [r['insight'] for r in data['results'] if r['has_insight'] and r['insight']]
    print(f"Loaded {len(insights)} insights")

    # Step 1: Extract topic + stance from all insights
    print(f"\n{'='*60}")
    print("STEP 1: Extracting topics and stances")
    print('='*60)

    finder = DebateFinder()

    insight_inputs = [(i['insight_text'], idx) for idx, i in enumerate(insights)]
    batches = [insight_inputs[i:i+batch_size] for i in range(0, len(insight_inputs), batch_size)]

    import time
    start = time.time()

    extracted = []
    for batch_results in finder.extract_topic_stance_batch.map(batches, order_outputs=False):
        extracted.extend(batch_results)
        print(f"  Progress: {len(extracted)}/{len(insights)} insights processed")

    print(f"Extraction complete in {time.time() - start:.1f}s")

    # Filter out errors
    valid_extracted = [e for e in extracted if 'topic' in e and 'stance' in e]
    print(f"Valid extractions: {len(valid_extracted)}/{len(extracted)}")

    # Step 2: Group by topic using embeddings
    print(f"\n{'='*60}")
    print("STEP 2: Grouping by topic similarity")
    print('='*60)

    print("Computing topic embeddings...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    topics = [e['topic'] for e in valid_extracted]
    topic_embeddings = model.encode(topics, show_progress_bar=True)

    # Find similar topic pairs
    print(f"Finding similar topics (threshold={topic_similarity})...")
    sim_matrix = cosine_similarity(topic_embeddings)

    # Group insights by similar topics
    topic_groups = defaultdict(list)
    used = set()

    for i in range(len(valid_extracted)):
        if i in used:
            continue

        group = [i]
        used.add(i)

        for j in range(i + 1, len(valid_extracted)):
            if j not in used and sim_matrix[i, j] >= topic_similarity:
                # Also check same episode exclusion
                if insights[valid_extracted[i]['idx']]['episode_title'] != insights[valid_extracted[j]['idx']]['episode_title']:
                    group.append(j)
                    used.add(j)

        if len(group) >= 2:
            topic_groups[valid_extracted[i]['topic']].extend(group)

    print(f"Found {len(topic_groups)} topic groups with 2+ insights")

    # Step 3: Check for opposition within groups
    print(f"\n{'='*60}")
    print("STEP 3: Finding opposing stances within topics")
    print('='*60)

    # Create pairs to check
    pairs_to_check = []
    for topic, indices in topic_groups.items():
        for i, idx_i in enumerate(indices):
            for idx_j in indices[i+1:]:
                e_i = valid_extracted[idx_i]
                e_j = valid_extracted[idx_j]

                pairs_to_check.append({
                    'topic': topic,
                    'idx_a': e_i['idx'],
                    'idx_b': e_j['idx'],
                    'stance_a': e_i['stance'],
                    'stance_b': e_j['stance'],
                    'insight_a': e_i['insight_text'],
                    'insight_b': e_j['insight_text'],
                })

    print(f"Checking {len(pairs_to_check)} potential debate pairs...")

    if len(pairs_to_check) == 0:
        print("No pairs to check. Try lowering topic_similarity threshold.")
        return

    # Batch check opposition
    pair_batches = [pairs_to_check[i:i+batch_size] for i in range(0, len(pairs_to_check), batch_size)]

    start = time.time()
    opposition_results = []
    for batch_results in finder.check_opposition_batch.map(pair_batches, order_outputs=False):
        opposition_results.extend(batch_results)
        debates_so_far = sum(1 for r in opposition_results if r.get('is_genuine_opposition') and r.get('opposition_quality', 0) >= min_quality)
        print(f"  Progress: {len(opposition_results)}/{len(pairs_to_check)} pairs, {debates_so_far} debates found")

    print(f"Opposition check complete in {time.time() - start:.1f}s")

    # Filter to genuine debates
    debates = [r for r in opposition_results
               if r.get('is_genuine_opposition') and r.get('opposition_quality', 0) >= min_quality]
    debates = sorted(debates, key=lambda x: x.get('opposition_quality', 0), reverse=True)

    # Summary
    print(f"\n{'='*60}")
    print("DEBATE ANALYSIS RESULTS")
    print('='*60)
    print(f"Topic groups analyzed: {len(topic_groups)}")
    print(f"Pairs checked: {len(opposition_results)}")
    print(f"Genuine oppositions found: {sum(1 for r in opposition_results if r.get('is_genuine_opposition'))}")
    print(f"High-quality debates (>={min_quality}): {len(debates)}")

    # Show debates
    if debates:
        print(f"\n{'='*60}")
        print("GENUINE DEBATES DISCOVERED")
        print('='*60)

        for i, d in enumerate(debates[:15], 1):
            print(f"\n{i}. {d.get('debate_question', 'Untitled')} (Quality: {d.get('opposition_quality')}/10)")
            print(f"   Topic: {d['topic'][:60]}...")
            print(f"\n   SIDE A: {d.get('side_a', d['stance_a'][:50])}")
            print(f"   \"{d['insight_a'][:120]}...\"")
            print(f"\n   SIDE B: {d.get('side_b', d['stance_b'][:50])}")
            print(f"   \"{d['insight_b'][:120]}...\"")

            # Episode info
            ep_a = insights[d['idx_a']]['episode_title'][:35]
            ep_b = insights[d['idx_b']]['episode_title'][:35]
            print(f"\n   Episodes: '{ep_a}...' vs '{ep_b}...'")

    # Save results
    output = input.replace("modal_extraction_", "debates_")

    output_data = {
        'metadata': {
            'timestamp': datetime.now().isoformat(),
            'input_file': input,
            'topic_similarity_threshold': topic_similarity,
            'min_quality': min_quality,
            'total_insights': len(insights),
            'valid_extractions': len(valid_extracted),
            'topic_groups': len(topic_groups),
            'pairs_checked': len(opposition_results),
            'debates_found': len(debates),
        },
        'debates': debates,
        'topic_extractions': valid_extracted,
        'all_opposition_checks': opposition_results,
    }

    with open(output, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\nResults saved to: {output}")