Skip to content

Commit 807d641

Browse files
committed
feat(community/og): 抓取加固 + SiteAdapter + LLM 兜底 + 管理员重抓
背景 DC 分享 9 条全部入库但 6 条 og_title 为 NULL,feed 卡片基本空白。 逐条根因不同:mp.weixin.qq.com 反爬+体积超限;arxiv.org/pdf 是 PDF 不是 HTML; scholar.google.com/scholar_url 是 click-tracking 返 204;其他静默失败。 Phase 1: OgFetchService 加固 - UA 改纯 Chrome 伪装(去 Bot 字样),绕开微信软反爬 - MAX_BODY_BYTES 2MB → 8MB,覆盖微信带 base64 inline 图的臃肿页面 - 流式扫到 </head> 立即停读,绝大多数站点只读几十 KB 就够(早停 + 限流双兜底) - Content-Type 非 HTML(application/pdf 等)直接软失败 "non-html content-type:..." Phase 2: SiteAdapter 抽象 - 接口 + UrlNormalizer 链式(最多 5 跳防环) - ArxivPdfAdapter: arxiv.org/pdf/<id> → arxiv.org/abs/<id>(pdf 二进制无 OG,abs 才有) - ScholarUrlAdapter: 从 scholar_url?url=<real> 提真实链接(Scholar 是 tracker) - 链式串联:scholar_url(arxiv pdf) → arxiv abs,一跳通杀 Phase 3: LLM 兜底 - OgFallbackService:抓不到 title 时用 OpenAI/DeepSeek 根据 URL 猜 title/desc - EnrichmentWorker 在 OG fetch 后插一步:title 为空才调,避免无效消耗 token - 复用 OpenAiProperties,未配 apiKey 时直接降级返回空(不失败) Phase 3.5: 管理员重抓 endpoint - POST /api/admin/community/{id}/refetch-og:单条触发异步 enrichment - POST /api/admin/community/refetch-og/bulk:body {"ids":[1,2,3]} 或 {"ids":["all"]} 限流单次最多 100 条,"all" 自动找 og_title IS NULL 的批量重跑 - Repository 加 findIdsMissingOgTitle(limit) 配合上面 bulk all 模式 测试 - 12 个原 OgFetch test 全过(构造器加 UrlNormalizer 参数后修复) - 16 个新 SiteAdapterTests:arxiv/scholar/链式 全覆盖 - 10 个原 EnrichmentWorker test 全过(@mock 加 OgFallbackService) - 全 suite 仅剩 2 个 pre-existing seed user 失败(PR #22 时已确认无关) Spring 注入 - OgFetchService 主 ctor 加 @Autowired,避免 Spring 在 prod ctor + test ctor 之间困惑
1 parent aca38b9 commit 807d641

15 files changed

Lines changed: 723 additions & 26 deletions

src/main/java/com/involutionhell/backend/community/controller/SharedLinkAdminController.java

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import com.involutionhell.backend.community.model.SharedLink;
77
import com.involutionhell.backend.community.model.SharedLinkStatus;
88
import com.involutionhell.backend.community.repository.SharedLinkRepository;
9+
import com.involutionhell.backend.community.service.SharedLinkEnrichmentWorker;
910
import com.involutionhell.backend.community.service.SharedLinkService;
1011
import org.slf4j.Logger;
1112
import org.slf4j.LoggerFactory;
@@ -44,11 +45,14 @@ public class SharedLinkAdminController {
4445

4546
private final SharedLinkService service;
4647
private final SharedLinkRepository linkRepo;
48+
private final SharedLinkEnrichmentWorker enrichmentWorker;
4749

4850
public SharedLinkAdminController(SharedLinkService service,
49-
SharedLinkRepository linkRepo) {
51+
SharedLinkRepository linkRepo,
52+
SharedLinkEnrichmentWorker enrichmentWorker) {
5053
this.service = service;
5154
this.linkRepo = linkRepo;
55+
this.enrichmentWorker = enrichmentWorker;
5256
}
5357

5458
@GetMapping("/pending")
@@ -91,4 +95,58 @@ public ResponseEntity<ApiResponse<SharedLinkView>> reject(
9195
.orElseGet(() -> ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
9296
.body(new ApiResponse<>(false, "link disappeared after update", null)));
9397
}
98+
99+
/**
100+
* 重跑 enrichment(OG 抓取 + LLM 兜底 + 分类)。
101+
*
102+
* 用途:
103+
* - OG 抓取规则升级后回填历史链接
104+
* - 单条链接首次抓取卡在限流 / 反爬,等过段时间手动重试
105+
* - 测试新 SiteAdapter / OgFallback 效果
106+
*
107+
* 异步触发,不等结果——立即返回 202 Accepted,前端轮询 /pending 看新数据。
108+
* status 不变(不会把 APPROVED 推回 PENDING),enrich() 内部会原地覆盖 og_* 字段。
109+
*/
110+
@PostMapping("/{id}/refetch-og")
111+
public ResponseEntity<ApiResponse<Map<String, Object>>> refetchOg(@PathVariable Long id) {
112+
Optional<SharedLink> maybe = linkRepo.findById(id);
113+
if (maybe.isEmpty()) {
114+
return ResponseEntity.status(HttpStatus.NOT_FOUND)
115+
.body(new ApiResponse<>(false, "link not found", null));
116+
}
117+
log.info("admin refetch-og shared-link id={}", id);
118+
enrichmentWorker.enrich(id);
119+
return ResponseEntity.accepted()
120+
.body(ApiResponse.ok(Map.of("id", id, "queued", true)));
121+
}
122+
123+
/**
124+
* 批量重跑 enrichment。POST body: {"ids": [1, 2, 3]}。
125+
* 用 ids: ["all"] 表示对全表扫描所有 og_title IS NULL 的链接重跑(运维用)。
126+
*
127+
* 防误操作:单次最多 100 条;"all" 也走相同上限。
128+
*/
129+
@PostMapping("/refetch-og/bulk")
130+
public ResponseEntity<ApiResponse<Map<String, Object>>> bulkRefetchOg(
131+
@RequestBody Map<String, Object> body) {
132+
Object idsRaw = body.get("ids");
133+
List<Long> ids;
134+
if (idsRaw instanceof List<?> raw && raw.size() == 1 && "all".equals(raw.get(0))) {
135+
ids = service.findIdsMissingOg(100);
136+
} else if (idsRaw instanceof List<?> raw) {
137+
ids = raw.stream()
138+
.map(v -> v instanceof Number n ? n.longValue() : Long.parseLong(v.toString()))
139+
.limit(100)
140+
.toList();
141+
} else {
142+
return ResponseEntity.badRequest()
143+
.body(new ApiResponse<>(false, "missing or invalid 'ids' field", null));
144+
}
145+
log.info("admin bulk refetch-og count={}", ids.size());
146+
for (Long id : ids) {
147+
enrichmentWorker.enrich(id);
148+
}
149+
return ResponseEntity.accepted()
150+
.body(ApiResponse.ok(Map.of("queued", ids.size(), "ids", ids)));
151+
}
94152
}

src/main/java/com/involutionhell/backend/community/repository/JdbcSharedLinkRepository.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,17 @@ public void touchProbeLastAt(Long id) {
248248
jdbc.update("UPDATE shared_links SET probe_last_at = NOW() WHERE id = ?", id);
249249
}
250250

251+
@Override
252+
public List<Long> findIdsMissingOgTitle(int limit) {
253+
// 用 LIMIT 防止运维误调动了几万条全量重跑把 LLM 兜底配额炸了。
254+
// 排序按 id ASC,先补老的(更可能是历史抓取规则不完善)。
255+
return jdbc.queryForList(
256+
"SELECT id FROM shared_links WHERE og_title IS NULL ORDER BY id ASC LIMIT ?",
257+
Long.class,
258+
limit
259+
);
260+
}
261+
251262
private String serializeFlags(Map<String, Boolean> flags) {
252263
try {
253264
return objectMapper.writeValueAsString(flags == null ? new HashMap<>() : flags);

src/main/java/com/involutionhell/backend/community/repository/SharedLinkRepository.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,7 @@ void updateEnrichment(Long id,
7070

7171
/** 探活扫描需要的最小字段集合。 */
7272
record ProbeTarget(Long id, String url, int probeFailCount) {}
73+
74+
/** 找出 og_title IS NULL 的 link id,用于管理员批量重抓 OG。 */
75+
List<Long> findIdsMissingOgTitle(int limit);
7376
}
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
package com.involutionhell.backend.community.service;
2+
3+
import com.involutionhell.backend.openai.config.OpenAiProperties;
4+
import org.slf4j.Logger;
5+
import org.slf4j.LoggerFactory;
6+
import org.springframework.stereotype.Service;
7+
import tools.jackson.databind.JsonNode;
8+
import tools.jackson.databind.ObjectMapper;
9+
10+
import java.net.URI;
11+
import java.net.http.HttpClient;
12+
import java.net.http.HttpRequest;
13+
import java.net.http.HttpResponse;
14+
import java.nio.charset.StandardCharsets;
15+
import java.time.Duration;
16+
import java.util.LinkedHashMap;
17+
import java.util.List;
18+
import java.util.Map;
19+
20+
/**
21+
* OG 抓取失败时的 LLM 兜底:用 LLM 根据 URL 本身(host + path)猜个 title/description。
22+
*
23+
* 何时触发:{@link OgFetchService} 返回的 errorMessage 非 null(PDF / 反爬 / 限流 等)。
24+
* 不能保证准确,但能避免 feed 卡片完全空白 ——
25+
* 用户看到 "&lt;arxiv 论文 id&gt; — 摘要:暂不可用" 也比一行 URL 强。
26+
*
27+
* 复用 {@link OpenAiProperties}(model/apiUrl/apiKey),跟 ClassificationService 同一套 LLM。
28+
* 失败降级:返回空结果(title/description 都 null),调用方自己决定显示策略。
29+
*/
30+
@Service
31+
public class OgFallbackService {
32+
33+
private static final Logger log = LoggerFactory.getLogger(OgFallbackService.class);
34+
private static final Duration TIMEOUT = Duration.ofSeconds(20);
35+
36+
/**
37+
* Prompt 设计原则:
38+
* - 强约束只返回 JSON,禁止 markdown / 解释,跟 ClassificationService 一致
39+
* - 明确"不能编造内容"——只根据 URL 本身可推断的信息给标题
40+
* - 失败标识:所有字段为空时返回 {"title":null,"description":null}
41+
*/
42+
private static final String SYSTEM_PROMPT = """
43+
你是一个网页元数据猜测助手。用户会给你一条无法直接抓取 OG meta 的 URL,
44+
请根据 URL 的 host、path、query 推断出最可能的标题和一句简短描述。
45+
46+
约束:
47+
- 只能基于 URL 本身可推断的事实,**不要编造文章内容**
48+
- 标题尽量贴近真实页面标题的风格(如 arxiv 论文用 "[Paper] <id>",
49+
微信公众号用 "<公众号> · <推断主题>",github repo 用 "<owner>/<repo>" 等)
50+
- 描述一句话说明这是什么类型的资源(论文、技术博客、新闻报道、代码仓库、视频、PDF 文档...)
51+
- 中文输出
52+
- 完全无法推断时返回 {"title":null,"description":null}
53+
54+
严格只返回 JSON,不要任何解释、代码块标记(不要 ```json)或其他文字:
55+
{"title":"<推断标题>", "description":"<一句话描述>"}
56+
""";
57+
58+
private final HttpClient httpClient;
59+
private final ObjectMapper objectMapper;
60+
private final OpenAiProperties properties;
61+
62+
public OgFallbackService(HttpClient httpClient,
63+
ObjectMapper objectMapper,
64+
OpenAiProperties properties) {
65+
this.httpClient = httpClient;
66+
this.objectMapper = objectMapper;
67+
this.properties = properties;
68+
}
69+
70+
/**
71+
* 给 URL 猜个 OG。失败返回 (null, null),让调用方决定是否回填。
72+
*/
73+
public Guess guess(String url, String host) {
74+
if (url == null || url.isBlank()) {
75+
return Guess.empty();
76+
}
77+
// 没配 apiKey 时直接降级,避免发空请求被 LLM 服务拒绝
78+
if (properties.apiKey() == null || properties.apiKey().isBlank()) {
79+
log.debug("og-fallback 跳过:未配置 OPENAI_API_KEY");
80+
return Guess.empty();
81+
}
82+
try {
83+
String userContent = "URL: " + url + "\nHost: " + (host == null ? "(unknown)" : host);
84+
String requestBody = buildRequestBody(SYSTEM_PROMPT, userContent);
85+
HttpRequest request = HttpRequest.newBuilder(URI.create(properties.apiUrl() + "/chat/completions"))
86+
.header("Content-Type", "application/json")
87+
.header("Authorization", "Bearer " + properties.apiKey())
88+
.timeout(TIMEOUT)
89+
.POST(HttpRequest.BodyPublishers.ofString(requestBody, StandardCharsets.UTF_8))
90+
.build();
91+
92+
HttpResponse<String> response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
93+
if (response.statusCode() < 200 || response.statusCode() >= 300) {
94+
log.warn("og-fallback LLM 失败(HTTP 非 2xx): host={} status={}", host, response.statusCode());
95+
return Guess.empty();
96+
}
97+
return parseLlmResponse(response.body(), host);
98+
} catch (InterruptedException e) {
99+
Thread.currentThread().interrupt();
100+
return Guess.empty();
101+
} catch (Exception e) {
102+
log.warn("og-fallback 异常: host={} error={}", host, e.getMessage());
103+
return Guess.empty();
104+
}
105+
}
106+
107+
private String buildRequestBody(String systemContent, String userContent) {
108+
try {
109+
Map<String, Object> body = new LinkedHashMap<>();
110+
body.put("model", properties.model());
111+
body.put("temperature", 0);
112+
body.put("messages", List.of(
113+
Map.of("role", "system", "content", systemContent),
114+
Map.of("role", "user", "content", userContent)
115+
));
116+
return objectMapper.writeValueAsString(body);
117+
} catch (Exception e) {
118+
// ObjectMapper 写 LinkedHashMap 不会抛 checked exception,但兜底以防万一
119+
throw new IllegalStateException("og-fallback 构造请求体失败", e);
120+
}
121+
}
122+
123+
private Guess parseLlmResponse(String body, String host) {
124+
try {
125+
JsonNode root = objectMapper.readTree(body);
126+
String content = root.path("choices").path(0).path("message").path("content").asString(null);
127+
if (content == null || content.isBlank()) return Guess.empty();
128+
// LLM 偶尔会带 ```json ... ``` 包装,剥一下
129+
String stripped = content.trim();
130+
if (stripped.startsWith("```")) {
131+
int firstNewline = stripped.indexOf('\n');
132+
if (firstNewline > 0) stripped = stripped.substring(firstNewline + 1);
133+
if (stripped.endsWith("```")) stripped = stripped.substring(0, stripped.length() - 3);
134+
stripped = stripped.trim();
135+
}
136+
JsonNode payload = objectMapper.readTree(stripped);
137+
String title = nullableText(payload.path("title"));
138+
String description = nullableText(payload.path("description"));
139+
if (title == null && description == null) {
140+
return Guess.empty();
141+
}
142+
return new Guess(title, description);
143+
} catch (Exception e) {
144+
log.warn("og-fallback LLM 响应解析失败: host={} error={}", host, e.getMessage());
145+
return Guess.empty();
146+
}
147+
}
148+
149+
private static String nullableText(JsonNode node) {
150+
if (node == null || node.isMissingNode() || node.isNull()) return null;
151+
String s = node.asString(null);
152+
if (s == null) return null;
153+
s = s.trim();
154+
return s.isEmpty() ? null : s;
155+
}
156+
157+
/**
158+
* 兜底猜测结果。两个字段都为 null 表示无可用猜测。
159+
*/
160+
public record Guess(String title, String description) {
161+
public static Guess empty() { return new Guess(null, null); }
162+
public boolean isEmpty() { return title == null && description == null; }
163+
}
164+
}

0 commit comments

Comments
 (0)