diff --git a/clis/jianyu/detail.ts b/clis/jianyu/detail.ts new file mode 100644 index 000000000..c0d9a93ac --- /dev/null +++ b/clis/jianyu/detail.ts @@ -0,0 +1,21 @@ +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { runProcurementDetail } from './shared/procurement-detail.js'; + +cli({ + site: 'jianyu', + name: 'detail', + description: '读取剑鱼标讯详情页并抽取证据字段', + domain: 'www.jianyu360.cn', + strategy: Strategy.COOKIE, + browser: true, + args: [ + { name: 'url', required: true, positional: true, help: 'Detail page URL from jianyu/search' }, + { name: 'query', help: 'Optional query for evidence ranking' }, + ], + columns: ['title', 'publish_time', 'content_type', 'project_code', 'budget_or_limit', 'deadline_or_open_time', 'url'], + func: async (page, kwargs) => runProcurementDetail(page, { + url: kwargs.url, + query: kwargs.query, + site: 'jianyu', + }), +}); diff --git a/clis/jianyu/search.test.ts b/clis/jianyu/search.test.ts index 5b024cc74..1581ae852 100644 --- a/clis/jianyu/search.test.ts +++ b/clis/jianyu/search.test.ts @@ -2,6 +2,14 @@ import { describe, expect, it } from 'vitest'; import { __test__ } from './search.js'; describe('jianyu search helpers', () => { + it('builds candidate URLs with supsearch as preferred entry', () => { + const candidates = __test__.buildSearchCandidates('procurement'); + expect(candidates[0]).toContain('/jylab/supsearch/index.html'); + expect(candidates[0]).toContain('keywords=procurement'); + expect(candidates[0]).toContain('selectType=title'); + expect(candidates[0]).toContain('searchGroup=1'); + }); + it('builds supsearch URL with required query params', () => { const url = __test__.buildSearchUrl('procurement'); expect(url).toContain('keywords=procurement'); @@ -23,4 +31,59 @@ describe('jianyu search helpers', () => { ]); expect(deduped).toHaveLength(2); }); + + it('filters obvious navigation rows before quality gate', () => { + const filtered = __test__.filterNavigationRows('电梯', [ + { title: '招标公告', url: 'https://www.jianyu360.cn/list/stype/ZBGG.html', date: '' }, + { title: '帮助中心', url: 'https://www.jianyu360.cn/helpCenter/index', date: '' }, + { title: '某项目电梯采购公告', url: 'https://www.jianyu360.cn/notice/detail/123', date: '2026-04-07' }, + ]); + expect(filtered).toHaveLength(1); + expect(filtered[0].title).toContain('电梯采购公告'); + }); + + it('rejects procurement rows that do not contain query evidence', () => { + const filtered = __test__.filterNavigationRows('电梯', [ + { + title: '某项目采购公告', + url: 'https://www.jianyu360.cn/notice/detail/123', + date: '2026-04-07', + contextText: '招标公告 项目编号:ABC-123', + }, + ]); + expect(filtered).toHaveLength(0); + }); + + it('parses search-index markdown headings', () => { + const rows = __test__.parseSearchIndexMarkdown(` +## [标题一](http://duckduckgo.com/l/?uddg=https%3A%2F%2Fbeijing.jianyu360.cn%2Fjybx%2F20260401_26033143187897.html) +## [标题二](https://www.jianyu360.cn/nologin/content/ABC.html) +`); + expect(rows).toHaveLength(2); + expect(rows[0].title).toBe('标题一'); + expect(rows[1].url).toContain('jianyu360.cn/nologin/content'); + }); + + it('unwraps duckduckgo redirect links', () => { + const direct = __test__.unwrapDuckDuckGoUrl('https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.jianyu360.cn%2Fnologin%2Fcontent%2FXYZ.html'); + expect(direct).toBe('https://www.jianyu360.cn/nologin/content/XYZ.html'); + }); + + it('extracts publish date from jianyu jybx urls', () => { + const date = __test__.extractDateFromJianyuUrl('https://shandong.jianyu360.cn/jybx/20260310_26030938267551.html'); + expect(date).toBe('2026-03-10'); + }); + + it('normalizes api payload rows with fallback url/title fields', () => { + const normalized = __test__.normalizeApiRow({ + noticeTitle: '某项目电梯采购公告', + detailUrl: '/jybx/20260310_26030938267551.html', + publishTime: '2026-03-10 09:00:00', + buyer: '测试单位', + }); + expect(normalized).toBeTruthy(); + expect(normalized?.title).toContain('电梯采购公告'); + expect(normalized?.url).toContain('/jybx/20260310_26030938267551.html'); + expect(normalized?.date).toBe('2026-03-10'); + }); }); diff --git a/clis/jianyu/search.ts b/clis/jianyu/search.ts index 8bc5b09d6..359e8b113 100644 --- a/clis/jianyu/search.ts +++ b/clis/jianyu/search.ts @@ -3,17 +3,61 @@ */ import { cli, Strategy } from '@jackwener/opencli/registry'; import { AuthRequiredError } from '@jackwener/opencli/errors'; +import { + buildSearchCandidates, + cleanText, + dedupeCandidates, + detectAuthPrompt, + normalizeDate, + searchRowsFromEntries, +} from './shared/china-bid-search.js'; +import { toProcurementSearchRecords } from './shared/procurement-contract.js'; -interface JianyuCandidate { - title: string; - url: string; - date: string; -} - +const SITE = 'jianyu'; +const DOMAIN = 'www.jianyu360.cn'; const SEARCH_ENTRY = 'https://www.jianyu360.cn/jylab/supsearch/index.html'; +const SEARCH_ENTRIES = [ + SEARCH_ENTRY, + 'https://www.jianyu360.cn/list/stype/ZBGG.html', + 'https://www.jianyu360.cn/', +]; +const SEARCH_INDEX_PROXY = 'https://r.jina.ai/http://duckduckgo.com/html/?q='; +const PROCUREMENT_TITLE_HINT = /(公告|招标|采购|中标|成交|项目|投标|结果|notice|tender|procurement|bidding)/i; +const AUTH_REQUIRED_HINT = /(请在下图依次点击|登录即可获得更多浏览权限|验证登录|请完成验证|图形验证码)/; +const NAVIGATION_PATH_PREFIXES = [ + '/product/', + '/front/', + '/helpcenter/', + '/brand/', + '/page_workdesktop/', + '/list/', + '/list/stype/', + '/list/rmxm', + '/big/page/', + '/jylab/', + '/tags/', + '/sitemap', + '/datasmt/', + '/bank/', + '/hj/', + '/exhibition/', + '/swordfish/page_big_pc/search/', +]; +const JIANYU_API_TYPES = ['fType', 'eType', 'vType', 'mType'] as const; -function cleanText(value: unknown): string { - return typeof value === 'string' ? value.replace(/\s+/g, ' ').trim() : ''; +interface JianyuApiPayload { + antiVerify?: number; + error_code?: number; + hasLogin?: boolean; + textVerify?: string; + list?: unknown[]; +} + +interface JianyuApiResponse { + type: string; + ok: boolean; + status: number; + payload?: JianyuApiPayload; } export function buildSearchUrl(query: string): string { @@ -24,133 +68,469 @@ export function buildSearchUrl(query: string): string { return url.toString(); } -export function normalizeDate(raw: string): string { - const normalized = cleanText(raw); - const match = normalized.match(/(20\d{2})[.\-/年](\d{1,2})[.\-/月](\d{1,2})/); - if (!match) return ''; - const year = match[1]; - const month = match[2].padStart(2, '0'); - const day = match[3].padStart(2, '0'); - return `${year}-${month}-${day}`; +function siteSearchCandidates(query: string): string[] { + const preferred = buildSearchUrl(query); + const fallbacks = buildSearchCandidates(query, SEARCH_ENTRIES, ['keywords', 'keyword', 'q', 'search', 'title']); + const ordered: string[] = []; + const seen = new Set(); + for (const candidate of [preferred, ...fallbacks]) { + const value = cleanText(candidate); + if (!value || seen.has(value)) continue; + seen.add(value); + ordered.push(value); + } + return ordered; +} + +function isLikelyNavigationUrl(rawUrl: string): boolean { + const urlText = cleanText(rawUrl); + if (!urlText) return true; + try { + const parsed = new URL(urlText); + const path = cleanText(parsed.pathname).toLowerCase().replace(/\/+$/, '/') || '/'; + if (path === '/') return true; + if (NAVIGATION_PATH_PREFIXES.some((prefix) => path.startsWith(prefix))) return true; + return false; + } catch { + return true; + } +} + +function filterNavigationRows(query: string, items: Array<{ + title?: string; + url?: string; + date?: string; + contextText?: string; +}>): Array<{ + title: string; + url: string; + date?: string; + contextText?: string; +}> { + const queryTokens = cleanText(query).split(/\s+/).filter(Boolean).map((token) => token.toLowerCase()); + return items + .map((item) => ({ + title: cleanText(item.title), + url: cleanText(item.url), + date: normalizeDate(cleanText(item.date)), + contextText: cleanText(item.contextText), + })) + .filter((item) => { + if (!item.title || !item.url) return false; + const haystack = `${item.title} ${item.contextText}`.toLowerCase(); + const hasQuery = queryTokens.length === 0 || queryTokens.some((token) => haystack.includes(token)); + const hasProcurementHint = PROCUREMENT_TITLE_HINT.test(`${item.title} ${item.contextText}`); + const hasDate = !!item.date; + if (!hasQuery) return false; + if (!isLikelyNavigationUrl(item.url)) return true; + return hasDate && hasProcurementHint; + }); +} + +async function isAuthRequired(page: any): Promise { + const pageText = cleanText(await page.evaluate('document.body ? document.body.innerText : ""')); + if (AUTH_REQUIRED_HINT.test(pageText)) return true; + return detectAuthPrompt(page); +} + +function toAbsoluteJianyuUrl(rawUrl: string): string { + const value = cleanText(rawUrl); + if (!value) return ''; + if (value.startsWith('http://') || value.startsWith('https://')) return value; + if (value.startsWith('//')) return `https:${value}`; + if (value.startsWith('/')) { + try { + return new URL(value, SEARCH_ENTRY).toString(); + } catch { + return ''; + } + } + return ''; +} + +function extractDateFromJianyuUrl(rawUrl: string): string { + const value = cleanText(rawUrl); + if (!value) return ''; + const matched = value.match(/\/(20\d{2})(\d{2})(\d{2})(?:[_/]|$)/); + if (!matched) return ''; + return `${matched[1]}-${matched[2]}-${matched[3]}`; +} + +function flattenStrings(input: unknown, depth = 0): string[] { + if (depth > 2 || input == null) return []; + if (typeof input === 'string' || typeof input === 'number') { + const text = cleanText(String(input)); + return text ? [text] : []; + } + if (Array.isArray(input)) { + return input.flatMap((item) => flattenStrings(item, depth + 1)); + } + if (typeof input === 'object') { + return Object.values(input as Record).flatMap((item) => flattenStrings(item, depth + 1)); + } + return []; +} + +function pickString(record: Record, keys: string[]): string { + for (const key of keys) { + const value = record[key]; + if (typeof value === 'string' || typeof value === 'number') { + const text = cleanText(String(value)); + if (text) return text; + } + } + return ''; } -function dedupeCandidates(items: JianyuCandidate[]): JianyuCandidate[] { - const deduped: JianyuCandidate[] = []; +function normalizeApiRow(item: unknown): { + title: string; + url: string; + date?: string; + contextText?: string; +} | null { + if (!item || typeof item !== 'object') return null; + const record = item as Record; + const allStrings = flattenStrings(record); + + let url = toAbsoluteJianyuUrl(pickString(record, [ + 'url', + 'detailUrl', + 'detailURL', + 'link', + 'href', + 'articleUrl', + 'newsUrl', + 'contentUrl', + 'jumpUrl', + 'sourceUrl', + ])); + if (!url) { + const maybeUrl = allStrings.find((value) => /jianyu360\.cn|\/jybx\/|\/nologin\/content\//i.test(value)) || ''; + url = toAbsoluteJianyuUrl(maybeUrl); + } + + let title = cleanText(pickString(record, [ + 'title', + 'noticeTitle', + 'bidTitle', + 'projectName', + 'name', + 'articleTitle', + 'newsTitle', + 'tenderTitle', + 'contentTitle', + ])); + if (!title) { + title = allStrings.find((value) => value.length >= 8 && PROCUREMENT_TITLE_HINT.test(value)) || ''; + } + + const date = normalizeDate(pickString(record, [ + 'publishTime', + 'publishDate', + 'pubDate', + 'createTime', + 'time', + 'releaseTime', + 'date', + ])) || extractDateFromJianyuUrl(url); + + const contextText = cleanText([ + pickString(record, ['content', 'summary', 'desc', 'description', 'buyer', 'winner', 'agency', 'industry']), + ...allStrings.slice(0, 6), + ].filter(Boolean).join(' ')); + + if (!title || !url) return null; + return { + title, + url, + date, + contextText, + }; +} + +function parseSearchIndexMarkdown(markdown: string): Array<{ title: string; url: string }> { + const rows: Array<{ title: string; url: string }> = []; + for (const line of markdown.split('\n')) { + const text = line.trim(); + if (!text.startsWith('## [')) continue; + const right = text.slice(3); + const sep = right.lastIndexOf(']('); + if (sep <= 0 || !right.endsWith(')')) continue; + const title = cleanText(right.slice(1, sep)); + const url = cleanText(right.slice(sep + 2, -1)); + if (!title || !url) continue; + rows.push({ title, url }); + } + return rows; +} + +function unwrapDuckDuckGoUrl(rawUrl: string): string { + const candidate = cleanText(rawUrl); + if (!candidate) return ''; + const normalized = candidate.startsWith('//') ? `https:${candidate}` : candidate; + try { + const parsed = new URL(normalized); + const host = parsed.hostname.toLowerCase(); + if (!host.endsWith('duckduckgo.com')) return normalized; + const uddg = parsed.searchParams.get('uddg'); + if (!uddg) return normalized; + try { + return decodeURIComponent(uddg); + } catch { + return uddg; + } + } catch { + return ''; + } +} + +function isJianyuHost(rawUrl: string): boolean { + const value = cleanText(rawUrl); + if (!value) return false; + try { + return new URL(value).hostname.toLowerCase().endsWith('jianyu360.cn'); + } catch { + return false; + } +} + +function buildIndexQueryVariants(query: string): string[] { + const tokens = cleanText(query).split(/\s+/).filter(Boolean); + const values = [cleanText(query), ...tokens]; + const ordered: string[] = []; const seen = new Set(); - for (const item of items) { - const key = `${item.title}\t${item.url}`; - if (seen.has(key)) continue; - seen.add(key); - deduped.push(item); + for (const value of values) { + const text = cleanText(value); + if (!text || seen.has(text)) continue; + seen.add(text); + ordered.push(text); + } + return ordered; +} + +async function fetchDuckDuckGoIndexRows(query: string, limit: number): Promise> { + const results: Array<{ title: string; url: string; date?: string; contextText?: string }> = []; + const seen = new Set(); + + for (const variant of buildIndexQueryVariants(query)) { + if (results.length >= limit) break; + const fullQuery = `site:jianyu360.cn ${variant}`; + const url = `${SEARCH_INDEX_PROXY}${encodeURIComponent(fullQuery)}`; + let responseText = ''; + try { + const response = await fetch(url, { + headers: { + Accept: 'text/plain, text/markdown, */*', + 'User-Agent': 'opencli-jianyu-search/1.0', + }, + }); + if (!response.ok) continue; + responseText = await response.text(); + } catch { + continue; + } + + const indexedRows = parseSearchIndexMarkdown(responseText); + for (const row of indexedRows) { + const unwrapped = unwrapDuckDuckGoUrl(row.url); + const absoluteUrl = toAbsoluteJianyuUrl(unwrapped) || cleanText(unwrapped); + if (!isJianyuHost(absoluteUrl)) continue; + const key = `${row.title}\t${absoluteUrl}`; + if (seen.has(key)) continue; + seen.add(key); + results.push({ + title: cleanText(row.title), + url: absoluteUrl, + date: extractDateFromJianyuUrl(absoluteUrl), + contextText: cleanText(`${row.title} ${variant}`), + }); + if (results.length >= limit) break; + } + } + + return results; +} + +async function fetchJianyuApiRows(page: any, query: string, limit: number): Promise<{ + rows: Array<{ title: string; url: string; date?: string; contextText?: string }>; + challenge: boolean; +}> { + try { + await page.goto(buildSearchUrl(query)); + await page.wait(2); + + const payload = await page.evaluate(` + (async () => { + const now = Math.floor(Date.now() / 1000); + const body = { + searchGroup: 1, + reqType: 'lastNews', + pageNum: 1, + pageSize: Math.max(20, Math.min(${Math.max(20, limit)}, 50)), + keyWords: ${JSON.stringify(query)}, + searchMode: 0, + bidField: '', + publishTime: \`\${now - 3600 * 24 * 365 * 3}-\${now}\`, + selectType: 'title,content', + subtype: '', + exclusionWords: '', + buyer: '', + winner: '', + agency: '', + industry: '', + province: '', + city: '', + district: '', + buyerClass: '', + fileExists: '', + price: '', + buyerTel: '', + winnerTel: '', + }; + const responses = []; + const types = ${JSON.stringify([...JIANYU_API_TYPES])}; + for (const type of types) { + try { + const response = await fetch('/jyapi/jybx/core/' + type + '/searchList', { + method: 'POST', + headers: { + Accept: 'application/json, text/plain, */*', + 'Content-Type': 'application/json', + }, + credentials: 'include', + body: JSON.stringify(body), + }); + let raw = null; + try { + raw = await response.json(); + } catch { + raw = null; + } + const dataList = raw && raw.data && Array.isArray(raw.data.list) ? raw.data.list : []; + responses.push({ + type, + ok: response.ok, + status: response.status, + payload: { + antiVerify: raw && typeof raw.antiVerify === 'number' ? raw.antiVerify : undefined, + error_code: raw && typeof raw.error_code === 'number' ? raw.error_code : undefined, + hasLogin: raw && typeof raw.hasLogin === 'boolean' ? raw.hasLogin : undefined, + textVerify: raw && typeof raw.textVerify === 'string' ? raw.textVerify.slice(0, 16) : undefined, + list: dataList, + }, + }); + } catch { + responses.push({ + type, + ok: false, + status: 0, + }); + } + } + const challenge = responses.some((item) => item && item.payload && item.payload.antiVerify === -1); + return { challenge, responses }; + })() + `) as { + challenge?: unknown; + responses?: unknown[]; + }; + + const rows: Array<{ title: string; url: string; date?: string; contextText?: string }> = []; + const seen = new Set(); + const responses = Array.isArray(payload?.responses) ? payload.responses : []; + for (const response of responses) { + if (!response || typeof response !== 'object') continue; + const meta = response as { payload?: unknown }; + const body = meta.payload; + if (!body || typeof body !== 'object') continue; + const list = (body as JianyuApiPayload).list; + if (!Array.isArray(list)) continue; + for (const item of list) { + const row = normalizeApiRow(item); + if (!row) continue; + const key = `${row.title}\t${row.url}`; + if (seen.has(key)) continue; + seen.add(key); + rows.push(row); + if (rows.length >= limit) break; + } + if (rows.length >= limit) break; + } + + const challenge = Boolean(payload?.challenge); + return { rows, challenge }; + } catch { + return { rows: [], challenge: false }; } - return deduped; } cli({ - site: 'jianyu', + site: SITE, name: 'search', description: '搜索剑鱼标讯公告', - domain: 'www.jianyu360.cn', + domain: DOMAIN, strategy: Strategy.COOKIE, browser: true, args: [ { name: 'query', required: true, positional: true, help: 'Search keyword, e.g. "procurement"' }, { name: 'limit', type: 'int', default: 20, help: 'Number of results (max 50)' }, ], - columns: ['rank', 'title', 'date', 'url'], + columns: ['rank', 'content_type', 'title', 'publish_time', 'project_code', 'budget_or_limit', 'url'], func: async (page, kwargs) => { const query = cleanText(kwargs.query); const limit = Math.max(1, Math.min(Number(kwargs.limit) || 20, 50)); - const searchUrl = buildSearchUrl(query); + const apiResult = await fetchJianyuApiRows(page, query, limit); + const mergedRows = dedupeCandidates(filterNavigationRows(query, apiResult.rows)); - await page.goto(searchUrl); - await page.wait(2); + const extractedRows = await searchRowsFromEntries(page, { + query, + candidateUrls: siteSearchCandidates(query), + allowedHostFragments: ['jianyu360.cn'], + limit, + }); + const domRows = dedupeCandidates(filterNavigationRows(query, extractedRows)); + const rows = dedupeCandidates([...mergedRows, ...domRows]); - const payload = await page.evaluate(` - (() => { - const clean = (value) => (value || '').replace(/\\s+/g, ' ').trim(); - const toAbsolute = (href) => { - if (!href) return ''; - if (href.startsWith('http://') || href.startsWith('https://')) return href; - if (href.startsWith('/')) return new URL(href, window.location.origin).toString(); - return ''; - }; - const parseDate = (text) => { - const normalized = clean(text); - const match = normalized.match(/(20\\d{2})[.\\-/年](\\d{1,2})[.\\-/月](\\d{1,2})/); - if (!match) return ''; - const month = String(match[2]).padStart(2, '0'); - const day = String(match[3]).padStart(2, '0'); - return match[1] + '-' + month + '-' + day; - }; - const pickDateText = (node) => { - let cursor = node; - for (let i = 0; i < 4 && cursor; i++) { - const text = clean(cursor.innerText || cursor.textContent || ''); - const date = parseDate(text); - if (date) return date; - cursor = cursor.parentElement; - } - return ''; - }; + if (rows.length === 0) { + const indexedRows = await fetchDuckDuckGoIndexRows(query, limit); + const filteredIndexedRows = dedupeCandidates(filterNavigationRows(query, indexedRows)); + if (filteredIndexedRows.length > 0) { + return toProcurementSearchRecords(filteredIndexedRows, { + site: SITE, + query, + limit, + }); + } - const anchors = Array.from( - document.querySelectorAll('a[href*="/nologin/content/"], a[href*="/content/"]'), + if (apiResult.challenge || await isAuthRequired(page)) { + throw new AuthRequiredError( + DOMAIN, + '[taxonomy=selector_drift] site=jianyu command=search blocked by human verification / access challenge', ); - const rows = []; - const seen = new Set(); - for (const anchor of anchors) { - const url = toAbsolute(anchor.getAttribute('href') || anchor.href || ''); - const title = clean(anchor.textContent || ''); - if (!url || !title || title.length < 4) continue; - const key = title + '\\t' + url; - if (seen.has(key)) continue; - seen.add(key); - rows.push({ - title, - url, - date: pickDateText(anchor), - }); - } - return rows; - })() - `); - - const pageText = cleanText(await page.evaluate('document.body ? document.body.innerText : ""')); - if ( - !Array.isArray(payload) - && /(请先登录|登录后|未登录|验证码)/.test(pageText) - ) { - throw new AuthRequiredError( - 'www.jianyu360.cn', - 'Jianyu search results require login or human verification', - ); + } } - const rows = Array.isArray(payload) - ? payload - .filter((item): item is JianyuCandidate => !!item && typeof item === 'object') - .map((item) => ({ - title: cleanText(item.title), - url: cleanText(item.url), - date: normalizeDate(cleanText(item.date)), - })) - .filter((item) => item.title && item.url) - : []; - - return dedupeCandidates(rows) - .slice(0, limit) - .map((item, index) => ({ - rank: index + 1, - title: item.title, - date: item.date, - url: item.url, - })); + return toProcurementSearchRecords(rows, { + site: SITE, + query, + limit, + }); }, }); export const __test__ = { + buildSearchCandidates: siteSearchCandidates, buildSearchUrl, normalizeDate, dedupeCandidates, + filterNavigationRows, + parseSearchIndexMarkdown, + unwrapDuckDuckGoUrl, + extractDateFromJianyuUrl, + normalizeApiRow, }; diff --git a/clis/jianyu/shared/china-bid-search.ts b/clis/jianyu/shared/china-bid-search.ts new file mode 100644 index 000000000..d661a4e73 --- /dev/null +++ b/clis/jianyu/shared/china-bid-search.ts @@ -0,0 +1,192 @@ +import { + type ProcurementSearchCandidateRaw, + cleanText, + normalizeDate, +} from './procurement-contract.js'; + +export type BidSearchCandidate = ProcurementSearchCandidateRaw; + +export { cleanText, normalizeDate }; + +export function dedupeCandidates(items: BidSearchCandidate[]): BidSearchCandidate[] { + const deduped: BidSearchCandidate[] = []; + const seen = new Set(); + for (const item of items) { + const key = `${cleanText(item.title)}\t${cleanText(item.url)}`; + if (seen.has(key)) continue; + seen.add(key); + deduped.push(item); + } + return deduped; +} + +function withQuery(baseUrl: string, key: string, query: string): string | null { + try { + const url = new URL(baseUrl); + url.searchParams.set(key, query); + return url.toString(); + } catch { + return null; + } +} + +export function buildSearchCandidates( + query: string, + baseEntries: string[], + queryKeys: string[] = ['keyword', 'keywords', 'q', 'search', 'title'], +): string[] { + const keyword = cleanText(query); + const candidates: string[] = []; + if (keyword) { + for (const entry of baseEntries) { + for (const key of queryKeys) { + const withKeyword = withQuery(entry, key, keyword); + if (withKeyword) candidates.push(withKeyword); + } + } + } + candidates.push(...baseEntries); + const ordered: string[] = []; + const seen = new Set(); + for (const item of candidates) { + const value = cleanText(item); + if (!value || seen.has(value)) continue; + seen.add(value); + ordered.push(value); + } + return ordered; +} + +export async function detectAuthPrompt(page: any): Promise { + const pageText = cleanText(await page.evaluate('document.body ? document.body.innerText : ""')); + return /(请先登录|未登录|登录后|验证码|人机验证|权限不足|无权限|请完善信息后访问)/.test(pageText); +} + +export async function searchRowsFromEntries( + page: any, + { + query, + candidateUrls, + allowedHostFragments, + limit, + }: { + query: string; + candidateUrls: string[]; + allowedHostFragments: string[]; + limit: number; + }, +): Promise { + const queryText = cleanText(query); + const rows: BidSearchCandidate[] = []; + + for (const targetUrl of candidateUrls) { + await page.goto(targetUrl); + await page.wait(2); + + const payload = await page.evaluate(` + (() => { + const clean = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const parseDate = (text) => { + const normalized = clean(text); + const match = normalized.match(/(20\\d{2})[.\\-/年](\\d{1,2})[.\\-/月](\\d{1,2})/); + if (!match) return ''; + return match[1] + '-' + String(match[2]).padStart(2, '0') + '-' + String(match[3]).padStart(2, '0'); + }; + const toAbsolute = (href) => { + if (!href) return ''; + if (href.startsWith('http://') || href.startsWith('https://')) return href; + if (href.startsWith('/')) return new URL(href, window.location.origin).toString(); + return ''; + }; + + const token = ${JSON.stringify(queryText)}; + const tokenParts = token.split(/\\s+/).filter(Boolean).map((part) => part.toLowerCase()); + const allowedHosts = ${JSON.stringify(allowedHostFragments.map((item) => item.toLowerCase()))}; + const procurementHints = ['招标', '采购', '公告', '项目', '中标', '成交', '询价', '竞价', '比选', '投标', 'notice', 'tender', 'procurement', 'bidding']; + const rowSelectors = [ + 'table tbody tr', + 'table tr', + 'ul li', + 'ol li', + 'article', + 'section', + '.list li', + '.notice li', + '[class*="list"] li', + '[class*="notice"] li', + '[class*="item"]', + '[class*="row"]', + ]; + + const rowNodes = []; + const rowSeen = new Set(); + for (const selector of rowSelectors) { + const nodes = Array.from(document.querySelectorAll(selector)); + for (const node of nodes) { + const text = clean(node.innerText || node.textContent || ''); + if (!text || text.length < 8) continue; + const lowerText = text.toLowerCase(); + const hasDate = /(20\\d{2})[.\\-/年](\\d{1,2})[.\\-/月](\\d{1,2})/.test(text); + const hasHint = procurementHints.some((hint) => lowerText.includes(hint)); + const hasQuery = tokenParts.length === 0 || tokenParts.some((part) => lowerText.includes(part)); + if (!hasDate && !hasHint && !hasQuery) continue; + if (rowSeen.has(node)) continue; + rowSeen.add(node); + rowNodes.push(node); + } + } + + const rows = []; + const seen = new Set(); + for (const node of rowNodes) { + const contextText = clean(node.innerText || node.textContent || ''); + const contextLower = contextText.toLowerCase(); + const hasHint = procurementHints.some((hint) => contextLower.includes(hint)); + const hasQuery = tokenParts.length === 0 || tokenParts.some((part) => contextLower.includes(part)); + if (!hasHint && !hasQuery) continue; + + const anchors = Array.from(node.querySelectorAll('a[href]')); + for (const anchor of anchors) { + const title = clean(anchor.textContent || ''); + if (!title || title.length < 4) continue; + const url = toAbsolute(anchor.getAttribute('href') || anchor.href || ''); + if (!url) continue; + const lowerUrl = url.toLowerCase(); + const hostMatched = allowedHosts.length === 0 || allowedHosts.some((item) => lowerUrl.includes(item)); + if (!hostMatched) continue; + + const key = title + '\\t' + url; + if (seen.has(key)) continue; + seen.add(key); + rows.push({ + title, + url, + date: parseDate(contextText), + contextText, + }); + } + } + return rows; + })() + `); + + if (Array.isArray(payload)) { + for (const item of payload) { + if (!item || typeof item !== 'object') continue; + const row = item as Record; + const candidate: BidSearchCandidate = { + title: cleanText(row.title), + url: cleanText(row.url), + date: normalizeDate(cleanText(row.date)), + contextText: cleanText(row.contextText), + }; + if (!candidate.title || !candidate.url) continue; + rows.push(candidate); + } + } + + if (rows.length >= limit) break; + } + + return dedupeCandidates(rows).slice(0, limit); +} diff --git a/clis/jianyu/shared/procurement-contract.test.ts b/clis/jianyu/shared/procurement-contract.test.ts new file mode 100644 index 000000000..116a9b406 --- /dev/null +++ b/clis/jianyu/shared/procurement-contract.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from 'vitest'; +import { + __test__, + toProcurementDetailRecord, + toProcurementSearchRecords, +} from './procurement-contract.js'; + +describe('procurement contract helpers', () => { + it('builds v2 search records with compatibility fields', () => { + const rows = toProcurementSearchRecords( + [ + { + title: '某项目电梯采购公告', + url: 'https://example.com/notice/detail?id=1', + date: '2026-04-09', + contextText: '招标公告 项目编号:ABC-123 预算金额:100万元 投标截止时间:2026-04-30', + }, + ], + { + site: 'jianyu', + query: '电梯', + limit: 10, + }, + ); + expect(rows).toHaveLength(1); + expect(rows[0].rank).toBe(1); + expect(rows[0].publish_time).toBe('2026-04-09'); + expect(rows[0].date).toBe('2026-04-09'); + expect(rows[0].summary).toBe(rows[0].snippet); + expect(rows[0].content_type).toBe('notice'); + expect(rows[0].source_site).toBe('jianyu'); + expect(rows[0].project_code).toContain('ABC-123'); + }); + + it('throws extraction_drift when all rows are navigation noise', () => { + expect(() => toProcurementSearchRecords( + [ + { + title: '官网首页', + url: 'https://example.com/index', + contextText: '官网首页 联系我们', + }, + ], + { + site: 'ggzy', + query: '电梯', + limit: 10, + }, + )).toThrow('[taxonomy=extraction_drift]'); + }); + + it('rejects rows that look like procurement notices but miss the query', () => { + expect(() => toProcurementSearchRecords( + [ + { + title: '某项目采购公告', + url: 'https://example.com/notice/detail?id=1', + contextText: '招标公告 项目编号:ABC-123 预算金额:100万元', + }, + ], + { + site: 'jianyu', + query: '电梯', + limit: 10, + }, + )).toThrow('[taxonomy=extraction_drift]'); + }); + + it('builds detail record with evidence blocks', () => { + const detail = toProcurementDetailRecord( + { + title: '电梯采购公告', + url: 'https://example.com/notice/detail/100', + contextText: '项目编号:A-100。预算金额:200万元。投标截止时间:2026年04月30日。', + }, + { + site: 'powerchina', + query: '电梯', + }, + ); + expect(detail.content_type).toBe('notice'); + expect(detail.detail_text).toContain('预算金额'); + expect(detail.evidence_blocks.length).toBeGreaterThan(0); + }); + + it('classifies detail urls and content type', () => { + expect(__test__.isDetailPage('https://a.com/notice/detail?id=1')).toBe(true); + expect(__test__.isDetailPage('https://shandong.jianyu360.cn/jybx/20260310_26030938267551.html')).toBe(true); + expect(__test__.isDetailPage('https://a.com/search?page=1')).toBe(false); + expect(__test__.classifyContentType('中标结果公告', 'https://a.com/detail/1', '中标候选人')).toBe('result'); + expect(__test__.classifyContentType( + '电梯采购公告', + 'https://shandong.jianyu360.cn/jybx/20260310_26030938267551.html', + '首页 帮助中心 招标公告', + )).toBe('notice'); + }); +}); diff --git a/clis/jianyu/shared/procurement-contract.ts b/clis/jianyu/shared/procurement-contract.ts new file mode 100644 index 000000000..ad262bb54 --- /dev/null +++ b/clis/jianyu/shared/procurement-contract.ts @@ -0,0 +1,438 @@ +export type ProcurementContentType = 'notice' | 'result' | 'news' | 'navigation' | 'unknown'; + +export type ProcurementTaxonomy = + | 'network_dns' + | 'relay_unavailable' + | 'selector_drift' + | 'timeout' + | 'empty_result' + | 'extraction_drift'; + +export interface ProcurementSearchCandidateRaw { + title: string; + url: string; + date?: string; + contextText?: string; +} + +interface ProcurementCoreRecord { + title: string; + url: string; + date: string; + publish_time: string; + source_site: string; + is_detail_page: boolean; + content_type: ProcurementContentType; + project_owner: string; + project_code: string; + budget_or_limit: string; + deadline_or_open_time: string; + snippet: string; + summary: string; + quality_flags: string[]; +} + +export interface ProcurementSearchRecord extends ProcurementCoreRecord { + rank: number; +} + +export interface ProcurementDetailRecord extends ProcurementCoreRecord { + detail_text: string; + evidence_blocks: string[]; +} + +const PROCUREMENT_HINTS = [ + '招标', + '采购', + '公告', + '项目', + '中标', + '成交', + '流标', + '终止', + '询价', + '竞价', + '比选', + '投标', + 'tender', + 'procurement', + 'bidding', + 'bid', + 'notice', +]; + +const RESULT_HINTS = [ + '中标', + '成交', + '结果', + '候选人', + '中选', + '定标', + '评标', + 'award', + 'winner', +]; + +const NOTICE_HINTS = [ + '招标', + '采购', + '询价', + '比选', + '公告', + '竞争性', + '邀请', + '投标', + 'tender', + 'procurement', + 'notice', +]; + +const NEWS_HINTS = [ + '新闻', + '资讯', + '动态', + '政策', + '简讯', + 'news', + 'article', +]; + +const NAVIGATION_HINTS = [ + '首页', + '官网', + '网站地图', + '联系我们', + '帮助中心', + 'english', + 'login', + '注册', + '导航', + '法规', + '政策文件', + '服务平台', + '信用中国', +]; + +const DETAIL_URL_HINTS = [ + '/detail', + '/content', + '/jybx/', + '/notice', + '/article', + '/view', + '/project', + '/bid', + 'detail=', + 'id=', +]; + +const LIST_URL_HINTS = [ + '/search', + '/list', + '/index', + '/home', + '/portal', + '/channel', + 'page=', +]; + +const OWNER_PATTERNS = [ + /(?:招标人|采购人|业主|建设单位|项目单位)\s*[::]\s*([^\n,。;]{2,60})/i, +]; + +const CODE_PATTERNS = [ + /(?:项目编号|招标编号|采购编号|项目编码|项目代码|编号)\s*[::]\s*([A-Za-z0-9\-_/]{4,60})/i, +]; + +const BUDGET_PATTERNS = [ + /(?:预算(?:金额)?|控制价|最高限价|限价|采购金额|合同估算价)\s*[::]\s*([^\n,。;]{2,80})/i, +]; + +const DEADLINE_PATTERNS = [ + /(?:报名截止时间|投标截止时间|开标时间|响应文件递交截止时间|截止时间|开标日期)\s*[::]\s*([^\n,。;]{2,80})/i, +]; + +const DATE_PATTERN = /(20\d{2})[.\-/年](\d{1,2})[.\-/月](\d{1,2})/; + +export function cleanText(value: unknown): string { + return typeof value === 'string' ? value.replace(/\s+/g, ' ').trim() : ''; +} + +export function normalizeDate(raw: string): string { + const normalized = cleanText(raw); + const match = normalized.match(DATE_PATTERN); + if (!match) return ''; + const year = match[1]; + const month = match[2].padStart(2, '0'); + const day = match[3].padStart(2, '0'); + return `${year}-${month}-${day}`; +} + +function uniqueInOrder(values: string[]): string[] { + const ordered: string[] = []; + const seen = new Set(); + for (const value of values) { + const text = cleanText(value); + if (!text || seen.has(text)) continue; + seen.add(text); + ordered.push(text); + } + return ordered; +} + +function containsAny(haystack: string, needles: string[]): boolean { + return needles.some((needle) => haystack.includes(needle.toLowerCase())); +} + +function extractByPatterns(text: string, patterns: RegExp[]): string { + for (const pattern of patterns) { + const matched = text.match(pattern); + if (matched?.[1]) return cleanText(matched[1]); + } + return ''; +} + +function deriveSnippet(text: string): string { + const normalized = cleanText(text); + if (!normalized) return ''; + return normalized.slice(0, 220); +} + +function splitEvidenceBlocks(text: string, query: string): string[] { + const normalized = cleanText(text); + if (!normalized) return []; + const queryTokens = query + .split(/\s+/) + .map((item) => item.toLowerCase().trim()) + .filter(Boolean); + const chunks = normalized + .split(/[。!?;\n]/) + .map((chunk) => cleanText(chunk)) + .filter(Boolean); + + const ranked = chunks + .map((chunk) => { + const lower = chunk.toLowerCase(); + const tokenScore = queryTokens.length === 0 + ? 0 + : queryTokens.reduce((score, token) => (lower.includes(token) ? score + 2 : score), 0); + const procurementScore = containsAny(lower, PROCUREMENT_HINTS) ? 1 : 0; + return { + chunk, + score: tokenScore + procurementScore, + }; + }) + .sort((a, b) => b.score - a.score || b.chunk.length - a.chunk.length) + .slice(0, 5) + .map((item) => item.chunk); + + return uniqueInOrder(ranked); +} + +function classifyContentType(title: string, url: string, contextText: string): ProcurementContentType { + const haystack = `${title} ${contextText} ${url}`.toLowerCase(); + if (containsAny(haystack, RESULT_HINTS)) return 'result'; + if (containsAny(haystack, NOTICE_HINTS)) return 'notice'; + if (containsAny(haystack, NEWS_HINTS)) return 'news'; + if (containsAny(haystack, NAVIGATION_HINTS)) return 'navigation'; + return 'unknown'; +} + +function isDetailPage(url: string): boolean { + const lower = cleanText(url).toLowerCase(); + if (!lower) return false; + const hasDetailToken = DETAIL_URL_HINTS.some((hint) => lower.includes(hint)); + if (!hasDetailToken) return false; + const hasListToken = LIST_URL_HINTS.some((hint) => lower.includes(hint)); + return !hasListToken; +} + +function buildQualityFlags(core: ProcurementCoreRecord): string[] { + const flags: string[] = []; + if (!core.project_owner) flags.push('missing_project_owner'); + if (!core.project_code) flags.push('missing_project_code'); + if (!core.budget_or_limit) flags.push('missing_budget'); + if (!core.deadline_or_open_time) flags.push('missing_deadline'); + if (core.content_type === 'navigation') flags.push('navigation_risk'); + if (!core.is_detail_page) flags.push('list_page_url'); + return flags; +} + +function queryMatched(text: string, query: string): boolean { + const tokenParts = query + .split(/\s+/) + .map((part) => part.toLowerCase().trim()) + .filter(Boolean); + if (tokenParts.length === 0) return true; + const lower = text.toLowerCase(); + return tokenParts.some((part) => lower.includes(part)); +} + +function normalizeCoreRecord( + row: ProcurementSearchCandidateRaw, + { + sourceSite, + }: { + sourceSite: string; + }, +): ProcurementCoreRecord { + const title = cleanText(row.title); + const url = cleanText(row.url); + const contextText = cleanText(row.contextText); + const date = normalizeDate(cleanText(row.date || contextText)); + const publishTime = date; + const contentType = classifyContentType(title, url, contextText); + const projectOwner = extractByPatterns(contextText, OWNER_PATTERNS); + const projectCode = extractByPatterns(contextText, CODE_PATTERNS); + const budget = extractByPatterns(contextText, BUDGET_PATTERNS); + const deadline = extractByPatterns(contextText, DEADLINE_PATTERNS); + const snippet = deriveSnippet(contextText || title); + + const core: ProcurementCoreRecord = { + title, + url, + date, + publish_time: publishTime, + source_site: sourceSite, + is_detail_page: isDetailPage(url), + content_type: contentType, + project_owner: projectOwner, + project_code: projectCode, + budget_or_limit: budget, + deadline_or_open_time: deadline, + snippet, + summary: snippet, + quality_flags: [], + }; + core.quality_flags = buildQualityFlags(core); + return core; +} + +function qualityRejectReason(core: ProcurementCoreRecord, query: string): string | null { + if (!core.title || !core.url) return 'missing_identity'; + if (core.content_type === 'navigation') return 'navigation_only'; + + const searchable = `${core.title} ${core.snippet} ${core.url}`.toLowerCase(); + const hasQuery = queryMatched(searchable, query); + if (!hasQuery) return 'query_mismatch'; + return null; +} + +function dedupeByTitleUrl(items: T[]): T[] { + const deduped: T[] = []; + const seen = new Set(); + for (const item of items) { + const key = `${item.title}\t${item.url}`; + if (seen.has(key)) continue; + seen.add(key); + deduped.push(item); + } + return deduped; +} + +export function formatTaxonomyError( + taxonomy: ProcurementTaxonomy, + { + site, + command, + detail, + }: { + site: string; + command: 'search' | 'detail'; + detail: string; + }, +): string { + return `[taxonomy=${taxonomy}] site=${site} command=${command} ${cleanText(detail)}`; +} + +export function taxonomyError( + taxonomy: ProcurementTaxonomy, + context: { + site: string; + command: 'search' | 'detail'; + detail: string; + }, +): Error { + return new Error(formatTaxonomyError(taxonomy, context)); +} + +export function toProcurementSearchRecords( + rows: ProcurementSearchCandidateRaw[], + { + site, + query, + limit, + }: { + site: string; + query: string; + limit: number; + }, +): ProcurementSearchRecord[] { + const normalizedRows = dedupeByTitleUrl(rows.map((row) => normalizeCoreRecord(row, { sourceSite: site }))); + const accepted: ProcurementCoreRecord[] = []; + for (const row of normalizedRows) { + const rejectReason = qualityRejectReason(row, query); + if (rejectReason) continue; + accepted.push(row); + } + + if (normalizedRows.length > 0 && accepted.length === 0) { + throw taxonomyError('extraction_drift', { + site, + command: 'search', + detail: `all rows rejected by quality gate (raw=${normalizedRows.length})`, + }); + } + + return accepted + .slice(0, Math.max(1, limit)) + .map((row, index) => ({ + rank: index + 1, + ...row, + })); +} + +export function toProcurementDetailRecord( + { + title, + url, + contextText, + publishTime, + }: { + title: string; + url: string; + contextText: string; + publishTime?: string; + }, + { + site, + query = '', + }: { + site: string; + query?: string; + }, +): ProcurementDetailRecord { + const core = normalizeCoreRecord( + { + title, + url, + date: publishTime, + contextText, + }, + { sourceSite: site }, + ); + const detailText = cleanText(contextText).slice(0, 6000); + const evidenceBlocks = splitEvidenceBlocks(detailText, query); + return { + ...core, + detail_text: detailText, + evidence_blocks: evidenceBlocks, + }; +} + +export const __test__ = { + classifyContentType, + isDetailPage, + splitEvidenceBlocks, + qualityRejectReason, +}; diff --git a/clis/jianyu/shared/procurement-detail.test.ts b/clis/jianyu/shared/procurement-detail.test.ts new file mode 100644 index 000000000..14686ebfd --- /dev/null +++ b/clis/jianyu/shared/procurement-detail.test.ts @@ -0,0 +1,83 @@ +import { describe, expect, it } from 'vitest'; +import { runProcurementDetail } from './procurement-detail.js'; + +function createPage( + evaluateImpl: () => Promise | unknown, +) { + return { + goto: async () => {}, + wait: async () => {}, + evaluate: async () => evaluateImpl(), + }; +} + +describe('procurement detail runner', () => { + it('retries transient execution-context errors and succeeds', async () => { + let attempts = 0; + const page = createPage(async () => { + attempts += 1; + if (attempts < 3) { + throw new Error('Execution context was destroyed.'); + } + return { + title: '电梯采购公告', + detailText: '项目编号:ABC-100 预算金额:100万元 截止时间:2026-04-30', + publishTime: '2026-04-09', + }; + }); + + const rows = await runProcurementDetail(page as never, { + url: 'https://example.com/jybx/20260409_1.html', + site: 'jianyu', + query: '电梯', + }); + + expect(attempts).toBe(3); + expect(rows).toHaveLength(1); + expect(rows[0].title).toContain('电梯采购公告'); + }); + + it('retries empty_result once and succeeds on the next attempt', async () => { + let attempts = 0; + const page = createPage(async () => { + attempts += 1; + if (attempts === 1) { + return { + title: '', + detailText: '', + publishTime: '', + }; + } + return { + title: '防爆电梯采购公告', + detailText: '采购内容:防爆电梯2台。', + publishTime: '2026-03-10', + }; + }); + + const rows = await runProcurementDetail(page as never, { + url: 'https://example.com/jybx/20260310_1.html', + site: 'jianyu', + query: '防爆电梯', + }); + + expect(attempts).toBe(2); + expect(rows).toHaveLength(1); + expect(rows[0].title).toContain('防爆电梯'); + }); + + it('does not retry non-retryable extraction_drift errors', async () => { + let attempts = 0; + const page = createPage(async () => { + attempts += 1; + return null; + }); + + await expect(runProcurementDetail(page as never, { + url: 'https://example.com/jybx/20260310_1.html', + site: 'jianyu', + query: '电梯', + })).rejects.toThrow('[taxonomy=extraction_drift]'); + expect(attempts).toBe(1); + }); +}); diff --git a/clis/jianyu/shared/procurement-detail.ts b/clis/jianyu/shared/procurement-detail.ts new file mode 100644 index 000000000..c61cbd529 --- /dev/null +++ b/clis/jianyu/shared/procurement-detail.ts @@ -0,0 +1,118 @@ +import { type IPage } from '@jackwener/opencli/types'; +import { + cleanText, + toProcurementDetailRecord, + taxonomyError, +} from './procurement-contract.js'; + +const DETAIL_MAX_ATTEMPTS = 3; +const RETRYABLE_DETAIL_ERROR_PATTERNS = [ + /execution context was destroyed/i, + /detached/i, + /target closed/i, + /cannot find context with specified id/i, + /\[taxonomy=empty_result\]/i, +]; + +function isRetryableDetailError(error: unknown): boolean { + const message = error instanceof Error + ? cleanText(error.message) + : cleanText(String(error ?? '')); + if (!message) return false; + return RETRYABLE_DETAIL_ERROR_PATTERNS.some((pattern) => pattern.test(message)); +} + +async function extractDetailPayload(page: IPage, targetUrl: string) { + await page.goto(targetUrl); + await page.wait(2); + + return await page.evaluate(` + (() => { + const clean = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const title = clean(document.title || ''); + const bodyText = clean(document.body ? document.body.innerText : ''); + const maxLength = 12000; + const limitedText = bodyText.length > maxLength ? bodyText.slice(0, maxLength) : bodyText; + const dateMatch = limitedText.match(/(20\\d{2})[.\\-/年](\\d{1,2})[.\\-/月](\\d{1,2})/); + const publishTime = dateMatch + ? dateMatch[1] + '-' + String(dateMatch[2]).padStart(2, '0') + '-' + String(dateMatch[3]).padStart(2, '0') + : ''; + return { + title, + detailText: limitedText, + publishTime, + }; + })() + `); +} + +export async function runProcurementDetail( + page: IPage, + { + url, + site, + query = '', + }: { + url: string; + site: string; + query?: string; + }, +) { + const targetUrl = cleanText(url); + if (!targetUrl) { + throw taxonomyError('relay_unavailable', { + site, + command: 'detail', + detail: 'missing required detail url', + }); + } + + let lastError: unknown = null; + for (let attempt = 1; attempt <= DETAIL_MAX_ATTEMPTS; attempt += 1) { + try { + const payload = await extractDetailPayload(page, targetUrl); + if (!payload || typeof payload !== 'object') { + throw taxonomyError('extraction_drift', { + site, + command: 'detail', + detail: `detail extraction returned invalid payload: ${targetUrl}`, + }); + } + + const row = payload as Record; + const title = cleanText(row.title); + const detailText = cleanText(row.detailText); + const publishTime = cleanText(row.publishTime); + if (!title && !detailText) { + throw taxonomyError('empty_result', { + site, + command: 'detail', + detail: `detail page has no readable content: ${targetUrl}`, + }); + } + + return [ + toProcurementDetailRecord( + { + title: title || targetUrl, + url: targetUrl, + contextText: detailText, + publishTime, + }, + { + site, + query, + }, + ), + ]; + } catch (error) { + lastError = error; + if (attempt >= DETAIL_MAX_ATTEMPTS || !isRetryableDetailError(error)) { + throw error; + } + await page.wait(Math.min(1.5, 0.5 * attempt)); + } + } + + throw lastError; +} diff --git a/docs/adapters/browser/jianyu.md b/docs/adapters/browser/jianyu.md index 7aa382376..52bf2a3c7 100644 --- a/docs/adapters/browser/jianyu.md +++ b/docs/adapters/browser/jianyu.md @@ -6,7 +6,8 @@ | Command | Description | |---------|-------------| -| `opencli jianyu search "" --limit ` | Search Jianyu bid notices and return normalized result rows | +| `opencli jianyu search "" --limit ` | Search Jianyu bid notices (V2 structured contract) | +| `opencli jianyu detail ""` | Extract detail-page evidence blocks from a search URL | ## Usage Examples @@ -16,6 +17,9 @@ opencli jianyu search "procurement" --limit 20 -f json # Search another keyword with a smaller window opencli jianyu search "substation" --limit 10 -f json + +# Extract structured detail evidence +opencli jianyu detail "https://www.jianyu360.cn/nologin/content/....html" -f json ``` ## Prerequisites @@ -25,12 +29,13 @@ opencli jianyu search "substation" --limit 10 -f json ## Notes -- This adapter reads visible search result content only. -- The `date` field is normalized to `YYYY-MM-DD` when date text is detectable. +- `search` now returns V2 fields: `publish_time`, `source_site`, `content_type`, `is_detail_page`, `snippet`, `quality_flags`, plus compatible `date/summary`. +- `detail` returns the same structured fields and adds `detail_text` + `evidence_blocks`. +- Date fields are normalized to `YYYY-MM-DD` when date text is detectable. - Results are deduplicated by `title + url`. - `--limit` defaults to `20` and is capped at `50`. ## Troubleshooting - If the page shows login/verification prompts, complete it in Chrome and retry. -- If the command returns empty results, confirm the keyword and page availability on Jianyu UI first. +- If the command returns no valid rows due to noise/navigation pages, it reports taxonomy-style extraction errors instead of silent weak results.