diff --git a/docs/adapters/browser/1688.md b/docs/adapters/browser/1688.md new file mode 100644 index 00000000..53364f56 --- /dev/null +++ b/docs/adapters/browser/1688.md @@ -0,0 +1,50 @@ +# 1688 + +**Mode**: 🔐 Browser · **Domain**: `1688.com` + +## Commands + +| Command | Description | +|---------|-------------| +| `opencli 1688 search ""` | Search public product candidates with price, MOQ, seller link, and visible badges | +| `opencli 1688 item ` | Read a public product detail page with price tiers, MOQ, delivery text, and seller basics | +| `opencli 1688 store ` | Read a public supplier/store page with company info, years on platform, categories, and visible service signals | + +## Usage Examples + +```bash +# Search products +opencli 1688 search "桌面置物架 宿舍 收纳" --limit 10 + +# JSON output +opencli 1688 search "桌面置物架 宿舍 收纳" --limit 10 -f json + +# Read an item by offer id +opencli 1688 item 841141931191 -f json + +# Read an item by URL +opencli 1688 item https://detail.1688.com/offer/841141931191.html -f json + +# Read a supplier store +opencli 1688 store https://shop52908bfw19166.1688.com/ -f json + +# Read a supplier by member id +opencli 1688 store b2b-22154705262941f196 -f json +``` + +## Prerequisites + +- Chrome running and **logged into** `1688.com` +- [Browser Bridge extension](/guide/browser-bridge) installed + +## Notes + +- This adapter only returns fields visible on public pages. It does not send inquiries, place orders, or access seller back office data. +- Prefer stable identifiers such as `offer_id`, `member_id`, and `shop_id` for follow-up workflows. +- `item` can be more sensitive to the active browser target than `search` or `store`. + +## Troubleshooting + +- If `opencli 1688 item` reports `did not expose product context`, first make sure the open page is a real `detail.1688.com` item page. +- If the browser target is too broad, retry with `OPENCLI_CDP_TARGET=detail.1688.com`. +- If you hit a slider or verification page, refresh the real page in Chrome and retry. diff --git a/docs/adapters/index.md b/docs/adapters/index.md index ae3d4ea4..55cd6c6b 100644 --- a/docs/adapters/index.md +++ b/docs/adapters/index.md @@ -42,6 +42,7 @@ Run `opencli list` for the live registry. | **[tiktok](/adapters/browser/tiktok)** | `explore` `search` `profile` `user` `following` `follow` `unfollow` `like` `unlike` `comment` `save` `unsave` `live` `notifications` `friends` | 🔐 Browser | | **[google](/adapters/browser/google)** | `news` `search` `suggest` `trends` | 🌐 / 🔐 | | **[jd](/adapters/browser/jd)** | `item` | 🔐 Browser | +| **[1688](/adapters/browser/1688)** | `search` `item` `store` | 🔐 Browser | | **[web](/adapters/browser/web)** | `read` | 🔐 Browser | | **[weixin](/adapters/browser/weixin)** | `download` | 🔐 Browser | | **[36kr](/adapters/browser/36kr)** | `news` `hot` `search` `article` | 🌐 / 🔐 | diff --git a/docs/developer/testing.md b/docs/developer/testing.md index 730b6f30..f6712d57 100644 --- a/docs/developer/testing.md +++ b/docs/developer/testing.md @@ -131,6 +131,8 @@ npx vitest src/ - `browser-public.test.ts` 使用 `tryBrowserCommand()`,站点反爬或地域限制导致空数据时会 warn + pass - `browser-auth.test.ts` 验证 **graceful failure**,重点是不 crash、不 hang、错误信息可控 - 如需测试完整登录态,保持 Chrome 登录态并安装 Browser Bridge 扩展,再手动运行对应测试 +- 对依赖具体 host 页面上下文的 browser adapter,除了单测外,还应手动验证真实命令,并把必要的 target host 约束写进 adapter docs / troubleshooting +- 对会主动导航页面的 browser commands,手动验证时优先串行执行;多个 CLI 进程同时连到同一个 CDP target 可能互相覆盖导航,制造假的 adapter 故障 --- diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index e5d310fe..72a26e77 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -12,6 +12,17 @@ - Your login session in Chrome might have expired. Open a normal Chrome tab, navigate to the target site, and log in or refresh the page. - Some sites have geographic restrictions (e.g., Bilibili, Zhihu from outside China). +### Browser command opens the page but still cannot read context + +- A healthy Browser Bridge connection does not guarantee that the current page target exposes the data your adapter expects. +- Some browser adapters are sensitive to the active host or page context. +- Example: `opencli 1688 item` may fail with `did not expose product context` if the target is too broad. +- Retry on a real item page, refresh the page in Chrome, and if needed narrow the target, for example: + +```bash +OPENCLI_CDP_TARGET=detail.1688.com opencli 1688 item 841141931191 -f json +``` + ### Node API errors - Make sure you are using **Node.js >= 20**. Some dependencies require modern Node APIs. diff --git a/src/clis/1688/item.test.ts b/src/clis/1688/item.test.ts new file mode 100644 index 00000000..9e9394d2 --- /dev/null +++ b/src/clis/1688/item.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './item.js'; + +describe('1688 item normalization', () => { + it('normalizes public item payload into reportable fields', () => { + const result = __test__.normalizeItemPayload({ + href: 'https://detail.1688.com/offer/887904326744.html', + title: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077 - 阿里巴巴', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 入驻13年 + 主营:大码女装 + 店铺回头率 + 87% + 山东青岛 + 3套起批 + 已售1600+套 + 支持定制logo + `, + offerTitle: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077', + offerId: 887904326744, + seller: { + companyName: '青岛沁澜衣品服装有限公司', + memberId: 'b2b-1641351767', + winportUrl: 'https://yinuoweierfushi.1688.com', + }, + trade: { + beginAmount: 3, + priceDisplay: '96.00-98.00', + unit: '套', + saleCount: 1655, + offerIDatacenterSellInfo: { + 面料名称: '莫代尔', + 主面料成分: '莫代尔纤维', + sellPointModel: '{"ignore":true}', + }, + offerPriceModel: { + currentPrices: [ + { beginAmount: 3, price: '98.00' }, + { beginAmount: 50, price: '97.00' }, + ], + }, + }, + gallery: { + mainImage: ['https://example.com/1.jpg'], + offerImgList: ['https://example.com/2.jpg'], + wlImageInfos: [{ fullPathImageURI: 'https://example.com/3.jpg' }], + }, + services: [ + { serviceName: '延期必赔', agreeDeliveryHours: 360 }, + { serviceName: '品质保障' }, + ], + }); + + expect(result.offer_id).toBe('887904326744'); + expect(result.member_id).toBe('b2b-1641351767'); + expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.price_text).toBe('¥96.00-98.00'); + expect(result.moq_text).toBe('3套起批'); + expect(result.origin_place).toBe('山东青岛'); + expect(result.delivery_days_text).toBe('360小时内发货'); + expect(result.private_label_text).toBe('支持定制logo'); + expect(result.visible_attributes).toEqual({ + 面料名称: '莫代尔', + 主面料成分: '莫代尔纤维', + }); + }); +}); diff --git a/src/clis/1688/item.ts b/src/clis/1688/item.ts new file mode 100644 index 00000000..1db98bef --- /dev/null +++ b/src/clis/1688/item.ts @@ -0,0 +1,280 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { isRecord } from '../../utils.js'; +import { + assertNotCaptcha, + buildCaptchaHint, + buildDetailUrl, + buildProvenance, + cleanMultilineText, + cleanText, + extractLocation, + extractMemberId, + extractOfferId, + extractShopId, + gotoAndReadState, + normalizePriceTiers, + parseMoqText, + parsePriceText, + toNumber, + uniqueNonEmpty, +} from './shared.js'; + +interface BuyerProtectionModel { + serviceName?: string; + shortBuyerDesc?: string; + packageBuyerDesc?: string; + textDesc?: string; + agreeDeliveryHours?: number; +} + +interface ItemBrowserPayload { + href?: string; + title?: string; + bodyText?: string; + offerTitle?: string; + offerId?: string | number; + seller?: { + companyName?: string; + memberId?: string; + winportUrl?: string; + sellerWinportUrlMap?: Record; + }; + trade?: { + beginAmount?: string | number; + priceDisplay?: string; + unit?: string; + saleCount?: string | number; + offerIDatacenterSellInfo?: Record; + offerPriceModel?: { + currentPrices?: Array<{ beginAmount?: string | number; price?: string | number }>; + }; + }; + gallery?: { + mainImage?: string[]; + offerImgList?: string[]; + wlImageInfos?: Array<{ fullPathImageURI?: string }>; + }; + shipping?: { + deliveryLimitText?: string; + logisticsText?: string; + protectionInfos?: BuyerProtectionModel[]; + buyerProtectionModel?: BuyerProtectionModel[]; + }; + services?: BuyerProtectionModel[]; +} + +function normalizeItemPayload(payload: ItemBrowserPayload): Record { + const href = cleanText(payload.href); + const bodyText = cleanMultilineText(payload.bodyText); + const sellerName = cleanText(payload.seller?.companyName); + const sellerUrl = cleanText( + payload.seller?.winportUrl + ?? payload.seller?.sellerWinportUrlMap?.defaultUrl + ?? payload.seller?.sellerWinportUrlMap?.indexUrl, + ); + const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(href) || ''; + const memberId = cleanText(payload.seller?.memberId) || extractMemberId(href) || null; + const shopId = extractShopId(sellerUrl) ?? extractShopId(href); + const unit = cleanText(payload.trade?.unit); + const priceDisplay = cleanText(payload.trade?.priceDisplay); + const priceRange = parsePriceText(priceDisplay ? `¥${priceDisplay}` : bodyText); + const moqText = extractMoqText(bodyText, payload.trade?.beginAmount, unit); + const moq = parseMoqText(moqText); + const services = uniqueServices(payload); + const serviceBadges = uniqueNonEmpty(services.map((service) => cleanText(service.serviceName))); + const attributes = normalizeVisibleAttributes(payload.trade?.offerIDatacenterSellInfo); + + const detailUrl = offerId ? buildDetailUrl(offerId) : href; + const provenance = buildProvenance(href || detailUrl); + const priceTiers = normalizePriceTiers(payload.trade?.offerPriceModel?.currentPrices ?? [], unit || null); + const images = uniqueNonEmpty([ + ...(payload.gallery?.mainImage ?? []), + ...(payload.gallery?.offerImgList ?? []), + ...((payload.gallery?.wlImageInfos ?? []).map((item) => item.fullPathImageURI ?? '')), + ]); + + return { + offer_id: offerId, + member_id: memberId, + shop_id: shopId, + title: cleanText(payload.offerTitle) || stripAlibabaSuffix(payload.title) || firstNonEmptyLine(bodyText), + item_url: detailUrl, + ...provenance, + main_images: images, + price_text: priceRange.price_text, + price_tiers: priceTiers, + currency: priceRange.currency ?? 'CNY', + moq_text: moq.moq_text, + moq_value: moq.moq_value, + seller_name: sellerName || null, + seller_url: sellerUrl || null, + shop_name: sellerName || null, + origin_place: extractLocation(bodyText), + delivery_days_text: extractDeliveryDaysText(bodyText, services, payload.shipping), + customization_text: extractKeywordLine(bodyText, ['来样定制', '来图定制', '支持定制', '可定制', '定制']), + private_label_text: extractKeywordLine(bodyText, ['贴牌', '贴标', '定制logo', '打logo', 'OEM', 'ODM']), + visible_attributes: attributes, + sales_text: extractSalesText(bodyText), + service_badges: serviceBadges, + stock_quantity: extractStockQuantity(bodyText), + }; +} + +function normalizeVisibleAttributes(raw: unknown): Record { + if (!isRecord(raw)) return {}; + const entries = Object.entries(raw) + .filter(([key, value]) => key !== 'sellPointModel' && cleanText(key) && cleanText(String(value))) + .map(([key, value]) => [cleanText(key), cleanText(String(value))] as const); + return Object.fromEntries(entries); +} + +function uniqueServices(payload: ItemBrowserPayload): BuyerProtectionModel[] { + const combined = [ + ...(Array.isArray(payload.services) ? payload.services : []), + ...(Array.isArray(payload.shipping?.protectionInfos) ? payload.shipping.protectionInfos : []), + ...(Array.isArray(payload.shipping?.buyerProtectionModel) ? payload.shipping.buyerProtectionModel : []), + ]; + + const seen = new Set(); + const result: BuyerProtectionModel[] = []; + for (const service of combined) { + const key = cleanText(service.serviceName); + if (!key || seen.has(key)) continue; + seen.add(key); + result.push(service); + } + return result; +} + +function stripAlibabaSuffix(title: string | undefined): string { + return cleanText(title).replace(/\s*-\s*阿里巴巴$/, '').trim(); +} + +function firstNonEmptyLine(text: string): string { + return text.split('\n').map((line) => cleanText(line)).find(Boolean) ?? ''; +} + +function extractMoqText(bodyText: string, beginAmount: string | number | undefined, unit: string): string { + const lineMatch = bodyText.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/); + if (lineMatch) return lineMatch[0]; + + const moqValue = toNumber(beginAmount); + if (moqValue !== null) { + return `${moqValue}${unit || ''}起批`; + } + + return ''; +} + +function extractDeliveryDaysText( + bodyText: string, + services: BuyerProtectionModel[], + shipping: ItemBrowserPayload['shipping'], +): string | null { + const shippingText = cleanText(shipping?.deliveryLimitText) || cleanText(shipping?.logisticsText); + if (shippingText) return shippingText; + + const textMatch = bodyText.match(/\d+\s*(?:小时|天)(?:内)?发货/); + if (textMatch) return textMatch[0]; + + const hourMatch = services.find((service) => typeof service.agreeDeliveryHours === 'number'); + if (hourMatch && typeof hourMatch.agreeDeliveryHours === 'number') { + return `${hourMatch.agreeDeliveryHours}小时内发货`; + } + + return null; +} + +function extractKeywordLine(bodyText: string, keywords: string[]): string | null { + const lines = bodyText.split('\n').map((line) => cleanText(line)).filter(Boolean); + for (const line of lines) { + if (keywords.some((keyword) => line.includes(keyword))) { + return line; + } + } + return null; +} + +function extractSalesText(bodyText: string): string | null { + const match = bodyText.match(/(?:全网销量|已售)\s*\d+(?:\.\d+)?\+?[件套个]?/); + return match ? cleanText(match[0]) : null; +} + +function extractStockQuantity(bodyText: string): number | null { + const match = bodyText.match(/库存\s*(\d+)/); + return match ? Number.parseInt(match[1], 10) : null; +} + +async function readItemPayload(page: IPage, itemUrl: string): Promise { + let state = await gotoAndReadState(page, itemUrl, 2500, 'item'); + if (state.href && !state.href.includes('/offer/')) { + assertNotCaptcha(state, 'item'); + } + + const payload = await page.evaluate(` + (() => { + const root = window.context ?? {}; + const model = root.result?.global?.globalData?.model ?? null; + const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); + return { + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + offerTitle: model?.offerTitleModel?.subject ?? '', + offerId: model?.tradeModel?.offerId ?? '', + seller: toJson(model?.sellerModel), + trade: toJson(model?.tradeModel), + gallery: toJson(root.result?.data?.gallery?.fields ?? null), + shipping: toJson(root.result?.data?.shippingServices?.fields ?? null), + services: toJson(root.result?.data?.shippingServices?.fields?.protectionInfos ?? []), + }; + })() + `) as ItemBrowserPayload; + + if (!cleanText(String(payload.offerId ?? ''))) { + state = await gotoAndReadState(page, itemUrl, 2500, 'item'); + assertNotCaptcha(state, 'item'); + throw new CommandExecutionError( + '1688 item page did not expose product context', + `${buildCaptchaHint('item')} If the page is still open but blank, refresh the item page in Chrome and retry.`, + ); + } + + return payload; +} + +cli({ + site: '1688', + name: 'item', + description: '1688 商品详情(公开商品字段、价格阶梯、卖家基础信息)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + required: true, + positional: true, + help: '1688 商品 URL 或 offer ID(如 887904326744)', + }, + ], + columns: ['offer_id', 'title', 'price_text', 'moq_text', 'seller_name', 'origin_place'], + func: async (page, kwargs) => { + const itemUrl = buildDetailUrl(String(kwargs.input ?? '')); + const payload = await readItemPayload(page, itemUrl); + return [normalizeItemPayload(payload)]; + }, +}); + +export const __test__ = { + normalizeItemPayload, + normalizeVisibleAttributes, + stripAlibabaSuffix, + extractMoqText, + extractDeliveryDaysText, + extractKeywordLine, + extractSalesText, + extractStockQuantity, +}; diff --git a/src/clis/1688/search.test.ts b/src/clis/1688/search.test.ts new file mode 100644 index 00000000..e204480c --- /dev/null +++ b/src/clis/1688/search.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './search.js'; + +describe('1688 search normalization', () => { + it('normalizes search candidates into structured result rows', () => { + const result = __test__.normalizeSearchCandidate({ + item_url: 'https://detail.1688.com/offer/887904326744.html', + title: '宿舍置物架桌面加高架', + container_text: '宿舍置物架桌面加高架 ¥56.00 2套起批 山东青岛 已售300+套', + price_text: '¥ 56 .00', + sales_text: '300+套', + moq_text: '2套起批', + tag_items: ['退货包运费', '回头率52%'], + hover_items: ['验厂报告'], + seller_name: '青岛沁澜衣品服装有限公司', + seller_url: 'https://yinuoweierfushi.1688.com', + }, 1, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=置物架'); + + expect(result.rank).toBe(1); + expect(result.offer_id).toBe('887904326744'); + expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.price_text).toBe('¥56.00'); + expect(result.price_min).toBe(56); + expect(result.price_max).toBe(56); + expect(result.moq_value).toBe(2); + expect(result.location).toBe('山东青岛'); + expect(result.sales_text).toBe('300+套'); + expect(result.badges).toEqual(expect.arrayContaining(['退货包运费', '验厂报告'])); + expect(result.return_rate_text).toBe('回头率52%'); + }); + + it('extracts offer id from mobile detail search links', () => { + const result = __test__.normalizeSearchCandidate({ + item_url: 'http://detail.m.1688.com/page/index.html?offerId=910933345396&sortType=&pageId=', + title: '', + container_text: '桌面书桌办公室工位收纳展示新中式博古架多层茶具厨房摆放置物架 ¥24.3 已售20+件', + price_text: '¥ 14 .28', + sales_text: '1500+件', + moq_text: '≥2个', + seller_name: '泰商国际贸易(宁阳)有限公司', + seller_url: 'http://tsgjmy.1688.com/', + }, 1, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=桌面置物架'); + + expect(result.offer_id).toBe('910933345396'); + expect(result.shop_id).toBe('tsgjmy'); + expect(result.title).toContain('桌面书桌办公室工位收纳展示'); + expect(result.price_text).toBe('¥14.28'); + expect(result.sales_text).toBe('1500+件'); + expect(result.moq_text).toBe('≥2个'); + expect(result.moq_value).toBe(2); + }); +}); diff --git a/src/clis/1688/search.ts b/src/clis/1688/search.ts new file mode 100644 index 00000000..87976a84 --- /dev/null +++ b/src/clis/1688/search.ts @@ -0,0 +1,302 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + FACTORY_BADGE_PATTERNS, + SERVICE_BADGE_PATTERNS, + assertNotCaptcha, + buildProvenance, + buildSearchUrl, + cleanText, + extractBadges, + extractLocation, + extractMemberId, + extractOfferId, + extractShopId, + gotoAndReadState, + limitCandidates, + parseMoqText, + parsePriceText, + uniqueNonEmpty, +} from './shared.js'; + +interface SearchPayload { + href?: string; + title?: string; + bodyText?: string; + candidates?: Array<{ + item_url?: string; + title?: string; + container_text?: string; + desc_rows?: string[]; + price_text?: string | null; + sales_text?: string | null; + hover_price_text?: string | null; + moq_text?: string | null; + tag_items?: string[]; + hover_items?: string[]; + seller_name?: string | null; + seller_url?: string | null; + }>; +} + +const SEARCH_ITEM_URL_PATTERNS = [ + 'detail.1688.com/offer/', + 'detail.m.1688.com/page/index.html?offerId=', +]; + +function normalizeSearchCandidate( + candidate: NonNullable[number], + rank: number, + sourceUrl: string, +): Record { + const itemUrl = cleanText(candidate.item_url); + const containerText = cleanText(candidate.container_text); + const priceText = firstNonEmpty([ + normalizeInlineText(candidate.price_text), + normalizeInlineText(extractPriceText(candidate.hover_price_text)), + ]); + const priceRange = parsePriceText(priceText || containerText); + const moq = parseMoqText(firstNonEmpty([ + normalizeInlineText(candidate.moq_text), + normalizeInlineText(extractMoqText(candidate.hover_price_text)), + normalizeInlineText(extractMoqText(containerText)), + ])); + const sellerUrl = cleanText(candidate.seller_url); + const evidenceText = uniqueNonEmpty([ + containerText, + ...(candidate.desc_rows ?? []), + ...(candidate.tag_items ?? []), + ...(candidate.hover_items ?? []), + ]).join('\n'); + const badges = extractBadges(evidenceText, [...FACTORY_BADGE_PATTERNS, ...SERVICE_BADGE_PATTERNS]); + const salesText = firstNonEmpty([ + extractSalesText(candidate.sales_text), + extractSalesText(containerText) ?? '', + ]) || null; + + return { + rank, + offer_id: extractOfferId(itemUrl), + member_id: extractMemberId(sellerUrl), + shop_id: extractShopId(sellerUrl), + title: cleanText(candidate.title) || firstLine(containerText), + source_url: sourceUrl, + fetched_at: new Date().toISOString(), + strategy: 'cookie', + price_text: priceRange.price_text, + price_min: priceRange.price_min, + price_max: priceRange.price_max, + currency: priceRange.currency ?? 'CNY', + moq_text: moq.moq_text, + moq_value: moq.moq_value, + seller_name: cleanText(candidate.seller_name) || null, + seller_url: sellerUrl || null, + item_url: itemUrl, + location: extractLocation(containerText), + badges, + sales_text: salesText, + return_rate_text: extractReturnRateText(candidate.tag_items ?? []), + }; +} + +function extractMoqText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + return normalized.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/i)?.[0] + ?? normalized.match(/≥\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)?/i)?.[0] + ?? normalized.match(/\d+(?:\.\d+)?\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)/i)?.[0] + ?? ''; +} + +function extractPriceText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + return normalized.match(/[¥$€]\s*\d+(?:\.\d+)?/)?.[0] ?? ''; +} + +function extractSalesText(text: string | null | undefined): string | null { + const normalized = normalizeInlineText(text); + if (!normalized) return null; + if (/^\d+(?:\.\d+)?\+?\s*(件|套|个|单)$/.test(normalized)) { + return normalized; + } + const match = normalized.match(/(?:已售|销量|售)\s*\d+(?:\.\d+)?\+?\s*(件|套|个|单)?/); + return match ? cleanText(match[0]) : null; +} + +function firstLine(text: string): string { + return text.split(/\s+/).find(Boolean) ?? ''; +} + +function firstNonEmpty(values: Array): string { + return values.map((value) => cleanText(value)).find(Boolean) ?? ''; +} + +function normalizeInlineText(text: string | null | undefined): string { + return cleanText(text) + .replace(/([¥$€])\s+(?=\d)/g, '$1') + .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') + .replace(/\s*([~-])\s*/g, '$1') + .trim(); +} + +function extractReturnRateText(values: string[]): string | null { + return uniqueNonEmpty(values.map((value) => normalizeInlineText(value))) + .find((value) => /^回头率\s*\d+(?:\.\d+)?%$/.test(value)) + ?? null; +} + +async function readSearchPayload(page: IPage, query: string): Promise { + const url = buildSearchUrl(query); + const state = await gotoAndReadState(page, url, 2500, 'search'); + assertNotCaptcha(state, 'search'); + + return await page.evaluate(` + (() => { + const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)}.some((pattern) => (href || '').includes(pattern)); + const uniqueTexts = (values) => [...new Set(values.map((value) => normalizeText(value)).filter(Boolean))]; + const collectTexts = (root, selector) => uniqueTexts( + Array.from(root.querySelectorAll(selector)).map((node) => node.innerText || node.textContent || ''), + ); + const firstText = (root, selectors) => { + for (const selector of selectors) { + const node = root.querySelector(selector); + const value = normalizeText(node ? node.innerText || node.textContent || '' : ''); + if (value) return value; + } + return ''; + }; + const findMoqText = (values, priceText) => { + const moqPattern = /(≥\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)?)|(\\d+(?:\\.\\d+)?\\s*(?:~|-|至|到)\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只))|(\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)\\s*起批)/i; + return values.find((value) => moqPattern.test(value)) + || normalizeText(priceText).match(moqPattern)?.[0] + || ''; + }; + const isSellerHref = (href) => { + if (!href) return false; + try { + const url = new URL(href, window.location.href); + const host = url.hostname || ''; + if (!host.endsWith('.1688.com')) return false; + if (host === 's.1688.com' || host === 'r.1688.com' || host === 'air.1688.com' || host === 'detail.1688.com' || host === 'detail.m.1688.com' || host === 'dj.1688.com') { + return false; + } + return true; + } catch { + return false; + } + }; + const collectCandidates = () => { + const anchors = Array.from(document.querySelectorAll('a')).filter((anchor) => isItemHref(anchor.href || '')); + const seen = new Set(); + const items = []; + + const pickContainer = (anchor) => { + let node = anchor; + while (node && node !== document.body) { + const text = normalizeText(node.innerText || node.textContent || ''); + if (text.length >= 40 && text.length <= 2000) { + return node; + } + node = node.parentElement; + } + return anchor; + }; + + for (const anchor of anchors) { + const href = anchor.href || ''; + if (!href || seen.has(href)) continue; + seen.add(href); + + const container = pickContainer(anchor); + const tagItems = collectTexts(container, '.offer-tag-row .offer-desc-item'); + const hoverItems = collectTexts(container, '.offer-hover-wrapper .offer-desc-item'); + const sellerAnchor = Array.from(container.querySelectorAll('a')) + .find((link) => isSellerHref(link.href || '')); + const hoverPriceText = firstText(container, [ + '.offer-hover-wrapper .hover-price-item', + '.offer-hover-wrapper .price-item', + ]); + + items.push({ + item_url: href, + title: firstText(container, ['.offer-title-row .title-text', '.offer-title-row']) + || normalizeText(anchor.innerText || anchor.textContent || ''), + container_text: normalizeText(container.innerText || container.textContent || ''), + desc_rows: collectTexts(container, '.offer-desc-row'), + price_text: firstText(container, ['.offer-price-row .price-item']), + sales_text: firstText(container, ['.offer-price-row .col-desc_after', '.offer-desc-row .col-desc_after']), + hover_price_text: hoverPriceText, + moq_text: findMoqText(hoverItems, hoverPriceText), + tag_items: tagItems, + hover_items: hoverItems, + seller_name: sellerAnchor ? normalizeText(sellerAnchor.innerText || sellerAnchor.textContent || '') : null, + seller_url: sellerAnchor ? sellerAnchor.href : null, + }); + } + + return items; + }; + + return { + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + candidates: collectCandidates(), + }; + })() + `) as SearchPayload; +} + +cli({ + site: '1688', + name: 'search', + description: '1688 商品搜索(结果候选、卖家链接、价格/MOQ/销量文本)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'query', + required: true, + positional: true, + help: '搜索关键词,如 "置物架"', + }, + { + name: 'limit', + type: 'int', + default: 20, + help: '结果数量上限(默认 20)', + }, + ], + columns: ['rank', 'title', 'price_text', 'moq_text', 'seller_name', 'location'], + func: async (page, kwargs) => { + const query = String(kwargs.query ?? ''); + const limit = Math.max(1, Number(kwargs.limit) || 20); + const payload = await readSearchPayload(page, query); + const sourceUrl = cleanText(payload.href) || buildSearchUrl(query); + const candidates = limitCandidates(payload.candidates ?? [], limit) + .filter((candidate) => cleanText(candidate.item_url)); + + if (candidates.length === 0) { + throw new CommandExecutionError( + '1688 search did not expose any result cards', + 'The search page likely hit a slider challenge or changed its DOM. Open the same query in Chrome, solve any challenge, keep a clean 1688 tab selected, and retry.', + ); + } + + const provenance = buildProvenance(sourceUrl); + return candidates.map((candidate, index) => ({ + ...normalizeSearchCandidate(candidate, index + 1, sourceUrl), + fetched_at: provenance.fetched_at, + strategy: provenance.strategy, + })); + }, +}); + +export const __test__ = { + normalizeSearchCandidate, + extractMoqText, + extractSalesText, + firstLine, +}; diff --git a/src/clis/1688/shared.test.ts b/src/clis/1688/shared.test.ts new file mode 100644 index 00000000..3395fc1d --- /dev/null +++ b/src/clis/1688/shared.test.ts @@ -0,0 +1,59 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './shared.js'; + +describe('1688 shared helpers', () => { + it('builds encoded search URLs', () => { + expect(__test__.buildSearchUrl('置物架')).toBe( + 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=%E7%BD%AE%E7%89%A9%E6%9E%B6', + ); + }); + + it('extracts stable ids from 1688 inputs', () => { + expect(__test__.extractOfferId('887904326744')).toBe('887904326744'); + expect(__test__.extractOfferId('https://detail.1688.com/offer/887904326744.html')).toBe('887904326744'); + expect(__test__.extractMemberId('https://winport.m.1688.com/page/index.html?memberId=b2b-1641351767')).toBe('b2b-1641351767'); + expect(__test__.extractMemberId('b2b-22154705262941f196')).toBe('b2b-22154705262941f196'); + expect(__test__.resolveStoreUrl('b2b-22154705262941f196')).toBe( + 'https://winport.m.1688.com/page/index.html?memberId=b2b-22154705262941f196', + ); + expect(__test__.extractShopId('https://yinuoweierfushi.1688.com/page/index.html')).toBe('yinuoweierfushi'); + }); + + it('parses price ranges and moq text', () => { + expect(__test__.parsePriceText('¥96.00-98.00')).toEqual({ + price_text: '¥96.00-98.00', + price_min: 96, + price_max: 98, + currency: 'CNY', + }); + + expect(__test__.parsePriceText('¥ 14 .28')).toEqual({ + price_text: '¥14.28', + price_min: 14.28, + price_max: 14.28, + currency: 'CNY', + }); + + expect(__test__.parseMoqText('3套起批')).toEqual({ + moq_text: '3套起批', + moq_value: 3, + }); + + expect(__test__.parseMoqText('2~999个')).toEqual({ + moq_text: '2~999个', + moq_value: 2, + }); + }); + + it('extracts location and captcha states', () => { + expect(__test__.extractLocation('山东青岛 送至 江苏苏州')).toBe('山东青岛'); + expect(__test__.extractMetric(`主营:家装建材 +地址:江苏省常州市武进区横林镇崔桥崔卫路40号`, '主营')).toBe('家装建材'); + expect(__test__.extractMetric('常州市优品诺家居科技有限公司是家居用品、家居用品等产品专业生产加工的公司', '生产加工')).toBe(null); + expect(__test__.isCaptchaState({ + href: 'https://s.1688.com/_____tmd_____/punish', + title: '验证码拦截', + body_text: '请拖动下方滑块完成验证', + })).toBe(true); + }); +}); diff --git a/src/clis/1688/shared.ts b/src/clis/1688/shared.ts new file mode 100644 index 00000000..b4a904bc --- /dev/null +++ b/src/clis/1688/shared.ts @@ -0,0 +1,527 @@ +import { ArgumentError, AuthRequiredError, CommandExecutionError } from '../../errors.js'; +import type { IPage } from '../../types.js'; + +export const SITE = '1688'; +export const HOME_URL = 'https://www.1688.com/'; +export const SEARCH_URL_PREFIX = 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords='; +export const DETAIL_URL_PREFIX = 'https://detail.1688.com/offer/'; +export const STORE_MOBILE_URL_PREFIX = 'https://winport.m.1688.com/page/index.html?memberId='; +export const STRATEGY = 'cookie'; + +const CAPTCHA_URL_MARKER = '/_____tmd_____/punish'; +const CAPTCHA_TEXT_PATTERNS = [ + '请拖动下方滑块完成验证', + '请按住滑块,拖动到最右边', + '通过验证以确保正常访问', + '验证码拦截', + '访问验证', + '滑动验证', +]; +export const FACTORY_BADGE_PATTERNS = [ + '源头工厂', + '深度验厂', + '实力工厂', + '工厂档案', + '加工专区', + '验厂报告', + '厂家直销', + '生产厂家', + '工厂直供', +]; +export const SERVICE_BADGE_PATTERNS = [ + '延期必赔', + '品质保障', + '破损包赔', + '退货包运费', + '晚发必赔', + '7*24小时响应', + '48小时发货', + '72小时发货', + '后天达', + '包邮', + '闪电拿样', +]; +const CHINA_LOCATIONS = [ + '北京', + '天津', + '上海', + '重庆', + '河北', + '山西', + '辽宁', + '吉林', + '黑龙江', + '江苏', + '浙江', + '安徽', + '福建', + '江西', + '山东', + '河南', + '湖北', + '湖南', + '广东', + '海南', + '四川', + '贵州', + '云南', + '陕西', + '甘肃', + '青海', + '台湾', + '内蒙古', + '广西', + '西藏', + '宁夏', + '新疆', + '香港', + '澳门', +]; + +export interface ProvenanceFields { + source_url: string; + fetched_at: string; + strategy: string; +} + +export interface PageState { + href: string; + title: string; + body_text: string; +} + +export interface PriceRange { + price_text: string; + price_min: number | null; + price_max: number | null; + currency: string | null; +} + +export interface MoqValue { + moq_text: string; + moq_value: number | null; +} + +export interface PriceTier { + quantity_text: string; + quantity_min: number | null; + price_text: string; + price: number | null; + currency: string | null; +} + +export interface SearchCandidate { + item_url: string; + title: string; + container_text: string; + seller_name: string | null; + seller_url: string | null; +} + +export function cleanText(value: unknown): string { + return typeof value === 'string' + ? value.replace(/\u00a0/g, ' ').replace(/\s+/g, ' ').trim() + : ''; +} + +export function cleanMultilineText(value: unknown): string { + return typeof value === 'string' + ? value + .replace(/\u00a0/g, ' ') + .split('\n') + .map((line) => line.replace(/\s+/g, ' ').trim()) + .filter(Boolean) + .join('\n') + : ''; +} + +export function uniqueNonEmpty(values: Array): string[] { + return [...new Set(values.map((value) => cleanText(value)).filter(Boolean))]; +} + +export function buildSearchUrl(query: string): string { + const normalized = cleanText(query); + if (!normalized) { + throw new ArgumentError('1688 search query cannot be empty'); + } + return `${SEARCH_URL_PREFIX}${encodeURIComponent(normalized)}`; +} + +export function buildDetailUrl(input: string): string { + const offerId = extractOfferId(input); + if (!offerId) { + throw new ArgumentError( + '1688 item expects an offer URL or offer ID', + 'Example: opencli 1688 item 887904326744', + ); + } + return `${DETAIL_URL_PREFIX}${offerId}.html`; +} + +export function resolveStoreUrl(input: string): string { + const normalized = cleanText(input); + if (!normalized) { + throw new ArgumentError('1688 store expects a store URL, shop host, or member ID'); + } + + if (/^https?:\/\//i.test(normalized)) { + return canonicalizeStoreUrl(normalized); + } + + const memberId = extractMemberId(normalized); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; + } + + if (normalized.endsWith('.1688.com')) { + return canonicalizeStoreUrl(`https://${normalized}`); + } + + if (/^[a-z0-9-]+$/i.test(normalized)) { + return canonicalizeStoreUrl(`https://${normalized}.1688.com`); + } + + throw new ArgumentError( + '1688 store expects a store URL, shop host, or member ID', + 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/?offerId=887904326744', + ); +} + +export function canonicalizeStoreUrl(input: string): string { + try { + const url = new URL(input); + if (!url.hostname.endsWith('1688.com')) { + throw new Error('not-1688'); + } + return url.toString(); + } catch { + throw new ArgumentError('Invalid 1688 store URL'); + } +} + +export function extractOfferId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + const directId = normalized.match(/^\d{6,}$/)?.[0]; + if (directId) return directId; + const detailMatch = normalized.match(/\/offer\/(\d{6,})\.html/i); + if (detailMatch) return detailMatch[1]; + const queryMatch = normalized.match(/[?&]offerId=(\d{6,})/i); + if (queryMatch) return queryMatch[1]; + return null; +} + +export function extractMemberId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + const direct = normalized.match(/\bb2b-[a-z0-9]+\b/i)?.[0]; + if (direct) return direct; + const queryMatch = normalized.match(/[?&]memberId=(b2b-[a-z0-9]+)/i); + if (queryMatch) return queryMatch[1]; + const mobileMatch = normalized.match(/\/winport\/(b2b-[a-z0-9]+)\.html/i); + if (mobileMatch) return mobileMatch[1]; + return null; +} + +export function extractShopId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + try { + const url = new URL(/^https?:\/\//i.test(normalized) ? normalized : `https://${normalized}`); + const [subdomain] = url.hostname.split('.'); + if (!subdomain || ['www', 'detail', 's', 'winport', 'work'].includes(subdomain)) { + return null; + } + return subdomain; + } catch { + return /^[a-z0-9-]+$/i.test(normalized) ? normalized : null; + } +} + +export function buildProvenance(sourceUrl: string): ProvenanceFields { + return { + source_url: sourceUrl, + fetched_at: new Date().toISOString(), + strategy: STRATEGY, + }; +} + +export function parsePriceText(text: string): PriceRange { + const normalized = normalizeNumericText(cleanText(text)); + const matches = normalized.match(/\d+(?:,\d{3})*(?:\.\d+)?/g) ?? []; + const values = matches + .map((value) => Number.parseFloat(value.replace(/,/g, ''))) + .filter((value) => Number.isFinite(value)); + + if (values.length === 0) { + return { + price_text: normalized, + price_min: null, + price_max: null, + currency: null, + }; + } + + return { + price_text: normalized, + price_min: values[0] ?? null, + price_max: values[values.length - 1] ?? values[0] ?? null, + currency: normalized.includes('¥') || normalized.includes('元') ? 'CNY' : null, + }; +} + +export function normalizePriceTiers( + rawTiers: Array<{ beginAmount?: unknown; price?: unknown }>, + unit: string | null, +): PriceTier[] { + return rawTiers + .map((tier) => { + const quantityMin = toNumber(tier.beginAmount); + const priceText = cleanText(tier.price); + const price = toNumber(tier.price); + return { + quantity_text: quantityMin !== null + ? `${quantityMin}${unit ?? ''}` + : '', + quantity_min: quantityMin, + price_text: priceText, + price, + currency: priceText ? 'CNY' : null, + }; + }) + .filter((tier) => tier.price_text); +} + +export function parseMoqText(text: string): MoqValue { + const normalized = normalizeNumericText(cleanText(text)); + const match = normalized.match(/(\d+(?:\.\d+)?)\s*(件|个|套|箱|包|双|台|把|只|pcs|piece|pieces)?\s*起批/i) + ?? normalized.match(/≥\s*(\d+(?:\.\d+)?)/); + const rangeMatch = normalized.match( + /(\d+(?:\.\d+)?)\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只|pcs|piece|pieces)/i, + ); + + if (!match && !rangeMatch) { + return { + moq_text: normalized, + moq_value: null, + }; + } + + return { + moq_text: normalized, + moq_value: Number.parseFloat((match ?? rangeMatch)![1]), + }; +} + +export function extractLocation(text: string): string | null { + const normalized = cleanMultilineText(text); + const primaryRegion = normalized.split(/送至|发往/)[0] ?? normalized; + const lines = primaryRegion.split('\n'); + for (const line of lines) { + const compact = cleanText(line); + if (!compact || compact.length > 16) continue; + if (CHINA_LOCATIONS.some((location) => compact.startsWith(location))) { + return compact; + } + } + + const locationPattern = new RegExp(`(${CHINA_LOCATIONS.join('|')})[\\u4e00-\\u9fa5]{0,8}`); + return primaryRegion.match(locationPattern)?.[0] ?? null; +} + +export function extractAddress(text: string): string | null { + const normalized = cleanMultilineText(text); + const lineMatch = normalized.match(/地址[::]\s*([^\n]+)/); + if (lineMatch) return cleanText(lineMatch[1]); + return normalized + .split('\n') + .map((line) => cleanText(line)) + .find((line) => line.includes('省') || line.includes('市') || line.includes('区') || line.includes('县')) + ?? null; +} + +export function extractMetric(text: string, label: string): string | null { + const normalized = cleanMultilineText(text); + const direct = normalized.match(new RegExp(`(?:^|\\n)\\s*${escapeForRegex(label)}[::]?\\s*([^\\n]+)`)); + if (direct) return cleanText(direct[1]); + + const lineBased = normalized.match(new RegExp(`(?:^|\\n)\\s*${escapeForRegex(label)}\\n([^\\n]+)`)); + return lineBased ? cleanText(lineBased[1]) : null; +} + +export function extractYearsOnPlatform(text: string): string | null { + return text.match(/入驻\d+年/)?.[0] ?? null; +} + +export function extractMainBusiness(text: string): string | null { + const value = extractMetric(text, '主营'); + return value ? value.replace(/^:/, '').trim() : null; +} + +export function extractBadges(text: string, candidates: string[]): string[] { + return uniqueNonEmpty( + candidates.filter((candidate) => cleanMultilineText(text).includes(candidate)), + ); +} + +export function guessTopCategories(text: string): string[] { + const mainBusiness = extractMainBusiness(text); + if (!mainBusiness) return []; + return uniqueNonEmpty(mainBusiness.split(/[、,/|]/).map((value) => value.trim())); +} + +export function isCaptchaState(state: Partial): boolean { + const href = cleanText(state.href).toLowerCase(); + const title = cleanText(state.title); + const bodyText = cleanMultilineText(state.body_text); + if (href.includes(CAPTCHA_URL_MARKER)) return true; + return CAPTCHA_TEXT_PATTERNS.some((pattern) => title.includes(pattern) || bodyText.includes(pattern)); +} + +export function buildCaptchaHint(action: string): string { + return [ + `Open a clean 1688 ${action} page in the shared Chrome profile and finish any slider challenge first.`, + 'If you run opencli via CDP, set OPENCLI_CDP_TARGET=1688.com or a more specific 1688 host before retrying.', + ].join(' '); +} + +export async function readPageState(page: IPage): Promise { + const result = await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + body_text: document.body ? document.body.innerText || '' : '', + }))() + `) as Partial; + + return { + href: cleanText(result.href), + title: cleanText(result.title), + body_text: cleanMultilineText(result.body_text), + }; +} + +export async function gotoAndReadState( + page: IPage, + url: string, + settleMs: number = 2500, + action: string = 'page', +): Promise { + try { + await page.goto(url, { settleMs }); + await page.wait(1.5); + return readPageState(page); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if ( + message.includes('Inspected target navigated or closed') + || message.includes('Cannot find context with specified id') + || message.includes('Target closed') + ) { + throw new CommandExecutionError( + `1688 ${action} navigation lost the current browser target`, + `${buildCaptchaHint(action)} If CDP is attached to a stale or blocked tab, open a fresh 1688 tab and point OPENCLI_CDP_TARGET at that tab.`, + ); + } + throw error; + } +} + +export async function ensure1688Session(page: IPage): Promise { + const state = await gotoAndReadState(page, HOME_URL, 1500); + if (isCaptchaState(state)) { + throw new CommandExecutionError( + '1688 homepage is currently blocked by a slider challenge', + buildCaptchaHint('homepage'), + ); + } + + const authState = await page.evaluate(` + (() => { + const text = document.body ? document.body.innerText || '' : ''; + const hasSearchInput = !!document.querySelector('input#alisearch-input, input[name="keywords"]'); + const hasLoggedMarker = ['采购车', '收藏的品', '我的足迹', '全部订单'] + .some((label) => text.includes(label)); + const hasLoginPrompt = ['请登录', '立即登录', '登录后'] + .some((label) => text.includes(label)); + return { + hasSearchInput, + hasLoggedMarker, + hasLoginPrompt, + }; + })() + `) as { hasSearchInput?: boolean; hasLoggedMarker?: boolean; hasLoginPrompt?: boolean }; + + const isLoggedIn = authState.hasSearchInput === true + && authState.hasLoggedMarker === true + && authState.hasLoginPrompt !== true; + + if (!isLoggedIn) { + throw new AuthRequiredError( + '1688.com', + '1688 is not logged in in the shared Chrome profile', + ); + } +} + +export function assertNotCaptcha(state: PageState, action: string): void { + if (!isCaptchaState(state)) return; + throw new CommandExecutionError( + `1688 ${action} hit a slider challenge`, + buildCaptchaHint(action), + ); +} + +export function toNumber(value: unknown): number | null { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + if (typeof value === 'string') { + const normalized = value.replace(/,/g, '').trim(); + if (!normalized) return null; + const parsed = Number.parseFloat(normalized); + return Number.isFinite(parsed) ? parsed : null; + } + return null; +} + +export function limitCandidates(values: T[], limit: number): T[] { + const normalizedLimit = Math.max(1, Math.trunc(limit) || 1); + return values.slice(0, normalizedLimit); +} + +function normalizeNumericText(value: string): string { + return value + .replace(/([¥$€])\s+(?=\d)/g, '$1') + .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') + .replace(/\s*([~-])\s*/g, '$1') + .trim(); +} + +function escapeForRegex(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +export const __test__ = { + buildSearchUrl, + buildDetailUrl, + resolveStoreUrl, + extractOfferId, + extractMemberId, + extractShopId, + parsePriceText, + normalizePriceTiers, + parseMoqText, + extractLocation, + extractAddress, + extractMetric, + extractYearsOnPlatform, + extractMainBusiness, + extractBadges, + guessTopCategories, + isCaptchaState, + cleanText, + cleanMultilineText, + uniqueNonEmpty, + limitCandidates, +}; diff --git a/src/clis/1688/store.test.ts b/src/clis/1688/store.test.ts new file mode 100644 index 00000000..a005a65e --- /dev/null +++ b/src/clis/1688/store.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './store.js'; + +describe('1688 store normalization', () => { + it('merges store contact text with seller seed data', () => { + const result = __test__.normalizeStorePayload({ + resolvedUrl: 'https://yinuoweierfushi.1688.com/?offerId=887904326744', + explicitMemberId: null, + storePayload: { + href: 'https://yinuoweierfushi.1688.com/page/index.html', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 联系方式 + 地址:山东省青岛市即墨区环秀街道办事处湘江二路97号甲 + `, + offerLinks: ['https://detail.1688.com/offer/887904326744.html'], + }, + contactPayload: { + href: 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 电话:86 0532 86655366 + 手机:15963238678 + 地址:山东省青岛市即墨区环秀街道办事处湘江二路97号甲 + `, + }, + seed: { + bodyText: ` + 入驻13年 + 主营:大码女装 + 店铺回头率 + 87% + 延期必赔 + 品质保障 + `, + seller: { + companyName: '青岛沁澜衣品服装有限公司', + memberId: 'b2b-1641351767', + winportUrl: 'https://yinuoweierfushi.1688.com', + }, + services: [{ serviceName: '延期必赔' }, { serviceName: '品质保障' }], + }, + }); + + expect(result.member_id).toBe('b2b-1641351767'); + expect(result.store_url).toBe('https://yinuoweierfushi.1688.com'); + expect(result.company_url).toBe('https://yinuoweierfushi.1688.com/page/contactinfo.html'); + expect(result.years_on_platform_text).toBe('入驻13年'); + expect(result.location).toBe('山东省青岛市即墨区环秀街道办事处湘江二路97号甲'); + expect(result.return_rate_text).toBe('87%'); + expect(result.top_categories).toEqual(['大码女装']); + expect(result.service_badges).toEqual(['延期必赔', '品质保障']); + }); + + it('builds contact urls and extracts offer ids', () => { + expect(__test__.buildContactUrl('https://yinuoweierfushi.1688.com')).toBe( + 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + ); + expect(__test__.firstOfferId([ + 'https://detail.1688.com/offer/887904326744.html', + ])).toBe('887904326744'); + }); + + it('collects deduplicated offer ids from input and store links', () => { + expect(__test__.collectOfferIds( + 'https://detail.1688.com/offer/887904326744.html', + { + href: 'https://yinuoweierfushi.1688.com/page/index.html', + bodyText: '', + offerLinks: [ + 'https://detail.1688.com/offer/887904326744.html', + 'https://detail.1688.com/offer/123456789012.html', + ], + }, + { + href: 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + bodyText: '', + offerLinks: [ + 'https://detail.1688.com/offer/123456789012.html', + 'https://detail.1688.com/offer/999999999999.html', + ], + }, + )).toEqual(['887904326744', '123456789012', '999999999999']); + }); +}); diff --git a/src/clis/1688/store.ts b/src/clis/1688/store.ts new file mode 100644 index 00000000..da7024e4 --- /dev/null +++ b/src/clis/1688/store.ts @@ -0,0 +1,291 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + FACTORY_BADGE_PATTERNS, + SERVICE_BADGE_PATTERNS, + assertNotCaptcha, + buildCaptchaHint, + buildDetailUrl, + buildProvenance, + cleanMultilineText, + cleanText, + extractAddress, + extractBadges, + extractMainBusiness, + extractMemberId, + extractMetric, + extractOfferId, + extractShopId, + extractYearsOnPlatform, + gotoAndReadState, + guessTopCategories, + resolveStoreUrl, + uniqueNonEmpty, +} from './shared.js'; + +interface StoreBrowserPayload { + href?: string; + title?: string; + bodyText?: string; + offerLinks?: string[]; + contactLinks?: string[]; +} + +interface StoreItemSeed { + href?: string; + bodyText?: string; + seller?: { + companyName?: string; + memberId?: string; + winportUrl?: string; + sellerWinportUrlMap?: Record; + }; + services?: Array<{ serviceName?: string }>; +} + +function collectOfferIds( + rawInput: string, + storePayload: StoreBrowserPayload | null, + contactPayload: StoreBrowserPayload | null, +): string[] { + const ids = uniqueNonEmpty([ + rawInput, + ...(storePayload?.offerLinks ?? []), + ...(contactPayload?.offerLinks ?? []), + ]) + .map((value) => extractOfferId(value)) + .filter((value): value is string => Boolean(value)); + + return [...new Set(ids)]; +} + +function normalizeStorePayload(input: { + resolvedUrl: string; + storePayload: StoreBrowserPayload | null; + contactPayload: StoreBrowserPayload | null; + seed: StoreItemSeed | null; + explicitMemberId: string | null; +}): Record { + const storePayload = input.storePayload; + const contactPayload = input.contactPayload; + const seed = input.seed; + + const contactText = cleanMultilineText(contactPayload?.bodyText); + const storeText = cleanMultilineText(storePayload?.bodyText); + const seedText = cleanMultilineText(seed?.bodyText); + const combinedText = [contactText, storeText, seedText].filter(Boolean).join('\n'); + + const sellerUrl = cleanText( + seed?.seller?.winportUrl + ?? seed?.seller?.sellerWinportUrlMap?.defaultUrl + ?? storePayload?.href + ?? input.resolvedUrl, + ); + const memberId = cleanText(seed?.seller?.memberId) + || input.explicitMemberId + || extractMemberId(input.resolvedUrl) + || null; + const shopId = extractShopId(sellerUrl) ?? extractShopId(input.resolvedUrl); + const companyName = cleanText(seed?.seller?.companyName) + || firstNamedLine(contactText) + || firstNamedLine(storeText) + || null; + const storeUrl = canonicalStoreUrl(sellerUrl || input.resolvedUrl); + const companyUrl = buildContactUrl(storeUrl) ?? storeUrl; + const serviceBadges = uniqueNonEmpty([ + ...extractBadges(combinedText, SERVICE_BADGE_PATTERNS), + ...((seed?.services ?? []).map((service) => cleanText(service.serviceName))), + ]); + const factoryBadges = extractBadges(combinedText, FACTORY_BADGE_PATTERNS); + + return { + member_id: memberId, + shop_id: shopId, + store_name: companyName, + store_url: storeUrl, + company_name: companyName, + company_url: companyUrl, + ...buildProvenance(contactPayload?.href || storePayload?.href || input.resolvedUrl), + business_model_text: firstMetric(combinedText, ['经营模式', '生产加工', '主营产品']), + years_on_platform_text: extractYearsOnPlatform(combinedText), + location: extractAddress(contactText) ?? extractAddress(storeText), + staff_size_text: firstMetric(combinedText, ['员工人数', '员工总数']), + factory_badges: factoryBadges, + service_badges: serviceBadges, + response_rate_text: firstMetric(combinedText, ['响应率', '回复率', '响应速度']), + return_rate_text: extractReturnRate(combinedText), + top_categories: guessTopCategories(combinedText), + phone_text: extractMetric(contactText, '电话'), + mobile_text: extractMetric(contactText, '手机'), + }; +} + +function canonicalStoreUrl(url: string): string { + try { + const parsed = new URL(url); + return `${parsed.protocol}//${parsed.hostname}`; + } catch { + return url; + } +} + +function buildContactUrl(storeUrl: string): string | null { + try { + const parsed = new URL(storeUrl); + return `${parsed.protocol}//${parsed.hostname}/page/contactinfo.html`; + } catch { + return null; + } +} + +function firstNamedLine(text: string): string | null { + return text + .split('\n') + .map((line) => cleanText(line)) + .find((line) => line.includes('有限公司') || line.includes('商行') || line.includes('工厂')) + ?? null; +} + +function firstMetric(text: string, labels: string[]): string | null { + for (const label of labels) { + const value = extractMetric(text, label); + if (value) return value; + } + return null; +} + +function extractReturnRate(text: string): string | null { + const inline = text.match(/回头率\s*([0-9.]+%)/); + if (inline) return inline[1]; + const multiline = text.match(/回头率\n([0-9.]+%)/); + return multiline ? multiline[1] : null; +} + +function firstOfferId(links: string[]): string | null { + for (const link of links) { + const offerId = extractOfferId(link); + if (offerId) return offerId; + } + return null; +} + +async function readStorePayload( + page: IPage, + url: string, + action: string, +): Promise { + const state = await gotoAndReadState(page, url, 2500, action); + assertNotCaptcha(state, action); + + return await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + offerLinks: Array.from(document.querySelectorAll('a[href*="detail.1688.com/offer/"]')) + .map((anchor) => anchor.href) + .filter(Boolean), + contactLinks: Array.from(document.querySelectorAll('a[href*="contactinfo"]')) + .map((anchor) => anchor.href) + .filter(Boolean), + }))() + `) as StoreBrowserPayload; +} + +async function readItemSeed( + page: IPage, + offerId: string, +): Promise { + const itemUrl = buildDetailUrl(offerId); + const state = await gotoAndReadState(page, itemUrl, 2500, 'store seed item'); + assertNotCaptcha(state, 'store seed item'); + + const seed = await page.evaluate(` + (() => { + const model = window.context?.result?.global?.globalData?.model ?? null; + const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); + return { + href: window.location.href, + bodyText: document.body ? document.body.innerText || '' : '', + seller: toJson(model?.sellerModel), + services: toJson(model?.shippingServices?.fields?.buyerProtectionModel ?? []), + }; + })() + `) as StoreItemSeed; + + if (!cleanText(seed.href) || !seed.seller) { + throw new CommandExecutionError( + '1688 store seed item did not expose seller context', + `${buildCaptchaHint('item')} Open a real 1688 item page in Chrome and retry.`, + ); + } + + return seed; +} + +async function readFirstUsableItemSeed( + page: IPage, + offerIds: string[], +): Promise { + for (const offerId of offerIds.slice(0, 8)) { + try { + return await readItemSeed(page, offerId); + } catch (err) { + if (!(err instanceof CommandExecutionError)) throw err; + } + } + return null; +} + +cli({ + site: '1688', + name: 'store', + description: '1688 店铺/供应商公开信息(联系方式、主营、入驻年限、公开服务信号)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + required: true, + positional: true, + help: '1688 店铺 URL、店铺 host 或 member ID(如 b2b-22154705262941f196)', + }, + ], + columns: ['company_name', 'years_on_platform_text', 'location', 'return_rate_text'], + func: async (page, kwargs) => { + const rawInput = String(kwargs.input ?? ''); + const resolvedUrl = resolveStoreUrl(rawInput); + const explicitMemberId = extractMemberId(rawInput); + + const storePayload = await readStorePayload(page, resolvedUrl, 'store'); + const contactUrl = buildContactUrl(storePayload.href || resolvedUrl); + const contactPayload = contactUrl ? await readStorePayload(page, contactUrl, 'store contact') : null; + const seed = await readFirstUsableItemSeed( + page, + collectOfferIds(rawInput, storePayload, contactPayload), + ); + + return [ + normalizeStorePayload({ + resolvedUrl, + storePayload, + contactPayload, + seed, + explicitMemberId, + }), + ]; + }, +}); + +export const __test__ = { + normalizeStorePayload, + canonicalStoreUrl, + buildContactUrl, + firstNamedLine, + firstMetric, + extractReturnRate, + firstOfferId, + collectOfferIds, +};