diff --git a/packages/bot/src/scheduler-registry.ts b/packages/bot/src/scheduler-registry.ts index fb5c3c4..a6f18c6 100644 --- a/packages/bot/src/scheduler-registry.ts +++ b/packages/bot/src/scheduler-registry.ts @@ -21,7 +21,7 @@ import { getScoreService } from './services/score.service'; import { getAttendanceService, getFineService } from './services'; import { ActivityScoreType, curationSources, getDb, members } from '@blog-study/shared/db'; -import { extractOgImage } from '@blog-study/shared/utils'; +import { extractFirstImage, extractOgImage } from '@blog-study/shared/utils'; import { getCurrentRound } from './services/round.service'; import { eq } from 'drizzle-orm'; import logger from './lib/logger'; @@ -80,8 +80,9 @@ export async function registerAllJobs(boss: PgBoss, client: Client): Promise null); + // OG 이미지 추출 (실패 시 RSS content 첫 이미지 fallback) + const thumbnailUrl = await extractOgImage(item.link).catch(() => null) + ?? extractFirstImage(item.description); const result = await postService.create({ memberId: member.id, @@ -262,7 +263,8 @@ export async function registerAllJobs(boss: PgBoss, client: Client): Promise): NormalizedFeedItem[] { const { format, feed } = result; + // HTML 엔티티 디코딩 (" ‘ 등) + const decodeField = (val: string | undefined): string | undefined => + val ? decode(val) : undefined; + if (format === 'atom') { return (feed.entries ?? []).map((entry) => ({ - title: entry.title, + title: decodeField(entry.title), link: entry.links?.[0]?.href, pubDate: entry.published ?? entry.updated, description: entry.summary ?? entry.content, @@ -37,7 +41,7 @@ export function extractFeedItems(result: ReturnType): Normaliz if (format === 'rss') { return (feed.items ?? []).map((item) => ({ - title: item.title, + title: decodeField(item.title), link: item.link, pubDate: item.pubDate ? String(item.pubDate) : undefined, description: item.description, @@ -49,7 +53,7 @@ export function extractFeedItems(result: ReturnType): Normaliz if (format === 'json') { return (feed.items ?? []).map((item) => ({ - title: item.title, + title: decodeField(item.title), link: item.url ?? item.external_url, pubDate: item.date_published ?? item.date_modified, description: item.summary ?? item.content_text, @@ -59,7 +63,7 @@ export function extractFeedItems(result: ReturnType): Normaliz // RDF return (feed.items ?? []).map((item) => ({ - title: item.title, + title: decodeField(item.title), link: item.link, pubDate: item.dc?.date, description: item.description, @@ -100,6 +104,17 @@ export function sanitizeDescription(html: string | undefined): string | null { return sanitized.length > 300 ? sanitized.slice(0, 300) + '...' : sanitized; } +/** + * HTML content에서 첫 번째 이미지 URL 추출 (RSS content:encoded fallback용) + */ +export function extractFirstImage(html: string | null | undefined): string | null { + if (!html) return null; + const match = html.match(/]+src=["']([^"']+)["']/i); + const imgUrl = match?.[1] ?? null; + if (imgUrl && !isSafeUrl(imgUrl)) return null; + return imgUrl; +} + /** * URL에서 og:image 메타태그 추출 (5초 타임아웃) * SSRF 보호: 내부 URL 차단 + OG 이미지 URL 검증 @@ -124,7 +139,19 @@ export async function extractOgImage(url: string): Promise { html.match(/]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) || html.match(/]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i); - const ogImageUrl = match?.[1] ?? null; + let ogImageUrl = match?.[1] ?? null; + + // og:image 없으면 JSON-LD Schema.org image fallback (Medium 등) + if (!ogImageUrl) { + const jsonLdMatch = html.match(/]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/i); + if (jsonLdMatch?.[1] && jsonLdMatch[1].length < 100_000) { + try { + const ld = JSON.parse(jsonLdMatch[1]); + const ldImage = ld.image?.url || ld.image?.contentUrl || (typeof ld.image === 'string' ? ld.image : null); + if (ldImage && isSafeUrl(ldImage)) ogImageUrl = ldImage; + } catch { /* invalid JSON-LD */ } + } + } // SSRF 방지: OG 이미지 URL 자체도 안전한지 검증 if (ogImageUrl && !isSafeUrl(ogImageUrl)) { diff --git a/packages/shared/src/utils/url-validator.ts b/packages/shared/src/utils/url-validator.ts index 9a8eedf..4684160 100644 --- a/packages/shared/src/utils/url-validator.ts +++ b/packages/shared/src/utils/url-validator.ts @@ -61,7 +61,7 @@ export interface UrlValidationResult { const PLATFORM_PATTERNS: Record = { velog: /^https?:\/\/velog\.io\/@[\w-]+(\/posts)?\/?$/, tistory: /^https?:\/\/[\w-]+\.tistory\.com\/?$/, - medium: /^https?:\/\/medium\.com\/@[\w-]+\/?$/, + medium: /^https?:\/\/(medium\.com\/@[\w-]+|(?!www\.)[\w-]+\.medium\.com)\/?$/, unknown: /^https?:\/\/.+/, }; diff --git a/packages/web/src/app/api/posts/preview/route.ts b/packages/web/src/app/api/posts/preview/route.ts index faf09ac..d117864 100644 --- a/packages/web/src/app/api/posts/preview/route.ts +++ b/packages/web/src/app/api/posts/preview/route.ts @@ -68,6 +68,24 @@ export async function POST(request: Request) { if (thumbnailUrl) thumbnailUrl = decodeHtmlEntities(thumbnailUrl); if (thumbnailUrl && !isSafeUrl(thumbnailUrl)) thumbnailUrl = null; + // og:image 없으면 JSON-LD → 본문 첫 이미지 순서로 fallback + if (!thumbnailUrl) { + // JSON-LD Schema.org image (Medium 등 JS 렌더링 플랫폼) + const jsonLdMatch = html.match(/]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/i); + if (jsonLdMatch?.[1] && jsonLdMatch[1].length < 100_000) { + try { + const ld = JSON.parse(jsonLdMatch[1]); + const ldImage = ld.image?.url || ld.image?.contentUrl || (typeof ld.image === 'string' ? ld.image : null); + if (ldImage && isSafeUrl(ldImage)) thumbnailUrl = ldImage; + } catch { /* invalid JSON-LD */ } + } + } + if (!thumbnailUrl) { + const imgMatch = html.match(/]+src=["']([^"']+)["']/i); + const fallback = imgMatch?.[1] || null; + if (fallback && isSafeUrl(fallback)) thumbnailUrl = fallback; + } + // og:description > meta description const ogDescMatch = html.match(/]*property=["']og:description["'][^>]*content=["']([^"']+)["']/i) || diff --git a/packages/web/src/lib/rss-detect.ts b/packages/web/src/lib/rss-detect.ts index f473282..18b0fad 100644 --- a/packages/web/src/lib/rss-detect.ts +++ b/packages/web/src/lib/rss-detect.ts @@ -28,8 +28,13 @@ function constructRssUrl(blogUrl: string, platform: string): string | null { case 'tistory': return `${url.protocol}//${url.hostname}/rss`; case 'medium': { - const match = url.pathname.match(/^\/@([\w-]+)\/?$/); - return match ? `https://medium.com/feed/@${match[1]}` : null; + // medium.com/@username 형식 + const pathMatch = url.pathname.match(/^\/@([\w-]+)\/?$/); + if (pathMatch) return `https://medium.com/feed/@${pathMatch[1]}`; + // username.medium.com 서브도메인 형식 + const subdomainMatch = url.hostname.match(/^([\w-]+)\.medium\.com$/); + if (subdomainMatch) return `https://${subdomainMatch[1]}.medium.com/feed`; + return null; } default: return null;