Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions packages/bot/src/scheduler-registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import { getScoreService } from './services/score.service';
import { getAttendanceService, getFineService } from './services';

import { ActivityScoreType, curationSources, getDb, members } from '@blog-study/shared/db';
import { extractOgImage } from '@blog-study/shared/utils';
import { extractFirstImage, extractOgImage } from '@blog-study/shared/utils';
import { getCurrentRound } from './services/round.service';
import { eq } from 'drizzle-orm';
import logger from './lib/logger';
Expand Down Expand Up @@ -80,8 +80,9 @@ export async function registerAllJobs(boss: PgBoss, client: Client): Promise<voi
for (const item of items) {
if (item.pubDate < POST_CUTOFF_DATE) continue;

// OG 이미지 추출 (실패해도 글 등록은 진행)
const thumbnailUrl = await extractOgImage(item.link).catch(() => null);
// OG 이미지 추출 (실패 시 RSS content 첫 이미지 fallback)
const thumbnailUrl = await extractOgImage(item.link).catch(() => null)
?? extractFirstImage(item.description);

const result = await postService.create({
memberId: member.id,
Expand Down Expand Up @@ -262,7 +263,8 @@ export async function registerAllJobs(boss: PgBoss, client: Client): Promise<voi
category: '',
tags: item.categories ?? [],
description: item.description,
thumbnailUrl: result?.status === 'fulfilled' ? result.value : null,
thumbnailUrl: (result?.status === 'fulfilled' ? result.value : null)
?? extractFirstImage(item.description),
};
});

Expand Down
37 changes: 32 additions & 5 deletions packages/shared/src/utils/feed-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ export interface NormalizedFeedItem {
export function extractFeedItems(result: ReturnType<typeof parseFeed>): NormalizedFeedItem[] {
const { format, feed } = result;

// HTML 엔티티 디코딩 (&quot; &lsquo; 등)
const decodeField = (val: string | undefined): string | undefined =>
val ? decode(val) : undefined;

if (format === 'atom') {
return (feed.entries ?? []).map((entry) => ({
title: entry.title,
title: decodeField(entry.title),
link: entry.links?.[0]?.href,
pubDate: entry.published ?? entry.updated,
description: entry.summary ?? entry.content,
Expand All @@ -37,7 +41,7 @@ export function extractFeedItems(result: ReturnType<typeof parseFeed>): Normaliz

if (format === 'rss') {
return (feed.items ?? []).map((item) => ({
title: item.title,
title: decodeField(item.title),
link: item.link,
pubDate: item.pubDate ? String(item.pubDate) : undefined,
description: item.description,
Expand All @@ -49,7 +53,7 @@ export function extractFeedItems(result: ReturnType<typeof parseFeed>): Normaliz

if (format === 'json') {
return (feed.items ?? []).map((item) => ({
title: item.title,
title: decodeField(item.title),
link: item.url ?? item.external_url,
pubDate: item.date_published ?? item.date_modified,
description: item.summary ?? item.content_text,
Expand All @@ -59,7 +63,7 @@ export function extractFeedItems(result: ReturnType<typeof parseFeed>): Normaliz

// RDF
return (feed.items ?? []).map((item) => ({
title: item.title,
title: decodeField(item.title),
link: item.link,
pubDate: item.dc?.date,
description: item.description,
Expand Down Expand Up @@ -100,6 +104,17 @@ export function sanitizeDescription(html: string | undefined): string | null {
return sanitized.length > 300 ? sanitized.slice(0, 300) + '...' : sanitized;
}

/**
* HTML content에서 첫 번째 이미지 URL 추출 (RSS content:encoded fallback용)
*/
export function extractFirstImage(html: string | null | undefined): string | null {
if (!html) return null;
const match = html.match(/<img[^>]+src=["']([^"']+)["']/i);
const imgUrl = match?.[1] ?? null;
if (imgUrl && !isSafeUrl(imgUrl)) return null;
return imgUrl;
}

/**
* URL에서 og:image 메타태그 추출 (5초 타임아웃)
* SSRF 보호: 내부 URL 차단 + OG 이미지 URL 검증
Expand All @@ -124,7 +139,19 @@ export async function extractOgImage(url: string): Promise<string | null> {
html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);

const ogImageUrl = match?.[1] ?? null;
let ogImageUrl = match?.[1] ?? null;

// og:image 없으면 JSON-LD Schema.org image fallback (Medium 등)
if (!ogImageUrl) {
const jsonLdMatch = html.match(/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/i);
if (jsonLdMatch?.[1] && jsonLdMatch[1].length < 100_000) {
try {
const ld = JSON.parse(jsonLdMatch[1]);
const ldImage = ld.image?.url || ld.image?.contentUrl || (typeof ld.image === 'string' ? ld.image : null);
if (ldImage && isSafeUrl(ldImage)) ogImageUrl = ldImage;
} catch { /* invalid JSON-LD */ }
}
}

// SSRF 방지: OG 이미지 URL 자체도 안전한지 검증
if (ogImageUrl && !isSafeUrl(ogImageUrl)) {
Expand Down
2 changes: 1 addition & 1 deletion packages/shared/src/utils/url-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ export interface UrlValidationResult {
const PLATFORM_PATTERNS: Record<BlogPlatform, RegExp> = {
velog: /^https?:\/\/velog\.io\/@[\w-]+(\/posts)?\/?$/,
tistory: /^https?:\/\/[\w-]+\.tistory\.com\/?$/,
medium: /^https?:\/\/medium\.com\/@[\w-]+\/?$/,
medium: /^https?:\/\/(medium\.com\/@[\w-]+|(?!www\.)[\w-]+\.medium\.com)\/?$/,
unknown: /^https?:\/\/.+/,
};

Expand Down
18 changes: 18 additions & 0 deletions packages/web/src/app/api/posts/preview/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ export async function POST(request: Request) {
if (thumbnailUrl) thumbnailUrl = decodeHtmlEntities(thumbnailUrl);
if (thumbnailUrl && !isSafeUrl(thumbnailUrl)) thumbnailUrl = null;

// og:image 없으면 JSON-LD → 본문 첫 이미지 순서로 fallback
if (!thumbnailUrl) {
// JSON-LD Schema.org image (Medium 등 JS 렌더링 플랫폼)
const jsonLdMatch = html.match(/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/i);
if (jsonLdMatch?.[1] && jsonLdMatch[1].length < 100_000) {
try {
const ld = JSON.parse(jsonLdMatch[1]);
const ldImage = ld.image?.url || ld.image?.contentUrl || (typeof ld.image === 'string' ? ld.image : null);
if (ldImage && isSafeUrl(ldImage)) thumbnailUrl = ldImage;
} catch { /* invalid JSON-LD */ }
}
}
if (!thumbnailUrl) {
const imgMatch = html.match(/<img[^>]+src=["']([^"']+)["']/i);
const fallback = imgMatch?.[1] || null;
if (fallback && isSafeUrl(fallback)) thumbnailUrl = fallback;
}

// og:description > meta description
const ogDescMatch =
html.match(/<meta[^>]*property=["']og:description["'][^>]*content=["']([^"']+)["']/i) ||
Expand Down
9 changes: 7 additions & 2 deletions packages/web/src/lib/rss-detect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,13 @@ function constructRssUrl(blogUrl: string, platform: string): string | null {
case 'tistory':
return `${url.protocol}//${url.hostname}/rss`;
case 'medium': {
const match = url.pathname.match(/^\/@([\w-]+)\/?$/);
return match ? `https://medium.com/feed/@${match[1]}` : null;
// medium.com/@username 형식
const pathMatch = url.pathname.match(/^\/@([\w-]+)\/?$/);
if (pathMatch) return `https://medium.com/feed/@${pathMatch[1]}`;
// username.medium.com 서브도메인 형식
const subdomainMatch = url.hostname.match(/^([\w-]+)\.medium\.com$/);
if (subdomainMatch) return `https://${subdomainMatch[1]}.medium.com/feed`;
return null;
}
default:
return null;
Expand Down
Loading