Skip to content

Commit c1b1d74

Browse files
committed
fix: handle google crawler
1 parent da9a55b commit c1b1d74

File tree

4 files changed

+211
-1
lines changed

4 files changed

+211
-1
lines changed

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"@testing-library/react": "^9.3.2",
2121
"@testing-library/user-event": "^7.1.2",
2222
"@types/aws-lambda": "^8.10.39",
23+
"@types/cidr-matcher": "^2.1.2",
2324
"@types/codemirror": "^0.0.84",
2425
"@types/date-fns": "^2.6.0",
2526
"@types/ioredis": "^4.14.3",
@@ -63,6 +64,7 @@
6364
"babel-preset-react-app": "^9.1.0",
6465
"camelcase": "^5.3.1",
6566
"case-sensitive-paths-webpack-plugin": "2.2.0",
67+
"cidr-matcher": "^2.1.1",
6668
"codemirror": "^5.51.0",
6769
"copy-webpack-plugin": "^5.1.1",
6870
"core-js": "^3.6.4",

src/server/crawlerCheck.ts

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
// googlebot-and-userfetcher-ip-cache.ts
2+
import axios from 'axios';
3+
import CidrMatcher from 'cidr-matcher';
4+
5+
export interface IPRangeDoc {
6+
prefixes: {
7+
ipv4Prefix?: string;
8+
ipv6Prefix?: string;
9+
service: string; // e.g., "googlebot", "user-triggered-fetchers"
10+
scope: string; // e.g., "global"
11+
}[];
12+
creationTime?: string;
13+
}
14+
15+
type CachePack = {
16+
fetchedAt: number;
17+
googlebot: IPRangeDoc;
18+
userFetchers: IPRangeDoc;
19+
cidrs: string[];
20+
matcher: CidrMatcher;
21+
};
22+
23+
export default class GoogleCrawlIPCache {
24+
private static readonly URL_GOOGLEBOT =
25+
'https://developers.google.com/search/apis/ipranges/googlebot.json';
26+
private static readonly URL_USER_FETCHERS =
27+
'https://developers.google.com/search/apis/ipranges/user-triggered-fetchers-google.json';
28+
private static readonly ONE_HOUR = 60 * 60 * 1000;
29+
30+
private cache: CachePack | null = null;
31+
private inflight: Promise<CachePack> | null = null;
32+
private cacheDurationMs: number;
33+
34+
constructor(cacheDurationMs: number = GoogleCrawlIPCache.ONE_HOUR) {
35+
this.cacheDurationMs = cacheDurationMs;
36+
}
37+
38+
/** 통합 CIDR 매처로 검사 (IPv4/IPv6 지원) */
39+
public async isCrawlerIP(ip: string): Promise<boolean> {
40+
const pack = await this.getCachePack();
41+
return pack.matcher.contains(ip);
42+
}
43+
44+
/** 통합 CIDR 목록 반환 */
45+
public async getAllCIDRs(): Promise<string[]> {
46+
const pack = await this.getCachePack();
47+
return pack.cidrs.slice();
48+
}
49+
50+
/** 원본 두 문서 반환 */
51+
public async getRawDocs(): Promise<{
52+
googlebot: IPRangeDoc;
53+
userFetchers: IPRangeDoc;
54+
}> {
55+
const pack = await this.getCachePack();
56+
return { googlebot: pack.googlebot, userFetchers: pack.userFetchers };
57+
}
58+
59+
/** 강제 무효화 */
60+
public invalidate(): void {
61+
this.cache = null;
62+
}
63+
64+
/** 마지막 갱신 시각(ms) */
65+
public getLastFetched(): number | null {
66+
return this.cache?.fetchedAt ?? null;
67+
}
68+
69+
/** 남은 TTL(ms). 캐시 없으면 0 */
70+
public getTtlRemainingMs(): number {
71+
if (!this.cache) return 0;
72+
const remain = this.cache.fetchedAt + this.cacheDurationMs - Date.now();
73+
return Math.max(0, remain);
74+
}
75+
76+
// 내부 구현부
77+
78+
private async getCachePack(): Promise<CachePack> {
79+
const now = Date.now();
80+
81+
if (this.cache && now - this.cache.fetchedAt < this.cacheDurationMs) {
82+
return this.cache;
83+
}
84+
85+
if (this.inflight) {
86+
return this.inflight;
87+
}
88+
89+
this.inflight = this.fetchBoth()
90+
.then((pack) => {
91+
this.cache = pack;
92+
return pack;
93+
})
94+
.finally(() => {
95+
this.inflight = null;
96+
});
97+
98+
return this.inflight;
99+
}
100+
101+
private async fetchBoth(): Promise<CachePack> {
102+
const headers = {
103+
'User-Agent': 'GoogleCrawlIPCache/1.0',
104+
Accept: 'application/json',
105+
};
106+
107+
const [gbRes, ufRes] = await Promise.all([
108+
axios.get<IPRangeDoc>(GoogleCrawlIPCache.URL_GOOGLEBOT, {
109+
headers,
110+
timeout: 15000,
111+
}),
112+
axios.get<IPRangeDoc>(GoogleCrawlIPCache.URL_USER_FETCHERS, {
113+
headers,
114+
timeout: 15000,
115+
}),
116+
]);
117+
118+
const googlebot = gbRes.data;
119+
const userFetchers = ufRes.data;
120+
121+
// 두 문서의 CIDR을 합쳐서 매처 구성
122+
const cidrs: string[] = [];
123+
for (const p of googlebot.prefixes || []) {
124+
if (p.ipv4Prefix) cidrs.push(p.ipv4Prefix);
125+
if (p.ipv6Prefix) cidrs.push(p.ipv6Prefix);
126+
}
127+
for (const p of userFetchers.prefixes || []) {
128+
if (p.ipv4Prefix) cidrs.push(p.ipv4Prefix);
129+
if (p.ipv6Prefix) cidrs.push(p.ipv6Prefix);
130+
}
131+
132+
// 중복 제거(간단 Set)
133+
const uniqueCidrs = Array.from(new Set(cidrs));
134+
const matcher = new CidrMatcher(uniqueCidrs);
135+
136+
return {
137+
fetchedAt: Date.now(),
138+
googlebot,
139+
userFetchers,
140+
cidrs: uniqueCidrs,
141+
matcher,
142+
};
143+
}
144+
}

src/server/rateLimitMiddleware.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
import { Middleware } from 'koa';
22
import { redis } from './CacheManager';
3+
import GoogleBotIPCache from './crawlerCheck';
4+
5+
const cache = new GoogleBotIPCache(); // 기본 TTL 1시간
6+
async function main() {
7+
try {
8+
const docs = await cache.getRawDocs();
9+
console.log('[googlebot] prefixes:', docs.googlebot.prefixes.length);
10+
console.log('[userFetchers] prefixes:', docs.userFetchers.prefixes.length);
11+
12+
const cidrs = await cache.getAllCIDRs();
13+
console.log('Total unique CIDRs:', cidrs.length);
14+
} catch (e) {
15+
console.error('[GoogleCrawlIPCache] error:', e);
16+
}
17+
}
18+
19+
main();
320

421
const parseNumber = (value: string | null) => {
522
if (value === null) return null;
@@ -12,13 +29,25 @@ const WHITELIST_IPS = (process.env.REACT_APP_WHITELIST_IPS ?? '')
1229
.split(',')
1330
.map((ip) => ip.trim());
1431

32+
console.log({ WHITELIST_IPS });
33+
1534
const rateLimitMiddleware: Middleware = async (ctx, next) => {
1635
const ip = ctx.request.ips.slice(-1)[0] || ctx.request.ip;
1736

18-
if (WHITELIST_IPS.some((whitelistIp) => ip.includes(whitelistIp))) {
37+
if (
38+
WHITELIST_IPS.length > 0 &&
39+
WHITELIST_IPS.some((whitelistIp) => ip.includes(whitelistIp))
40+
) {
1941
return next();
2042
}
2143

44+
const isCrawler = await cache.isCrawlerIP(ip);
45+
if (isCrawler) {
46+
return next();
47+
} else {
48+
console.log(`[${ip}] is not a crawler`);
49+
}
50+
2251
const isBlockedUrl = await redis.get(`${ctx.url}:blocked`);
2352
if (isBlockedUrl === '1') {
2453
ctx.status = 429;

yarn.lock

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1709,6 +1709,11 @@
17091709
"@types/connect" "*"
17101710
"@types/node" "*"
17111711

1712+
"@types/cidr-matcher@^2.1.2":
1713+
version "2.1.2"
1714+
resolved "https://registry.yarnpkg.com/@types/cidr-matcher/-/cidr-matcher-2.1.2.tgz#0ac8f46a1b11040b237907cd0255c3179b9c302a"
1715+
integrity sha512-tuyfj7hfDCBolTk+TDPC6OA6YlKIS3dpvYDGNKzRMVq/wdrl6LD5MDqRoVKPhJ+4nuOrBz2yKnzclzy7t28NyA==
1716+
17121717
"@types/codemirror@^0.0.84":
17131718
version "0.0.84"
17141719
resolved "https://registry.yarnpkg.com/@types/codemirror/-/codemirror-0.0.84.tgz#b0cfca79ccdfd45ffe1f737668276a31b3149ebd"
@@ -3751,6 +3756,13 @@ ci-info@^2.0.0:
37513756
resolved "https://registry.yarnpkg.com/ci-info/-/ci-info-2.0.0.tgz#67a9e964be31a51e15e5010d58e6f12834002f46"
37523757
integrity sha512-5tK7EtrZ0N+OLFMthtqOj4fI2Jeb88C4CAZPu25LDVUgXJ0A3Js4PMGqrn0JU1W0Mh1/Z8wZzYPxqUrXeBboCQ==
37533758

3759+
cidr-matcher@^2.1.1:
3760+
version "2.1.1"
3761+
resolved "https://registry.yarnpkg.com/cidr-matcher/-/cidr-matcher-2.1.1.tgz#01a489f291bfbc7a3a14358120a6d839a98b1c90"
3762+
integrity sha512-QPJRz4HDQxpB8AZWEqd6ejVp+siArXh3u1MYaUFV85cd293StGSMb87jVe0z9gS92KsFwxCxjb3utO3e5HKHTw==
3763+
dependencies:
3764+
ip6addr "^0.2.2"
3765+
37543766
cipher-base@^1.0.0, cipher-base@^1.0.1, cipher-base@^1.0.3:
37553767
version "1.0.4"
37563768
resolved "https://registry.yarnpkg.com/cipher-base/-/cipher-base-1.0.4.tgz#8760e4ecc272f4c363532f926d874aae2c1397de"
@@ -7240,6 +7252,14 @@ ip-regex@^2.1.0:
72407252
resolved "https://registry.yarnpkg.com/ip-regex/-/ip-regex-2.1.0.tgz#fa78bf5d2e6913c911ce9f819ee5146bb6d844e9"
72417253
integrity sha1-+ni/XS5pE8kRzp+BnuUUa7bYROk=
72427254

7255+
ip6addr@^0.2.2:
7256+
version "0.2.5"
7257+
resolved "https://registry.yarnpkg.com/ip6addr/-/ip6addr-0.2.5.tgz#06e134f44b4e1a684fd91b24035dca7a53b8f759"
7258+
integrity sha512-9RGGSB6Zc9Ox5DpDGFnJdIeF0AsqXzdH+FspCfPPaU/L/4tI6P+5lIoFUFm9JXs9IrJv1boqAaNCQmoDADTSKQ==
7259+
dependencies:
7260+
assert-plus "^1.0.0"
7261+
jsprim "^2.0.2"
7262+
72437263
ip@^1.1.0, ip@^1.1.5:
72447264
version "1.1.5"
72457265
resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.5.tgz#bdded70114290828c0a039e72ef25f5aaec4354a"
@@ -8207,6 +8227,11 @@ [email protected]:
82078227
resolved "https://registry.yarnpkg.com/json-schema/-/json-schema-0.2.3.tgz#b480c892e59a2f05954ce727bd3f2a4e882f9e13"
82088228
integrity sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=
82098229

8230+
8231+
version "0.4.0"
8232+
resolved "https://registry.yarnpkg.com/json-schema/-/json-schema-0.4.0.tgz#f7de4cf6efab838ebaeb3236474cbba5a1930ab5"
8233+
integrity sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==
8234+
82108235
json-stable-stringify-without-jsonify@^1.0.1:
82118236
version "1.0.1"
82128237
resolved "https://registry.yarnpkg.com/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz#9db7b59496ad3f3cfef30a75142d2d930ad72651"
@@ -8265,6 +8290,16 @@ jsprim@^1.2.2:
82658290
json-schema "0.2.3"
82668291
verror "1.10.0"
82678292

8293+
jsprim@^2.0.2:
8294+
version "2.0.2"
8295+
resolved "https://registry.yarnpkg.com/jsprim/-/jsprim-2.0.2.tgz#77ca23dbcd4135cd364800d22ff82c2185803d4d"
8296+
integrity sha512-gqXddjPqQ6G40VdnI6T6yObEC+pDNvyP95wdQhkWkg7crHH3km5qP1FsOXEkzEQwnz6gz5qGTn1c2Y52wP3OyQ==
8297+
dependencies:
8298+
assert-plus "1.0.0"
8299+
extsprintf "1.3.0"
8300+
json-schema "0.4.0"
8301+
verror "1.10.0"
8302+
82688303
jsx-ast-utils@^2.2.1:
82698304
version "2.2.3"
82708305
resolved "https://registry.yarnpkg.com/jsx-ast-utils/-/jsx-ast-utils-2.2.3.tgz#8a9364e402448a3ce7f14d357738310d9248054f"

0 commit comments

Comments
 (0)