diff --git a/library/agent/Agent.test.ts b/library/agent/Agent.test.ts index 3e1745fc8..cc48d10e0 100644 --- a/library/agent/Agent.test.ts +++ b/library/agent/Agent.test.ts @@ -58,6 +58,8 @@ wrap(fetch, "fetch", function mock() { pattern: "Bytespider", }, ], + blockedSignatureAgents: "chatgpt.com", + monitoredSignatureAgents: "", } satisfies Response), }; }; diff --git a/library/agent/Agent.ts b/library/agent/Agent.ts index 7b6cc7294..1d7a0589d 100644 --- a/library/agent/Agent.ts +++ b/library/agent/Agent.ts @@ -398,6 +398,8 @@ export class Agent { monitoredIPAddresses, monitoredUserAgents, userAgentDetails, + blockedSignatureAgents, + monitoredSignatureAgents, } = await fetchBlockedLists(this.token); this.serviceConfig.updateBlockedIPAddresses(blockedIPAddresses); this.serviceConfig.updateBlockedUserAgents(blockedUserAgents); @@ -405,6 +407,10 @@ export class Agent { this.serviceConfig.updateMonitoredIPAddresses(monitoredIPAddresses); this.serviceConfig.updateMonitoredUserAgents(monitoredUserAgents); this.serviceConfig.updateUserAgentDetails(userAgentDetails); + this.serviceConfig.updateBlockedSignatureAgents(blockedSignatureAgents); + this.serviceConfig.updateMonitoredSignatureAgents( + monitoredSignatureAgents + ); } catch (error: any) { console.error(`Aikido: Failed to update blocked lists: ${error.message}`); } diff --git a/library/agent/ServiceConfig.ts b/library/agent/ServiceConfig.ts index 0371c6585..a0501a723 100644 --- a/library/agent/ServiceConfig.ts +++ b/library/agent/ServiceConfig.ts @@ -26,6 +26,8 @@ export class ServiceConfig { private monitoredIPAddresses: { list: IPMatcher; key: string }[] = []; private monitoredUserAgentRegex: RegExp | undefined; private userAgentDetails: { pattern: RegExp; key: string }[] = []; + private blockedSignatureAgentRegex: RegExp | undefined; + private monitoredSignatureAgentRegex: RegExp | undefined; constructor( endpoints: EndpointConfig[], @@ -278,4 +280,45 @@ export class ServiceConfig { hasReceivedAnyStats() { return this.receivedAnyStats; } + + updateBlockedSignatureAgents(blockedSignatureAgents: string) { + if (!blockedSignatureAgents) { + // If an empty string is passed, we want to set the regex to undefined + // e.g. new RegExp("").test("abc") == true + this.blockedSignatureAgentRegex = undefined; + return; + } + this.blockedSignatureAgentRegex = safeCreateRegExp( + blockedSignatureAgents, + "i" + ); + } + + updateMonitoredSignatureAgents(monitoredSignatureAgents: string) { + if (!monitoredSignatureAgents) { + // If an empty string is passed, we want to set the regex to undefined + // e.g. new RegExp("").test("abc") == true + this.monitoredSignatureAgentRegex = undefined; + return; + } + + this.monitoredSignatureAgentRegex = safeCreateRegExp( + monitoredSignatureAgents, + "i" + ); + } + + isSignatureAgentBlocked(ua: string): boolean { + if (this.blockedSignatureAgentRegex) { + return this.blockedSignatureAgentRegex.test(ua); + } + return false; + } + + isSignatureAgentMonitored(ua: string): boolean { + if (this.monitoredSignatureAgentRegex) { + return this.monitoredSignatureAgentRegex.test(ua); + } + return false; + } } diff --git a/library/agent/api/fetchBlockedLists.ts b/library/agent/api/fetchBlockedLists.ts index 57045f82e..2bae2a403 100644 --- a/library/agent/api/fetchBlockedLists.ts +++ b/library/agent/api/fetchBlockedLists.ts @@ -25,6 +25,8 @@ export type Response = { // If we want to collect stats about the individual user agents, // we can loop through the userAgentDetails and match each pattern. userAgentDetails: UserAgentDetails[]; + blockedSignatureAgents: string; + monitoredSignatureAgents: string; }; export async function fetchBlockedLists(token: Token): Promise { @@ -56,6 +58,8 @@ export async function fetchBlockedLists(token: Token): Promise { blockedUserAgents: string; monitoredUserAgents: string; userAgentDetails: UserAgentDetails[]; + blockedSignatureAgents: string; + monitoredSignatureAgents: string; } = JSON.parse(body); return { @@ -85,5 +89,13 @@ export async function fetchBlockedLists(token: Token): Promise { result && Array.isArray(result.userAgentDetails) ? result.userAgentDetails : [], + blockedSignatureAgents: + result && typeof result.blockedSignatureAgents === "string" + ? result.blockedSignatureAgents + : "", + monitoredSignatureAgents: + result && typeof result.monitoredSignatureAgents === "string" + ? result.monitoredSignatureAgents + : "", }; } diff --git a/library/sources/HTTPServer.stats.test.ts b/library/sources/HTTPServer.stats.test.ts index cc4e072e4..c1f335913 100644 --- a/library/sources/HTTPServer.stats.test.ts +++ b/library/sources/HTTPServer.stats.test.ts @@ -49,7 +49,13 @@ wrap(fetchBlockedLists, "fetchBlockedLists", function fetchBlockedLists() { key: "google_extended", pattern: "Google-Extended", }, + { + key: "chatgpt_agent", + pattern: "chatgpt.com", + }, ], + blockedSignatureAgents: "", + monitoredSignatureAgents: "chatgpt.com", } satisfies Response; }; }); @@ -95,10 +101,19 @@ t.test("it tracks monitored user agents", async () => { }, timeoutInMS: 500, }), - ]).then(([response1, response2, response3]) => { + fetch({ + url: new URL("http://localhost:3327/test"), + method: "GET", + headers: { + "Signature-Agent": "chatgpt.com", + }, + timeoutInMS: 500, + }), + ]).then(([response1, response2, response3, response4]) => { t.equal(response1.statusCode, 200); t.equal(response2.statusCode, 200); t.equal(response3.statusCode, 200); + t.equal(response4.statusCode, 200); const stats = agent.getInspectionStatistics().getStats(); t.same(stats.userAgents, { breakdown: { @@ -106,6 +121,8 @@ t.test("it tracks monitored user agents", async () => { ai_data_scrapers: 1, // eslint-disable-next-line camelcase google_extended: 1, + // eslint-disable-next-line camelcase + chatgpt_agent: 1, }, }); t.same(stats.ipAddresses, { diff --git a/library/sources/HTTPServer.test.ts b/library/sources/HTTPServer.test.ts index 18d49e7db..d1b9c5ed4 100644 --- a/library/sources/HTTPServer.test.ts +++ b/library/sources/HTTPServer.test.ts @@ -80,6 +80,8 @@ wrap(fetchBlockedLists, "fetchBlockedLists", function fetchBlockedLists() { monitoredUserAgents: "", monitoredIPAddresses: [], userAgentDetails: [], + blockedSignatureAgents: "", + monitoredSignatureAgents: "", } satisfies Response; }; }); diff --git a/library/sources/Hono.allowedIPAddresses.test.ts b/library/sources/Hono.allowedIPAddresses.test.ts index f80fb2a41..18f690cc9 100644 --- a/library/sources/Hono.allowedIPAddresses.test.ts +++ b/library/sources/Hono.allowedIPAddresses.test.ts @@ -49,6 +49,8 @@ wrap(fetch, "fetch", function mock(original) { ], monitoredIPAddresses: [], monitoredUserAgents: "", + blockedSignatureAgents: "", + monitoredSignatureAgents: "", } satisfies Response), }; } diff --git a/library/sources/Hono.test.ts b/library/sources/Hono.test.ts index cec75f32f..cd7d9d4ac 100644 --- a/library/sources/Hono.test.ts +++ b/library/sources/Hono.test.ts @@ -47,6 +47,8 @@ wrap(fetch, "fetch", function mock(original) { pattern: "attacker", }, ], + blockedSignatureAgents: "chatgpt.com", + monitoredSignatureAgents: "", } satisfies Response), }; } @@ -592,6 +594,36 @@ t.test("bypass list works", opts, async (t) => { }); t.equal(response4.statusCode, 200); + // It does not block bypassed ip because of signature agent + const response5 = await fetch.fetch({ + url: new URL("http://127.0.0.1:8769/"), + headers: { + "X-Forwarded-For": "123.1.2.254", + "Signature-Agent": "chatgpt.com", + }, + }); + t.equal(response5.statusCode, 200); + + // It blocks non-bypassed ip because of signature agent + const response6 = await fetch.fetch({ + url: new URL("http://127.0.0.1:8769/"), + headers: { + "X-Forwarded-For": "2.3.4.5", + "Signature-Agent": "chatgpt.com", + }, + }); + t.equal(response6.statusCode, 403); + + // Not blocked signature agent + const response7 = await fetch.fetch({ + url: new URL("http://127.0.0.1:8769/"), + headers: { + "X-Forwarded-For": "2.3.4.5", + "Signature-Agent": "some-other-agent", + }, + }); + t.equal(response7.statusCode, 200); + // Cleanup server server.close(); }); diff --git a/library/sources/http-server/checkIfRequestIsBlocked.ts b/library/sources/http-server/checkIfRequestIsBlocked.ts index 6d30dd2df..ca1384859 100644 --- a/library/sources/http-server/checkIfRequestIsBlocked.ts +++ b/library/sources/http-server/checkIfRequestIsBlocked.ts @@ -118,11 +118,9 @@ export function checkIfRequestIsBlocked( ? context.headers["user-agent"] : undefined; - const isUserAgentBlocked = userAgent - ? agent.getConfig().isUserAgentBlocked(userAgent) - : ({ blocked: false } as const); - if (userAgent) { + const isUserAgentBlocked = agent.getConfig().isUserAgentBlocked(userAgent); + const isMonitoredUserAgent = agent .getConfig() .isMonitoredUserAgent(userAgent); @@ -134,17 +132,51 @@ export function checkIfRequestIsBlocked( .getMatchingUserAgentKeys(userAgent); agent.getInspectionStatistics().onUserAgentMatches(userAgentKeys); } + + if (isUserAgentBlocked.blocked) { + res.statusCode = 403; + res.setHeader("Content-Type", "text/plain"); + + res.end( + "You are not allowed to access this resource because you have been identified as a bot." + ); + + return true; + } } - if (isUserAgentBlocked.blocked) { - res.statusCode = 403; - res.setHeader("Content-Type", "text/plain"); + const signatureAgent = + context.headers && typeof context.headers["signature-agent"] === "string" + ? context.headers["signature-agent"] + : undefined; - res.end( - "You are not allowed to access this resource because you have been identified as a bot." - ); + if (signatureAgent) { + const isSignatureAgentBlocked = agent + .getConfig() + .isSignatureAgentBlocked(signatureAgent); - return true; + const isMonitoredSignatureAgent = agent + .getConfig() + .isSignatureAgentMonitored(signatureAgent); + + if (isSignatureAgentBlocked || isMonitoredSignatureAgent) { + // Find all the matching user agent keys when it's a blocked or monitored user agent + const userAgentKeys = agent + .getConfig() + .getMatchingUserAgentKeys(signatureAgent); + agent.getInspectionStatistics().onUserAgentMatches(userAgentKeys); + } + + if (isSignatureAgentBlocked) { + res.statusCode = 403; + res.setHeader("Content-Type", "text/plain"); + + res.end( + "You are not allowed to access this resource because you have been identified as a bot." + ); + + return true; + } } return false;