From 6be8d71ebb1bdd2762de11a84280fb1108fa42cd Mon Sep 17 00:00:00 2001 From: Paul Irish Date: Thu, 6 Feb 2025 16:14:22 -0800 Subject: [PATCH 1/3] literally wtf --- lib/create-entity-finder-api.js | 43 +++++++++++++++++++--------- lib/create-entity-finder-api.test.js | 2 +- package.json | 3 +- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/lib/create-entity-finder-api.js b/lib/create-entity-finder-api.js index 78ae7c8..e300d29 100644 --- a/lib/create-entity-finder-api.js +++ b/lib/create-entity-finder-api.js @@ -3,22 +3,38 @@ const DOMAIN_CHARACTERS = /([a-z0-9.-]+\.[a-z0-9]+|localhost)/i const IP_REGEX = /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ const ROOT_DOMAIN_REGEX = /[^.]+\.([^.]+|(gov|com|co|ne)\.\w{2})$/i -function getDomainFromOriginOrURL(originOrURL) { - if (typeof originOrURL !== 'string') return null - if (originOrURL.length > 10000 || originOrURL.startsWith('data:')) return null +// const DOMAIN_IN_URL_REGEX = /:\/\/(\S*?)(:\d+)?(\/|$)/ +// const DOMAIN_CHARACTERS = /([a-z0-9.-]+?\.[a-z0-9]+|localhost)/i +// const IP_REGEX = /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ +// const ROOT_DOMAIN_REGEX = /[^.]+?\.([^.]+?|(gov|com|co|ne)\.\w{2})$/i + +/** + * @param {string} originOrURL + * @return {[string|null, string|null]} - The first item is the root domain, the second item is the domain. + */ +function parseDomains(originOrURL) { + if (typeof originOrURL !== 'string') return [null, null] + if (originOrURL.length > 10000 || originOrURL.startsWith('data:')) return [null, null] let m = originOrURL.match(DOMAIN_IN_URL_REGEX) - if (m) return m[1] + let domain; + if (m) { + domain = m[1] + } m = originOrURL.match(DOMAIN_CHARACTERS) - if (m) return m[0] - return null + if (m) { + domain = m[0] + } + + if (!domain) return [null, null] + if (IP_REGEX.test(domain)) return [domain, domain] + m = domain.match(ROOT_DOMAIN_REGEX) + const rootDomain = m && m[0] || domain; + + return [rootDomain, domain] } -function getRootDomain(originOrURL) { - const domain = getDomainFromOriginOrURL(originOrURL) - if (!domain) return null - if (IP_REGEX.test(domain)) return domain - const match = domain.match(ROOT_DOMAIN_REGEX) - return (match && match[0]) || domain +function getRootDomain(originOrURL,) { + return parseDomains(originOrURL)[0]; } function sliceSubdomainFromDomain(domain, rootDomain) { @@ -30,8 +46,7 @@ function sliceSubdomainFromDomain(domain, rootDomain) { } function getEntityInDataset(entityByDomain, entityBySubDomain, entityByRootDomain, originOrURL) { - const domain = getDomainFromOriginOrURL(originOrURL) - const rootDomain = getRootDomain(domain) + const [rootDomain,domain] = parseDomains(originOrURL); if (!domain || !rootDomain) return undefined if (entityByDomain.has(domain)) return entityByDomain.get(domain) diff --git a/lib/create-entity-finder-api.test.js b/lib/create-entity-finder-api.test.js index 5f4145a..4224fd0 100644 --- a/lib/create-entity-finder-api.test.js +++ b/lib/create-entity-finder-api.test.js @@ -44,7 +44,7 @@ describe('getEntity', () => { expect(api.getEntity('https://baz.bar.example.co.uk/path').name).toEqual('Domain') }) - it.skip('stress test', () => { + it('stress test', () => { const urls = fs .readFileSync(path.join(__dirname, '../data/random-urls.txt'), 'utf8') .split('\n') diff --git a/package.json b/package.json index 606171f..f75f51f 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,5 @@ "httparchive-nostats-subset": "./lib/subsets/httparchive-nostats.js", "httparchive-subset": "./lib/subsets/httparchive.js" } - }, - "packageManager": "yarn@4.1.1" + } } From 83d3920c3d5523467bcd7138790f019ada2c0059 Mon Sep 17 00:00:00 2001 From: Paul Irish Date: Thu, 6 Feb 2025 16:19:54 -0800 Subject: [PATCH 2/3] ok --- lib/create-entity-finder-api.js | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/create-entity-finder-api.js b/lib/create-entity-finder-api.js index e300d29..f63cb53 100644 --- a/lib/create-entity-finder-api.js +++ b/lib/create-entity-finder-api.js @@ -1,13 +1,8 @@ const DOMAIN_IN_URL_REGEX = /:\/\/(\S*?)(:\d+)?(\/|$)/ -const DOMAIN_CHARACTERS = /([a-z0-9.-]+\.[a-z0-9]+|localhost)/i +const DOMAIN_CHARACTERS = /(?:[a-z0-9.-]+\.[a-z0-9]+|localhost)/i const IP_REGEX = /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ const ROOT_DOMAIN_REGEX = /[^.]+\.([^.]+|(gov|com|co|ne)\.\w{2})$/i -// const DOMAIN_IN_URL_REGEX = /:\/\/(\S*?)(:\d+)?(\/|$)/ -// const DOMAIN_CHARACTERS = /([a-z0-9.-]+?\.[a-z0-9]+|localhost)/i -// const IP_REGEX = /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ -// const ROOT_DOMAIN_REGEX = /[^.]+?\.([^.]+?|(gov|com|co|ne)\.\w{2})$/i - /** * @param {string} originOrURL * @return {[string|null, string|null]} - The first item is the root domain, the second item is the domain. From 9234a8f7b79e8ee21935481971e5dadda66fa212 Mon Sep 17 00:00:00 2001 From: Paul Irish Date: Thu, 6 Feb 2025 16:29:10 -0800 Subject: [PATCH 3/3] revert those --- lib/create-entity-finder-api.js | 10 +++++----- lib/create-entity-finder-api.test.js | 2 +- package.json | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/create-entity-finder-api.js b/lib/create-entity-finder-api.js index f63cb53..aa42497 100644 --- a/lib/create-entity-finder-api.js +++ b/lib/create-entity-finder-api.js @@ -13,12 +13,12 @@ function parseDomains(originOrURL) { let m = originOrURL.match(DOMAIN_IN_URL_REGEX) let domain; if (m) { - domain = m[1] - } + domain = m[1] + } m = originOrURL.match(DOMAIN_CHARACTERS) if (m) { - domain = m[0] - } + domain = m[0] + } if (!domain) return [null, null] if (IP_REGEX.test(domain)) return [domain, domain] @@ -41,7 +41,7 @@ function sliceSubdomainFromDomain(domain, rootDomain) { } function getEntityInDataset(entityByDomain, entityBySubDomain, entityByRootDomain, originOrURL) { - const [rootDomain,domain] = parseDomains(originOrURL); + const [rootDomain, domain] = parseDomains(originOrURL); if (!domain || !rootDomain) return undefined if (entityByDomain.has(domain)) return entityByDomain.get(domain) diff --git a/lib/create-entity-finder-api.test.js b/lib/create-entity-finder-api.test.js index 4224fd0..5f4145a 100644 --- a/lib/create-entity-finder-api.test.js +++ b/lib/create-entity-finder-api.test.js @@ -44,7 +44,7 @@ describe('getEntity', () => { expect(api.getEntity('https://baz.bar.example.co.uk/path').name).toEqual('Domain') }) - it('stress test', () => { + it.skip('stress test', () => { const urls = fs .readFileSync(path.join(__dirname, '../data/random-urls.txt'), 'utf8') .split('\n') diff --git a/package.json b/package.json index f75f51f..606171f 100644 --- a/package.json +++ b/package.json @@ -41,5 +41,6 @@ "httparchive-nostats-subset": "./lib/subsets/httparchive-nostats.js", "httparchive-subset": "./lib/subsets/httparchive.js" } - } + }, + "packageManager": "yarn@4.1.1" }