diff --git a/bin/build-entity-json-files.js b/bin/build-entity-json-files.js index 2b9e457..263a916 100644 --- a/bin/build-entity-json-files.js +++ b/bin/build-entity-json-files.js @@ -38,7 +38,7 @@ const entitiesInHTTPArchive = _(httpArchiveData) // Find all the unique entities for our domains found in HTTPArchive .map(({domain}) => getEntity(domain)) .filter(Boolean) - .uniq() + .uniqBy(e => e.name) // Use the original entity which has the minimal form .map(e => sourceEntities.find(candidate => candidate.name === e.name)) .value() diff --git a/bin/fix-line-delimited-json.js b/bin/fix-line-delimited-json.js new file mode 100644 index 0000000..bd48acb --- /dev/null +++ b/bin/fix-line-delimited-json.js @@ -0,0 +1,12 @@ +const fs = require('fs') +const path = require('path') + +const fileToFix = path.resolve(process.cwd(), process.argv[2]) +console.log('Fixing', fileToFix, '...') +const lines = fs + .readFileSync(process.argv[2], 'utf8') + .split('\n') + .filter(Boolean) +JSON.parse(lines[0]) + +fs.writeFileSync(fileToFix, '[\n' + lines.join(',') + '\n]') diff --git a/bin/generate-canonical-domain-csv.js b/bin/generate-canonical-domain-csv.js index b3f0971..518f20f 100644 --- a/bin/generate-canonical-domain-csv.js +++ b/bin/generate-canonical-domain-csv.js @@ -26,7 +26,7 @@ const entries = Array.from(observedDomains) .map(domain => { const entity = getEntity(domain) if (!entity) { - return undefined + return [domain, domain, 'unknown'] } return [domain, entity.domains[0], entity.categories[0] || 'other'] diff --git a/lib/create-entity-finder-api.js b/lib/create-entity-finder-api.js index 00a05b6..b8db6f8 100644 --- a/lib/create-entity-finder-api.js +++ b/lib/create-entity-finder-api.js @@ -40,7 +40,11 @@ function createAPIFromDataset(entities_) { entity.averageExecutionTime = entity.totalExecutionTime / entity.totalOccurrences for (const domain of entity.domains) { - if (entityByDomain.has(domain)) throw new Error(`Duplicate domain ${domain}`) + if (entityByDomain.has(domain)) { + const duplicate = entityByDomain.get(domain) + throw new Error(`Duplicate domain ${domain} (${entity.name} and ${duplicate.name})`) + } + entityByDomain.set(domain, entity) const rootDomain = getRootDomain(domain) diff --git a/lib/index.test.js b/lib/index.test.js index 073a189..34c31db 100644 --- a/lib/index.test.js +++ b/lib/index.test.js @@ -54,59 +54,59 @@ describe('getRootDomain', () => { describe('getEntity', () => { it('works for direct domain usage', () => { expect(getEntity('https://js.connect.facebook.net/lib.js')).toMatchInlineSnapshot(` -Object { - "averageExecutionTime": 161.0291502603836, - "categories": Array [ - "social", - ], - "company": "Facebook", - "domains": Array [ - "*.atlassbx.com", - "*.facebook.com", - "*.fbsbx.com", - "fbcdn-photos-e-a.akamaihd.net", - "*.facebook.net", - "*.fbcdn.net", - ], - "examples": Array [ - "www.facebook.com", - "connect.facebook.net", - "staticxx.facebook.com", - "static.xx.fbcdn.net", - "m.facebook.com", - "an.facebook.com", - "platform-lookaside.fbsbx.com", - ], - "homepage": "https://www.facebook.com", - "name": "Facebook", - "totalExecutionTime": 578108820, - "totalOccurrences": 3590088, -} -`) + Object { + "averageExecutionTime": 222.95578518974813, + "categories": Array [ + "social", + ], + "company": "Facebook", + "domains": Array [ + "*.atlassbx.com", + "*.facebook.com", + "*.fbsbx.com", + "fbcdn-photos-e-a.akamaihd.net", + "*.facebook.net", + "*.fbcdn.net", + ], + "examples": Array [ + "www.facebook.com", + "connect.facebook.net", + "staticxx.facebook.com", + "static.xx.fbcdn.net", + "m.facebook.com", + "an.facebook.com", + "platform-lookaside.fbsbx.com", + ], + "homepage": "https://www.facebook.com", + "name": "Facebook", + "totalExecutionTime": 322128748, + "totalOccurrences": 1444810, + } + `) }) it('works for inferred domain usage', () => { expect(getEntity('https://unknown.typekit.net/fonts.css')).toMatchInlineSnapshot(` -Object { - "averageExecutionTime": 93.6581576026637, - "categories": Array [ - "cdn", - ], - "company": "Adobe", - "domains": Array [ - "*.typekit.com", - "*.typekit.net", - ], - "examples": Array [ - "use.typekit.net", - "p.typekit.net", - ], - "homepage": "https://fonts.adobe.com/", - "name": "Adobe TypeKit", - "totalExecutionTime": 1940878, - "totalOccurrences": 20723, -} -`) + Object { + "averageExecutionTime": 105.38858905165768, + "categories": Array [ + "cdn", + ], + "company": "Adobe", + "domains": Array [ + "*.typekit.com", + "*.typekit.net", + ], + "examples": Array [ + "use.typekit.net", + "p.typekit.net", + ], + "homepage": "https://fonts.adobe.com/", + "name": "Adobe TypeKit", + "totalExecutionTime": 1230201, + "totalOccurrences": 11673, + } + `) }) it('does not over-infer', () => {