From 0aa83727be75dea3954c2cfa04ed2bd054f16562 Mon Sep 17 00:00:00 2001 From: Guillaume NICOLAS Date: Wed, 28 Aug 2024 11:15:16 +0200 Subject: [PATCH] feat: keep saving in file all observed domains with minimum observations --- bin/automated-update.js | 51 ++++++++++++++++------------- sql/most-observed-domains-query.sql | 21 ++++++++++++ 2 files changed, 50 insertions(+), 22 deletions(-) create mode 100644 sql/most-observed-domains-query.sql diff --git a/bin/automated-update.js b/bin/automated-update.js index 06eebfe..fc5faf7 100644 --- a/bin/automated-update.js +++ b/bin/automated-update.js @@ -120,6 +120,7 @@ async function main() { const observedDomainsFilename = `${__dirname}/../data/${dateStringHypens}-observed-domains.json` const entityScriptingFilename = `${__dirname}/../data/${dateStringHypens}-entity-scripting.json` + const mostObservedDomainsFilename = `${__dirname}/../sql/most-observed-domains-query.sql` const allObservedDomainsFilename = `${__dirname}/../sql/all-observed-domains-query.sql` const entityPerPageFilename = `${__dirname}/../sql/entity-per-page.sql` @@ -131,6 +132,10 @@ async function main() { exitFn: () => process.exit(1), }) + const mostObservedDomainsQuery = getQueryForTable( + mostObservedDomainsFilename, + dateStringUnderscore + ) const allObservedDomainsQuery = getQueryForTable(allObservedDomainsFilename, dateStringUnderscore) const entityPerPageQuery = getQueryForTable(entityPerPageFilename, dateStringUnderscore) @@ -142,24 +147,21 @@ async function main() { const start = Date.now() - const domainEntityMapping = entities.reduce((array, {name, domains}) => { - return array.concat(domains.map(domain => ({name, domain}))) - }, []) - - const resultsStream = await getQueryResultStream(allObservedDomainsQuery, { - entities_string: JSON.stringify(domainEntityMapping), - }) - - // Observed domain json file pipe + //1. Get and write in 'observed-domains' json file domains observed more than 50 times let observedDomainsNbRows = 0 const observedDomainsFileWriterStream = fs.createWriteStream(observedDomainsFilename) - resultsStream - // stringify observed domain json (with json array prefix based on row index) - .pipe(getJSONStringTransformer(observedDomainsNbRows)) - // write to observed-domains json file - .pipe(observedDomainsFileWriterStream) + await getQueryResultStream(mostObservedDomainsQuery).then(stream => { + stream + // stringify observed domain json (with json array prefix based on row index) + .pipe(getJSONStringTransformer(observedDomainsNbRows)) + // write to observed-domains json file + .pipe(observedDomainsFileWriterStream) + }) - // Observed domain entity mapping table pipe + //2. Get and write in 'third_party_web' table all observed domains mapped to entity observed at least 50 times + const domainEntityMapping = entities.reduce((array, {name, domains}) => { + return array.concat(domains.map(domain => ({name, domain}))) + }, []) const thirdPartyWebTableWriterStream = new BigQuery() .dataset('third_party_web') .table(dateStringUnderscore) @@ -170,13 +172,18 @@ async function main() { {name: 'category', type: 'STRING'}, ], }) - resultsStream - // map observed domain to entity - .pipe(EntityCanonicalDomainTransformer) - // stringify json - .pipe(getJSONStringTransformer()) - // write to thrid_party_web table - .pipe(thirdPartyWebTableWriterStream) + + await getQueryResultStream(allObservedDomainsQuery, { + entities_string: JSON.stringify(domainEntityMapping), + }).then(stream => { + stream + // map observed domain to entity + .pipe(EntityCanonicalDomainTransformer) + // stringify json + .pipe(getJSONStringTransformer()) + // write to thrid_party_web table + .pipe(thirdPartyWebTableWriterStream) + }) // Wait both streams to finish await resolveOnFinished([observedDomainsFileWriterStream, thirdPartyWebTableWriterStream]) diff --git a/sql/most-observed-domains-query.sql b/sql/most-observed-domains-query.sql new file mode 100644 index 0000000..7e7df13 --- /dev/null +++ b/sql/most-observed-domains-query.sql @@ -0,0 +1,21 @@ +SELECT + domain, + COUNT(0) AS totalOccurrences +FROM + ( + SELECT + page, + NET.HOST(url) AS domain, + COUNT(0) AS totalOccurrences + FROM + `httparchive.requests.2022_01_01_mobile` + GROUP BY + page, + domain + ) +GROUP BY + domain +HAVING + totalOccurrences >= 50 +ORDER BY + totalOccurrences DESC \ No newline at end of file