From 938564c168c37f2538cca2e2ef0d57b09b24a616 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Tue, 5 Nov 2024 15:36:20 +0000 Subject: [PATCH 01/14] =?UTF-8?q?=E2=9C=A8=20explorer=20thumbnails=20code?= =?UTF-8?q?=20review=20touchups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/@ourworldindata/core-table/src/CoreTableUtils.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/@ourworldindata/core-table/src/CoreTableUtils.ts b/packages/@ourworldindata/core-table/src/CoreTableUtils.ts index 6f1a9bdb9e8..0e27c90f9b6 100644 --- a/packages/@ourworldindata/core-table/src/CoreTableUtils.ts +++ b/packages/@ourworldindata/core-table/src/CoreTableUtils.ts @@ -578,7 +578,6 @@ export const parseDelimited = ( const result = Papa.parse(str, { delimiter: delimiter ?? detectDelimiter(str), - skipEmptyLines: true, header: true, transformHeader: (header: string) => header.trim(), transform: (value: string) => value.trim(), From 98209283c18c20a4922b2cd304a6a5945d8e44a2 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Tue, 29 Oct 2024 21:51:34 +0000 Subject: [PATCH 02/14] =?UTF-8?q?=F0=9F=8E=89=20create=20ExplorerViewsAndC?= =?UTF-8?q?harts=20Algolia=20index?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/indexChartsToAlgolia.ts | 12 +- .../indexExplorerViewsAndChartsToAlgolia.ts | 142 ++++ baker/algolia/indexExplorerViewsToAlgolia.ts | 715 ++++++++++++------ baker/updateChartEntities.ts | 25 +- .../explorer/src/ExplorerProgram.ts | 2 + .../@ourworldindata/explorer/src/index.ts | 3 +- settings/clientSettings.ts | 3 + site/DataCatalog/DataCatalogUtils.ts | 8 +- site/search/ChartHit.tsx | 42 +- site/search/searchTypes.ts | 10 + 10 files changed, 732 insertions(+), 230 deletions(-) create mode 100644 baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts diff --git a/baker/algolia/indexChartsToAlgolia.ts b/baker/algolia/indexChartsToAlgolia.ts index c2de1064c34..96dcc8b2b84 100644 --- a/baker/algolia/indexChartsToAlgolia.ts +++ b/baker/algolia/indexChartsToAlgolia.ts @@ -2,7 +2,11 @@ import * as db from "../../db/db.js" import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { isPathRedirectedToExplorer } from "../../explorerAdminServer/ExplorerRedirects.js" -import { ChartRecord, SearchIndexName } from "../../site/search/searchTypes.js" +import { + ChartRecord, + ChartRecordType, + SearchIndexName, +} from "../../site/search/searchTypes.js" import { KeyChartLevel, OwidGdocLinkType, @@ -116,9 +120,10 @@ const parseAndProcessChartRecords = ( } } -const getChartsRecords = async ( +export const getChartsRecords = async ( knex: db.KnexReadonlyTransaction ): Promise => { + console.log("Fetching charts to index") const chartsToIndex = await db.knexRaw( knex, `-- sql @@ -139,6 +144,8 @@ const getChartsRecords = async ( WHERE cc.full ->> "$.isPublished" = 'true' AND c.isIndexable IS TRUE GROUP BY c.id + -- TODO: remove this, testing only + -- LIMIT 15 ) SELECT c.id, c.slug, @@ -192,6 +199,7 @@ const getChartsRecords = async ( const record = { objectID: c.id.toString(), + type: ChartRecordType.Chart, chartId: c.id, slug: c.slug, title: c.title, diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts new file mode 100644 index 00000000000..8d0e0e52226 --- /dev/null +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -0,0 +1,142 @@ +import Bugsnag from "@bugsnag/js" +import * as db from "../../db/db.js" +import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" +import { + ALGOLIA_INDEXING, + BUGSNAG_NODE_API_KEY, +} from "../../settings/serverSettings.js" +import { getAlgoliaClient } from "./configureAlgolia.js" +import { + ExplorerViewEntryWithExplorerInfo, + getExplorerViewRecords, +} from "./indexExplorerViewsToAlgolia.js" +import { getIndexName } from "../../site/search/searchClient.js" +import { + ChartRecord, + ChartRecordType, + SearchIndexName, +} from "../../site/search/searchTypes.js" +import { getChartsRecords } from "./indexChartsToAlgolia.js" + +function explorerViewRecordToChartRecord( + e: ExplorerViewEntryWithExplorerInfo +): ChartRecord { + return { + type: ChartRecordType.ExplorerView, + objectID: e.objectID!, + chartId: Math.floor(Math.random() * 1000000), + slug: e.explorerSlug, + queryParams: e.viewQueryParams, + title: e.viewTitle, + subtitle: e.explorerSubtitle, + variantName: "", + keyChartForTags: [], + tags: e.tags, + availableEntities: e.availableEntities, + publishedAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + numDimensions: e.numNonDefaultSettings, + titleLength: e.titleLength, + numRelatedArticles: 0, + views_7d: e.explorerViews_7d, + score: e.score, + } +} + +/** + * Scale explorer scores to the range of grapher scores + * e.g. if the highest explorer score is 100 and the highest grapher score is 1000, + * we want to scale the explorer scores to be between 0 and 1000 + */ +function scaleExplorerScores( + explorerRecords: ChartRecord[], + grapherRecords: ChartRecord[] +): ChartRecord[] { + const explorerScores = explorerRecords.map((e) => e.score) + const explorerScoreMax = Math.max(...explorerScores) + + const grapherScores = grapherRecords.map((e) => e.score) + const grapherScoreBounds = { + max: Math.max(...grapherScores), + min: Math.min(...grapherScores), + } + + // scale positive explorer scores to the range of grapher scores + // We want to keep negative scores because they're intentionally downranked as near-duplicates of existing views + return explorerRecords.map((e): ChartRecord => { + if (e.score < 0) return e + // A value between 0 and 1 + const normalized = e.score / explorerScoreMax + const grapherRange = grapherScoreBounds.max - grapherScoreBounds.min + const scaled = Math.round( + normalized * grapherRange + grapherScoreBounds.min + ) + return { + ...e, + score: scaled, + } + }) +} + +// We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over. +// If we standardize the record shape, we could have this be the only index and have a `type` field +// to use in /search. +const indexExplorerViewsAndChartsToAlgolia = async () => { + const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts) + console.log( + `Indexing explorer views and charts to the "${indexName}" index on Algolia` + ) + if (!ALGOLIA_INDEXING) return + if (BUGSNAG_NODE_API_KEY) { + Bugsnag.start({ + apiKey: BUGSNAG_NODE_API_KEY, + context: "index-explorer-views-to-algolia", + autoTrackSessions: false, + }) + } + const client = getAlgoliaClient() + + if (!client) { + await logErrorAndMaybeSendToBugsnag( + `Failed indexing explorer views (Algolia client not initialized)` + ) + return + } + + try { + const { explorerViews, grapherViews } = + await db.knexReadonlyTransaction(async (trx) => { + return { + explorerViews: await getExplorerViewRecords(trx), + grapherViews: await getChartsRecords(trx), + } + }, db.TransactionCloseMode.Close) + + const convertedExplorerViews = explorerViews.map( + explorerViewRecordToChartRecord + ) + const scaledExplorerViews = scaleExplorerScores( + convertedExplorerViews, + grapherViews + ) + const records = [...scaledExplorerViews, ...grapherViews] + + const index = client.initIndex(indexName) + + console.log(`Indexing ${records.length} records`) + await index.replaceAllObjects(records) + console.log(`Indexing complete`) + } catch (e) { + await logErrorAndMaybeSendToBugsnag({ + name: `IndexExplorerViewsToAlgoliaError`, + message: `${e}`, + }) + } +} + +process.on("unhandledRejection", (e) => { + console.error(e) + process.exit(1) +}) + +void indexExplorerViewsAndChartsToAlgolia() diff --git a/baker/algolia/indexExplorerViewsToAlgolia.ts b/baker/algolia/indexExplorerViewsToAlgolia.ts index 7a637711986..4365cc221f1 100644 --- a/baker/algolia/indexExplorerViewsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsToAlgolia.ts @@ -1,42 +1,48 @@ import * as db from "../../db/db.js" -import { tsvFormat } from "d3-dsv" import { ExplorerChoiceParams, ExplorerControlType, GridBoolean, DecisionMatrix, + TableDef, } from "@ourworldindata/explorer" import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" import { ALGOLIA_INDEXING, BUGSNAG_NODE_API_KEY, + DATA_API_URL, } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { getIndexName } from "../../site/search/searchClient.js" import { SearchIndexName } from "../../site/search/searchTypes.js" -import { groupBy, keyBy, orderBy, partition } from "lodash" +import { at, get, groupBy, keyBy, mapValues, orderBy, partition } from "lodash" import { MarkdownTextWrap } from "@ourworldindata/components" -import { DbRawVariable } from "@ourworldindata/utils" import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" import Bugsnag from "@bugsnag/js" - -export type ExplorerBlockGraphers = { - type: "graphers" - block: { - title?: string - subtitle?: string - grapherId?: number - }[] -} +import { obtainAvailableEntitiesForAllGraphers } from "../updateChartEntities.js" +import { fetchS3MetadataByPath } from "../../db/model/Variable.js" +import { getVariableMetadataRoute } from "@ourworldindata/grapher" +import pMap from "p-map" +import { ExplorerAdminServer } from "../../explorerAdminServer/ExplorerAdminServer.js" +import { GIT_CMS_DIR } from "../../gitCms/GitCmsConstants.js" +import { parseDelimited } from "@ourworldindata/core-table" +import { + ColumnTypeNames, + CoreRow, + DbEnrichedVariable, +} from "@ourworldindata/types" interface ExplorerViewEntry { viewTitle: string viewSubtitle: string viewSettings: string[] viewQueryParams: string + availableEntities: string[] viewGrapherId?: number - viewFirstYIndicator?: string | number // Variable ID or ETL path + yVariableIds: Array // Variable IDs or ETL paths + tableSlug?: string + ySlugs: string[] /** * We often have several views with the same title within an explorer, e.g. "Population". @@ -52,13 +58,14 @@ interface ExplorerViewEntry { // viewViews_7d: number } -interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { +export interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { explorerSlug: string explorerTitle: string explorerSubtitle: string explorerViews_7d: number viewTitleAndExplorerSlug: string // used for deduplication: `viewTitle | explorerSlug` numViewsWithinExplorer: number + tags: string[] score: number @@ -81,6 +88,169 @@ const explorerChoiceToViewSettings = ( }) } +type ExplorerIndicatorMetadata = Record< + string | number, + { + entityNames?: string[] + display: DbEnrichedVariable["display"] + titlePublic: DbEnrichedVariable["titlePublic"] + descriptionShort: DbEnrichedVariable["descriptionShort"] + name: DbEnrichedVariable["name"] + } +> + +async function fetchIndicatorMetadata( + records: Omit< + ExplorerViewEntry, + "viewTitleIndexWithinExplorer" | "titleLength" + >[], + trx: db.KnexReadonlyTransaction +): Promise { + function checkIsETLPath(idOrPath: string | number): idOrPath is string { + return typeof idOrPath === "string" + } + + const { etlPaths, ids } = records.reduce( + ({ etlPaths, ids }, record) => { + for (const yVariableId of record.yVariableIds) { + if (checkIsETLPath(yVariableId)) { + etlPaths.add(yVariableId) + } else { + ids.add(yVariableId) + } + } + return { etlPaths, ids } + }, + { etlPaths: new Set(), ids: new Set() } + ) + + const metadataFromDB = ( + await trx + .table("variables") + .select( + "id", + "catalogPath", + "name", + "titlePublic", + "display", + "name", + "descriptionShort" + ) + .whereIn("id", [...ids]) + .orWhereIn("catalogPath", [...etlPaths]) + ).map((row) => ({ + ...row, + display: row.display ? JSON.parse(row.display) : {}, + })) as DbEnrichedVariable[] + + const indicatorMetadataByIdAndPath = { + ...keyBy(metadataFromDB, "id"), + ...keyBy(metadataFromDB, "catalogPath"), + } as ExplorerIndicatorMetadata + + async function fetchEntitiesForId(id?: number) { + if (id) { + const metadata = await fetchS3MetadataByPath( + getVariableMetadataRoute(DATA_API_URL, id) + ) + const entityNames = get(metadata, "dimensions.entities.values", []) + .map((value) => value.name) + .filter((name): name is string => !!name) + + const idEntry = indicatorMetadataByIdAndPath[id] + if (idEntry) { + idEntry.entityNames = entityNames + } + const path = metadata.catalogPath + if (path) { + const pathEntry = indicatorMetadataByIdAndPath[path] + if (pathEntry) { + pathEntry.entityNames = entityNames + } + } + } + } + + await pMap( + metadataFromDB.map((meta) => meta.id), + fetchEntitiesForId, + { concurrency: 10 } + ) + + return indicatorMetadataByIdAndPath +} + +/** Almost always `"country"`, but sometimes things like `"location"` */ +function getEntityNameSlug(tableDef: TableDef): string { + return ( + tableDef.columnDefinitions?.find( + (col) => col.type === ColumnTypeNames.EntityName + )?.slug || "country" + ) +} + +/** + * Returns an aggregator function that can be used to aggregate entities per column in a parsed CSV + * e.g. if there's a column named "gdp", this will return an object like `{ gdp: Set }` + * containing all the entities that have any data for gdp. + */ +function makeAggregator(entityNameSlug: string) { + return ( + result: Record>, + row: Record + ) => { + const entityName = row[entityNameSlug] + Object.keys(row).forEach((columnSlug) => { + if (columnSlug === entityNameSlug || columnSlug === "year") return + + const value = row[columnSlug] + if (value) { + if (!result[columnSlug]) { + result[columnSlug] = new Set() + } + result[columnSlug].add(entityName) + } + }) + + return result + } +} + +async function getEntitiesPerColumnPerTable( + tableDefs: TableDef[] +): Promise>> { + return pMap( + tableDefs, + (tableDef) => { + console.log("Fetching CSV table data from", tableDef.url) + return fetch(tableDef.url!) + .then((res) => res.text()) + .then((csv) => parseDelimited(csv)) + .then((parsed) => { + const entityNameSlug = getEntityNameSlug(tableDef) + const aggregateEntities = makeAggregator(entityNameSlug) + const entitiesPerColumn = parsed.reduce( + aggregateEntities, + {} + ) + + // Convert sets to arrays + const entityNamesAsArray = mapValues( + entitiesPerColumn, + (set) => Array.from(set) + ) as Record + + // Return an object like `{ almonds: { population: ["United States", "Canada"], area_harvested__ha: ["United States"] } }` + return { [tableDef.slug!]: entityNamesAsArray } + }) + }, + { + concurrency: 5, + } + // Merge all these objects together + ).then((results) => Object.assign({}, ...results)) +} + const computeScore = ( record: Omit & Partial @@ -89,251 +259,365 @@ const computeScore = ( record.numNonDefaultSettings * 50 - record.titleLength -const getExplorerViewRecordsForExplorerSlug = async ( - trx: db.KnexReadonlyTransaction, - slug: string -): Promise => { - const explorerConfig = await trx - .table("explorers") - .select("config") - .where({ slug }) - .first() - .then((row) => JSON.parse(row.config) as any) - - const explorerGrapherBlock: ExplorerBlockGraphers = - explorerConfig.blocks.filter( - (block: any) => block.type === "graphers" - )[0] as ExplorerBlockGraphers +interface IndicatorMetadata { + entityNames: string[] + titlePublic?: string + display?: { name: string } + name: string + descriptionShort?: string +} - if (explorerGrapherBlock === undefined) - throw new Error(`Explorer ${slug} has no grapher block`) +interface GrapherInfo { + id: number + title: string + subtitle: string +} - // TODO: Maybe make DecisionMatrix accept JSON directly - const tsv = tsvFormat(explorerGrapherBlock.block) - const explorerDecisionMatrix = new DecisionMatrix(tsv) +const parseYVariableIds = (matrixRow: CoreRow): (string | number)[] => { + return ( + matrixRow.yVariableIds + ?.trim() + .split(" ") + .map((idOrPath: string) => + isNaN(parseInt(idOrPath)) ? idOrPath : parseInt(idOrPath) + ) || [] + ) +} - console.log( - `Processing explorer ${slug} (${explorerDecisionMatrix.numRows} rows)` +const getNonDefaultSettings = ( + choice: ExplorerChoiceParams, + matrix: DecisionMatrix +): [string, any][] => { + const defaultSettings = matrix.defaultSettings + return Object.entries(matrix.availableChoiceOptions).filter( + ([choiceName, choiceOptions]) => { + return ( + choiceOptions.length > 1 && + !(defaultSettings[choiceName] !== undefined + ? defaultSettings[choiceName] === choice[choiceName] + : choice[choiceName] === choiceOptions[0]) + ) + } ) +} - const defaultSettings = explorerDecisionMatrix.defaultSettings +const createBaseRecord = ( + choice: ExplorerChoiceParams, + matrix: DecisionMatrix, + index: number +): Partial => { + matrix.setValuesFromChoiceParams(choice) + const nonDefaultSettings = getNonDefaultSettings(choice, matrix) + const yVariableIds = parseYVariableIds(matrix.selectedRow) + + return { + viewTitle: matrix.selectedRow.title, + viewSubtitle: matrix.selectedRow.subtitle, + viewSettings: explorerChoiceToViewSettings(choice, matrix), + availableEntities: [], + viewGrapherId: matrix.selectedRow.grapherId, + yVariableIds, + viewQueryParams: matrix.toString(), + viewIndexWithinExplorer: index, + numNonDefaultSettings: nonDefaultSettings.length, + tableSlug: matrix.selectedRow.tableSlug, + ySlugs: matrix.selectedRow.ySlugs?.split(" ") || [], + } +} - const records = explorerDecisionMatrix +const createBaseRecords = ( + matrix: DecisionMatrix +): Partial[] => { + return matrix .allDecisionsAsQueryParams() - .map((choice, i) => { - explorerDecisionMatrix.setValuesFromChoiceParams(choice) - - // Check which choices are non-default, i.e. are not the first available option in a dropdown/radio - const nonDefaultSettings = Object.entries( - explorerDecisionMatrix.availableChoiceOptions - ).filter(([choiceName, choiceOptions]) => { - // Keep only choices which are not the default, which is: - // - either the options marked as `default` in the decision matrix - // - or the first available option in the decision matrix - return ( - choiceOptions.length > 1 && - !(defaultSettings[choiceName] !== undefined - ? defaultSettings[choiceName] === choice[choiceName] - : choice[choiceName] === choiceOptions[0]) - ) - }) + .map((choice: ExplorerChoiceParams, index: number) => + createBaseRecord(choice, matrix, index) + ) +} - const record: Omit< - ExplorerViewEntry, - "viewTitleIndexWithinExplorer" | "titleLength" - > = { - viewTitle: explorerDecisionMatrix.selectedRow.title, - viewSubtitle: explorerDecisionMatrix.selectedRow.subtitle, - viewSettings: explorerChoiceToViewSettings( - choice, - explorerDecisionMatrix - ), - viewGrapherId: explorerDecisionMatrix.selectedRow.grapherId, - viewFirstYIndicator: - explorerDecisionMatrix.selectedRow.yVariableIds - ?.trim() - .split(" ") - .at(0), - viewQueryParams: explorerDecisionMatrix.toString(), - - viewIndexWithinExplorer: i, - numNonDefaultSettings: nonDefaultSettings.length, - } - return record - }) +const fetchGrapherInfo = async ( + trx: db.KnexReadonlyTransaction, + grapherIds: number[] +): Promise> => { + return await trx + .select( + trx.raw("charts.id as id"), + trx.raw("chart_configs.full->>'$.title' as title"), + trx.raw("chart_configs.full->>'$.subtitle' as subtitle") + ) + .from("charts") + .join("chart_configs", { "charts.configId": "chart_configs.id" }) + .whereIn("charts.id", grapherIds) + .andWhereRaw("chart_configs.full->>'$.isPublished' = 'true'") + .then((rows) => keyBy(rows, "id")) +} + +const enrichRecordWithGrapherInfo = ( + record: Partial, + grapherInfo: Record, + availableEntities: Map, + slug: string +): Partial => { + if (!record.viewGrapherId) return record + + const grapher = grapherInfo[record.viewGrapherId] + if (!grapher) { + console.warn( + `Grapher id ${record.viewGrapherId} not found for explorer ${slug}` + ) + return record + } + + return { + ...record, + availableEntities: + availableEntities.get(record.viewGrapherId)?.availableEntities ?? + [], + viewTitle: grapher.title, + viewSubtitle: grapher.subtitle, + } +} - // Enrich `grapherId`-powered views with title/subtitle +const enrichWithGrapherData = async ( + records: Partial[], + trx: db.KnexReadonlyTransaction, + slug: string +): Promise[]> => { const grapherIds = records .filter((record) => record.viewGrapherId !== undefined) .map((record) => record.viewGrapherId as number) - if (grapherIds.length) { - console.log( - `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${slug}` - ) - const grapherIdToTitle = await trx - .select( - trx.raw("charts.id as id"), - trx.raw("chart_configs.full->>'$.title' as title"), - trx.raw("chart_configs.full->>'$.subtitle' as subtitle") - ) - .from("charts") - .join("chart_configs", { "charts.configId": "chart_configs.id" }) - .whereIn("charts.id", grapherIds) - .andWhereRaw("chart_configs.full->>'$.isPublished' = 'true'") - .then((rows) => keyBy(rows, "id")) - - for (const record of records) { - if (record.viewGrapherId !== undefined) { - const grapherInfo = grapherIdToTitle[record.viewGrapherId] - if (grapherInfo === undefined) { - console.warn( - `Grapher id ${record.viewGrapherId} not found for explorer ${slug}` - ) - continue - } - record.viewTitle = grapherInfo.title - record.viewSubtitle = grapherInfo.subtitle - } - } - } + if (!grapherIds.length) return records - // Resolve the `yIndicatorIds` field - const yIndicatorIds = records - .map((record) => record.viewFirstYIndicator) - .filter((id) => id !== undefined) - .filter((id) => id !== "") + console.log( + `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${slug}` + ) + const grapherInfo = await fetchGrapherInfo(trx, grapherIds) + const availableEntities = await obtainAvailableEntitiesForAllGraphers( + trx, + grapherIds + ) - if (yIndicatorIds.length) { - console.log( - `Fetching indicator metadata from ${yIndicatorIds.length} indicators for explorer ${slug}` + return records.map((record) => + enrichRecordWithGrapherInfo( + record, + grapherInfo, + availableEntities, + slug ) + ) +} - type IndicatorRecord = Pick< - DbRawVariable, - | "id" - | "catalogPath" - | "titlePublic" - | "display" - | "name" - | "descriptionShort" - > - // The `yIndicatorId` can be a variable ID or a catalog path, and we want to resolve both - const indicatorIdToTitle: IndicatorRecord[] = await trx - .table("variables") - .select( - "id", - "catalogPath", - "name", - "titlePublic", - "display", - "name", - "descriptionShort" - ) - .whereIn("id", yIndicatorIds) - .orWhereIn("catalogPath", yIndicatorIds) - - const indicatorsKeyedByIdAndCatalogPath = indicatorIdToTitle.reduce( - (acc, indicator) => { - acc[indicator.id] = indicator - if (indicator.catalogPath) - acc[indicator.catalogPath] = indicator - return acc - }, - {} as Record +const enrichRecordWithTableData = ( + record: Partial, + entitiesPerColumnPerTable: Record> +): Partial => { + const { tableSlug, ySlugs } = record + if (!tableSlug || !ySlugs?.length) return record + + const availableEntities = ySlugs + .flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) + .filter((name, i, array) => array.indexOf(name) === i) + + return { ...record, availableEntities } +} + +const enrichRecordWithIndicatorData = ( + record: Partial, + indicatorMetadata: Record +): Partial => { + if (!record.yVariableIds?.length) return record + + const allEntities = at(indicatorMetadata, record.yVariableIds) + .flatMap((meta) => meta.entityNames) + .filter( + (name, i, array): name is string => + array.indexOf(name) === i && !!name ) - for (const record of records) { - if (record.viewFirstYIndicator !== undefined) { - const indicatorInfo = - indicatorsKeyedByIdAndCatalogPath[ - record.viewFirstYIndicator - ] - if (indicatorInfo === undefined) { - console.warn( - `Indicator id ${record.viewFirstYIndicator} not found for explorer ${slug}` - ) - continue - } + const result = { ...record, availableEntities: allEntities } - // This is the fallback chain for the grapher title. it's complicated. - record.viewTitle = - record.viewTitle ?? - indicatorInfo.titlePublic ?? - (indicatorInfo.display - ? JSON.parse(indicatorInfo.display).name - : undefined) ?? - indicatorInfo.name - record.viewSubtitle = - record.viewSubtitle ?? indicatorInfo.descriptionShort - } - } + const firstYIndicator = record.yVariableIds[0] + if (firstYIndicator === undefined) return result + + const indicatorInfo = indicatorMetadata[firstYIndicator] + if (!indicatorInfo) return result + + return { + ...result, + viewTitle: + record.viewTitle ?? + indicatorInfo.titlePublic ?? + indicatorInfo.display?.name ?? + indicatorInfo.name, + viewSubtitle: record.viewSubtitle ?? indicatorInfo.descriptionShort, } +} - // Drop any views where we couldn't obtain a title, for whatever reason - const [recordsWithViewTitle, recordsWithNoViewTitle] = partition( - records, - (record) => record.viewTitle !== undefined - ) +const enrichWithMetadata = async ( + records: Partial[], + indicatorMetadata: Record, + entitiesPerColumnPerTable: Record> +): Promise[]> => { + return records.map((record) => { + const withTableData = enrichRecordWithTableData( + record, + entitiesPerColumnPerTable + ) + return enrichRecordWithIndicatorData(withTableData, indicatorMetadata) + }) +} + +const cleanSubtitles = ( + records: Partial[] +): Partial[] => { + return records.map((record) => ({ + ...record, + viewSubtitle: record.viewSubtitle + ? new MarkdownTextWrap({ + text: record.viewSubtitle, + fontSize: 10, + }).plaintext + : undefined, + })) +} - for (const record of recordsWithNoViewTitle) { +async function logMissingTitles( + records: Partial[], + slug: string +): Promise { + for (const record of records) { await logErrorAndMaybeSendToBugsnag({ name: "ExplorerViewTitleMissing", message: `Explorer ${slug} has a view with no title: ${record.viewQueryParams}.`, }) } +} - // Remove Markdown from viewSubtitle; do this after fetching grapher info above, as it might also contain Markdown - const recordsWithTitleLength = recordsWithViewTitle.map((record) => { - if (record.viewSubtitle) { - record.viewSubtitle = new MarkdownTextWrap({ - text: record.viewSubtitle, - fontSize: 10, // doesn't matter, but is a mandatory field - }).plaintext - } - return { ...record, titleLength: record.viewTitle.length } - }) as Omit[] - - // Compute viewTitleIndexWithinExplorer: - // First, sort by score descending (ignoring views_7d, which is not relevant _within_ an explorer). - // Then, group by viewTitle. - // Finally, ungroup again, and keep track of the index of each element within the group. - const recordsSortedByScore = orderBy( - recordsWithTitleLength, - (record) => computeScore(record), - "desc" +async function finalizeRecords( + records: Partial[], + slug: string +): Promise { + const [withTitle, withoutTitle] = partition( + records, + (record) => record.viewTitle !== undefined ) - const recordsGroupedByViewTitle = groupBy(recordsSortedByScore, "viewTitle") - const recordsWithIndexWithinExplorer = Object.values( - recordsGroupedByViewTitle - ).flatMap((recordsGroup) => - recordsGroup.map((record, i) => ({ + + await logMissingTitles(withoutTitle, slug) + + const withCleanSubtitles = cleanSubtitles(withTitle) + const withTitleLength = withCleanSubtitles.map((record) => ({ + ...record, + titleLength: record.viewTitle!.length, + })) as Omit[] + + const sortedByScore = orderBy( + withTitleLength, + computeScore, + "desc" + ) as Omit[] + + const groupedByTitle = groupBy(sortedByScore, "viewTitle") + + return Object.values(groupedByTitle).flatMap((group, i) => + group.map((record) => ({ ...record, viewTitleIndexWithinExplorer: i, })) ) +} + +export const getExplorerViewRecordsForExplorerSlug = async ( + trx: db.KnexReadonlyTransaction, + slug: string, + explorerAdminServer: ExplorerAdminServer +): Promise => { + // Get explorer program and table definitions + const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) + const tableDefs = explorerProgram.tableSlugs + .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) + .filter((x) => x && x.url && x.slug) as TableDef[] - return recordsWithIndexWithinExplorer + // Fetch and process CSV table data + console.log( + `Fetching CSV table data for ${slug} and aggregating entities by column` + ) + const entitiesPerColumnPerTable = + await getEntitiesPerColumnPerTable(tableDefs) + console.log( + "Finished fetching CSV table data and aggregating entities by column" + ) + + // Create base records from decision matrix + console.log( + `Processing explorer ${slug} (${explorerProgram.decisionMatrix.numRows} rows)` + ) + const baseRecords = createBaseRecords(explorerProgram.decisionMatrix) + + // Enrich with grapher data + const recordsWithGrapherData = await enrichWithGrapherData( + baseRecords, + trx, + slug + ) + + // Fetch and apply indicator metadata + console.log("Fetching indicator metadata for explorer", slug) + const indicatorMetadata = await fetchIndicatorMetadata( + recordsWithGrapherData as any, + trx + ) + console.log("Fetched indicator metadata for explorer", slug) + + const enrichedRecords = await enrichWithMetadata( + recordsWithGrapherData, + indicatorMetadata as any, + entitiesPerColumnPerTable + ) + + // Finalize records with titles, sorting, and grouping + return finalizeRecords(enrichedRecords, slug) } -const getExplorerViewRecords = async ( - trx: db.KnexReadonlyTransaction -): Promise => { - const publishedExplorers = Object.values( - await db.getPublishedExplorersBySlug(trx) +async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { + const explorersBySlug = await db.getPublishedExplorersBySlug(trx) + const parentTags = await db.getParentTagsByChildName(trx) + const publishedExplorersWithTags = Object.values(explorersBySlug).map( + (explorer) => ({ + ...explorer, + tags: explorer.tags + .flatMap((tag) => [tag.name, ...parentTags[tag.name]]) + .filter( + (tag, index, array) => !!tag && array.indexOf(tag) === index + ), + }) ) + return publishedExplorersWithTags +} +export const getExplorerViewRecords = async ( + trx: db.KnexReadonlyTransaction +): Promise => { + console.log("Fetching explorer views to index") + const publishedExplorersWithTags = await getExplorersWithInheritedTags(trx) const pageviews = await getAnalyticsPageviewsByUrlObj(trx) + const explorerAdminServer = new ExplorerAdminServer(GIT_CMS_DIR) + let records = [] as ExplorerViewEntryWithExplorerInfo[] - for (const explorerInfo of publishedExplorers) { + for (const explorerInfo of publishedExplorersWithTags) { const explorerViewRecords = await getExplorerViewRecordsForExplorerSlug( trx, - explorerInfo.slug + explorerInfo.slug, + explorerAdminServer ) - const explorerPageviews = - pageviews[`/explorers/${explorerInfo.slug}`]?.views_7d ?? 0 + const explorerPageviews = get( + pageviews, + [`/explorers/${explorerInfo.slug}`, "views_7d"], + 0 + ) + // These have a score for ranking purposes, but it doesn't yet factor in the explorer's pageviews const unscoredRecords = explorerViewRecords.map( (record, i): Omit => ({ ...record, @@ -343,7 +627,7 @@ const getExplorerViewRecords = async ( explorerViews_7d: explorerPageviews, viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, numViewsWithinExplorer: explorerViewRecords.length, - + tags: explorerInfo.tags, objectID: `${explorerInfo.slug}-${i}`, }) ) @@ -385,8 +669,11 @@ const indexExplorerViewsToAlgolia = async () => { getExplorerViewRecords, db.TransactionCloseMode.Close ) + console.log(`Indexing ${records.length} explorer views to Algolia`) await index.replaceAllObjects(records) + console.log(`Indexing complete`) } catch (e) { + console.error(e) await logErrorAndMaybeSendToBugsnag({ name: `IndexExplorerViewsToAlgoliaError`, message: `${e}`, diff --git a/baker/updateChartEntities.ts b/baker/updateChartEntities.ts index 5ba84537674..9e9736c19dd 100644 --- a/baker/updateChartEntities.ts +++ b/baker/updateChartEntities.ts @@ -120,8 +120,10 @@ const obtainAvailableEntitiesForGrapherConfig = async ( } else return [] } -const obtainAvailableEntitiesForAllGraphers = async ( - trx: db.KnexReadonlyTransaction +export const obtainAvailableEntitiesForAllGraphers = async ( + trx: db.KnexReadonlyTransaction, + // Optional subset of IDs to restrict data fetching to + chartIds?: number[] ) => { const entityNameToIdMap = await mapEntityNamesToEntityIds(trx) @@ -134,10 +136,17 @@ const obtainAvailableEntitiesForAllGraphers = async ( FROM charts c JOIN chart_configs cc ON c.configId = cc.id WHERE cc.full ->> "$.isPublished" = 'true' + ${chartIds && chartIds.length ? `AND c.id IN (${chartIds.join(",")})` : ""} ` ) - const availableEntitiesByChartId = new Map() + const availableEntitiesByChartId = new Map< + number, + { + availableEntities: string[] + availableEntityIds: number[] + } + >() await pMap( allPublishedGraphers, async (grapher) => { @@ -156,7 +165,10 @@ const obtainAvailableEntitiesForAllGraphers = async ( return [entityId] } ) - availableEntitiesByChartId.set(grapher.id, availableEntityIds) + availableEntitiesByChartId.set(grapher.id, { + availableEntities, + availableEntityIds, + }) console.log( grapher.id, @@ -194,7 +206,10 @@ const updateAvailableEntitiesForAllGraphers = async ( console.log("--- Updating charts_x_entities ---") await trx.delete().from(ChartsXEntitiesTableName) // clears out the WHOLE table - for (const [chartId, availableEntityIds] of availableEntitiesByChartId) { + for (const [ + chartId, + { availableEntityIds }, + ] of availableEntitiesByChartId) { const rows = availableEntityIds.map((entityId) => ({ chartId, entityId, diff --git a/packages/@ourworldindata/explorer/src/ExplorerProgram.ts b/packages/@ourworldindata/explorer/src/ExplorerProgram.ts index 07f998345f5..5535566d998 100644 --- a/packages/@ourworldindata/explorer/src/ExplorerProgram.ts +++ b/packages/@ourworldindata/explorer/src/ExplorerProgram.ts @@ -48,6 +48,7 @@ export interface TableDef { url?: string columnDefinitions?: OwidColumnDef[] inlineData?: string[][] + slug?: TableSlug } interface ExplorerGrapherInterface extends GrapherInterface { @@ -477,6 +478,7 @@ export class ExplorerProgram extends GridProgram { url, columnDefinitions, inlineData, + slug: tableSlug, } } } diff --git a/packages/@ourworldindata/explorer/src/index.ts b/packages/@ourworldindata/explorer/src/index.ts index 70eb37f3a43..036fc1450ef 100644 --- a/packages/@ourworldindata/explorer/src/index.ts +++ b/packages/@ourworldindata/explorer/src/index.ts @@ -11,6 +11,7 @@ export { EXPLORER_EMBEDDED_FIGURE_SELECTOR, ExplorerChartCreationMode, ExplorerContainerId, + ExplorerControlType, ExplorerControlTypeRegex, EXPLORERS_GIT_CMS_FOLDER, EXPLORERS_PREVIEW_ROUTE, @@ -23,7 +24,6 @@ export { type ExplorerChoice, type ExplorerChoiceOption, type ExplorerChoiceParams, - type ExplorerControlType, type ExplorerFullQueryParams, type ExplorersRouteResponse, type ExplorerStandardQueryParams, @@ -32,6 +32,7 @@ export { } from "./ExplorerConstants.js" export { + type TableDef, ExplorerProgram, EXPLORER_FILE_SUFFIX, makeFullPath, diff --git a/settings/clientSettings.ts b/settings/clientSettings.ts index 2fbccb0cb4c..73d9a362987 100644 --- a/settings/clientSettings.ts +++ b/settings/clientSettings.ts @@ -38,6 +38,9 @@ export const BAKED_SITE_EXPORTS_BASE_URL: string = export const GRAPHER_DYNAMIC_THUMBNAIL_URL: string = process.env.GRAPHER_DYNAMIC_THUMBNAIL_URL ?? `${BAKED_GRAPHER_URL}` +export const EXPLORER_DYNAMIC_THUMBNAIL_URL: string = + process.env.EXPLORER_DYNAMIC_THUMBNAIL_URL ?? `${BAKED_BASE_URL}/explorers` + export const GRAPHER_DYNAMIC_CONFIG_URL: string = process.env.GRAPHER_DYNAMIC_CONFIG_URL ?? `${BAKED_GRAPHER_URL}` diff --git a/site/DataCatalog/DataCatalogUtils.ts b/site/DataCatalog/DataCatalogUtils.ts index 040b3fbc2f5..7840b420882 100644 --- a/site/DataCatalog/DataCatalogUtils.ts +++ b/site/DataCatalog/DataCatalogUtils.ts @@ -4,7 +4,7 @@ import { SearchResponse, } from "instantsearch.js" import { getIndexName } from "../search/searchClient.js" -import { SearchIndexName } from "../search/searchTypes.js" +import { ChartRecordType, SearchIndexName } from "../search/searchTypes.js" import { TagGraphRoot } from "@ourworldindata/types" import { DataCatalogState } from "./DataCatalogState.js" import { countriesByName, Region } from "@ourworldindata/utils" @@ -13,13 +13,15 @@ import { SearchClient } from "algoliasearch" /** * Constants */ -const CHARTS_INDEX = getIndexName(SearchIndexName.Charts) +const CHARTS_INDEX = getIndexName(SearchIndexName.ExplorerViewsAndCharts) const DATA_CATALOG_ATTRIBUTES = [ "title", "slug", "availableEntities", "variantName", + "type", + "queryParams", ] /** @@ -44,6 +46,8 @@ export type IDataCatalogHit = { availableEntities: string[] objectID: string variantName: string | null + type: ChartRecordType + queryParams: string __position: number _highlightResult?: HitHighlightResult _snippetResult?: HitHighlightResult diff --git a/site/search/ChartHit.tsx b/site/search/ChartHit.tsx index a9b97fadeae..607f489322c 100644 --- a/site/search/ChartHit.tsx +++ b/site/search/ChartHit.tsx @@ -1,12 +1,14 @@ import React, { useEffect, useMemo, useState } from "react" import cx from "classnames" import { Region } from "@ourworldindata/utils" -import { IChartHit, SearchIndexName } from "./searchTypes.js" +import { ChartRecordType, IChartHit, SearchIndexName } from "./searchTypes.js" import { getEntityQueryStr, pickEntitiesForChartHit } from "./SearchUtils.js" import { HitAttributeHighlightResult } from "instantsearch.js" import { + BAKED_BASE_URL, BAKED_GRAPHER_EXPORTS_BASE_URL, BAKED_GRAPHER_URL, + EXPLORER_DYNAMIC_THUMBNAIL_URL, GRAPHER_DYNAMIC_THUMBNAIL_URL, } from "../../settings/clientSettings.js" import { getIndexName } from "./searchClient.js" @@ -21,6 +23,7 @@ import { } from "@ourworldindata/grapher" import { Highlight } from "react-instantsearch" import { IDataCatalogHit } from "../DataCatalog/DataCatalogUtils.js" +import { EXPLORERS_ROUTE_FOLDER } from "@ourworldindata/explorer" export function ChartHit({ hit, @@ -35,6 +38,7 @@ export function ChartHit({ }) { const [imgLoaded, setImgLoaded] = useState(false) const [imgError, setImgError] = useState(false) + const isExplorerView = hit.type === ChartRecordType.ExplorerView const entities = useMemo( () => @@ -51,10 +55,36 @@ export function ChartHit({ searchQueryRegionsMatches, ] ) - const queryStr = useMemo(() => getEntityQueryStr(entities), [entities]) - const previewUrl = queryStr - ? `${GRAPHER_DYNAMIC_THUMBNAIL_URL}/${hit.slug}.svg${queryStr}` - : `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${hit.slug}.svg` + const entityQueryStr = useMemo( + () => getEntityQueryStr(entities), + [entities] + ) + + const fullQueryParams = isExplorerView + ? hit.queryParams! + entityQueryStr.replace("?", "&") + : entityQueryStr + + function createExplorerViewThumbnailUrl( + slug: string, + fullQueryParams: string + ): string { + return `${EXPLORER_DYNAMIC_THUMBNAIL_URL}/${slug}.svg${fullQueryParams}` + } + function createGrapherThumbnailUrl( + slug: string, + fullQueryParams?: string + ): string { + return fullQueryParams + ? `${GRAPHER_DYNAMIC_THUMBNAIL_URL}/${slug}.svg${fullQueryParams}` + : `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${slug}.svg` + } + const previewUrl = isExplorerView + ? createExplorerViewThumbnailUrl(hit.slug, fullQueryParams) + : createGrapherThumbnailUrl(hit.slug, fullQueryParams) + + const chartUrl = isExplorerView + ? `${BAKED_BASE_URL}/${EXPLORERS_ROUTE_FOLDER}/${hit.slug}${fullQueryParams}` + : `${BAKED_GRAPHER_URL}/${hit.slug}${fullQueryParams}` useEffect(() => { setImgLoaded(false) @@ -63,7 +93,7 @@ export function ChartHit({ return ( & { viewTitleIndexWithinExplorer: number } +export enum ChartRecordType { + Chart = "chart", + ExplorerView = "explorerView", +} + export interface ChartRecord { + type: ChartRecordType objectID: string chartId: number slug: string + queryParams?: string title: string subtitle: string | undefined variantName: string @@ -81,6 +88,7 @@ export enum SearchIndexName { ExplorerViews = "explorer-views", Charts = "charts", Pages = "pages", + ExplorerViewsAndCharts = "explorer-views-and-charts", } export type SearchCategoryFilter = SearchIndexName | "all" @@ -96,4 +104,6 @@ export const indexNameToSubdirectoryMap: Record = { [SearchIndexName.Pages]: "", [SearchIndexName.Charts]: "/grapher", [SearchIndexName.ExplorerViews]: "/explorers", + // n/a - charts and explorers have different subdirectories, so this needs to be resolved elsewhere + [SearchIndexName.ExplorerViewsAndCharts]: "", } From 8311973b558f9afcee0e2cff44ac31f4d8358961 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Thu, 31 Oct 2024 19:56:44 -0400 Subject: [PATCH 03/14] =?UTF-8?q?=F0=9F=94=A8=20refactor=20algolia=20bakin?= =?UTF-8?q?g=20code=20for=20legibility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 3 +- adminSiteServer/apiRouter.ts | 2 +- baker/algolia/indexChartsToAlgolia.ts | 222 +----- .../indexExplorerViewsAndChartsToAlgolia.ts | 12 +- baker/algolia/indexExplorerViewsToAlgolia.ts | 637 +---------------- ...xToAlgolia.tsx => indexPagesToAlgolia.tsx} | 6 +- baker/algolia/utils/charts.ts | 153 +++++ baker/algolia/utils/explorerViews.ts | 639 ++++++++++++++++++ .../{algoliaUtils.tsx => utils/pages.ts} | 49 +- baker/algolia/utils/shared.ts | 40 ++ baker/algolia/utils/types.ts | 91 +++ 11 files changed, 963 insertions(+), 891 deletions(-) rename baker/algolia/{indexToAlgolia.tsx => indexPagesToAlgolia.tsx} (88%) create mode 100644 baker/algolia/utils/charts.ts create mode 100644 baker/algolia/utils/explorerViews.ts rename baker/algolia/{algoliaUtils.tsx => utils/pages.ts} (91%) create mode 100644 baker/algolia/utils/shared.ts create mode 100644 baker/algolia/utils/types.ts diff --git a/Makefile b/Makefile index e8ed24f3e9e..faa6f072e78 100644 --- a/Makefile +++ b/Makefile @@ -297,9 +297,10 @@ update.chart-entities: itsJustJavascript reindex: itsJustJavascript @echo '==> Reindexing search in Algolia' node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js - node --enable-source-maps itsJustJavascript/baker/algolia/indexToAlgolia.js + node --enable-source-maps itsJustJavascript/baker/algolia/indexPagesToAlgolia.js node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js + node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js delete-algolia-index: itsJustJavascript @echo '==> Deleting Algolia index' diff --git a/adminSiteServer/apiRouter.ts b/adminSiteServer/apiRouter.ts index 231a2314cda..7cd65bb867e 100644 --- a/adminSiteServer/apiRouter.ts +++ b/adminSiteServer/apiRouter.ts @@ -125,7 +125,7 @@ import { denormalizeLatestCountryData } from "../baker/countryProfiles.js" import { indexIndividualGdocPost, removeIndividualGdocPostFromIndex, -} from "../baker/algolia/algoliaUtils.js" +} from "../baker/algolia/utils/pages.js" import { References } from "../adminSiteClient/ChartEditor.js" import { DeployQueueServer } from "../baker/DeployQueueServer.js" import { FunctionalRouter } from "./FunctionalRouter.js" diff --git a/baker/algolia/indexChartsToAlgolia.ts b/baker/algolia/indexChartsToAlgolia.ts index 96dcc8b2b84..92b0572f2cf 100644 --- a/baker/algolia/indexChartsToAlgolia.ts +++ b/baker/algolia/indexChartsToAlgolia.ts @@ -1,227 +1,9 @@ import * as db from "../../db/db.js" import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" -import { isPathRedirectedToExplorer } from "../../explorerAdminServer/ExplorerRedirects.js" -import { - ChartRecord, - ChartRecordType, - SearchIndexName, -} from "../../site/search/searchTypes.js" -import { - KeyChartLevel, - OwidGdocLinkType, - excludeNullish, - isNil, - countries, - orderBy, - removeTrailingParenthetical, - uniq, -} from "@ourworldindata/utils" -import { MarkdownTextWrap } from "@ourworldindata/components" -import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" -import { getRelatedArticles } from "../../db/model/Post.js" +import { SearchIndexName } from "../../site/search/searchTypes.js" import { getIndexName } from "../../site/search/searchClient.js" -import { getPublishedLinksTo } from "../../db/model/Link.js" - -const computeScore = (record: Omit): number => { - const { numRelatedArticles, views_7d } = record - return numRelatedArticles * 500 + views_7d -} - -const countriesWithVariantNames = new Set( - countries - .filter((country) => country.variantNames?.length || country.shortName) - .map((country) => country.name) -) - -const processAvailableEntities = (availableEntities: string[] | null) => { - if (!availableEntities) return [] - - // Algolia is a bit weird with synonyms: - // If we have a synonym "USA" -> "United States", and we search for "USA", - // then it seems that Algolia can only find that within `availableEntities` - // if "USA" is within the first 100-or-so entries of the array. - // So, the easy solution is to sort the entities to ensure that countries - // with variant names are at the top. - // Also, entities containing a hyphen like "low-income countries" can also - // only be found if they're within the first 100-or-so entries. - // - @marcelgerber, 2024-03-25 - return orderBy( - availableEntities, - [ - (entityName) => - countriesWithVariantNames.has( - removeTrailingParenthetical(entityName) - ), - (entityName) => entityName.includes("-"), - (entityName) => entityName, - ], - ["desc", "desc", "asc"] - ) -} - -interface RawChartRecordRow { - id: number - slug: string - title: string - variantName: string - subtitle: string - numDimensions: string - publishedAt: string - updatedAt: string - entityNames: string - tags: string - keyChartForTags: string -} - -interface ParsedChartRecordRow { - id: number - slug: string - title: string - variantName: string - subtitle: string - numDimensions: string - publishedAt: string - updatedAt: string - entityNames: string[] - tags: string[] - keyChartForTags: string[] -} - -const parseAndProcessChartRecords = ( - rawRecord: RawChartRecordRow -): ParsedChartRecordRow => { - let parsedEntities: string[] = [] - if (rawRecord.entityNames !== null) { - // This is a very rough way to check for the Algolia record size limit, but it's better than the update failing - // because we exceed the 20KB record size limit - if (rawRecord.entityNames.length < 12000) - parsedEntities = excludeNullish( - JSON.parse(rawRecord.entityNames as string) as (string | null)[] - ) as string[] - else { - console.info( - `Chart ${rawRecord.id} has too many entities, skipping its entities` - ) - } - } - const entityNames = processAvailableEntities(parsedEntities) - - const tags = JSON.parse(rawRecord.tags) - const keyChartForTags = JSON.parse( - rawRecord.keyChartForTags as string - ).filter((t: string | null) => t) - - return { - ...rawRecord, - entityNames, - tags, - keyChartForTags, - } -} - -export const getChartsRecords = async ( - knex: db.KnexReadonlyTransaction -): Promise => { - console.log("Fetching charts to index") - const chartsToIndex = await db.knexRaw( - knex, - `-- sql - WITH indexable_charts_with_entity_names AS ( - SELECT c.id, - cc.slug, - cc.full ->> "$.title" AS title, - cc.full ->> "$.variantName" AS variantName, - cc.full ->> "$.subtitle" AS subtitle, - JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions, - c.publishedAt, - c.updatedAt, - JSON_ARRAYAGG(e.name) AS entityNames - FROM charts c - LEFT JOIN chart_configs cc ON c.configId = cc.id - LEFT JOIN charts_x_entities ce ON c.id = ce.chartId - LEFT JOIN entities e ON ce.entityId = e.id - WHERE cc.full ->> "$.isPublished" = 'true' - AND c.isIndexable IS TRUE - GROUP BY c.id - -- TODO: remove this, testing only - -- LIMIT 15 - ) - SELECT c.id, - c.slug, - c.title, - c.variantName, - c.subtitle, - c.numDimensions, - c.publishedAt, - c.updatedAt, - c.entityNames, -- this array may contain null values, will have to filter these out - JSON_ARRAYAGG(t.name) AS tags, - JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out - FROM indexable_charts_with_entity_names c - LEFT JOIN chart_tags ct ON c.id = ct.chartId - LEFT JOIN tags t on ct.tagId = t.id - GROUP BY c.id - HAVING COUNT(t.id) >= 1 - ` - ) - - const parsedRows = chartsToIndex.map(parseAndProcessChartRecords) - - const pageviews = await getAnalyticsPageviewsByUrlObj(knex) - - const parentTagsByChildName = await db.getParentTagsByChildName(knex) - - const records: ChartRecord[] = [] - for (const c of parsedRows) { - // Our search currently cannot render explorers, so don't index them because - // otherwise they will fail when rendered in the search results - if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue - - const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? [] - const linksFromGdocs = await getPublishedLinksTo( - knex, - [c.slug], - OwidGdocLinkType.Grapher - ) - - const plaintextSubtitle = isNil(c.subtitle) - ? undefined - : new MarkdownTextWrap({ - text: c.subtitle, - fontSize: 10, // doesn't matter, but is a mandatory field - }).plaintext - - const parentTags = c.tags.flatMap( - // a chart can be tagged with a tag that isn't in the tag graph - (tag) => parentTagsByChildName[tag] || [] - ) - - const record = { - objectID: c.id.toString(), - type: ChartRecordType.Chart, - chartId: c.id, - slug: c.slug, - title: c.title, - variantName: c.variantName, - subtitle: plaintextSubtitle, - availableEntities: c.entityNames, - numDimensions: parseInt(c.numDimensions), - publishedAt: c.publishedAt, - updatedAt: c.updatedAt, - tags: uniq([...c.tags, ...parentTags]), - keyChartForTags: c.keyChartForTags as string[], - titleLength: c.title.length, - // Number of references to this chart in all our posts and pages - numRelatedArticles: relatedArticles.length + linksFromGdocs.length, - views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0, - } - const score = computeScore(record) - records.push({ ...record, score }) - } - - return records -} +import { getChartsRecords } from "./utils/charts.js" const indexChartsToAlgolia = async () => { if (!ALGOLIA_INDEXING) return diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index 8d0e0e52226..10cf3b0fd12 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -1,3 +1,4 @@ +import fs from "fs/promises" import Bugsnag from "@bugsnag/js" import * as db from "../../db/db.js" import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" @@ -6,17 +7,15 @@ import { BUGSNAG_NODE_API_KEY, } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" -import { - ExplorerViewEntryWithExplorerInfo, - getExplorerViewRecords, -} from "./indexExplorerViewsToAlgolia.js" +import { ExplorerViewEntryWithExplorerInfo } from "./utils/types.js" +import { getExplorerViewRecords } from "./utils/explorerViews.js" +import { getChartsRecords } from "./utils/charts.js" import { getIndexName } from "../../site/search/searchClient.js" import { ChartRecord, ChartRecordType, SearchIndexName, } from "../../site/search/searchTypes.js" -import { getChartsRecords } from "./indexChartsToAlgolia.js" function explorerViewRecordToChartRecord( e: ExplorerViewEntryWithExplorerInfo @@ -69,7 +68,7 @@ function scaleExplorerScores( const normalized = e.score / explorerScoreMax const grapherRange = grapherScoreBounds.max - grapherScoreBounds.min const scaled = Math.round( - normalized * grapherRange + grapherScoreBounds.min + (normalized / 2) * grapherRange + grapherScoreBounds.min ) return { ...e, @@ -122,7 +121,6 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { const records = [...scaledExplorerViews, ...grapherViews] const index = client.initIndex(indexName) - console.log(`Indexing ${records.length} records`) await index.replaceAllObjects(records) console.log(`Indexing complete`) diff --git a/baker/algolia/indexExplorerViewsToAlgolia.ts b/baker/algolia/indexExplorerViewsToAlgolia.ts index 4365cc221f1..3a31b4757ce 100644 --- a/baker/algolia/indexExplorerViewsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsToAlgolia.ts @@ -1,646 +1,15 @@ +import fs from "fs/promises" import * as db from "../../db/db.js" -import { - ExplorerChoiceParams, - ExplorerControlType, - GridBoolean, - DecisionMatrix, - TableDef, -} from "@ourworldindata/explorer" -import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" import { ALGOLIA_INDEXING, BUGSNAG_NODE_API_KEY, - DATA_API_URL, } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { getIndexName } from "../../site/search/searchClient.js" import { SearchIndexName } from "../../site/search/searchTypes.js" -import { at, get, groupBy, keyBy, mapValues, orderBy, partition } from "lodash" -import { MarkdownTextWrap } from "@ourworldindata/components" -import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" import Bugsnag from "@bugsnag/js" -import { obtainAvailableEntitiesForAllGraphers } from "../updateChartEntities.js" -import { fetchS3MetadataByPath } from "../../db/model/Variable.js" -import { getVariableMetadataRoute } from "@ourworldindata/grapher" -import pMap from "p-map" -import { ExplorerAdminServer } from "../../explorerAdminServer/ExplorerAdminServer.js" -import { GIT_CMS_DIR } from "../../gitCms/GitCmsConstants.js" -import { parseDelimited } from "@ourworldindata/core-table" -import { - ColumnTypeNames, - CoreRow, - DbEnrichedVariable, -} from "@ourworldindata/types" - -interface ExplorerViewEntry { - viewTitle: string - viewSubtitle: string - viewSettings: string[] - viewQueryParams: string - availableEntities: string[] - - viewGrapherId?: number - yVariableIds: Array // Variable IDs or ETL paths - tableSlug?: string - ySlugs: string[] - - /** - * We often have several views with the same title within an explorer, e.g. "Population". - * In order to only display _one_ of these views in search results, we need a way to demote duplicates. - * This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc. - */ - viewTitleIndexWithinExplorer: number - - // Potential ranking criteria - viewIndexWithinExplorer: number - titleLength: number - numNonDefaultSettings: number - // viewViews_7d: number -} - -export interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { - explorerSlug: string - explorerTitle: string - explorerSubtitle: string - explorerViews_7d: number - viewTitleAndExplorerSlug: string // used for deduplication: `viewTitle | explorerSlug` - numViewsWithinExplorer: number - tags: string[] - - score: number - - objectID?: string -} - -// Creates a search-ready string from a choice. -// Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included. -// Imagine a "Per capita" checkbox, for example. If it's not ticked, then we don't want searches for "per capita" to wrongfully match it. -const explorerChoiceToViewSettings = ( - choices: ExplorerChoiceParams, - decisionMatrix: DecisionMatrix -): string[] => { - return Object.entries(choices).map(([choiceName, choiceValue]) => { - const choiceControlType = - decisionMatrix.choiceNameToControlTypeMap.get(choiceName) - if (choiceControlType === ExplorerControlType.Checkbox) - return choiceValue === GridBoolean.true ? choiceName : "" - else return choiceValue - }) -} - -type ExplorerIndicatorMetadata = Record< - string | number, - { - entityNames?: string[] - display: DbEnrichedVariable["display"] - titlePublic: DbEnrichedVariable["titlePublic"] - descriptionShort: DbEnrichedVariable["descriptionShort"] - name: DbEnrichedVariable["name"] - } -> - -async function fetchIndicatorMetadata( - records: Omit< - ExplorerViewEntry, - "viewTitleIndexWithinExplorer" | "titleLength" - >[], - trx: db.KnexReadonlyTransaction -): Promise { - function checkIsETLPath(idOrPath: string | number): idOrPath is string { - return typeof idOrPath === "string" - } - - const { etlPaths, ids } = records.reduce( - ({ etlPaths, ids }, record) => { - for (const yVariableId of record.yVariableIds) { - if (checkIsETLPath(yVariableId)) { - etlPaths.add(yVariableId) - } else { - ids.add(yVariableId) - } - } - return { etlPaths, ids } - }, - { etlPaths: new Set(), ids: new Set() } - ) - - const metadataFromDB = ( - await trx - .table("variables") - .select( - "id", - "catalogPath", - "name", - "titlePublic", - "display", - "name", - "descriptionShort" - ) - .whereIn("id", [...ids]) - .orWhereIn("catalogPath", [...etlPaths]) - ).map((row) => ({ - ...row, - display: row.display ? JSON.parse(row.display) : {}, - })) as DbEnrichedVariable[] - - const indicatorMetadataByIdAndPath = { - ...keyBy(metadataFromDB, "id"), - ...keyBy(metadataFromDB, "catalogPath"), - } as ExplorerIndicatorMetadata - - async function fetchEntitiesForId(id?: number) { - if (id) { - const metadata = await fetchS3MetadataByPath( - getVariableMetadataRoute(DATA_API_URL, id) - ) - const entityNames = get(metadata, "dimensions.entities.values", []) - .map((value) => value.name) - .filter((name): name is string => !!name) - - const idEntry = indicatorMetadataByIdAndPath[id] - if (idEntry) { - idEntry.entityNames = entityNames - } - const path = metadata.catalogPath - if (path) { - const pathEntry = indicatorMetadataByIdAndPath[path] - if (pathEntry) { - pathEntry.entityNames = entityNames - } - } - } - } - - await pMap( - metadataFromDB.map((meta) => meta.id), - fetchEntitiesForId, - { concurrency: 10 } - ) - - return indicatorMetadataByIdAndPath -} - -/** Almost always `"country"`, but sometimes things like `"location"` */ -function getEntityNameSlug(tableDef: TableDef): string { - return ( - tableDef.columnDefinitions?.find( - (col) => col.type === ColumnTypeNames.EntityName - )?.slug || "country" - ) -} - -/** - * Returns an aggregator function that can be used to aggregate entities per column in a parsed CSV - * e.g. if there's a column named "gdp", this will return an object like `{ gdp: Set }` - * containing all the entities that have any data for gdp. - */ -function makeAggregator(entityNameSlug: string) { - return ( - result: Record>, - row: Record - ) => { - const entityName = row[entityNameSlug] - Object.keys(row).forEach((columnSlug) => { - if (columnSlug === entityNameSlug || columnSlug === "year") return - - const value = row[columnSlug] - if (value) { - if (!result[columnSlug]) { - result[columnSlug] = new Set() - } - result[columnSlug].add(entityName) - } - }) - - return result - } -} - -async function getEntitiesPerColumnPerTable( - tableDefs: TableDef[] -): Promise>> { - return pMap( - tableDefs, - (tableDef) => { - console.log("Fetching CSV table data from", tableDef.url) - return fetch(tableDef.url!) - .then((res) => res.text()) - .then((csv) => parseDelimited(csv)) - .then((parsed) => { - const entityNameSlug = getEntityNameSlug(tableDef) - const aggregateEntities = makeAggregator(entityNameSlug) - const entitiesPerColumn = parsed.reduce( - aggregateEntities, - {} - ) - - // Convert sets to arrays - const entityNamesAsArray = mapValues( - entitiesPerColumn, - (set) => Array.from(set) - ) as Record - - // Return an object like `{ almonds: { population: ["United States", "Canada"], area_harvested__ha: ["United States"] } }` - return { [tableDef.slug!]: entityNamesAsArray } - }) - }, - { - concurrency: 5, - } - // Merge all these objects together - ).then((results) => Object.assign({}, ...results)) -} - -const computeScore = ( - record: Omit & - Partial -) => - (record.explorerViews_7d ?? 0) * 10 - - record.numNonDefaultSettings * 50 - - record.titleLength - -interface IndicatorMetadata { - entityNames: string[] - titlePublic?: string - display?: { name: string } - name: string - descriptionShort?: string -} - -interface GrapherInfo { - id: number - title: string - subtitle: string -} - -const parseYVariableIds = (matrixRow: CoreRow): (string | number)[] => { - return ( - matrixRow.yVariableIds - ?.trim() - .split(" ") - .map((idOrPath: string) => - isNaN(parseInt(idOrPath)) ? idOrPath : parseInt(idOrPath) - ) || [] - ) -} - -const getNonDefaultSettings = ( - choice: ExplorerChoiceParams, - matrix: DecisionMatrix -): [string, any][] => { - const defaultSettings = matrix.defaultSettings - return Object.entries(matrix.availableChoiceOptions).filter( - ([choiceName, choiceOptions]) => { - return ( - choiceOptions.length > 1 && - !(defaultSettings[choiceName] !== undefined - ? defaultSettings[choiceName] === choice[choiceName] - : choice[choiceName] === choiceOptions[0]) - ) - } - ) -} - -const createBaseRecord = ( - choice: ExplorerChoiceParams, - matrix: DecisionMatrix, - index: number -): Partial => { - matrix.setValuesFromChoiceParams(choice) - const nonDefaultSettings = getNonDefaultSettings(choice, matrix) - const yVariableIds = parseYVariableIds(matrix.selectedRow) - - return { - viewTitle: matrix.selectedRow.title, - viewSubtitle: matrix.selectedRow.subtitle, - viewSettings: explorerChoiceToViewSettings(choice, matrix), - availableEntities: [], - viewGrapherId: matrix.selectedRow.grapherId, - yVariableIds, - viewQueryParams: matrix.toString(), - viewIndexWithinExplorer: index, - numNonDefaultSettings: nonDefaultSettings.length, - tableSlug: matrix.selectedRow.tableSlug, - ySlugs: matrix.selectedRow.ySlugs?.split(" ") || [], - } -} - -const createBaseRecords = ( - matrix: DecisionMatrix -): Partial[] => { - return matrix - .allDecisionsAsQueryParams() - .map((choice: ExplorerChoiceParams, index: number) => - createBaseRecord(choice, matrix, index) - ) -} - -const fetchGrapherInfo = async ( - trx: db.KnexReadonlyTransaction, - grapherIds: number[] -): Promise> => { - return await trx - .select( - trx.raw("charts.id as id"), - trx.raw("chart_configs.full->>'$.title' as title"), - trx.raw("chart_configs.full->>'$.subtitle' as subtitle") - ) - .from("charts") - .join("chart_configs", { "charts.configId": "chart_configs.id" }) - .whereIn("charts.id", grapherIds) - .andWhereRaw("chart_configs.full->>'$.isPublished' = 'true'") - .then((rows) => keyBy(rows, "id")) -} - -const enrichRecordWithGrapherInfo = ( - record: Partial, - grapherInfo: Record, - availableEntities: Map, - slug: string -): Partial => { - if (!record.viewGrapherId) return record - - const grapher = grapherInfo[record.viewGrapherId] - if (!grapher) { - console.warn( - `Grapher id ${record.viewGrapherId} not found for explorer ${slug}` - ) - return record - } - - return { - ...record, - availableEntities: - availableEntities.get(record.viewGrapherId)?.availableEntities ?? - [], - viewTitle: grapher.title, - viewSubtitle: grapher.subtitle, - } -} - -const enrichWithGrapherData = async ( - records: Partial[], - trx: db.KnexReadonlyTransaction, - slug: string -): Promise[]> => { - const grapherIds = records - .filter((record) => record.viewGrapherId !== undefined) - .map((record) => record.viewGrapherId as number) - - if (!grapherIds.length) return records - - console.log( - `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${slug}` - ) - const grapherInfo = await fetchGrapherInfo(trx, grapherIds) - const availableEntities = await obtainAvailableEntitiesForAllGraphers( - trx, - grapherIds - ) - - return records.map((record) => - enrichRecordWithGrapherInfo( - record, - grapherInfo, - availableEntities, - slug - ) - ) -} - -const enrichRecordWithTableData = ( - record: Partial, - entitiesPerColumnPerTable: Record> -): Partial => { - const { tableSlug, ySlugs } = record - if (!tableSlug || !ySlugs?.length) return record - - const availableEntities = ySlugs - .flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) - .filter((name, i, array) => array.indexOf(name) === i) - - return { ...record, availableEntities } -} - -const enrichRecordWithIndicatorData = ( - record: Partial, - indicatorMetadata: Record -): Partial => { - if (!record.yVariableIds?.length) return record - - const allEntities = at(indicatorMetadata, record.yVariableIds) - .flatMap((meta) => meta.entityNames) - .filter( - (name, i, array): name is string => - array.indexOf(name) === i && !!name - ) - - const result = { ...record, availableEntities: allEntities } - - const firstYIndicator = record.yVariableIds[0] - if (firstYIndicator === undefined) return result - - const indicatorInfo = indicatorMetadata[firstYIndicator] - if (!indicatorInfo) return result - - return { - ...result, - viewTitle: - record.viewTitle ?? - indicatorInfo.titlePublic ?? - indicatorInfo.display?.name ?? - indicatorInfo.name, - viewSubtitle: record.viewSubtitle ?? indicatorInfo.descriptionShort, - } -} - -const enrichWithMetadata = async ( - records: Partial[], - indicatorMetadata: Record, - entitiesPerColumnPerTable: Record> -): Promise[]> => { - return records.map((record) => { - const withTableData = enrichRecordWithTableData( - record, - entitiesPerColumnPerTable - ) - return enrichRecordWithIndicatorData(withTableData, indicatorMetadata) - }) -} - -const cleanSubtitles = ( - records: Partial[] -): Partial[] => { - return records.map((record) => ({ - ...record, - viewSubtitle: record.viewSubtitle - ? new MarkdownTextWrap({ - text: record.viewSubtitle, - fontSize: 10, - }).plaintext - : undefined, - })) -} - -async function logMissingTitles( - records: Partial[], - slug: string -): Promise { - for (const record of records) { - await logErrorAndMaybeSendToBugsnag({ - name: "ExplorerViewTitleMissing", - message: `Explorer ${slug} has a view with no title: ${record.viewQueryParams}.`, - }) - } -} - -async function finalizeRecords( - records: Partial[], - slug: string -): Promise { - const [withTitle, withoutTitle] = partition( - records, - (record) => record.viewTitle !== undefined - ) - - await logMissingTitles(withoutTitle, slug) - - const withCleanSubtitles = cleanSubtitles(withTitle) - const withTitleLength = withCleanSubtitles.map((record) => ({ - ...record, - titleLength: record.viewTitle!.length, - })) as Omit[] - - const sortedByScore = orderBy( - withTitleLength, - computeScore, - "desc" - ) as Omit[] - - const groupedByTitle = groupBy(sortedByScore, "viewTitle") - - return Object.values(groupedByTitle).flatMap((group, i) => - group.map((record) => ({ - ...record, - viewTitleIndexWithinExplorer: i, - })) - ) -} - -export const getExplorerViewRecordsForExplorerSlug = async ( - trx: db.KnexReadonlyTransaction, - slug: string, - explorerAdminServer: ExplorerAdminServer -): Promise => { - // Get explorer program and table definitions - const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) - const tableDefs = explorerProgram.tableSlugs - .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) - .filter((x) => x && x.url && x.slug) as TableDef[] - - // Fetch and process CSV table data - console.log( - `Fetching CSV table data for ${slug} and aggregating entities by column` - ) - const entitiesPerColumnPerTable = - await getEntitiesPerColumnPerTable(tableDefs) - console.log( - "Finished fetching CSV table data and aggregating entities by column" - ) - - // Create base records from decision matrix - console.log( - `Processing explorer ${slug} (${explorerProgram.decisionMatrix.numRows} rows)` - ) - const baseRecords = createBaseRecords(explorerProgram.decisionMatrix) - - // Enrich with grapher data - const recordsWithGrapherData = await enrichWithGrapherData( - baseRecords, - trx, - slug - ) - - // Fetch and apply indicator metadata - console.log("Fetching indicator metadata for explorer", slug) - const indicatorMetadata = await fetchIndicatorMetadata( - recordsWithGrapherData as any, - trx - ) - console.log("Fetched indicator metadata for explorer", slug) - - const enrichedRecords = await enrichWithMetadata( - recordsWithGrapherData, - indicatorMetadata as any, - entitiesPerColumnPerTable - ) - - // Finalize records with titles, sorting, and grouping - return finalizeRecords(enrichedRecords, slug) -} - -async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { - const explorersBySlug = await db.getPublishedExplorersBySlug(trx) - const parentTags = await db.getParentTagsByChildName(trx) - const publishedExplorersWithTags = Object.values(explorersBySlug).map( - (explorer) => ({ - ...explorer, - tags: explorer.tags - .flatMap((tag) => [tag.name, ...parentTags[tag.name]]) - .filter( - (tag, index, array) => !!tag && array.indexOf(tag) === index - ), - }) - ) - return publishedExplorersWithTags -} - -export const getExplorerViewRecords = async ( - trx: db.KnexReadonlyTransaction -): Promise => { - console.log("Fetching explorer views to index") - const publishedExplorersWithTags = await getExplorersWithInheritedTags(trx) - const pageviews = await getAnalyticsPageviewsByUrlObj(trx) - - const explorerAdminServer = new ExplorerAdminServer(GIT_CMS_DIR) - - let records = [] as ExplorerViewEntryWithExplorerInfo[] - for (const explorerInfo of publishedExplorersWithTags) { - const explorerViewRecords = await getExplorerViewRecordsForExplorerSlug( - trx, - explorerInfo.slug, - explorerAdminServer - ) - - const explorerPageviews = get( - pageviews, - [`/explorers/${explorerInfo.slug}`, "views_7d"], - 0 - ) - // These have a score for ranking purposes, but it doesn't yet factor in the explorer's pageviews - const unscoredRecords = explorerViewRecords.map( - (record, i): Omit => ({ - ...record, - explorerSlug: explorerInfo.slug, - explorerTitle: explorerInfo.title, - explorerSubtitle: explorerInfo.subtitle, - explorerViews_7d: explorerPageviews, - viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, - numViewsWithinExplorer: explorerViewRecords.length, - tags: explorerInfo.tags, - objectID: `${explorerInfo.slug}-${i}`, - }) - ) - records = records.concat( - unscoredRecords.map((record) => ({ - ...record, - score: computeScore(record), - })) - ) - } - - return records -} +import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" +import { getExplorerViewRecords } from "./utils/explorerViews.js" const indexExplorerViewsToAlgolia = async () => { if (!ALGOLIA_INDEXING) return diff --git a/baker/algolia/indexToAlgolia.tsx b/baker/algolia/indexPagesToAlgolia.tsx similarity index 88% rename from baker/algolia/indexToAlgolia.tsx rename to baker/algolia/indexPagesToAlgolia.tsx index 00e7044df9a..703f618c73b 100644 --- a/baker/algolia/indexToAlgolia.tsx +++ b/baker/algolia/indexPagesToAlgolia.tsx @@ -3,9 +3,9 @@ import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { SearchIndexName } from "../../site/search/searchTypes.js" import { getIndexName } from "../../site/search/searchClient.js" -import { getPagesRecords } from "./algoliaUtils.js" +import { getPagesRecords } from "./utils/pages.js" -const indexToAlgolia = async () => { +const indexPagesToAlgolia = async () => { if (!ALGOLIA_INDEXING) return const client = getAlgoliaClient() @@ -31,4 +31,4 @@ process.on("unhandledRejection", (e) => { process.exit(1) }) -void indexToAlgolia() +void indexPagesToAlgolia() diff --git a/baker/algolia/utils/charts.ts b/baker/algolia/utils/charts.ts new file mode 100644 index 00000000000..51f36eb0280 --- /dev/null +++ b/baker/algolia/utils/charts.ts @@ -0,0 +1,153 @@ +import { isNil, uniq } from "lodash" +import { MarkdownTextWrap } from "@ourworldindata/components" +import { KeyChartLevel, OwidGdocLinkType } from "@ourworldindata/types" +import * as db from "../../../db/db.js" +import { + ChartRecord, + ChartRecordType, +} from "../../../site/search/searchTypes.js" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { getRelatedArticles } from "../../../db/model/Post.js" +import { getPublishedLinksTo } from "../../../db/model/Link.js" +import { isPathRedirectedToExplorer } from "../../../explorerAdminServer/ExplorerRedirects.js" +import { ParsedChartRecordRow, RawChartRecordRow } from "./types.js" +import { excludeNullish } from "@ourworldindata/utils" +import { processAvailableEntities } from "./shared.js" + +const computeChartScore = (record: Omit): number => { + const { numRelatedArticles, views_7d } = record + return numRelatedArticles * 500 + views_7d +} + +const parseAndProcessChartRecords = ( + rawRecord: RawChartRecordRow +): ParsedChartRecordRow => { + let parsedEntities: string[] = [] + if (rawRecord.entityNames !== null) { + // This is a very rough way to check for the Algolia record size limit, but it's better than the update failing + // because we exceed the 20KB record size limit + if (rawRecord.entityNames.length < 12000) + parsedEntities = excludeNullish( + JSON.parse(rawRecord.entityNames as string) as (string | null)[] + ) as string[] + else { + console.info( + `Chart ${rawRecord.id} has too many entities, skipping its entities` + ) + } + } + const entityNames = processAvailableEntities(parsedEntities) + + const tags = JSON.parse(rawRecord.tags) + const keyChartForTags = JSON.parse( + rawRecord.keyChartForTags as string + ).filter((t: string | null) => t) + + return { + ...rawRecord, + entityNames, + tags, + keyChartForTags, + } +} + +export const getChartsRecords = async ( + knex: db.KnexReadonlyTransaction +): Promise => { + console.log("Fetching charts to index") + const chartsToIndex = await db.knexRaw( + knex, + `-- sql + WITH indexable_charts_with_entity_names AS ( + SELECT c.id, + cc.slug, + cc.full ->> "$.title" AS title, + cc.full ->> "$.variantName" AS variantName, + cc.full ->> "$.subtitle" AS subtitle, + JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions, + c.publishedAt, + c.updatedAt, + JSON_ARRAYAGG(e.name) AS entityNames + FROM charts c + LEFT JOIN chart_configs cc ON c.configId = cc.id + LEFT JOIN charts_x_entities ce ON c.id = ce.chartId + LEFT JOIN entities e ON ce.entityId = e.id + WHERE cc.full ->> "$.isPublished" = 'true' + AND c.isIndexable IS TRUE + GROUP BY c.id + ) + SELECT c.id, + c.slug, + c.title, + c.variantName, + c.subtitle, + c.numDimensions, + c.publishedAt, + c.updatedAt, + c.entityNames, -- this array may contain null values, will have to filter these out + JSON_ARRAYAGG(t.name) AS tags, + JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out + FROM indexable_charts_with_entity_names c + LEFT JOIN chart_tags ct ON c.id = ct.chartId + LEFT JOIN tags t on ct.tagId = t.id + GROUP BY c.id + HAVING COUNT(t.id) >= 1 + ` + ) + + const parsedRows = chartsToIndex.map(parseAndProcessChartRecords) + + const pageviews = await getAnalyticsPageviewsByUrlObj(knex) + + const parentTagsByChildName = await db.getParentTagsByChildName(knex) + + const records: ChartRecord[] = [] + for (const c of parsedRows) { + // Our search currently cannot render explorers, so don't index them because + // otherwise they will fail when rendered in the search results + if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue + + const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? [] + const linksFromGdocs = await getPublishedLinksTo( + knex, + [c.slug], + OwidGdocLinkType.Grapher + ) + + const plaintextSubtitle = isNil(c.subtitle) + ? undefined + : new MarkdownTextWrap({ + text: c.subtitle, + fontSize: 10, // doesn't matter, but is a mandatory field + }).plaintext + + const parentTags = c.tags.flatMap( + // a chart can be tagged with a tag that isn't in the tag graph + (tag) => parentTagsByChildName[tag] || [] + ) + + const record = { + objectID: c.id.toString(), + type: ChartRecordType.Chart, + chartId: c.id, + slug: c.slug, + title: c.title, + variantName: c.variantName, + subtitle: plaintextSubtitle, + availableEntities: c.entityNames, + numDimensions: parseInt(c.numDimensions), + publishedAt: c.publishedAt, + updatedAt: c.updatedAt, + tags: uniq([...c.tags, ...parentTags]), + keyChartForTags: c.keyChartForTags as string[], + titleLength: c.title.length, + // Number of references to this chart in all our posts and pages + numRelatedArticles: relatedArticles.length + linksFromGdocs.length, + views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0, + } + const score = computeChartScore(record) + records.push({ ...record, score }) + } + + return records +} diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts new file mode 100644 index 00000000000..ed8047f5d3a --- /dev/null +++ b/baker/algolia/utils/explorerViews.ts @@ -0,0 +1,639 @@ +import fs from "fs/promises" +import { + ExplorerChoiceParams, + ExplorerControlType, + GridBoolean, + DecisionMatrix, + TableDef, +} from "@ourworldindata/explorer" +import { at, get, groupBy, mapValues, orderBy, partition } from "lodash" +import { MarkdownTextWrap } from "@ourworldindata/components" +import { logErrorAndMaybeSendToBugsnag } from "../../../serverUtils/errorLog.js" +import { obtainAvailableEntitiesForAllGraphers } from "../../updateChartEntities.js" +import { fetchS3MetadataByPath } from "../../../db/model/Variable.js" +import { getVariableMetadataRoute } from "@ourworldindata/grapher" +import pMap from "p-map" +import { ExplorerAdminServer } from "../../../explorerAdminServer/ExplorerAdminServer.js" +import { GIT_CMS_DIR } from "../../../gitCms/GitCmsConstants.js" +import { parseDelimited } from "@ourworldindata/core-table" +import { + ColumnTypeNames, + CoreRow, + DbEnrichedVariable, +} from "@ourworldindata/types" + +import * as db from "../../../db/db.js" +import { DATA_API_URL } from "../../../settings/serverSettings.js" +import { keyBy } from "@ourworldindata/utils" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { + ExplorerViewEntry, + ExplorerViewEntryWithExplorerInfo, + GrapherInfo, + IndicatorMetadata, +} from "./types.js" +import { processAvailableEntities } from "./shared.js" + +// Creates a search-ready string from a choice. +// Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included. +// Imagine a "Per capita" checkbox, for example. If it's not ticked, then we don't want searches for "per capita" to wrongfully match it. +const explorerChoiceToViewSettings = ( + choices: ExplorerChoiceParams, + decisionMatrix: DecisionMatrix +): string[] => { + return Object.entries(choices).map(([choiceName, choiceValue]) => { + const choiceControlType = + decisionMatrix.choiceNameToControlTypeMap.get(choiceName) + if (choiceControlType === ExplorerControlType.Checkbox) + return choiceValue === GridBoolean.true ? choiceName : "" + else return choiceValue + }) +} + +type ExplorerIndicatorMetadata = Record< + string | number, + { + entityNames?: string[] + display: DbEnrichedVariable["display"] + titlePublic: DbEnrichedVariable["titlePublic"] + descriptionShort: DbEnrichedVariable["descriptionShort"] + name: DbEnrichedVariable["name"] + } +> + +async function fetchIndicatorMetadata( + records: Omit< + ExplorerViewEntry, + "viewTitleIndexWithinExplorer" | "titleLength" + >[], + trx: db.KnexReadonlyTransaction +): Promise { + function checkIsETLPath(idOrPath: string | number): idOrPath is string { + return typeof idOrPath === "string" + } + + const { etlPaths, ids } = records.reduce( + ({ etlPaths, ids }, record) => { + for (const yVariableId of record.yVariableIds) { + if (checkIsETLPath(yVariableId)) { + etlPaths.add(yVariableId) + } else { + ids.add(yVariableId) + } + } + return { etlPaths, ids } + }, + { etlPaths: new Set(), ids: new Set() } + ) + + const metadataFromDB = ( + await trx + .table("variables") + .select( + "id", + "catalogPath", + "name", + "titlePublic", + "display", + "name", + "descriptionShort" + ) + .whereIn("id", [...ids]) + .orWhereIn("catalogPath", [...etlPaths]) + ).map((row) => ({ + ...row, + display: row.display ? JSON.parse(row.display) : {}, + })) as DbEnrichedVariable[] + + const indicatorMetadataByIdAndPath = { + ...keyBy(metadataFromDB, "id"), + ...keyBy(metadataFromDB, "catalogPath"), + } as ExplorerIndicatorMetadata + + async function fetchEntitiesForId(id?: number) { + if (id) { + const metadata = await fetchS3MetadataByPath( + getVariableMetadataRoute(DATA_API_URL, id) + ) + const entityNames = get(metadata, "dimensions.entities.values", []) + .map((value) => value.name) + .filter((name): name is string => !!name) + + const idEntry = indicatorMetadataByIdAndPath[id] + if (idEntry) { + idEntry.entityNames = entityNames + } + const path = metadata.catalogPath + if (path) { + const pathEntry = indicatorMetadataByIdAndPath[path] + if (pathEntry) { + pathEntry.entityNames = entityNames + } + } + } + } + + await pMap( + metadataFromDB.map((meta) => meta.id), + fetchEntitiesForId, + { concurrency: 10 } + ) + + return indicatorMetadataByIdAndPath +} + +/** Almost always `"country"`, but sometimes things like `"location"` */ +function getEntityNameSlug(tableDef: TableDef): string { + return ( + tableDef.columnDefinitions?.find( + (col) => col.type === ColumnTypeNames.EntityName + )?.slug || "country" + ) +} + +/** + * Returns an aggregator function that can be used to aggregate entities per column in a parsed CSV + * e.g. if there's a column named "gdp", this will return an object like `{ gdp: Set }` + * containing all the entities that have any data for gdp. + */ +function makeAggregator(entityNameSlug: string) { + return ( + result: Record>, + row: Record + ) => { + const entityName = row[entityNameSlug] + Object.keys(row).forEach((columnSlug) => { + if (columnSlug === entityNameSlug || columnSlug === "year") return + + const value = row[columnSlug] + if (value) { + if (!result[columnSlug]) { + result[columnSlug] = new Set() + } + result[columnSlug].add(entityName) + } + }) + + return result + } +} + +async function getEntitiesPerColumnPerTable( + tableDefs: TableDef[] +): Promise>> { + return pMap( + tableDefs, + (tableDef) => { + console.log("Fetching CSV table data from", tableDef.url) + return fetch(tableDef.url!) + .then((res) => res.text()) + .then((csv) => parseDelimited(csv)) + .then((parsed) => { + const entityNameSlug = getEntityNameSlug(tableDef) + const aggregateEntities = makeAggregator(entityNameSlug) + const entitiesPerColumn = parsed.reduce( + aggregateEntities, + {} + ) + + // Convert sets to arrays + const entityNamesAsArray = mapValues( + entitiesPerColumn, + (set) => Array.from(set) + ) as Record + + // Return an object like `{ almonds: { population: ["United States", "Canada"], area_harvested__ha: ["United States"] } }` + return { [tableDef.slug!]: entityNamesAsArray } + }) + }, + { + concurrency: 5, + } + // Merge all these objects together + ).then((results) => Object.assign({}, ...results)) +} + +const computeExplorerViewScore = ( + record: Omit & + Partial +) => + (record.explorerViews_7d ?? 0) * 10 - + record.numNonDefaultSettings * 50 - + record.titleLength + +const parseYVariableIds = (matrixRow: CoreRow): (string | number)[] => { + return ( + matrixRow.yVariableIds + ?.trim() + .split(" ") + .map((idOrPath: string) => + isNaN(parseInt(idOrPath)) ? idOrPath : parseInt(idOrPath) + ) || [] + ) +} + +const getNonDefaultSettings = ( + choice: ExplorerChoiceParams, + matrix: DecisionMatrix +): [string, any][] => { + const defaultSettings = matrix.defaultSettings + return Object.entries(matrix.availableChoiceOptions).filter( + ([choiceName, choiceOptions]) => { + return ( + choiceOptions.length > 1 && + !(defaultSettings[choiceName] !== undefined + ? defaultSettings[choiceName] === choice[choiceName] + : choice[choiceName] === choiceOptions[0]) + ) + } + ) +} + +const createBaseRecord = ( + choice: ExplorerChoiceParams, + matrix: DecisionMatrix, + index: number +): Partial => { + matrix.setValuesFromChoiceParams(choice) + const nonDefaultSettings = getNonDefaultSettings(choice, matrix) + const yVariableIds = parseYVariableIds(matrix.selectedRow) + + return { + viewTitle: matrix.selectedRow.title, + viewSubtitle: matrix.selectedRow.subtitle, + viewSettings: explorerChoiceToViewSettings(choice, matrix), + availableEntities: [], + viewGrapherId: matrix.selectedRow.grapherId, + yVariableIds, + viewQueryParams: matrix.toString(), + viewIndexWithinExplorer: index, + numNonDefaultSettings: nonDefaultSettings.length, + tableSlug: matrix.selectedRow.tableSlug, + ySlugs: matrix.selectedRow.ySlugs?.split(" ") || [], + } +} + +const createBaseRecords = ( + matrix: DecisionMatrix +): Partial[] => { + return ( + matrix + .allDecisionsAsQueryParams() + // TODO: remove me, testing only + .slice(0, 5) + .map((choice: ExplorerChoiceParams, index: number) => + createBaseRecord(choice, matrix, index) + ) + ) +} + +const fetchGrapherInfo = async ( + trx: db.KnexReadonlyTransaction, + grapherIds: number[] +): Promise> => { + return await trx + .select( + trx.raw("charts.id as id"), + trx.raw("chart_configs.full->>'$.title' as title"), + trx.raw("chart_configs.full->>'$.subtitle' as subtitle") + ) + .from("charts") + .join("chart_configs", { "charts.configId": "chart_configs.id" }) + .whereIn("charts.id", grapherIds) + .andWhereRaw("chart_configs.full->>'$.isPublished' = 'true'") + .then((rows) => keyBy(rows, "id")) +} + +const enrichRecordWithGrapherInfo = ( + record: Partial, + grapherInfo: Record, + availableEntities: Map, + slug: string +): Partial => { + if (!record.viewGrapherId) return record + + const grapher = grapherInfo[record.viewGrapherId] + if (!grapher) { + console.warn( + `Grapher id ${record.viewGrapherId} not found for explorer ${slug}` + ) + return record + } + + return { + ...record, + availableEntities: + availableEntities.get(record.viewGrapherId)?.availableEntities ?? + [], + viewTitle: grapher.title, + viewSubtitle: grapher.subtitle, + } +} + +const enrichWithGrapherData = async ( + records: Partial[], + trx: db.KnexReadonlyTransaction, + slug: string +): Promise[]> => { + const grapherIds = records + .filter((record) => record.viewGrapherId !== undefined) + .map((record) => record.viewGrapherId as number) + + if (!grapherIds.length) return records + + console.log( + `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${slug}` + ) + const grapherInfo = await fetchGrapherInfo(trx, grapherIds) + const availableEntities = await obtainAvailableEntitiesForAllGraphers( + trx, + grapherIds + ) + + return records.map((record) => + enrichRecordWithGrapherInfo( + record, + grapherInfo, + availableEntities, + slug + ) + ) +} + +const enrichRecordWithTableData = ( + record: Partial, + entitiesPerColumnPerTable: Record> +): Partial => { + const { tableSlug, ySlugs } = record + if (!tableSlug || !ySlugs?.length) return record + + // console.log("tableSlug", tableSlug) + // console.log("ySlugs", ySlugs) + // console.log("entitiesPerColumnPerTable", entitiesPerColumnPerTable) + const availableEntities = ySlugs + .flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) + .filter((name, i, array) => !!name && array.indexOf(name) === i) + + return { ...record, availableEntities } +} + +const enrichRecordWithIndicatorData = ( + record: Partial, + indicatorMetadata: Record +): Partial => { + if (!record.yVariableIds?.length) return record + + const allEntities = at(indicatorMetadata, record.yVariableIds) + .flatMap((meta) => meta.entityNames) + .filter( + (name, i, array): name is string => + array.indexOf(name) === i && !!name + ) + + const result = { ...record, availableEntities: allEntities } + + const firstYIndicator = record.yVariableIds[0] + if (firstYIndicator === undefined) return result + + const indicatorInfo = indicatorMetadata[firstYIndicator] + if (!indicatorInfo) return result + + return { + ...result, + viewTitle: + record.viewTitle ?? + indicatorInfo.titlePublic ?? + indicatorInfo.display?.name ?? + indicatorInfo.name, + viewSubtitle: record.viewSubtitle ?? indicatorInfo.descriptionShort, + } +} + +const enrichWithMetadata = async ( + records: Partial[], + indicatorMetadata: Record, + entitiesPerColumnPerTable: Record> +): Promise[]> => { + return records.map((record) => { + const withTableData = enrichRecordWithTableData( + record, + entitiesPerColumnPerTable + ) + return enrichRecordWithIndicatorData(withTableData, indicatorMetadata) + }) +} + +const cleanSubtitles = ( + records: Partial[] +): Partial[] => { + return records.map((record) => ({ + ...record, + viewSubtitle: record.viewSubtitle + ? new MarkdownTextWrap({ + text: record.viewSubtitle, + fontSize: 10, + }).plaintext + : undefined, + })) +} + +async function logMissingTitles( + records: Partial[], + slug: string +): Promise { + for (const record of records) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewTitleMissing", + message: `Explorer ${slug} has a view with no title: ${record.viewQueryParams}.`, + }) + } +} + +async function finalizeRecords( + records: Partial[], + slug: string +): Promise { + const [withTitle, withoutTitle] = partition( + records, + (record) => record.viewTitle !== undefined + ) + + await logMissingTitles(withoutTitle, slug) + + const withCleanSubtitles = cleanSubtitles(withTitle) + + const withTitleLength = withCleanSubtitles.map((record) => ({ + ...record, + titleLength: record.viewTitle!.length, + })) as Omit[] + + const withCleanedEntities = [] as Omit< + ExplorerViewEntry, + "viewTitleIndexWithinExplorer" + >[] + + for (const record of withTitleLength) { + const cleanedEntities = processAvailableEntities( + record.availableEntities + ) + if (!cleanedEntities.length) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewNoEntities", + message: `Explorer ${slug} has a view with no entities: ${record.viewQueryParams}.`, + }) + } + withCleanedEntities.push({ + ...record, + availableEntities: cleanedEntities, + }) + } + + const sortedByScore = orderBy( + withCleanedEntities, + computeExplorerViewScore, + "desc" + ) as Omit[] + + const groupedByTitle = groupBy(sortedByScore, "viewTitle") + + return Object.values(groupedByTitle).flatMap((group, i) => + group.map((record) => ({ + ...record, + viewTitleIndexWithinExplorer: i, + })) + ) +} + +export const getExplorerViewRecordsForExplorerSlug = async ( + trx: db.KnexReadonlyTransaction, + slug: string, + explorerAdminServer: ExplorerAdminServer +): Promise => { + // Get explorer program and table definitions + const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) + const tableDefs = explorerProgram.tableSlugs + .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) + .filter((x) => x && x.url && x.slug) as TableDef[] + + // Fetch and process CSV table data + console.log( + `Fetching CSV table data for ${slug} and aggregating entities by column` + ) + const entitiesPerColumnPerTable = + await getEntitiesPerColumnPerTable(tableDefs) + console.log( + "Finished fetching CSV table data and aggregating entities by column" + ) + + // Create base records from decision matrix + console.log( + `Processing explorer ${slug} (${explorerProgram.decisionMatrix.numRows} rows)` + ) + const baseRecords = createBaseRecords(explorerProgram.decisionMatrix) + + // Enrich with grapher data + const recordsWithGrapherData = await enrichWithGrapherData( + baseRecords, + trx, + slug + ) + + // Fetch and apply indicator metadata + console.log("Fetching indicator metadata for explorer", slug) + const indicatorMetadata = await fetchIndicatorMetadata( + recordsWithGrapherData as any, + trx + ) + console.log("Fetched indicator metadata for explorer", slug) + + const enrichedRecords = await enrichWithMetadata( + recordsWithGrapherData, + indicatorMetadata as any, + entitiesPerColumnPerTable + ) + + // Finalize records with titles, sorting, and grouping + return finalizeRecords(enrichedRecords, slug) +} + +async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { + const explorersBySlug = await db.getPublishedExplorersBySlug(trx) + const parentTags = await db.getParentTagsByChildName(trx) + const publishedExplorersWithTags = [] + + for (const explorer of Object.values(explorersBySlug).filter( + // TODO: testing, remove this + (e) => e.slug === "fish-stocks" + )) { + if (!explorer.tags.length) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerTagMissing", + message: `Explorer ${explorer.slug} has no tags.`, + }) + } + const tags = new Set() + // The DB query gets the tags for the explorer, but we need to add the parent tags as well. + // This isn't done in the query because it would require a recursive CTE. + // It's easier to write that query once, separately, and reuse it + for (const tag of explorer.tags) { + tags.add(tag.name) + for (const parentTag of parentTags[tag.name]) { + tags.add(parentTag) + } + } + + publishedExplorersWithTags.push({ + ...explorer, + tags: Array.from(tags), + }) + } + + return publishedExplorersWithTags +} + +export const getExplorerViewRecords = async ( + trx: db.KnexReadonlyTransaction +): Promise => { + console.log("Fetching explorer views to index") + const publishedExplorersWithTags = await getExplorersWithInheritedTags(trx) + const pageviews = await getAnalyticsPageviewsByUrlObj(trx) + + const explorerAdminServer = new ExplorerAdminServer(GIT_CMS_DIR) + + let records = [] as ExplorerViewEntryWithExplorerInfo[] + for (const explorerInfo of publishedExplorersWithTags) { + const explorerViewRecords = await getExplorerViewRecordsForExplorerSlug( + trx, + explorerInfo.slug, + explorerAdminServer + ) + + const explorerPageviews = get( + pageviews, + [`/explorers/${explorerInfo.slug}`, "views_7d"], + 0 + ) + // These have a score for ranking purposes, but it doesn't yet factor in the explorer's pageviews + const unscoredRecords = explorerViewRecords.map( + (record, i): Omit => ({ + ...record, + explorerSlug: explorerInfo.slug, + explorerTitle: explorerInfo.title, + explorerSubtitle: explorerInfo.subtitle, + explorerViews_7d: explorerPageviews, + viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, + numViewsWithinExplorer: explorerViewRecords.length, + tags: explorerInfo.tags, + objectID: `${explorerInfo.slug}-${i}`, + }) + ) + records = records.concat( + unscoredRecords.map((record) => ({ + ...record, + score: computeExplorerViewScore(record), + })) + ) + } + + return records +} diff --git a/baker/algolia/algoliaUtils.tsx b/baker/algolia/utils/pages.ts similarity index 91% rename from baker/algolia/algoliaUtils.tsx rename to baker/algolia/utils/pages.ts index 70767e36cab..24d6d2781cb 100644 --- a/baker/algolia/algoliaUtils.tsx +++ b/baker/algolia/utils/pages.ts @@ -1,6 +1,6 @@ -import * as db from "../../db/db.js" -import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" -import { chunkParagraphs } from "../chunk.js" +import * as db from "../../../db/db.js" +import { ALGOLIA_INDEXING } from "../../../settings/serverSettings.js" +import { chunkParagraphs } from "../../chunk.js" import { countries, Country, @@ -17,36 +17,31 @@ import { DEFAULT_GDOC_FEATURED_IMAGE, DEFAULT_THUMBNAIL_FILENAME, } from "@ourworldindata/utils" -import { formatPost } from "../formatWordpressPost.js" +import { formatPost } from "../../formatWordpressPost.js" import ReactDOMServer from "react-dom/server.js" -import { getAlgoliaClient } from "./configureAlgolia.js" +import { getAlgoliaClient } from "../configureAlgolia.js" import { htmlToText } from "html-to-text" import { PageRecord, - PageType, SearchIndexName, -} from "../../site/search/searchTypes.js" -import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" -import { ArticleBlocks } from "../../site/gdocs/components/ArticleBlocks.js" +} from "../../../site/search/searchTypes.js" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { ArticleBlocks } from "../../../site/gdocs/components/ArticleBlocks.js" import React from "react" import { getFullPost, getPostTags, getPostsFromSnapshots, -} from "../../db/model/Post.js" -import { getIndexName } from "../../site/search/searchClient.js" +} from "../../../db/model/Post.js" +import { getIndexName } from "../../../site/search/searchClient.js" import { ObjectWithObjectID } from "@algolia/client-search" import { SearchIndex } from "algoliasearch" import { match, P } from "ts-pattern" -import { gdocFromJSON } from "../../db/model/Gdoc/GdocFactory.js" -import { formatUrls } from "../../site/formatting.js" +import { gdocFromJSON } from "../../../db/model/Gdoc/GdocFactory.js" +import { formatUrls } from "../../../site/formatting.js" +import { TypeAndImportance } from "./types.js" -interface TypeAndImportance { - type: PageType - importance: number -} - -const computeScore = (record: Omit): number => { +const computePageScore = (record: Omit): number => { const { importance, views_7d } = record return importance * 1000 + views_7d } @@ -70,7 +65,7 @@ function generateCountryRecords( documentType: "country-page" as const, thumbnailUrl: `/${DEFAULT_THUMBNAIL_FILENAME}`, } - const score = computeScore(record) + const score = computePageScore(record) return { ...record, score } }) } @@ -152,7 +147,7 @@ async function generateWordpressRecords( views_7d: pageviews[`/${post.path}`]?.views_7d ?? 0, documentType: "wordpress" as const, } - const score = computeScore(record) + const score = computePageScore(record) records.push({ ...record, score }) i += 1 } @@ -205,9 +200,13 @@ function generateGdocRecords( if (!gdoc.content.body) continue // Only rendering the blocks - not the page nav, title, byline, etc const renderedPostContent = ReactDOMServer.renderToStaticMarkup( -
- -
+ React.createElement( + "div", + null, + React.createElement(ArticleBlocks, { + blocks: gdoc.content.body, + }) + ) ) const chunks = generateChunksFromHtmlText(renderedPostContent) const postTypeAndImportance = getPostTypeAndImportance(gdoc) @@ -230,7 +229,7 @@ function generateGdocRecords( authors: gdoc.content.authors, thumbnailUrl, } - const score = computeScore(record) + const score = computePageScore(record) records.push({ ...record, score }) i += 1 } diff --git a/baker/algolia/utils/shared.ts b/baker/algolia/utils/shared.ts new file mode 100644 index 00000000000..6d8dfe300ce --- /dev/null +++ b/baker/algolia/utils/shared.ts @@ -0,0 +1,40 @@ +import { + countries, + excludeNullish, + orderBy, + removeTrailingParenthetical, +} from "@ourworldindata/utils" + +const countriesWithVariantNames = new Set( + countries + .filter((country) => country.variantNames?.length || country.shortName) + .map((country) => country.name) +) + +export const processAvailableEntities = ( + availableEntities: string[] | null +) => { + if (!availableEntities) return [] + + // Algolia is a bit weird with synonyms: + // If we have a synonym "USA" -> "United States", and we search for "USA", + // then it seems that Algolia can only find that within `availableEntities` + // if "USA" is within the first 100-or-so entries of the array. + // So, the easy solution is to sort the entities to ensure that countries + // with variant names are at the top. + // Also, entities containing a hyphen like "low-income countries" can also + // only be found if they're within the first 100-or-so entries. + // - @marcelgerber, 2024-03-25 + return orderBy( + availableEntities, + [ + (entityName) => + countriesWithVariantNames.has( + removeTrailingParenthetical(entityName) + ), + (entityName) => entityName.includes("-"), + (entityName) => entityName, + ], + ["desc", "desc", "asc"] + ) +} diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts new file mode 100644 index 00000000000..4b269012991 --- /dev/null +++ b/baker/algolia/utils/types.ts @@ -0,0 +1,91 @@ +import { PageType } from "../../../site/search/searchTypes.js" + +/** Pages */ +export interface TypeAndImportance { + type: PageType + importance: number +} + +/** Charts */ +export interface RawChartRecordRow { + id: number + slug: string + title: string + variantName: string + subtitle: string + numDimensions: string + publishedAt: string + updatedAt: string + entityNames: string + tags: string + keyChartForTags: string +} + +export interface ParsedChartRecordRow { + id: number + slug: string + title: string + variantName: string + subtitle: string + numDimensions: string + publishedAt: string + updatedAt: string + entityNames: string[] + tags: string[] + keyChartForTags: string[] +} + +/** Explorers */ +export interface ExplorerViewEntry { + viewTitle: string + viewSubtitle: string + viewSettings: string[] + viewQueryParams: string + availableEntities: string[] + + viewGrapherId?: number + yVariableIds: Array // Variable IDs or ETL paths + tableSlug?: string + ySlugs: string[] + + /** + * We often have several views with the same title within an explorer, e.g. "Population". + * In order to only display _one_ of these views in search results, we need a way to demote duplicates. + * This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc. + */ + viewTitleIndexWithinExplorer: number + + // Potential ranking criteria + viewIndexWithinExplorer: number + titleLength: number + numNonDefaultSettings: number + // viewViews_7d: number +} + +export interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { + explorerSlug: string + explorerTitle: string + explorerSubtitle: string + explorerViews_7d: number + viewTitleAndExplorerSlug: string // used for deduplication: `viewTitle | explorerSlug` + numViewsWithinExplorer: number + tags: string[] + + score: number + + objectID?: string +} + +export interface IndicatorMetadata { + entityNames: string[] + titlePublic?: string + display?: { name: string } + name: string + descriptionShort?: string +} + +export interface GrapherInfo { + id: number + title: string + subtitle: string +} From ab957f8236be7bc3b20c8c996e003daf55ea6bb7 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Sat, 2 Nov 2024 02:59:00 +0000 Subject: [PATCH 04/14] =?UTF-8?q?=F0=9F=94=A8=20more=20explorer=20indexing?= =?UTF-8?q?=20refactoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../indexExplorerViewsAndChartsToAlgolia.ts | 7 +- baker/algolia/indexExplorerViewsToAlgolia.ts | 1 - baker/algolia/utils/explorerViews.ts | 506 ++++++++++-------- baker/algolia/utils/shared.ts | 1 - baker/algolia/utils/types.ts | 112 +++- db/db.ts | 20 +- .../types/src/dbTypes/Explorers.ts | 8 + .../types/src/gdocTypes/Gdoc.ts | 3 +- packages/@ourworldindata/types/src/index.ts | 1 + 9 files changed, 414 insertions(+), 245 deletions(-) diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index 10cf3b0fd12..a793aa84c20 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -1,4 +1,3 @@ -import fs from "fs/promises" import Bugsnag from "@bugsnag/js" import * as db from "../../db/db.js" import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" @@ -7,7 +6,7 @@ import { BUGSNAG_NODE_API_KEY, } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" -import { ExplorerViewEntryWithExplorerInfo } from "./utils/types.js" +import { ExplorerViewFinalRecord } from "./utils/types.js" import { getExplorerViewRecords } from "./utils/explorerViews.js" import { getChartsRecords } from "./utils/charts.js" import { getIndexName } from "../../site/search/searchClient.js" @@ -18,7 +17,7 @@ import { } from "../../site/search/searchTypes.js" function explorerViewRecordToChartRecord( - e: ExplorerViewEntryWithExplorerInfo + e: ExplorerViewFinalRecord ): ChartRecord { return { type: ChartRecordType.ExplorerView, @@ -68,7 +67,7 @@ function scaleExplorerScores( const normalized = e.score / explorerScoreMax const grapherRange = grapherScoreBounds.max - grapherScoreBounds.min const scaled = Math.round( - (normalized / 2) * grapherRange + grapherScoreBounds.min + normalized * grapherRange + grapherScoreBounds.min ) return { ...e, diff --git a/baker/algolia/indexExplorerViewsToAlgolia.ts b/baker/algolia/indexExplorerViewsToAlgolia.ts index 3a31b4757ce..4c8568e4a2a 100644 --- a/baker/algolia/indexExplorerViewsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsToAlgolia.ts @@ -1,4 +1,3 @@ -import fs from "fs/promises" import * as db from "../../db/db.js" import { ALGOLIA_INDEXING, diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index ed8047f5d3a..c1f8fd5a19f 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -1,4 +1,3 @@ -import fs from "fs/promises" import { ExplorerChoiceParams, ExplorerControlType, @@ -6,7 +5,7 @@ import { DecisionMatrix, TableDef, } from "@ourworldindata/explorer" -import { at, get, groupBy, mapValues, orderBy, partition } from "lodash" +import { at, get, groupBy, mapValues, orderBy, partition, uniq } from "lodash" import { MarkdownTextWrap } from "@ourworldindata/components" import { logErrorAndMaybeSendToBugsnag } from "../../../serverUtils/errorLog.js" import { obtainAvailableEntitiesForAllGraphers } from "../../updateChartEntities.js" @@ -19,7 +18,7 @@ import { parseDelimited } from "@ourworldindata/core-table" import { ColumnTypeNames, CoreRow, - DbEnrichedVariable, + MinimalExplorerInfo, } from "@ourworldindata/types" import * as db from "../../../db/db.js" @@ -27,12 +26,21 @@ import { DATA_API_URL } from "../../../settings/serverSettings.js" import { keyBy } from "@ourworldindata/utils" import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" import { - ExplorerViewEntry, - ExplorerViewEntryWithExplorerInfo, - GrapherInfo, - IndicatorMetadata, + CsvUnenrichedExplorerViewRecord, + EnrichedExplorerRecord, + EntitiesByColumnDictionary, + ExplorerIndicatorMetadataDictionary, + ExplorerIndicatorMetadataFromDb, + ExplorerViewFinalRecord, + ExplorerViewBaseRecord, + ExplorerViewGrapherInfo, + GrapherEnrichedExplorerViewRecord, + GrapherUnenrichedExplorerViewRecord, + IndicatorEnrichedExplorerViewRecord, + IndicatorUnenrichedExplorerViewRecord, + CsvEnrichedExplorerViewRecord, } from "./types.js" -import { processAvailableEntities } from "./shared.js" +import { processAvailableEntities as processRecordAvailableEntities } from "./shared.js" // Creates a search-ready string from a choice. // Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included. @@ -50,24 +58,21 @@ const explorerChoiceToViewSettings = ( }) } -type ExplorerIndicatorMetadata = Record< - string | number, - { - entityNames?: string[] - display: DbEnrichedVariable["display"] - titlePublic: DbEnrichedVariable["titlePublic"] - descriptionShort: DbEnrichedVariable["descriptionShort"] - name: DbEnrichedVariable["name"] - } -> - +/** + * Takes records with `yVariableIds` and fetches their metadata. + * First it fetches base metadata from the DB, then it fetches availableEntities from S3. + * Returns a dictionary of metadata by id (and path, when possible): + * ``` + * { + * 123: { id: 123, name: "GDP", entityNames: ["United States", "Canada"] }, + * "an/etl#path": { id: "an/etl#path", name: "GDP", entityNames: ["United States", "Canada"] } + * } + * ``` + */ async function fetchIndicatorMetadata( - records: Omit< - ExplorerViewEntry, - "viewTitleIndexWithinExplorer" | "titleLength" - >[], + records: IndicatorUnenrichedExplorerViewRecord[], trx: db.KnexReadonlyTransaction -): Promise { +): Promise { function checkIsETLPath(idOrPath: string | number): idOrPath is string { return typeof idOrPath === "string" } @@ -95,7 +100,6 @@ async function fetchIndicatorMetadata( "name", "titlePublic", "display", - "name", "descriptionShort" ) .whereIn("id", [...ids]) @@ -103,12 +107,12 @@ async function fetchIndicatorMetadata( ).map((row) => ({ ...row, display: row.display ? JSON.parse(row.display) : {}, - })) as DbEnrichedVariable[] + })) as ExplorerIndicatorMetadataFromDb[] const indicatorMetadataByIdAndPath = { ...keyBy(metadataFromDB, "id"), ...keyBy(metadataFromDB, "catalogPath"), - } as ExplorerIndicatorMetadata + } as ExplorerIndicatorMetadataDictionary async function fetchEntitiesForId(id?: number) { if (id) { @@ -178,9 +182,19 @@ function makeAggregator(entityNameSlug: string) { } } +/** + * Fetches the CSVs for all of an explorer's tables, parses them, and aggregates their entities per column. + * Returns an object like: + * ``` + * { + * almonds: { population: ["United States", "Canada"], food__tonnes: ["United States"] }, + * olives: { population: ["United States", "Canada"], food__tonnes: ["United States", "Greece"] }, + * } + * ``` + */ async function getEntitiesPerColumnPerTable( tableDefs: TableDef[] -): Promise>> { +): Promise { return pMap( tableDefs, (tableDef) => { @@ -202,7 +216,6 @@ async function getEntitiesPerColumnPerTable( (set) => Array.from(set) ) as Record - // Return an object like `{ almonds: { population: ["United States", "Canada"], area_harvested__ha: ["United States"] } }` return { [tableDef.slug!]: entityNamesAsArray } }) }, @@ -213,11 +226,12 @@ async function getEntitiesPerColumnPerTable( ).then((results) => Object.assign({}, ...results)) } -const computeExplorerViewScore = ( - record: Omit & - Partial -) => - (record.explorerViews_7d ?? 0) * 10 - +const computeExplorerViewScore = (record: { + explorerViews_7d: number + numNonDefaultSettings: number + titleLength: number +}) => + (record.explorerViews_7d || 0) * 10 - record.numNonDefaultSettings * 50 - record.titleLength @@ -252,17 +266,18 @@ const getNonDefaultSettings = ( const createBaseRecord = ( choice: ExplorerChoiceParams, matrix: DecisionMatrix, - index: number -): Partial => { + index: number, + explorerInfo: MinimalExplorerInfo +): ExplorerViewBaseRecord => { matrix.setValuesFromChoiceParams(choice) const nonDefaultSettings = getNonDefaultSettings(choice, matrix) const yVariableIds = parseYVariableIds(matrix.selectedRow) return { + availableEntities: [], viewTitle: matrix.selectedRow.title, viewSubtitle: matrix.selectedRow.subtitle, viewSettings: explorerChoiceToViewSettings(choice, matrix), - availableEntities: [], viewGrapherId: matrix.selectedRow.grapherId, yVariableIds, viewQueryParams: matrix.toString(), @@ -270,19 +285,21 @@ const createBaseRecord = ( numNonDefaultSettings: nonDefaultSettings.length, tableSlug: matrix.selectedRow.tableSlug, ySlugs: matrix.selectedRow.ySlugs?.split(" ") || [], + explorerSlug: explorerInfo.slug, } } const createBaseRecords = ( + explorerInfo: MinimalExplorerInfo, matrix: DecisionMatrix -): Partial[] => { +): ExplorerViewBaseRecord[] => { return ( matrix .allDecisionsAsQueryParams() // TODO: remove me, testing only - .slice(0, 5) + // .slice (0, 5) .map((choice: ExplorerChoiceParams, index: number) => - createBaseRecord(choice, matrix, index) + createBaseRecord(choice, matrix, index, explorerInfo) ) ) } @@ -290,7 +307,7 @@ const createBaseRecords = ( const fetchGrapherInfo = async ( trx: db.KnexReadonlyTransaction, grapherIds: number[] -): Promise> => { +): Promise> => { return await trx .select( trx.raw("charts.id as id"), @@ -304,20 +321,19 @@ const fetchGrapherInfo = async ( .then((rows) => keyBy(rows, "id")) } -const enrichRecordWithGrapherInfo = ( - record: Partial, - grapherInfo: Record, +async function enrichRecordWithGrapherInfo( + record: GrapherUnenrichedExplorerViewRecord, + grapherInfo: Record, availableEntities: Map, - slug: string -): Partial => { - if (!record.viewGrapherId) return record - + explorerInfo: MinimalExplorerInfo +): Promise { const grapher = grapherInfo[record.viewGrapherId] if (!grapher) { - console.warn( - `Grapher id ${record.viewGrapherId} not found for explorer ${slug}` - ) - return record + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewGrapherMissing", + message: `Explorer with slug "${explorerInfo.slug}" has a view with a missing grapher: ${record.viewQueryParams}.`, + }) + return } return { @@ -327,22 +343,20 @@ const enrichRecordWithGrapherInfo = ( [], viewTitle: grapher.title, viewSubtitle: grapher.subtitle, + titleLength: grapher.title.length, } } const enrichWithGrapherData = async ( - records: Partial[], trx: db.KnexReadonlyTransaction, - slug: string -): Promise[]> => { - const grapherIds = records - .filter((record) => record.viewGrapherId !== undefined) - .map((record) => record.viewGrapherId as number) - - if (!grapherIds.length) return records + records: GrapherUnenrichedExplorerViewRecord[], + explorerInfo: MinimalExplorerInfo +): Promise => { + if (!records.length) return [] + const grapherIds = records.map((record) => record.viewGrapherId as number) console.log( - `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${slug}` + `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${explorerInfo.slug}` ) const grapherInfo = await fetchGrapherInfo(trx, grapherIds) const availableEntities = await obtainAvailableEntitiesForAllGraphers( @@ -350,167 +364,213 @@ const enrichWithGrapherData = async ( grapherIds ) - return records.map((record) => - enrichRecordWithGrapherInfo( + const enrichedRecords: GrapherEnrichedExplorerViewRecord[] = [] + for (const record of records) { + const enrichedRecord = await enrichRecordWithGrapherInfo( record, grapherInfo, availableEntities, - slug + explorerInfo ) - ) + if (enrichedRecord) enrichedRecords.push(enrichedRecord) + } + return enrichedRecords } -const enrichRecordWithTableData = ( - record: Partial, - entitiesPerColumnPerTable: Record> -): Partial => { - const { tableSlug, ySlugs } = record - if (!tableSlug || !ySlugs?.length) return record +async function enrichRecordWithTableData( + record: CsvUnenrichedExplorerViewRecord, + entitiesPerColumnPerTable: EntitiesByColumnDictionary +): Promise { + const { tableSlug, ySlugs, viewTitle } = record + if (!tableSlug || !ySlugs?.length || !viewTitle) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewMissingData", + message: `Explorer with slug "${record.explorerSlug}" has a view with missing data: ${record.viewQueryParams}.`, + }) + return + } - // console.log("tableSlug", tableSlug) - // console.log("ySlugs", ySlugs) - // console.log("entitiesPerColumnPerTable", entitiesPerColumnPerTable) const availableEntities = ySlugs .flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) .filter((name, i, array) => !!name && array.indexOf(name) === i) - return { ...record, availableEntities } + return { + ...record, + availableEntities, + titleLength: viewTitle.length, + } } -const enrichRecordWithIndicatorData = ( - record: Partial, - indicatorMetadata: Record -): Partial => { - if (!record.yVariableIds?.length) return record - - const allEntities = at(indicatorMetadata, record.yVariableIds) - .flatMap((meta) => meta.entityNames) - .filter( - (name, i, array): name is string => - array.indexOf(name) === i && !!name +async function enrichWithTableData( + records: CsvUnenrichedExplorerViewRecord[], + entitiesPerColumnPerTable: EntitiesByColumnDictionary +): Promise { + const enrichedRecords: CsvEnrichedExplorerViewRecord[] = [] + + for (const record of records) { + const enrichedRecord = await enrichRecordWithTableData( + record, + entitiesPerColumnPerTable ) + if (enrichedRecord) { + enrichedRecords.push(enrichedRecord) + } + } + return enrichedRecords +} + +function enrichRecordWithIndicatorData( + record: IndicatorUnenrichedExplorerViewRecord, + indicatorMetadataDictionary: ExplorerIndicatorMetadataDictionary +): IndicatorEnrichedExplorerViewRecord { + const allEntityNames = at( + indicatorMetadataDictionary, + record.yVariableIds + ).flatMap((meta) => meta.entityNames) - const result = { ...record, availableEntities: allEntities } + const uniqueNonEmptyEntityNames = uniq(allEntityNames).filter( + Boolean + ) as string[] const firstYIndicator = record.yVariableIds[0] - if (firstYIndicator === undefined) return result - const indicatorInfo = indicatorMetadata[firstYIndicator] - if (!indicatorInfo) return result + const indicatorInfo = indicatorMetadataDictionary[firstYIndicator] + + const viewTitle = + record.viewTitle || + indicatorInfo.titlePublic || + indicatorInfo.display?.name || + (indicatorInfo.name as string) + + const viewSubtitle = + record.viewSubtitle || (indicatorInfo.descriptionShort as string) return { - ...result, - viewTitle: - record.viewTitle ?? - indicatorInfo.titlePublic ?? - indicatorInfo.display?.name ?? - indicatorInfo.name, - viewSubtitle: record.viewSubtitle ?? indicatorInfo.descriptionShort, + ...record, + availableEntities: uniqueNonEmptyEntityNames, + viewTitle, + viewSubtitle, + titleLength: viewTitle.length, } } -const enrichWithMetadata = async ( - records: Partial[], - indicatorMetadata: Record, - entitiesPerColumnPerTable: Record> -): Promise[]> => { - return records.map((record) => { - const withTableData = enrichRecordWithTableData( - record, - entitiesPerColumnPerTable +const enrichWithIndicatorMetadata = async ( + indicatorBaseRecords: IndicatorUnenrichedExplorerViewRecord[], + indicatorMetadataDictionary: ExplorerIndicatorMetadataDictionary +): Promise => { + return indicatorBaseRecords.map((indicatorBaseRecord) => + enrichRecordWithIndicatorData( + indicatorBaseRecord, + indicatorMetadataDictionary ) - return enrichRecordWithIndicatorData(withTableData, indicatorMetadata) - }) + ) } -const cleanSubtitles = ( - records: Partial[] -): Partial[] => { - return records.map((record) => ({ - ...record, - viewSubtitle: record.viewSubtitle +function processSubtitles( + records: EnrichedExplorerRecord[] +): EnrichedExplorerRecord[] { + return records.map((record) => { + // Remove markdown links from text + const viewSubtitle = record.viewSubtitle ? new MarkdownTextWrap({ text: record.viewSubtitle, fontSize: 10, }).plaintext - : undefined, - })) + : undefined + return { + ...record, + viewSubtitle, + } as EnrichedExplorerRecord + }) } -async function logMissingTitles( - records: Partial[], - slug: string -): Promise { +async function processAvailableEntities( + records: EnrichedExplorerRecord[] +): Promise { + const processedRecords: EnrichedExplorerRecord[] = [] for (const record of records) { - await logErrorAndMaybeSendToBugsnag({ - name: "ExplorerViewTitleMissing", - message: `Explorer ${slug} has a view with no title: ${record.viewQueryParams}.`, - }) + const availableEntities = processRecordAvailableEntities( + record.availableEntities + ) + if (!availableEntities) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewMissingData", + message: `Explorer with slug "${record.explorerSlug}" has a view with missing entities: ${record.viewQueryParams}.`, + }) + } else { + processedRecords.push({ + ...record, + availableEntities, + }) + } } + return processedRecords } async function finalizeRecords( - records: Partial[], - slug: string -): Promise { - const [withTitle, withoutTitle] = partition( - records, - (record) => record.viewTitle !== undefined - ) - - await logMissingTitles(withoutTitle, slug) + records: EnrichedExplorerRecord[], + slug: string, + pageviews: Record, + explorerInfo: MinimalExplorerInfo +): Promise { + const withCleanSubtitles = processSubtitles(records) - const withCleanSubtitles = cleanSubtitles(withTitle) + const withCleanEntities = await processAvailableEntities(withCleanSubtitles) - const withTitleLength = withCleanSubtitles.map((record) => ({ + const withPageviews = withCleanEntities.map((record) => ({ ...record, - titleLength: record.viewTitle!.length, - })) as Omit[] - - const withCleanedEntities = [] as Omit< - ExplorerViewEntry, - "viewTitleIndexWithinExplorer" - >[] + explorerViews_7d: get(pageviews, [`/explorers/${slug}`, "views_7d"], 0), + })) - for (const record of withTitleLength) { - const cleanedEntities = processAvailableEntities( - record.availableEntities - ) - if (!cleanedEntities.length) { - await logErrorAndMaybeSendToBugsnag({ - name: "ExplorerViewNoEntities", - message: `Explorer ${slug} has a view with no entities: ${record.viewQueryParams}.`, - }) - } - withCleanedEntities.push({ + const unsortedFinalRecords = withPageviews.map( + ( + record, + i + ): Omit => ({ ...record, - availableEntities: cleanedEntities, + viewSettings: record.viewSettings.filter((x): x is string => !!x), + viewTitle: record.viewTitle!, + viewSubtitle: record.viewSubtitle!, + explorerSlug: explorerInfo.slug, + explorerTitle: explorerInfo.title, + explorerSubtitle: explorerInfo.subtitle, + viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, + numViewsWithinExplorer: withPageviews.length, + tags: explorerInfo.tags, + objectID: `${explorerInfo.slug}-${i}`, + score: computeExplorerViewScore(record), }) - } + ) const sortedByScore = orderBy( - withCleanedEntities, + unsortedFinalRecords, computeExplorerViewScore, "desc" - ) as Omit[] + ) const groupedByTitle = groupBy(sortedByScore, "viewTitle") - return Object.values(groupedByTitle).flatMap((group, i) => - group.map((record) => ({ - ...record, - viewTitleIndexWithinExplorer: i, - })) + const indexedExplorerViewData = Object.values(groupedByTitle).flatMap( + (records) => + records.map((record, i) => ({ + ...record, + viewTitleIndexWithinExplorer: i, + })) ) + + return indexedExplorerViewData } -export const getExplorerViewRecordsForExplorerSlug = async ( +export const getExplorerViewRecordsForExplorer = async ( trx: db.KnexReadonlyTransaction, - slug: string, + explorerInfo: MinimalExplorerInfo, + pageviews: Record, explorerAdminServer: ExplorerAdminServer -): Promise => { +): Promise => { + const { slug } = explorerInfo // Get explorer program and table definitions const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) + // TODO: why doesn't us-covid-data-explorer have tableSlugs or tableDefs? const tableDefs = explorerProgram.tableSlugs .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) .filter((x) => x && x.url && x.slug) as TableDef[] @@ -525,59 +585,80 @@ export const getExplorerViewRecordsForExplorerSlug = async ( "Finished fetching CSV table data and aggregating entities by column" ) - // Create base records from decision matrix console.log( - `Processing explorer ${slug} (${explorerProgram.decisionMatrix.numRows} rows)` + `Creating ${explorerProgram.decisionMatrix.numRows} base records for explorer ${slug}` + ) + const baseRecords = createBaseRecords( + explorerInfo, + explorerProgram.decisionMatrix ) - const baseRecords = createBaseRecords(explorerProgram.decisionMatrix) - // Enrich with grapher data - const recordsWithGrapherData = await enrichWithGrapherData( + const [grapherBaseRecords, nonGrapherBaseRecords] = partition( baseRecords, + (record) => record.viewGrapherId !== undefined + ) as [GrapherUnenrichedExplorerViewRecord[], ExplorerViewBaseRecord[]] + + const enrichedGrapherRecords = await enrichWithGrapherData( trx, - slug + grapherBaseRecords, + explorerInfo ) + const [indicatorBaseRecords, csvBaseRecords] = partition( + nonGrapherBaseRecords, + (record) => record.yVariableIds.length > 0 + ) as [ + IndicatorUnenrichedExplorerViewRecord[], + CsvUnenrichedExplorerViewRecord[], + ] + // Fetch and apply indicator metadata console.log("Fetching indicator metadata for explorer", slug) - const indicatorMetadata = await fetchIndicatorMetadata( - recordsWithGrapherData as any, + const indicatorMetadataDictionary = await fetchIndicatorMetadata( + indicatorBaseRecords, trx ) console.log("Fetched indicator metadata for explorer", slug) - const enrichedRecords = await enrichWithMetadata( - recordsWithGrapherData, - indicatorMetadata as any, + const enrichedIndicatorRecords = await enrichWithIndicatorMetadata( + indicatorBaseRecords, + indicatorMetadataDictionary + ) + + const enrichedCsvRecords = await enrichWithTableData( + csvBaseRecords, entitiesPerColumnPerTable ) - // Finalize records with titles, sorting, and grouping - return finalizeRecords(enrichedRecords, slug) + const enrichedRecords = [ + ...enrichedGrapherRecords, + ...enrichedIndicatorRecords, + ...enrichedCsvRecords, + ] + + // // Finalize records with titles, sorting, and grouping + return finalizeRecords(enrichedRecords, slug, pageviews, explorerInfo) } async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { const explorersBySlug = await db.getPublishedExplorersBySlug(trx) + // The DB query gets the tags for the explorer, but we need to add the parent tags as well. + // This isn't done in the query because it would require a recursive CTE. + // It's easier to write that query once, separately, and reuse it. const parentTags = await db.getParentTagsByChildName(trx) const publishedExplorersWithTags = [] - for (const explorer of Object.values(explorersBySlug).filter( - // TODO: testing, remove this - (e) => e.slug === "fish-stocks" - )) { + for (const explorer of Object.values(explorersBySlug)) { if (!explorer.tags.length) { await logErrorAndMaybeSendToBugsnag({ name: "ExplorerTagMissing", - message: `Explorer ${explorer.slug} has no tags.`, + message: `Explorer "${explorer.slug}" has no tags.`, }) } const tags = new Set() - // The DB query gets the tags for the explorer, but we need to add the parent tags as well. - // This isn't done in the query because it would require a recursive CTE. - // It's easier to write that query once, separately, and reuse it for (const tag of explorer.tags) { - tags.add(tag.name) - for (const parentTag of parentTags[tag.name]) { + tags.add(tag) + for (const parentTag of parentTags[tag]) { tags.add(parentTag) } } @@ -593,47 +674,24 @@ async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { export const getExplorerViewRecords = async ( trx: db.KnexReadonlyTransaction -): Promise => { - console.log("Fetching explorer views to index") +): Promise => { + console.log("Getting explorer view records") const publishedExplorersWithTags = await getExplorersWithInheritedTags(trx) const pageviews = await getAnalyticsPageviewsByUrlObj(trx) const explorerAdminServer = new ExplorerAdminServer(GIT_CMS_DIR) - let records = [] as ExplorerViewEntryWithExplorerInfo[] - for (const explorerInfo of publishedExplorersWithTags) { - const explorerViewRecords = await getExplorerViewRecordsForExplorerSlug( - trx, - explorerInfo.slug, - explorerAdminServer - ) - - const explorerPageviews = get( - pageviews, - [`/explorers/${explorerInfo.slug}`, "views_7d"], - 0 - ) - // These have a score for ranking purposes, but it doesn't yet factor in the explorer's pageviews - const unscoredRecords = explorerViewRecords.map( - (record, i): Omit => ({ - ...record, - explorerSlug: explorerInfo.slug, - explorerTitle: explorerInfo.title, - explorerSubtitle: explorerInfo.subtitle, - explorerViews_7d: explorerPageviews, - viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, - numViewsWithinExplorer: explorerViewRecords.length, - tags: explorerInfo.tags, - objectID: `${explorerInfo.slug}-${i}`, - }) - ) - records = records.concat( - unscoredRecords.map((record) => ({ - ...record, - score: computeExplorerViewScore(record), - })) - ) - } + const records = await pMap( + publishedExplorersWithTags, + (explorerInfo) => + getExplorerViewRecordsForExplorer( + trx, + explorerInfo, + pageviews, + explorerAdminServer + ), + { concurrency: 1 } + ).then((records) => records.flat()) return records } diff --git a/baker/algolia/utils/shared.ts b/baker/algolia/utils/shared.ts index 6d8dfe300ce..30911b3f1fd 100644 --- a/baker/algolia/utils/shared.ts +++ b/baker/algolia/utils/shared.ts @@ -1,6 +1,5 @@ import { countries, - excludeNullish, orderBy, removeTrailingParenthetical, } from "@ourworldindata/utils" diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts index 4b269012991..6cd50f82c40 100644 --- a/baker/algolia/utils/types.ts +++ b/baker/algolia/utils/types.ts @@ -1,3 +1,4 @@ +import { DbEnrichedVariable } from "@ourworldindata/types" import { PageType } from "../../../site/search/searchTypes.js" /** Pages */ @@ -62,6 +63,16 @@ export interface ExplorerViewEntry { // viewViews_7d: number } +export type ExplorerViewWithoutViewTitleIndex = Omit< + ExplorerViewEntry, + "viewTitleIndexWithinExplorer" +> + +export type EntitiesByColumnDictionary = Record< + string, + Record +> + export interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { explorerSlug: string explorerTitle: string @@ -76,6 +87,105 @@ export interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { objectID?: string } +export type ExplorerIndicatorMetadataFromDb = Pick< + DbEnrichedVariable, + | "id" + | "catalogPath" + | "name" + | "titlePublic" + | "display" + | "descriptionShort" +> + +export type ExplorerIndicatorMetadataDictionary = Record< + string | number, + ExplorerIndicatorMetadataFromDb & { + entityNames?: string[] + } +> + +export interface ExplorerViewBaseRecord { + availableEntities: string[] + numNonDefaultSettings: number + tableSlug?: string + viewGrapherId?: number + viewIndexWithinExplorer: number + viewQueryParams: string + // TODO: are nulls necessary here? + viewSettings: Array + viewSubtitle?: string + viewTitle?: string + ySlugs: Array + yVariableIds: Array + explorerSlug: string +} + +export type GrapherUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: number +} + +export type GrapherEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewTitle: string + viewSubtitle: string + titleLength: number +} + +export type IndicatorUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: [] + tableSlug: never +} + +export type IndicatorEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: string[] + tableSlug: string + availableEntities: string[] + titleLength: number +} + +export type CsvUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: string[] + // TODO: why are there nulls here? + tableSlug: string | null +} + +export type CsvEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: string[] + tableSlug: string + titleLength: number +} + +export type EnrichedExplorerRecord = + | GrapherEnrichedExplorerViewRecord + | IndicatorEnrichedExplorerViewRecord + | CsvEnrichedExplorerViewRecord + +/** This is the final record we index to Algolia */ +export interface ExplorerViewFinalRecord { + objectID: string + explorerTitle: string + viewTitle: string + viewSettings: string[] + viewTitleIndexWithinExplorer: number + score: number + viewIndexWithinExplorer: number + viewSubtitle: string + viewQueryParams: string + titleLength: number + numNonDefaultSettings: number + explorerSlug: string + explorerSubtitle: string + explorerViews_7d: number + viewTitleAndExplorerSlug: string + numViewsWithinExplorer: number + // These 2 aren't currently used in the explorer-views index (used in /search), but we need them in the data catalog + tags: string[] + availableEntities: string[] +} + export interface IndicatorMetadata { entityNames: string[] titlePublic?: string @@ -84,7 +194,7 @@ export interface IndicatorMetadata { descriptionShort?: string } -export interface GrapherInfo { +export interface ExplorerViewGrapherInfo { id: number title: string subtitle: string diff --git a/db/db.ts b/db/db.ts index 8f28b746806..8cc02389120 100644 --- a/db/db.ts +++ b/db/db.ts @@ -9,7 +9,6 @@ import { import { registerExitHandler } from "./cleanup.js" import { createTagGraph, keyBy } from "@ourworldindata/utils" import { - DbChartTagJoin, ImageMetadata, MinimalDataInsightInterface, OwidGdocType, @@ -28,6 +27,7 @@ import { OwidGdoc, DbPlainTag, TagGraphNode, + MinimalExplorerInfo, } from "@ourworldindata/types" import { groupBy, uniq } from "lodash" import { gdocFromJSON } from "./model/Gdoc/GdocFactory.js" @@ -197,7 +197,7 @@ export const getSlugsWithPublishedGdocsSuccessors = async ( export const getExplorerTags = async ( knex: KnexReadonlyTransaction -): Promise<{ slug: string; tags: DbChartTagJoin[] }[]> => { +): Promise<{ slug: string; tags: Pick[] }[]> => { return knexRaw<{ slug: string; tags: string }>( knex, `-- sql @@ -216,21 +216,14 @@ export const getExplorerTags = async ( ).then((rows) => rows.map((row) => ({ slug: row.slug, - tags: JSON.parse(row.tags) as DbChartTagJoin[], + tags: JSON.parse(row.tags) as Pick[], })) ) } export const getPublishedExplorersBySlug = async ( knex: KnexReadonlyTransaction -): Promise<{ - [slug: string]: { - slug: string - title: string - subtitle: string - tags: DbChartTagJoin[] - } -}> => { +): Promise> => { const tags = await getExplorerTags(knex) const tagsBySlug = keyBy(tags, "slug") return knexRaw( @@ -246,11 +239,14 @@ export const getPublishedExplorersBySlug = async ( isPublished = TRUE` ).then((rows) => { const processed = rows.map((row: any) => { + const tagsForExplorer = tagsBySlug[row.slug] return { slug: row.slug, title: row.title, subtitle: row.subtitle === "null" ? "" : row.subtitle, - tags: tagsBySlug[row.slug]?.tags ?? [], + tags: tagsForExplorer + ? tagsForExplorer.tags.map((tag) => tag.name) + : [], } }) return keyBy(processed, "slug") diff --git a/packages/@ourworldindata/types/src/dbTypes/Explorers.ts b/packages/@ourworldindata/types/src/dbTypes/Explorers.ts index cafe0a7809e..9ccd0ef3ea1 100644 --- a/packages/@ourworldindata/types/src/dbTypes/Explorers.ts +++ b/packages/@ourworldindata/types/src/dbTypes/Explorers.ts @@ -10,3 +10,11 @@ export interface DbInsertExplorer { } export type DbPlainExplorer = Required // TODO: add enriched type and type config properly + +/** A sparse set of explorer metadata. Currently used to begin Algolia indexing with */ +export type MinimalExplorerInfo = { + slug: string + title: string + subtitle: string + tags: string[] +} diff --git a/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts b/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts index 6f15c14f15a..8adff305b33 100644 --- a/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts +++ b/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts @@ -11,7 +11,6 @@ import { RawBlockText, RefDictionary, } from "./ArchieMlComponents.js" -import { DbChartTagJoin } from "../dbTypes/ChartTags.js" import { MinimalTag } from "../dbTypes/Tags.js" import { DbEnrichedLatestWork } from "../domainTypes/Author.js" @@ -42,7 +41,7 @@ export interface LinkedChart { title: string subtitle?: string thumbnail?: string - tags: DbChartTagJoin[] + tags: string[] tab?: GrapherTabOption indicatorId?: number // in case of a datapage } diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts index 4f0e8a12d1a..0335c469664 100644 --- a/packages/@ourworldindata/types/src/index.ts +++ b/packages/@ourworldindata/types/src/index.ts @@ -502,6 +502,7 @@ export { type DbPlainExplorer, type DbInsertExplorer, ExplorersTableName, + type MinimalExplorerInfo, } from "./dbTypes/Explorers.js" export { type DbPlainExplorerVariable, From a373039ba86549c5d38b7cb136bb67201c9a48d5 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Sat, 2 Nov 2024 03:04:37 +0000 Subject: [PATCH 05/14] =?UTF-8?q?=F0=9F=94=A8=20remove=20unused=20types?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/utils/explorerViews.ts | 14 +++----- baker/algolia/utils/types.ts | 50 +++------------------------- 2 files changed, 10 insertions(+), 54 deletions(-) diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index c1f8fd5a19f..bba0ce1237e 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -293,15 +293,11 @@ const createBaseRecords = ( explorerInfo: MinimalExplorerInfo, matrix: DecisionMatrix ): ExplorerViewBaseRecord[] => { - return ( - matrix - .allDecisionsAsQueryParams() - // TODO: remove me, testing only - // .slice (0, 5) - .map((choice: ExplorerChoiceParams, index: number) => - createBaseRecord(choice, matrix, index, explorerInfo) - ) - ) + return matrix + .allDecisionsAsQueryParams() + .map((choice: ExplorerChoiceParams, index: number) => + createBaseRecord(choice, matrix, index, explorerInfo) + ) } const fetchGrapherInfo = async ( diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts index 6cd50f82c40..ae1c57d49e0 100644 --- a/baker/algolia/utils/types.ts +++ b/baker/algolia/utils/types.ts @@ -37,56 +37,11 @@ export interface ParsedChartRecordRow { } /** Explorers */ -export interface ExplorerViewEntry { - viewTitle: string - viewSubtitle: string - viewSettings: string[] - viewQueryParams: string - availableEntities: string[] - - viewGrapherId?: number - yVariableIds: Array // Variable IDs or ETL paths - tableSlug?: string - ySlugs: string[] - - /** - * We often have several views with the same title within an explorer, e.g. "Population". - * In order to only display _one_ of these views in search results, we need a way to demote duplicates. - * This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc. - */ - viewTitleIndexWithinExplorer: number - - // Potential ranking criteria - viewIndexWithinExplorer: number - titleLength: number - numNonDefaultSettings: number - // viewViews_7d: number -} - -export type ExplorerViewWithoutViewTitleIndex = Omit< - ExplorerViewEntry, - "viewTitleIndexWithinExplorer" -> - export type EntitiesByColumnDictionary = Record< string, Record > -export interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { - explorerSlug: string - explorerTitle: string - explorerSubtitle: string - explorerViews_7d: number - viewTitleAndExplorerSlug: string // used for deduplication: `viewTitle | explorerSlug` - numViewsWithinExplorer: number - tags: string[] - - score: number - - objectID?: string -} - export type ExplorerIndicatorMetadataFromDb = Pick< DbEnrichedVariable, | "id" @@ -169,6 +124,11 @@ export interface ExplorerViewFinalRecord { explorerTitle: string viewTitle: string viewSettings: string[] + /** + * We often have several views with the same title within an explorer, e.g. "Population". + * In order to only display _one_ of these views in search results, we need a way to demote duplicates. + * This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc. + */ viewTitleIndexWithinExplorer: number score: number viewIndexWithinExplorer: number From c115fd7067f80eb314a84dfa36de6c948500ab25 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Sat, 2 Nov 2024 03:16:56 +0000 Subject: [PATCH 06/14] =?UTF-8?q?=F0=9F=90=9B=20fix=20typescript=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- site/gdocs/components/ExplorerTiles.tsx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/site/gdocs/components/ExplorerTiles.tsx b/site/gdocs/components/ExplorerTiles.tsx index 1f1208f0046..2884085df24 100644 --- a/site/gdocs/components/ExplorerTiles.tsx +++ b/site/gdocs/components/ExplorerTiles.tsx @@ -19,10 +19,9 @@ function ExplorerTile({ url }: { url: string }) { height={40} width={40} src={`${BAKED_BASE_URL}/images/tag-icons/${encodeURIComponent( - linkedChart.tags[0].name + linkedChart.tags[0] )}.svg`} - className="explorer-tile__icon" - alt={`Icon for topic ${linkedChart.tags[0].name}`} + alt={`Icon for topic ${linkedChart.tags[0]}`} loading="lazy" /> ) : null From 15739d01c6381d9917310aaeae964daf72b0e12a Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Sat, 2 Nov 2024 03:30:16 +0000 Subject: [PATCH 07/14] =?UTF-8?q?=F0=9F=8E=89=20add=20config=20for=20Explo?= =?UTF-8?q?rerViewsAndCharts=20index?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/configureAlgolia.ts | 34 +++++++++++++++++++ .../indexExplorerViewsAndChartsToAlgolia.ts | 5 ++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/baker/algolia/configureAlgolia.ts b/baker/algolia/configureAlgolia.ts index 84d43ca31fc..d230a16bf41 100644 --- a/baker/algolia/configureAlgolia.ts +++ b/baker/algolia/configureAlgolia.ts @@ -159,6 +159,40 @@ export const configureAlgolia = async () => { ], }) + const explorerViewsAndChartsIndex = client.initIndex( + getIndexName(SearchIndexName.ExplorerViewsAndCharts) + ) + + await explorerViewsAndChartsIndex.setSettings({ + ...baseSettings, + searchableAttributes: [ + "unordered(title)", + "unordered(slug)", + "unordered(variantName)", + "unordered(subtitle)", + "unordered(tags)", + "unordered(availableEntities)", + ], + ranking: ["typo", "words", "exact", "attribute", "custom", "proximity"], + customRanking: [ + // For multiple explorer views with the same title, we want to avoid surfacing duplicates. + // So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc. + "asc(viewTitleIndexWithinExplorer)", + "desc(score)", + "asc(titleLength)", + ], + attributesToSnippet: ["subtitle:24"], + attributeForDistinct: "id", + optionalWords: ["vs"], + + // These lines below essentially demote matches in the `subtitle` and `availableEntities` fields: + // If we find a match (only) there, then it doesn't count towards `exact`, and is therefore ranked lower. + // We also disable prefix matching and typo tolerance on these. + disableExactOnAttributes: ["tags", "subtitle", "availableEntities"], + disableTypoToleranceOnAttributes: ["subtitle", "availableEntities"], + disablePrefixOnAttributes: ["subtitle"], + }) + const synonyms = [ ["owid", "our world in data"], ["kids", "children"], diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index a793aa84c20..ccd628baa3e 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -18,7 +18,9 @@ import { function explorerViewRecordToChartRecord( e: ExplorerViewFinalRecord -): ChartRecord { +): ChartRecord & { + viewTitleIndexWithinExplorer: number +} { return { type: ChartRecordType.ExplorerView, objectID: e.objectID!, @@ -37,6 +39,7 @@ function explorerViewRecordToChartRecord( titleLength: e.titleLength, numRelatedArticles: 0, views_7d: e.explorerViews_7d, + viewTitleIndexWithinExplorer: e.viewTitleIndexWithinExplorer, score: e.score, } } From 4fe5cefecd86e98976c1fef17aabdc20979e20f2 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Mon, 4 Nov 2024 10:57:51 -0500 Subject: [PATCH 08/14] =?UTF-8?q?=E2=9C=A8=20omit=20grapher=20explorers=20?= =?UTF-8?q?from=20explorer-views-and-charts=20index?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../indexExplorerViewsAndChartsToAlgolia.ts | 91 ++++--------------- baker/algolia/utils/explorerViews.ts | 66 ++++++++++++++ baker/algolia/utils/types.ts | 35 ++++--- 3 files changed, 104 insertions(+), 88 deletions(-) diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index ccd628baa3e..f652b691b95 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -6,78 +6,15 @@ import { BUGSNAG_NODE_API_KEY, } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" -import { ExplorerViewFinalRecord } from "./utils/types.js" -import { getExplorerViewRecords } from "./utils/explorerViews.js" +import { + explorerViewRecordToChartRecord, + getExplorerViewRecords, + scaleExplorerScores, +} from "./utils/explorerViews.js" import { getChartsRecords } from "./utils/charts.js" import { getIndexName } from "../../site/search/searchClient.js" -import { - ChartRecord, - ChartRecordType, - SearchIndexName, -} from "../../site/search/searchTypes.js" - -function explorerViewRecordToChartRecord( - e: ExplorerViewFinalRecord -): ChartRecord & { - viewTitleIndexWithinExplorer: number -} { - return { - type: ChartRecordType.ExplorerView, - objectID: e.objectID!, - chartId: Math.floor(Math.random() * 1000000), - slug: e.explorerSlug, - queryParams: e.viewQueryParams, - title: e.viewTitle, - subtitle: e.explorerSubtitle, - variantName: "", - keyChartForTags: [], - tags: e.tags, - availableEntities: e.availableEntities, - publishedAt: new Date().toISOString(), - updatedAt: new Date().toISOString(), - numDimensions: e.numNonDefaultSettings, - titleLength: e.titleLength, - numRelatedArticles: 0, - views_7d: e.explorerViews_7d, - viewTitleIndexWithinExplorer: e.viewTitleIndexWithinExplorer, - score: e.score, - } -} - -/** - * Scale explorer scores to the range of grapher scores - * e.g. if the highest explorer score is 100 and the highest grapher score is 1000, - * we want to scale the explorer scores to be between 0 and 1000 - */ -function scaleExplorerScores( - explorerRecords: ChartRecord[], - grapherRecords: ChartRecord[] -): ChartRecord[] { - const explorerScores = explorerRecords.map((e) => e.score) - const explorerScoreMax = Math.max(...explorerScores) - - const grapherScores = grapherRecords.map((e) => e.score) - const grapherScoreBounds = { - max: Math.max(...grapherScores), - min: Math.min(...grapherScores), - } - - // scale positive explorer scores to the range of grapher scores - // We want to keep negative scores because they're intentionally downranked as near-duplicates of existing views - return explorerRecords.map((e): ChartRecord => { - if (e.score < 0) return e - // A value between 0 and 1 - const normalized = e.score / explorerScoreMax - const grapherRange = grapherScoreBounds.max - grapherScoreBounds.min - const scaled = Math.round( - normalized * grapherRange + grapherScoreBounds.min - ) - return { - ...e, - score: scaled, - } - }) -} +import { SearchIndexName } from "../../site/search/searchTypes.js" +import { ConvertedExplorerChartHit } from "./utils/types.js" // We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over. // If we standardize the record shape, we could have this be the only index and have a `type` field @@ -113,11 +50,17 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { } }, db.TransactionCloseMode.Close) - const convertedExplorerViews = explorerViews.map( - explorerViewRecordToChartRecord - ) + const convertedNonGrapherExplorerViews: ConvertedExplorerChartHit[] = [] + for (const view of explorerViews) { + if (!view.viewGrapherId) { + convertedNonGrapherExplorerViews.push( + explorerViewRecordToChartRecord(view) + ) + } + } + const scaledExplorerViews = scaleExplorerScores( - convertedExplorerViews, + convertedNonGrapherExplorerViews, grapherViews ) const records = [...scaledExplorerViews, ...grapherViews] diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index bba0ce1237e..aca2397b227 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -39,8 +39,74 @@ import { IndicatorEnrichedExplorerViewRecord, IndicatorUnenrichedExplorerViewRecord, CsvEnrichedExplorerViewRecord, + ConvertedExplorerChartHit, } from "./types.js" import { processAvailableEntities as processRecordAvailableEntities } from "./shared.js" +import { + ChartRecord, + ChartRecordType, +} from "../../../site/search/searchTypes.js" + +export function explorerViewRecordToChartRecord( + e: ExplorerViewFinalRecord +): ConvertedExplorerChartHit { + return { + type: ChartRecordType.ExplorerView, + objectID: e.objectID!, + chartId: Math.floor(Math.random() * 1000000), + slug: e.explorerSlug, + queryParams: e.viewQueryParams, + title: e.viewTitle, + subtitle: e.explorerSubtitle, + variantName: "", + keyChartForTags: [], + tags: e.tags, + availableEntities: e.availableEntities, + publishedAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + numDimensions: e.numNonDefaultSettings, + titleLength: e.titleLength, + numRelatedArticles: 0, + views_7d: e.explorerViews_7d, + viewTitleIndexWithinExplorer: e.viewTitleIndexWithinExplorer, + score: e.score, + } +} + +/** + * Scale explorer scores to the range of grapher scores + * e.g. if the highest explorer score is 100 and the highest grapher score is 1000, + * we want to scale the explorer scores to be between 0 and 1000 + */ +export function scaleExplorerScores( + explorerRecords: ChartRecord[], + grapherRecords: ChartRecord[] +): ChartRecord[] { + const explorerScores = explorerRecords.map((e) => e.score) + const explorerScoreMax = Math.max(...explorerScores) + + const grapherScores = grapherRecords.map((e) => e.score) + const grapherScoreBounds = { + max: Math.max(...grapherScores), + min: Math.min(...grapherScores), + } + + // scale positive explorer scores to the range of grapher scores + // We want to keep negative scores because they're intentionally downranked as near-duplicates of existing views + return explorerRecords.map((e): ChartRecord => { + if (e.score < 0) return e + // A value between 0 and 1 + const normalized = e.score / explorerScoreMax + const grapherRange = grapherScoreBounds.max - grapherScoreBounds.min + const scaled = Math.round( + normalized * grapherRange + grapherScoreBounds.min + ) + return { + ...e, + score: scaled, + } + }) +} // Creates a search-ready string from a choice. // Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included. diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts index ae1c57d49e0..f8ebd9d505e 100644 --- a/baker/algolia/utils/types.ts +++ b/baker/algolia/utils/types.ts @@ -1,5 +1,5 @@ import { DbEnrichedVariable } from "@ourworldindata/types" -import { PageType } from "../../../site/search/searchTypes.js" +import { ChartRecord, PageType } from "../../../site/search/searchTypes.js" /** Pages */ export interface TypeAndImportance { @@ -37,6 +37,20 @@ export interface ParsedChartRecordRow { } /** Explorers */ +export interface IndicatorMetadata { + entityNames: string[] + titlePublic?: string + display?: { name: string } + name: string + descriptionShort?: string +} + +export interface ExplorerViewGrapherInfo { + id: number + title: string + subtitle: string +} + export type EntitiesByColumnDictionary = Record< string, Record @@ -118,7 +132,7 @@ export type EnrichedExplorerRecord = | IndicatorEnrichedExplorerViewRecord | CsvEnrichedExplorerViewRecord -/** This is the final record we index to Algolia */ +/** This is the final record we index to Algolia for the `explorer-views` index */ export interface ExplorerViewFinalRecord { objectID: string explorerTitle: string @@ -144,18 +158,11 @@ export interface ExplorerViewFinalRecord { // These 2 aren't currently used in the explorer-views index (used in /search), but we need them in the data catalog tags: string[] availableEntities: string[] + // Only used to filter out these views from the data catalog (because we already index graphers) + viewGrapherId?: number } -export interface IndicatorMetadata { - entityNames: string[] - titlePublic?: string - display?: { name: string } - name: string - descriptionShort?: string -} - -export interface ExplorerViewGrapherInfo { - id: number - title: string - subtitle: string +// This is the final record we index to Algolia for the `explorer-views-and-charts` index +export type ConvertedExplorerChartHit = ChartRecord & { + viewTitleIndexWithinExplorer: number } From 6541daef646bfcb8c0806f436f399a4d69fee44d Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Wed, 6 Nov 2024 20:26:14 +0000 Subject: [PATCH 09/14] =?UTF-8?q?=E2=9C=A8=20explorer=20indexing=20code=20?= =?UTF-8?q?review=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/indexChartsToAlgolia.ts | 13 ++- baker/algolia/utils/explorerViews.ts | 116 +++++++++++--------------- baker/algolia/utils/types.ts | 5 +- baker/updateChartEntities.ts | 4 +- 4 files changed, 65 insertions(+), 73 deletions(-) diff --git a/baker/algolia/indexChartsToAlgolia.ts b/baker/algolia/indexChartsToAlgolia.ts index 92b0572f2cf..30086cdca04 100644 --- a/baker/algolia/indexChartsToAlgolia.ts +++ b/baker/algolia/indexChartsToAlgolia.ts @@ -1,12 +1,23 @@ import * as db from "../../db/db.js" -import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" +import { + ALGOLIA_INDEXING, + BUGSNAG_NODE_API_KEY, +} from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { SearchIndexName } from "../../site/search/searchTypes.js" import { getIndexName } from "../../site/search/searchClient.js" import { getChartsRecords } from "./utils/charts.js" +import Bugsnag from "@bugsnag/js" const indexChartsToAlgolia = async () => { if (!ALGOLIA_INDEXING) return + if (BUGSNAG_NODE_API_KEY) { + Bugsnag.start({ + apiKey: BUGSNAG_NODE_API_KEY, + context: "index-explorer-views-to-algolia", + autoTrackSessions: false, + }) + } const client = getAlgoliaClient() if (!client) { diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index aca2397b227..21e51208351 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -8,7 +8,7 @@ import { import { at, get, groupBy, mapValues, orderBy, partition, uniq } from "lodash" import { MarkdownTextWrap } from "@ourworldindata/components" import { logErrorAndMaybeSendToBugsnag } from "../../../serverUtils/errorLog.js" -import { obtainAvailableEntitiesForAllGraphers } from "../../updateChartEntities.js" +import { obtainAvailableEntitiesForGraphers } from "../../updateChartEntities.js" import { fetchS3MetadataByPath } from "../../../db/model/Variable.js" import { getVariableMetadataRoute } from "@ourworldindata/grapher" import pMap from "p-map" @@ -53,7 +53,7 @@ export function explorerViewRecordToChartRecord( return { type: ChartRecordType.ExplorerView, objectID: e.objectID!, - chartId: Math.floor(Math.random() * 1000000), + chartId: -1, slug: e.explorerSlug, queryParams: e.viewQueryParams, title: e.viewTitle, @@ -74,35 +74,20 @@ export function explorerViewRecordToChartRecord( } /** - * Scale explorer scores to the range of grapher scores - * e.g. if the highest explorer score is 100 and the highest grapher score is 1000, - * we want to scale the explorer scores to be between 0 and 1000 + * Scale records' positive scores to be between 0 and 10000. */ -export function scaleExplorerScores( - explorerRecords: ChartRecord[], - grapherRecords: ChartRecord[] -): ChartRecord[] { - const explorerScores = explorerRecords.map((e) => e.score) - const explorerScoreMax = Math.max(...explorerScores) - - const grapherScores = grapherRecords.map((e) => e.score) - const grapherScoreBounds = { - max: Math.max(...grapherScores), - min: Math.min(...grapherScores), - } - - // scale positive explorer scores to the range of grapher scores - // We want to keep negative scores because they're intentionally downranked as near-duplicates of existing views - return explorerRecords.map((e): ChartRecord => { - if (e.score < 0) return e +export function scaleRecordScores(records: ChartRecord[]): ChartRecord[] { + const scores = records.map((r) => r.score) + const maxScore = Math.max(...scores) + return records.map((record): ChartRecord => { + // For ExplorerView records, we want to keep negative scores, + // because they're intentionally downranked as near-duplicates of existing views + if (record.score < 0) return record // A value between 0 and 1 - const normalized = e.score / explorerScoreMax - const grapherRange = grapherScoreBounds.max - grapherScoreBounds.min - const scaled = Math.round( - normalized * grapherRange + grapherScoreBounds.min - ) + const normalized = record.score / maxScore + const scaled = Math.round(normalized * 10000) return { - ...e, + ...record, score: scaled, } }) @@ -180,25 +165,23 @@ async function fetchIndicatorMetadata( ...keyBy(metadataFromDB, "catalogPath"), } as ExplorerIndicatorMetadataDictionary - async function fetchEntitiesForId(id?: number) { - if (id) { - const metadata = await fetchS3MetadataByPath( - getVariableMetadataRoute(DATA_API_URL, id) - ) - const entityNames = get(metadata, "dimensions.entities.values", []) - .map((value) => value.name) - .filter((name): name is string => !!name) + async function fetchEntitiesForId(id: number) { + const metadata = await fetchS3MetadataByPath( + getVariableMetadataRoute(DATA_API_URL, id) + ) + const entityNames = get(metadata, "dimensions.entities.values", []) + .map((value) => value.name) + .filter((name): name is string => !!name) - const idEntry = indicatorMetadataByIdAndPath[id] - if (idEntry) { - idEntry.entityNames = entityNames - } - const path = metadata.catalogPath - if (path) { - const pathEntry = indicatorMetadataByIdAndPath[path] - if (pathEntry) { - pathEntry.entityNames = entityNames - } + const idEntry = indicatorMetadataByIdAndPath[id] + if (idEntry) { + idEntry.entityNames = entityNames + } + const path = metadata.catalogPath + if (path) { + const pathEntry = indicatorMetadataByIdAndPath[path] + if (pathEntry) { + pathEntry.entityNames = entityNames } } } @@ -421,7 +404,7 @@ const enrichWithGrapherData = async ( `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${explorerInfo.slug}` ) const grapherInfo = await fetchGrapherInfo(trx, grapherIds) - const availableEntities = await obtainAvailableEntitiesForAllGraphers( + const availableEntities = await obtainAvailableEntitiesForGraphers( trx, grapherIds ) @@ -452,9 +435,9 @@ async function enrichRecordWithTableData( return } - const availableEntities = ySlugs - .flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) - .filter((name, i, array) => !!name && array.indexOf(name) === i) + const availableEntities = uniq( + ySlugs.flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) + ) return { ...record, @@ -491,8 +474,8 @@ function enrichRecordWithIndicatorData( ).flatMap((meta) => meta.entityNames) const uniqueNonEmptyEntityNames = uniq(allEntityNames).filter( - Boolean - ) as string[] + (name): name is string => !!name + ) const firstYIndicator = record.yVariableIds[0] @@ -630,22 +613,7 @@ export const getExplorerViewRecordsForExplorer = async ( explorerAdminServer: ExplorerAdminServer ): Promise => { const { slug } = explorerInfo - // Get explorer program and table definitions const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) - // TODO: why doesn't us-covid-data-explorer have tableSlugs or tableDefs? - const tableDefs = explorerProgram.tableSlugs - .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) - .filter((x) => x && x.url && x.slug) as TableDef[] - - // Fetch and process CSV table data - console.log( - `Fetching CSV table data for ${slug} and aggregating entities by column` - ) - const entitiesPerColumnPerTable = - await getEntitiesPerColumnPerTable(tableDefs) - console.log( - "Finished fetching CSV table data and aggregating entities by column" - ) console.log( `Creating ${explorerProgram.decisionMatrix.numRows} base records for explorer ${slug}` @@ -687,6 +655,20 @@ export const getExplorerViewRecordsForExplorer = async ( indicatorMetadataDictionary ) + const tableDefs = explorerProgram.tableSlugs + .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) + .filter((x) => x && x.url && x.slug) as TableDef[] + + // Fetch and process CSV table data + console.log( + `Fetching CSV table data for ${slug} and aggregating entities by column` + ) + const entitiesPerColumnPerTable = + await getEntitiesPerColumnPerTable(tableDefs) + console.log( + "Finished fetching CSV table data and aggregating entities by column" + ) + const enrichedCsvRecords = await enrichWithTableData( csvBaseRecords, entitiesPerColumnPerTable diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts index f8ebd9d505e..6c6346a2213 100644 --- a/baker/algolia/utils/types.ts +++ b/baker/algolia/utils/types.ts @@ -108,7 +108,7 @@ export type IndicatorUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { export type IndicatorEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { viewGrapherId: never ySlugs: string[] - tableSlug: string + tableSlug: never availableEntities: string[] titleLength: number } @@ -116,8 +116,7 @@ export type IndicatorEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { export type CsvUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { viewGrapherId: never ySlugs: string[] - // TODO: why are there nulls here? - tableSlug: string | null + tableSlug: string } export type CsvEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { diff --git a/baker/updateChartEntities.ts b/baker/updateChartEntities.ts index 9e9736c19dd..10264e28155 100644 --- a/baker/updateChartEntities.ts +++ b/baker/updateChartEntities.ts @@ -120,7 +120,7 @@ const obtainAvailableEntitiesForGrapherConfig = async ( } else return [] } -export const obtainAvailableEntitiesForAllGraphers = async ( +export const obtainAvailableEntitiesForGraphers = async ( trx: db.KnexReadonlyTransaction, // Optional subset of IDs to restrict data fetching to chartIds?: number[] @@ -196,7 +196,7 @@ const updateAvailableEntitiesForAllGraphers = async ( "--- Obtaining available entity ids for all published graphers ---" ) const availableEntitiesByChartId = - await obtainAvailableEntitiesForAllGraphers(trx) + await obtainAvailableEntitiesForGraphers(trx) console.log("--- Fetch stats ---") console.log( From 6688278037fc6cb8b0e273384d17b3806339fab3 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Thu, 7 Nov 2024 19:45:44 +0000 Subject: [PATCH 10/14] =?UTF-8?q?=F0=9F=90=9B=20add=20faceting=20attribute?= =?UTF-8?q?s=20to=20ExplorerViewsAndCharts=20algolia=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/configureAlgolia.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/baker/algolia/configureAlgolia.ts b/baker/algolia/configureAlgolia.ts index d230a16bf41..c2fd82757bf 100644 --- a/baker/algolia/configureAlgolia.ts +++ b/baker/algolia/configureAlgolia.ts @@ -191,6 +191,7 @@ export const configureAlgolia = async () => { disableExactOnAttributes: ["tags", "subtitle", "availableEntities"], disableTypoToleranceOnAttributes: ["subtitle", "availableEntities"], disablePrefixOnAttributes: ["subtitle"], + attributesForFaceting: ["tags", "availableEntities"], }) const synonyms = [ From 6976b8e9feb22a1c585e9de9af9c03304b6dcd21 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Thu, 7 Nov 2024 19:45:55 +0000 Subject: [PATCH 11/14] =?UTF-8?q?=E2=9C=A8=20improved=20explorerView=20sco?= =?UTF-8?q?re=20scaling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../indexExplorerViewsAndChartsToAlgolia.ts | 38 +++++++--------- baker/algolia/utils/explorerViews.ts | 44 +++++++++++-------- baker/algolia/utils/shared.ts | 25 +++++++++++ baker/algolia/utils/types.ts | 6 +++ 4 files changed, 71 insertions(+), 42 deletions(-) diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index f652b691b95..d2fd7859f1f 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -7,23 +7,18 @@ import { } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { - explorerViewRecordToChartRecord, getExplorerViewRecords, - scaleExplorerScores, + adaptExplorerViews, } from "./utils/explorerViews.js" +import { scaleRecordScores } from "./utils/shared.js" import { getChartsRecords } from "./utils/charts.js" import { getIndexName } from "../../site/search/searchClient.js" import { SearchIndexName } from "../../site/search/searchTypes.js" -import { ConvertedExplorerChartHit } from "./utils/types.js" // We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over. // If we standardize the record shape, we could have this be the only index and have a `type` field // to use in /search. const indexExplorerViewsAndChartsToAlgolia = async () => { - const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts) - console.log( - `Indexing explorer views and charts to the "${indexName}" index on Algolia` - ) if (!ALGOLIA_INDEXING) return if (BUGSNAG_NODE_API_KEY) { Bugsnag.start({ @@ -32,8 +27,11 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { autoTrackSessions: false, }) } + const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts) + console.log( + `Indexing explorer views and charts to the "${indexName}" index on Algolia` + ) const client = getAlgoliaClient() - if (!client) { await logErrorAndMaybeSendToBugsnag( `Failed indexing explorer views (Algolia client not initialized)` @@ -50,29 +48,23 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { } }, db.TransactionCloseMode.Close) - const convertedNonGrapherExplorerViews: ConvertedExplorerChartHit[] = [] - for (const view of explorerViews) { - if (!view.viewGrapherId) { - convertedNonGrapherExplorerViews.push( - explorerViewRecordToChartRecord(view) - ) - } - } + // Scale grapher scores between 0 and 10000, and explorer scores between 0 and 500 + // (Except for the first view of each explorer, which we set to 10000) + // This is because Graphers are generally higher quality than Explorers + const scaledGrapherViews = scaleRecordScores(grapherViews) + const scaledExplorerViews = adaptExplorerViews(explorerViews) - const scaledExplorerViews = scaleExplorerScores( - convertedNonGrapherExplorerViews, - grapherViews - ) - const records = [...scaledExplorerViews, ...grapherViews] + const records = [...scaledGrapherViews, ...scaledExplorerViews] const index = client.initIndex(indexName) console.log(`Indexing ${records.length} records`) await index.replaceAllObjects(records) console.log(`Indexing complete`) - } catch (e) { + } catch (error) { + console.log("Error: ", error) await logErrorAndMaybeSendToBugsnag({ name: `IndexExplorerViewsToAlgoliaError`, - message: `${e}`, + message: error, }) } } diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index 21e51208351..1f628210e84 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -41,7 +41,10 @@ import { CsvEnrichedExplorerViewRecord, ConvertedExplorerChartHit, } from "./types.js" -import { processAvailableEntities as processRecordAvailableEntities } from "./shared.js" +import { + processAvailableEntities as processRecordAvailableEntities, + scaleRecordScores, +} from "./shared.js" import { ChartRecord, ChartRecordType, @@ -74,23 +77,23 @@ export function explorerViewRecordToChartRecord( } /** - * Scale records' positive scores to be between 0 and 10000. + * Filter out Grapher views, scale their scores, and convert them to ChartRecords. + * Each explorer has a default view (whichever is defined first in the decision matrix) + * We scale these views' scores between 0 and MAX_SCORE, but the rest we scale between 0 and 500 + * to bury them under the (higher quality) grapher views in the data catalog. */ -export function scaleRecordScores(records: ChartRecord[]): ChartRecord[] { - const scores = records.map((r) => r.score) - const maxScore = Math.max(...scores) - return records.map((record): ChartRecord => { - // For ExplorerView records, we want to keep negative scores, - // because they're intentionally downranked as near-duplicates of existing views - if (record.score < 0) return record - // A value between 0 and 1 - const normalized = record.score / maxScore - const scaled = Math.round(normalized * 10000) - return { - ...record, - score: scaled, - } - }) +export function adaptExplorerViews( + explorerViews: ExplorerViewFinalRecord[] +): ChartRecord[] { + const nonGrapherViews = explorerViews.filter((view) => !view.viewGrapherId) + const [firstViews, rest] = partition( + nonGrapherViews, + (view) => view.isFirstExplorerView + ) + return [ + ...scaleRecordScores(firstViews), + ...scaleRecordScores(rest, 500), + ].map(explorerViewRecordToChartRecord) } // Creates a search-ready string from a choice. @@ -223,7 +226,9 @@ function makeAggregator(entityNameSlug: string) { if (!result[columnSlug]) { result[columnSlug] = new Set() } - result[columnSlug].add(entityName) + if (entityName) { + result[columnSlug].add(entityName) + } } }) @@ -335,6 +340,7 @@ const createBaseRecord = ( tableSlug: matrix.selectedRow.tableSlug, ySlugs: matrix.selectedRow.ySlugs?.split(" ") || [], explorerSlug: explorerInfo.slug, + isFirstExplorerView: index === 0, } } @@ -437,7 +443,7 @@ async function enrichRecordWithTableData( const availableEntities = uniq( ySlugs.flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) - ) + ).filter((name): name is string => !!name) return { ...record, diff --git a/baker/algolia/utils/shared.ts b/baker/algolia/utils/shared.ts index 30911b3f1fd..4b1f92a6669 100644 --- a/baker/algolia/utils/shared.ts +++ b/baker/algolia/utils/shared.ts @@ -37,3 +37,28 @@ export const processAvailableEntities = ( ["desc", "desc", "asc"] ) } + +export const MAX_SCORE = 10000 + +/** + * Scale records' positive scores to be between 0 and MAX_SCORE. + */ +export function scaleRecordScores( + records: T[], + max = MAX_SCORE +): T[] { + const scores = records.map((r) => r.score) + const maxScore = Math.max(...scores) + return records.map((record): T => { + // For ExplorerView records, we want to keep negative scores, + // because they're intentionally downranked as near-duplicates of existing views + if (record.score < 0) return record + // A value between 0 and 1 + const normalized = record.score / maxScore + const scaled = Math.round(normalized * max) + return { + ...record, + score: scaled, + } + }) +} diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts index 6c6346a2213..eaf60b0a337 100644 --- a/baker/algolia/utils/types.ts +++ b/baker/algolia/utils/types.ts @@ -87,6 +87,9 @@ export interface ExplorerViewBaseRecord { ySlugs: Array yVariableIds: Array explorerSlug: string + // True when the record is the first view specified in the explorer's config + // Used in order to downrank all other views for the same explorer in the data catalog + isFirstExplorerView: boolean } export type GrapherUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { @@ -159,6 +162,9 @@ export interface ExplorerViewFinalRecord { availableEntities: string[] // Only used to filter out these views from the data catalog (because we already index graphers) viewGrapherId?: number + // True when the record is the first view specified in the explorer's config + // Used in order to downrank all other views for the same explorer in the data catalog + isFirstExplorerView: boolean } // This is the final record we index to Algolia for the `explorer-views-and-charts` index From 5f03f99ddcaba02669358e6793ddfeda3c23eb10 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Thu, 7 Nov 2024 20:17:51 +0000 Subject: [PATCH 12/14] =?UTF-8?q?=E2=9C=A8=20add=20option=20to=20skip=20gr?= =?UTF-8?q?apher=20explorer=20views?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../indexExplorerViewsAndChartsToAlgolia.ts | 2 +- baker/algolia/utils/explorerViews.ts | 32 ++++++++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index d2fd7859f1f..8d49784442a 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -43,7 +43,7 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { const { explorerViews, grapherViews } = await db.knexReadonlyTransaction(async (trx) => { return { - explorerViews: await getExplorerViewRecords(trx), + explorerViews: await getExplorerViewRecords(trx, true), grapherViews: await getChartsRecords(trx), } }, db.TransactionCloseMode.Close) diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index 1f628210e84..2d1f74ff8b8 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -77,17 +77,16 @@ export function explorerViewRecordToChartRecord( } /** - * Filter out Grapher views, scale their scores, and convert them to ChartRecords. + * Scale explorer record scores then convert them to ChartRecords. * Each explorer has a default view (whichever is defined first in the decision matrix) - * We scale these views' scores between 0 and MAX_SCORE, but the rest we scale between 0 and 500 + * We scale these default view scores between 0 and MAX_SCORE, but the rest we scale between 0 and 500 * to bury them under the (higher quality) grapher views in the data catalog. */ export function adaptExplorerViews( explorerViews: ExplorerViewFinalRecord[] ): ChartRecord[] { - const nonGrapherViews = explorerViews.filter((view) => !view.viewGrapherId) const [firstViews, rest] = partition( - nonGrapherViews, + explorerViews, (view) => view.isFirstExplorerView ) return [ @@ -616,7 +615,8 @@ export const getExplorerViewRecordsForExplorer = async ( trx: db.KnexReadonlyTransaction, explorerInfo: MinimalExplorerInfo, pageviews: Record, - explorerAdminServer: ExplorerAdminServer + explorerAdminServer: ExplorerAdminServer, + skipGrapherViews: boolean ): Promise => { const { slug } = explorerInfo const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) @@ -634,11 +634,14 @@ export const getExplorerViewRecordsForExplorer = async ( (record) => record.viewGrapherId !== undefined ) as [GrapherUnenrichedExplorerViewRecord[], ExplorerViewBaseRecord[]] - const enrichedGrapherRecords = await enrichWithGrapherData( - trx, - grapherBaseRecords, - explorerInfo - ) + let enrichedGrapherRecords: GrapherEnrichedExplorerViewRecord[] = [] + if (!skipGrapherViews) { + enrichedGrapherRecords = await enrichWithGrapherData( + trx, + grapherBaseRecords, + explorerInfo + ) + } const [indicatorBaseRecords, csvBaseRecords] = partition( nonGrapherBaseRecords, @@ -723,9 +726,13 @@ async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { } export const getExplorerViewRecords = async ( - trx: db.KnexReadonlyTransaction + trx: db.KnexReadonlyTransaction, + skipGrapherViews = false ): Promise => { console.log("Getting explorer view records") + if (skipGrapherViews) { + console.log("(Skipping grapher views)") + } const publishedExplorersWithTags = await getExplorersWithInheritedTags(trx) const pageviews = await getAnalyticsPageviewsByUrlObj(trx) @@ -738,7 +745,8 @@ export const getExplorerViewRecords = async ( trx, explorerInfo, pageviews, - explorerAdminServer + explorerAdminServer, + skipGrapherViews ), { concurrency: 1 } ).then((records) => records.flat()) From ed92b59db2f0b4034e1e46917454448df6f43ac0 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Thu, 7 Nov 2024 21:25:08 +0000 Subject: [PATCH 13/14] =?UTF-8?q?=E2=9C=A8=20further=20explorer=20view=20r?= =?UTF-8?q?anking=20enhancements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/configureAlgolia.ts | 2 +- .../algolia/indexExplorerViewsAndChartsToAlgolia.ts | 12 ++++++++---- baker/algolia/utils/explorerViews.ts | 6 +++--- baker/algolia/utils/shared.ts | 9 ++++----- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/baker/algolia/configureAlgolia.ts b/baker/algolia/configureAlgolia.ts index c2fd82757bf..abb42eab36a 100644 --- a/baker/algolia/configureAlgolia.ts +++ b/baker/algolia/configureAlgolia.ts @@ -175,10 +175,10 @@ export const configureAlgolia = async () => { ], ranking: ["typo", "words", "exact", "attribute", "custom", "proximity"], customRanking: [ + "desc(score)", // For multiple explorer views with the same title, we want to avoid surfacing duplicates. // So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc. "asc(viewTitleIndexWithinExplorer)", - "desc(score)", "asc(titleLength)", ], attributesToSnippet: ["subtitle:24"], diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts index 8d49784442a..1614d199d29 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -48,10 +48,14 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { } }, db.TransactionCloseMode.Close) - // Scale grapher scores between 0 and 10000, and explorer scores between 0 and 500 - // (Except for the first view of each explorer, which we set to 10000) - // This is because Graphers are generally higher quality than Explorers - const scaledGrapherViews = scaleRecordScores(grapherViews) + // Scale grapher records and the default explorer views between 1000 and 10000, + // Scale the remaining explorer views between 0 and 1000. + // This is because Graphers are generally higher quality than Explorers and we don't want + // the data catalog to smother Grapher results with hundreds of low-quality Explorer results. + const scaledGrapherViews = scaleRecordScores( + grapherViews, + [1000, 10000] + ) const scaledExplorerViews = adaptExplorerViews(explorerViews) const records = [...scaledGrapherViews, ...scaledExplorerViews] diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index 2d1f74ff8b8..84c2037fd4f 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -79,7 +79,7 @@ export function explorerViewRecordToChartRecord( /** * Scale explorer record scores then convert them to ChartRecords. * Each explorer has a default view (whichever is defined first in the decision matrix) - * We scale these default view scores between 0 and MAX_SCORE, but the rest we scale between 0 and 500 + * We scale these default view scores between 0 and 10000, but the rest we scale between 0 and 1000 * to bury them under the (higher quality) grapher views in the data catalog. */ export function adaptExplorerViews( @@ -90,8 +90,8 @@ export function adaptExplorerViews( (view) => view.isFirstExplorerView ) return [ - ...scaleRecordScores(firstViews), - ...scaleRecordScores(rest, 500), + ...scaleRecordScores(firstViews, [1000, 10000]), + ...scaleRecordScores(rest, [0, 1000]), ].map(explorerViewRecordToChartRecord) } diff --git a/baker/algolia/utils/shared.ts b/baker/algolia/utils/shared.ts index 4b1f92a6669..0ebc1aef230 100644 --- a/baker/algolia/utils/shared.ts +++ b/baker/algolia/utils/shared.ts @@ -38,16 +38,15 @@ export const processAvailableEntities = ( ) } -export const MAX_SCORE = 10000 - /** - * Scale records' positive scores to be between 0 and MAX_SCORE. + * Scale records' positive scores to be between two numbers. */ export function scaleRecordScores( records: T[], - max = MAX_SCORE + range: [number, number] ): T[] { const scores = records.map((r) => r.score) + const [min, max] = range const maxScore = Math.max(...scores) return records.map((record): T => { // For ExplorerView records, we want to keep negative scores, @@ -55,7 +54,7 @@ export function scaleRecordScores( if (record.score < 0) return record // A value between 0 and 1 const normalized = record.score / maxScore - const scaled = Math.round(normalized * max) + const scaled = Math.round(normalized * (max - min) + min) return { ...record, score: scaled, From c510e3c71159dc7181b8aa56b60f45a053b1ee59 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Fri, 8 Nov 2024 14:13:04 +0000 Subject: [PATCH 14/14] =?UTF-8?q?=F0=9F=90=9B=20restore=20empty=20line=20s?= =?UTF-8?q?kipping=20for=20parseDelimited?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/@ourworldindata/core-table/src/CoreTableUtils.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/@ourworldindata/core-table/src/CoreTableUtils.ts b/packages/@ourworldindata/core-table/src/CoreTableUtils.ts index 0e27c90f9b6..618c41dc8f3 100644 --- a/packages/@ourworldindata/core-table/src/CoreTableUtils.ts +++ b/packages/@ourworldindata/core-table/src/CoreTableUtils.ts @@ -579,6 +579,7 @@ export const parseDelimited = ( const result = Papa.parse(str, { delimiter: delimiter ?? detectDelimiter(str), header: true, + skipEmptyLines: true, transformHeader: (header: string) => header.trim(), transform: (value: string) => value.trim(), })