diff --git a/Makefile b/Makefile index e8ed24f3e9e..faa6f072e78 100644 --- a/Makefile +++ b/Makefile @@ -297,9 +297,10 @@ update.chart-entities: itsJustJavascript reindex: itsJustJavascript @echo '==> Reindexing search in Algolia' node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js - node --enable-source-maps itsJustJavascript/baker/algolia/indexToAlgolia.js + node --enable-source-maps itsJustJavascript/baker/algolia/indexPagesToAlgolia.js node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js + node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js delete-algolia-index: itsJustJavascript @echo '==> Deleting Algolia index' diff --git a/adminSiteServer/apiRouter.ts b/adminSiteServer/apiRouter.ts index 231a2314cda..7cd65bb867e 100644 --- a/adminSiteServer/apiRouter.ts +++ b/adminSiteServer/apiRouter.ts @@ -125,7 +125,7 @@ import { denormalizeLatestCountryData } from "../baker/countryProfiles.js" import { indexIndividualGdocPost, removeIndividualGdocPostFromIndex, -} from "../baker/algolia/algoliaUtils.js" +} from "../baker/algolia/utils/pages.js" import { References } from "../adminSiteClient/ChartEditor.js" import { DeployQueueServer } from "../baker/DeployQueueServer.js" import { FunctionalRouter } from "./FunctionalRouter.js" diff --git a/baker/algolia/configureAlgolia.ts b/baker/algolia/configureAlgolia.ts index 84d43ca31fc..abb42eab36a 100644 --- a/baker/algolia/configureAlgolia.ts +++ b/baker/algolia/configureAlgolia.ts @@ -159,6 +159,41 @@ export const configureAlgolia = async () => { ], }) + const explorerViewsAndChartsIndex = client.initIndex( + getIndexName(SearchIndexName.ExplorerViewsAndCharts) + ) + + await explorerViewsAndChartsIndex.setSettings({ + ...baseSettings, + searchableAttributes: [ + "unordered(title)", + "unordered(slug)", + "unordered(variantName)", + "unordered(subtitle)", + "unordered(tags)", + "unordered(availableEntities)", + ], + ranking: ["typo", "words", "exact", "attribute", "custom", "proximity"], + customRanking: [ + "desc(score)", + // For multiple explorer views with the same title, we want to avoid surfacing duplicates. + // So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc. + "asc(viewTitleIndexWithinExplorer)", + "asc(titleLength)", + ], + attributesToSnippet: ["subtitle:24"], + attributeForDistinct: "id", + optionalWords: ["vs"], + + // These lines below essentially demote matches in the `subtitle` and `availableEntities` fields: + // If we find a match (only) there, then it doesn't count towards `exact`, and is therefore ranked lower. + // We also disable prefix matching and typo tolerance on these. + disableExactOnAttributes: ["tags", "subtitle", "availableEntities"], + disableTypoToleranceOnAttributes: ["subtitle", "availableEntities"], + disablePrefixOnAttributes: ["subtitle"], + attributesForFaceting: ["tags", "availableEntities"], + }) + const synonyms = [ ["owid", "our world in data"], ["kids", "children"], diff --git a/baker/algolia/indexChartsToAlgolia.ts b/baker/algolia/indexChartsToAlgolia.ts index c2de1064c34..30086cdca04 100644 --- a/baker/algolia/indexChartsToAlgolia.ts +++ b/baker/algolia/indexChartsToAlgolia.ts @@ -1,222 +1,23 @@ import * as db from "../../db/db.js" -import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" -import { getAlgoliaClient } from "./configureAlgolia.js" -import { isPathRedirectedToExplorer } from "../../explorerAdminServer/ExplorerRedirects.js" -import { ChartRecord, SearchIndexName } from "../../site/search/searchTypes.js" import { - KeyChartLevel, - OwidGdocLinkType, - excludeNullish, - isNil, - countries, - orderBy, - removeTrailingParenthetical, - uniq, -} from "@ourworldindata/utils" -import { MarkdownTextWrap } from "@ourworldindata/components" -import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" -import { getRelatedArticles } from "../../db/model/Post.js" + ALGOLIA_INDEXING, + BUGSNAG_NODE_API_KEY, +} from "../../settings/serverSettings.js" +import { getAlgoliaClient } from "./configureAlgolia.js" +import { SearchIndexName } from "../../site/search/searchTypes.js" import { getIndexName } from "../../site/search/searchClient.js" -import { getPublishedLinksTo } from "../../db/model/Link.js" - -const computeScore = (record: Omit): number => { - const { numRelatedArticles, views_7d } = record - return numRelatedArticles * 500 + views_7d -} - -const countriesWithVariantNames = new Set( - countries - .filter((country) => country.variantNames?.length || country.shortName) - .map((country) => country.name) -) - -const processAvailableEntities = (availableEntities: string[] | null) => { - if (!availableEntities) return [] - - // Algolia is a bit weird with synonyms: - // If we have a synonym "USA" -> "United States", and we search for "USA", - // then it seems that Algolia can only find that within `availableEntities` - // if "USA" is within the first 100-or-so entries of the array. - // So, the easy solution is to sort the entities to ensure that countries - // with variant names are at the top. - // Also, entities containing a hyphen like "low-income countries" can also - // only be found if they're within the first 100-or-so entries. - // - @marcelgerber, 2024-03-25 - return orderBy( - availableEntities, - [ - (entityName) => - countriesWithVariantNames.has( - removeTrailingParenthetical(entityName) - ), - (entityName) => entityName.includes("-"), - (entityName) => entityName, - ], - ["desc", "desc", "asc"] - ) -} - -interface RawChartRecordRow { - id: number - slug: string - title: string - variantName: string - subtitle: string - numDimensions: string - publishedAt: string - updatedAt: string - entityNames: string - tags: string - keyChartForTags: string -} - -interface ParsedChartRecordRow { - id: number - slug: string - title: string - variantName: string - subtitle: string - numDimensions: string - publishedAt: string - updatedAt: string - entityNames: string[] - tags: string[] - keyChartForTags: string[] -} - -const parseAndProcessChartRecords = ( - rawRecord: RawChartRecordRow -): ParsedChartRecordRow => { - let parsedEntities: string[] = [] - if (rawRecord.entityNames !== null) { - // This is a very rough way to check for the Algolia record size limit, but it's better than the update failing - // because we exceed the 20KB record size limit - if (rawRecord.entityNames.length < 12000) - parsedEntities = excludeNullish( - JSON.parse(rawRecord.entityNames as string) as (string | null)[] - ) as string[] - else { - console.info( - `Chart ${rawRecord.id} has too many entities, skipping its entities` - ) - } - } - const entityNames = processAvailableEntities(parsedEntities) - - const tags = JSON.parse(rawRecord.tags) - const keyChartForTags = JSON.parse( - rawRecord.keyChartForTags as string - ).filter((t: string | null) => t) - - return { - ...rawRecord, - entityNames, - tags, - keyChartForTags, - } -} - -const getChartsRecords = async ( - knex: db.KnexReadonlyTransaction -): Promise => { - const chartsToIndex = await db.knexRaw( - knex, - `-- sql - WITH indexable_charts_with_entity_names AS ( - SELECT c.id, - cc.slug, - cc.full ->> "$.title" AS title, - cc.full ->> "$.variantName" AS variantName, - cc.full ->> "$.subtitle" AS subtitle, - JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions, - c.publishedAt, - c.updatedAt, - JSON_ARRAYAGG(e.name) AS entityNames - FROM charts c - LEFT JOIN chart_configs cc ON c.configId = cc.id - LEFT JOIN charts_x_entities ce ON c.id = ce.chartId - LEFT JOIN entities e ON ce.entityId = e.id - WHERE cc.full ->> "$.isPublished" = 'true' - AND c.isIndexable IS TRUE - GROUP BY c.id - ) - SELECT c.id, - c.slug, - c.title, - c.variantName, - c.subtitle, - c.numDimensions, - c.publishedAt, - c.updatedAt, - c.entityNames, -- this array may contain null values, will have to filter these out - JSON_ARRAYAGG(t.name) AS tags, - JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out - FROM indexable_charts_with_entity_names c - LEFT JOIN chart_tags ct ON c.id = ct.chartId - LEFT JOIN tags t on ct.tagId = t.id - GROUP BY c.id - HAVING COUNT(t.id) >= 1 - ` - ) - - const parsedRows = chartsToIndex.map(parseAndProcessChartRecords) - - const pageviews = await getAnalyticsPageviewsByUrlObj(knex) - - const parentTagsByChildName = await db.getParentTagsByChildName(knex) - - const records: ChartRecord[] = [] - for (const c of parsedRows) { - // Our search currently cannot render explorers, so don't index them because - // otherwise they will fail when rendered in the search results - if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue - - const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? [] - const linksFromGdocs = await getPublishedLinksTo( - knex, - [c.slug], - OwidGdocLinkType.Grapher - ) - - const plaintextSubtitle = isNil(c.subtitle) - ? undefined - : new MarkdownTextWrap({ - text: c.subtitle, - fontSize: 10, // doesn't matter, but is a mandatory field - }).plaintext - - const parentTags = c.tags.flatMap( - // a chart can be tagged with a tag that isn't in the tag graph - (tag) => parentTagsByChildName[tag] || [] - ) - - const record = { - objectID: c.id.toString(), - chartId: c.id, - slug: c.slug, - title: c.title, - variantName: c.variantName, - subtitle: plaintextSubtitle, - availableEntities: c.entityNames, - numDimensions: parseInt(c.numDimensions), - publishedAt: c.publishedAt, - updatedAt: c.updatedAt, - tags: uniq([...c.tags, ...parentTags]), - keyChartForTags: c.keyChartForTags as string[], - titleLength: c.title.length, - // Number of references to this chart in all our posts and pages - numRelatedArticles: relatedArticles.length + linksFromGdocs.length, - views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0, - } - const score = computeScore(record) - records.push({ ...record, score }) - } - - return records -} +import { getChartsRecords } from "./utils/charts.js" +import Bugsnag from "@bugsnag/js" const indexChartsToAlgolia = async () => { if (!ALGOLIA_INDEXING) return + if (BUGSNAG_NODE_API_KEY) { + Bugsnag.start({ + apiKey: BUGSNAG_NODE_API_KEY, + context: "index-explorer-views-to-algolia", + autoTrackSessions: false, + }) + } const client = getAlgoliaClient() if (!client) { diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts new file mode 100644 index 00000000000..1614d199d29 --- /dev/null +++ b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts @@ -0,0 +1,81 @@ +import Bugsnag from "@bugsnag/js" +import * as db from "../../db/db.js" +import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" +import { + ALGOLIA_INDEXING, + BUGSNAG_NODE_API_KEY, +} from "../../settings/serverSettings.js" +import { getAlgoliaClient } from "./configureAlgolia.js" +import { + getExplorerViewRecords, + adaptExplorerViews, +} from "./utils/explorerViews.js" +import { scaleRecordScores } from "./utils/shared.js" +import { getChartsRecords } from "./utils/charts.js" +import { getIndexName } from "../../site/search/searchClient.js" +import { SearchIndexName } from "../../site/search/searchTypes.js" + +// We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over. +// If we standardize the record shape, we could have this be the only index and have a `type` field +// to use in /search. +const indexExplorerViewsAndChartsToAlgolia = async () => { + if (!ALGOLIA_INDEXING) return + if (BUGSNAG_NODE_API_KEY) { + Bugsnag.start({ + apiKey: BUGSNAG_NODE_API_KEY, + context: "index-explorer-views-to-algolia", + autoTrackSessions: false, + }) + } + const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts) + console.log( + `Indexing explorer views and charts to the "${indexName}" index on Algolia` + ) + const client = getAlgoliaClient() + if (!client) { + await logErrorAndMaybeSendToBugsnag( + `Failed indexing explorer views (Algolia client not initialized)` + ) + return + } + + try { + const { explorerViews, grapherViews } = + await db.knexReadonlyTransaction(async (trx) => { + return { + explorerViews: await getExplorerViewRecords(trx, true), + grapherViews: await getChartsRecords(trx), + } + }, db.TransactionCloseMode.Close) + + // Scale grapher records and the default explorer views between 1000 and 10000, + // Scale the remaining explorer views between 0 and 1000. + // This is because Graphers are generally higher quality than Explorers and we don't want + // the data catalog to smother Grapher results with hundreds of low-quality Explorer results. + const scaledGrapherViews = scaleRecordScores( + grapherViews, + [1000, 10000] + ) + const scaledExplorerViews = adaptExplorerViews(explorerViews) + + const records = [...scaledGrapherViews, ...scaledExplorerViews] + + const index = client.initIndex(indexName) + console.log(`Indexing ${records.length} records`) + await index.replaceAllObjects(records) + console.log(`Indexing complete`) + } catch (error) { + console.log("Error: ", error) + await logErrorAndMaybeSendToBugsnag({ + name: `IndexExplorerViewsToAlgoliaError`, + message: error, + }) + } +} + +process.on("unhandledRejection", (e) => { + console.error(e) + process.exit(1) +}) + +void indexExplorerViewsAndChartsToAlgolia() diff --git a/baker/algolia/indexExplorerViewsToAlgolia.ts b/baker/algolia/indexExplorerViewsToAlgolia.ts index 7a637711986..4c8568e4a2a 100644 --- a/baker/algolia/indexExplorerViewsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsToAlgolia.ts @@ -1,12 +1,4 @@ import * as db from "../../db/db.js" -import { tsvFormat } from "d3-dsv" -import { - ExplorerChoiceParams, - ExplorerControlType, - GridBoolean, - DecisionMatrix, -} from "@ourworldindata/explorer" -import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" import { ALGOLIA_INDEXING, BUGSNAG_NODE_API_KEY, @@ -14,349 +6,9 @@ import { import { getAlgoliaClient } from "./configureAlgolia.js" import { getIndexName } from "../../site/search/searchClient.js" import { SearchIndexName } from "../../site/search/searchTypes.js" -import { groupBy, keyBy, orderBy, partition } from "lodash" -import { MarkdownTextWrap } from "@ourworldindata/components" -import { DbRawVariable } from "@ourworldindata/utils" -import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" import Bugsnag from "@bugsnag/js" - -export type ExplorerBlockGraphers = { - type: "graphers" - block: { - title?: string - subtitle?: string - grapherId?: number - }[] -} - -interface ExplorerViewEntry { - viewTitle: string - viewSubtitle: string - viewSettings: string[] - viewQueryParams: string - - viewGrapherId?: number - viewFirstYIndicator?: string | number // Variable ID or ETL path - - /** - * We often have several views with the same title within an explorer, e.g. "Population". - * In order to only display _one_ of these views in search results, we need a way to demote duplicates. - * This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc. - */ - viewTitleIndexWithinExplorer: number - - // Potential ranking criteria - viewIndexWithinExplorer: number - titleLength: number - numNonDefaultSettings: number - // viewViews_7d: number -} - -interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry { - explorerSlug: string - explorerTitle: string - explorerSubtitle: string - explorerViews_7d: number - viewTitleAndExplorerSlug: string // used for deduplication: `viewTitle | explorerSlug` - numViewsWithinExplorer: number - - score: number - - objectID?: string -} - -// Creates a search-ready string from a choice. -// Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included. -// Imagine a "Per capita" checkbox, for example. If it's not ticked, then we don't want searches for "per capita" to wrongfully match it. -const explorerChoiceToViewSettings = ( - choices: ExplorerChoiceParams, - decisionMatrix: DecisionMatrix -): string[] => { - return Object.entries(choices).map(([choiceName, choiceValue]) => { - const choiceControlType = - decisionMatrix.choiceNameToControlTypeMap.get(choiceName) - if (choiceControlType === ExplorerControlType.Checkbox) - return choiceValue === GridBoolean.true ? choiceName : "" - else return choiceValue - }) -} - -const computeScore = ( - record: Omit & - Partial -) => - (record.explorerViews_7d ?? 0) * 10 - - record.numNonDefaultSettings * 50 - - record.titleLength - -const getExplorerViewRecordsForExplorerSlug = async ( - trx: db.KnexReadonlyTransaction, - slug: string -): Promise => { - const explorerConfig = await trx - .table("explorers") - .select("config") - .where({ slug }) - .first() - .then((row) => JSON.parse(row.config) as any) - - const explorerGrapherBlock: ExplorerBlockGraphers = - explorerConfig.blocks.filter( - (block: any) => block.type === "graphers" - )[0] as ExplorerBlockGraphers - - if (explorerGrapherBlock === undefined) - throw new Error(`Explorer ${slug} has no grapher block`) - - // TODO: Maybe make DecisionMatrix accept JSON directly - const tsv = tsvFormat(explorerGrapherBlock.block) - const explorerDecisionMatrix = new DecisionMatrix(tsv) - - console.log( - `Processing explorer ${slug} (${explorerDecisionMatrix.numRows} rows)` - ) - - const defaultSettings = explorerDecisionMatrix.defaultSettings - - const records = explorerDecisionMatrix - .allDecisionsAsQueryParams() - .map((choice, i) => { - explorerDecisionMatrix.setValuesFromChoiceParams(choice) - - // Check which choices are non-default, i.e. are not the first available option in a dropdown/radio - const nonDefaultSettings = Object.entries( - explorerDecisionMatrix.availableChoiceOptions - ).filter(([choiceName, choiceOptions]) => { - // Keep only choices which are not the default, which is: - // - either the options marked as `default` in the decision matrix - // - or the first available option in the decision matrix - return ( - choiceOptions.length > 1 && - !(defaultSettings[choiceName] !== undefined - ? defaultSettings[choiceName] === choice[choiceName] - : choice[choiceName] === choiceOptions[0]) - ) - }) - - const record: Omit< - ExplorerViewEntry, - "viewTitleIndexWithinExplorer" | "titleLength" - > = { - viewTitle: explorerDecisionMatrix.selectedRow.title, - viewSubtitle: explorerDecisionMatrix.selectedRow.subtitle, - viewSettings: explorerChoiceToViewSettings( - choice, - explorerDecisionMatrix - ), - viewGrapherId: explorerDecisionMatrix.selectedRow.grapherId, - viewFirstYIndicator: - explorerDecisionMatrix.selectedRow.yVariableIds - ?.trim() - .split(" ") - .at(0), - viewQueryParams: explorerDecisionMatrix.toString(), - - viewIndexWithinExplorer: i, - numNonDefaultSettings: nonDefaultSettings.length, - } - return record - }) - - // Enrich `grapherId`-powered views with title/subtitle - const grapherIds = records - .filter((record) => record.viewGrapherId !== undefined) - .map((record) => record.viewGrapherId as number) - - if (grapherIds.length) { - console.log( - `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${slug}` - ) - const grapherIdToTitle = await trx - .select( - trx.raw("charts.id as id"), - trx.raw("chart_configs.full->>'$.title' as title"), - trx.raw("chart_configs.full->>'$.subtitle' as subtitle") - ) - .from("charts") - .join("chart_configs", { "charts.configId": "chart_configs.id" }) - .whereIn("charts.id", grapherIds) - .andWhereRaw("chart_configs.full->>'$.isPublished' = 'true'") - .then((rows) => keyBy(rows, "id")) - - for (const record of records) { - if (record.viewGrapherId !== undefined) { - const grapherInfo = grapherIdToTitle[record.viewGrapherId] - if (grapherInfo === undefined) { - console.warn( - `Grapher id ${record.viewGrapherId} not found for explorer ${slug}` - ) - continue - } - record.viewTitle = grapherInfo.title - record.viewSubtitle = grapherInfo.subtitle - } - } - } - - // Resolve the `yIndicatorIds` field - const yIndicatorIds = records - .map((record) => record.viewFirstYIndicator) - .filter((id) => id !== undefined) - .filter((id) => id !== "") - - if (yIndicatorIds.length) { - console.log( - `Fetching indicator metadata from ${yIndicatorIds.length} indicators for explorer ${slug}` - ) - - type IndicatorRecord = Pick< - DbRawVariable, - | "id" - | "catalogPath" - | "titlePublic" - | "display" - | "name" - | "descriptionShort" - > - // The `yIndicatorId` can be a variable ID or a catalog path, and we want to resolve both - const indicatorIdToTitle: IndicatorRecord[] = await trx - .table("variables") - .select( - "id", - "catalogPath", - "name", - "titlePublic", - "display", - "name", - "descriptionShort" - ) - .whereIn("id", yIndicatorIds) - .orWhereIn("catalogPath", yIndicatorIds) - - const indicatorsKeyedByIdAndCatalogPath = indicatorIdToTitle.reduce( - (acc, indicator) => { - acc[indicator.id] = indicator - if (indicator.catalogPath) - acc[indicator.catalogPath] = indicator - return acc - }, - {} as Record - ) - - for (const record of records) { - if (record.viewFirstYIndicator !== undefined) { - const indicatorInfo = - indicatorsKeyedByIdAndCatalogPath[ - record.viewFirstYIndicator - ] - if (indicatorInfo === undefined) { - console.warn( - `Indicator id ${record.viewFirstYIndicator} not found for explorer ${slug}` - ) - continue - } - - // This is the fallback chain for the grapher title. it's complicated. - record.viewTitle = - record.viewTitle ?? - indicatorInfo.titlePublic ?? - (indicatorInfo.display - ? JSON.parse(indicatorInfo.display).name - : undefined) ?? - indicatorInfo.name - record.viewSubtitle = - record.viewSubtitle ?? indicatorInfo.descriptionShort - } - } - } - - // Drop any views where we couldn't obtain a title, for whatever reason - const [recordsWithViewTitle, recordsWithNoViewTitle] = partition( - records, - (record) => record.viewTitle !== undefined - ) - - for (const record of recordsWithNoViewTitle) { - await logErrorAndMaybeSendToBugsnag({ - name: "ExplorerViewTitleMissing", - message: `Explorer ${slug} has a view with no title: ${record.viewQueryParams}.`, - }) - } - - // Remove Markdown from viewSubtitle; do this after fetching grapher info above, as it might also contain Markdown - const recordsWithTitleLength = recordsWithViewTitle.map((record) => { - if (record.viewSubtitle) { - record.viewSubtitle = new MarkdownTextWrap({ - text: record.viewSubtitle, - fontSize: 10, // doesn't matter, but is a mandatory field - }).plaintext - } - return { ...record, titleLength: record.viewTitle.length } - }) as Omit[] - - // Compute viewTitleIndexWithinExplorer: - // First, sort by score descending (ignoring views_7d, which is not relevant _within_ an explorer). - // Then, group by viewTitle. - // Finally, ungroup again, and keep track of the index of each element within the group. - const recordsSortedByScore = orderBy( - recordsWithTitleLength, - (record) => computeScore(record), - "desc" - ) - const recordsGroupedByViewTitle = groupBy(recordsSortedByScore, "viewTitle") - const recordsWithIndexWithinExplorer = Object.values( - recordsGroupedByViewTitle - ).flatMap((recordsGroup) => - recordsGroup.map((record, i) => ({ - ...record, - viewTitleIndexWithinExplorer: i, - })) - ) - - return recordsWithIndexWithinExplorer -} - -const getExplorerViewRecords = async ( - trx: db.KnexReadonlyTransaction -): Promise => { - const publishedExplorers = Object.values( - await db.getPublishedExplorersBySlug(trx) - ) - - const pageviews = await getAnalyticsPageviewsByUrlObj(trx) - - let records = [] as ExplorerViewEntryWithExplorerInfo[] - for (const explorerInfo of publishedExplorers) { - const explorerViewRecords = await getExplorerViewRecordsForExplorerSlug( - trx, - explorerInfo.slug - ) - - const explorerPageviews = - pageviews[`/explorers/${explorerInfo.slug}`]?.views_7d ?? 0 - const unscoredRecords = explorerViewRecords.map( - (record, i): Omit => ({ - ...record, - explorerSlug: explorerInfo.slug, - explorerTitle: explorerInfo.title, - explorerSubtitle: explorerInfo.subtitle, - explorerViews_7d: explorerPageviews, - viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, - numViewsWithinExplorer: explorerViewRecords.length, - - objectID: `${explorerInfo.slug}-${i}`, - }) - ) - records = records.concat( - unscoredRecords.map((record) => ({ - ...record, - score: computeScore(record), - })) - ) - } - - return records -} +import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js" +import { getExplorerViewRecords } from "./utils/explorerViews.js" const indexExplorerViewsToAlgolia = async () => { if (!ALGOLIA_INDEXING) return @@ -385,8 +37,11 @@ const indexExplorerViewsToAlgolia = async () => { getExplorerViewRecords, db.TransactionCloseMode.Close ) + console.log(`Indexing ${records.length} explorer views to Algolia`) await index.replaceAllObjects(records) + console.log(`Indexing complete`) } catch (e) { + console.error(e) await logErrorAndMaybeSendToBugsnag({ name: `IndexExplorerViewsToAlgoliaError`, message: `${e}`, diff --git a/baker/algolia/indexToAlgolia.tsx b/baker/algolia/indexPagesToAlgolia.tsx similarity index 88% rename from baker/algolia/indexToAlgolia.tsx rename to baker/algolia/indexPagesToAlgolia.tsx index 00e7044df9a..703f618c73b 100644 --- a/baker/algolia/indexToAlgolia.tsx +++ b/baker/algolia/indexPagesToAlgolia.tsx @@ -3,9 +3,9 @@ import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" import { getAlgoliaClient } from "./configureAlgolia.js" import { SearchIndexName } from "../../site/search/searchTypes.js" import { getIndexName } from "../../site/search/searchClient.js" -import { getPagesRecords } from "./algoliaUtils.js" +import { getPagesRecords } from "./utils/pages.js" -const indexToAlgolia = async () => { +const indexPagesToAlgolia = async () => { if (!ALGOLIA_INDEXING) return const client = getAlgoliaClient() @@ -31,4 +31,4 @@ process.on("unhandledRejection", (e) => { process.exit(1) }) -void indexToAlgolia() +void indexPagesToAlgolia() diff --git a/baker/algolia/utils/charts.ts b/baker/algolia/utils/charts.ts new file mode 100644 index 00000000000..51f36eb0280 --- /dev/null +++ b/baker/algolia/utils/charts.ts @@ -0,0 +1,153 @@ +import { isNil, uniq } from "lodash" +import { MarkdownTextWrap } from "@ourworldindata/components" +import { KeyChartLevel, OwidGdocLinkType } from "@ourworldindata/types" +import * as db from "../../../db/db.js" +import { + ChartRecord, + ChartRecordType, +} from "../../../site/search/searchTypes.js" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { getRelatedArticles } from "../../../db/model/Post.js" +import { getPublishedLinksTo } from "../../../db/model/Link.js" +import { isPathRedirectedToExplorer } from "../../../explorerAdminServer/ExplorerRedirects.js" +import { ParsedChartRecordRow, RawChartRecordRow } from "./types.js" +import { excludeNullish } from "@ourworldindata/utils" +import { processAvailableEntities } from "./shared.js" + +const computeChartScore = (record: Omit): number => { + const { numRelatedArticles, views_7d } = record + return numRelatedArticles * 500 + views_7d +} + +const parseAndProcessChartRecords = ( + rawRecord: RawChartRecordRow +): ParsedChartRecordRow => { + let parsedEntities: string[] = [] + if (rawRecord.entityNames !== null) { + // This is a very rough way to check for the Algolia record size limit, but it's better than the update failing + // because we exceed the 20KB record size limit + if (rawRecord.entityNames.length < 12000) + parsedEntities = excludeNullish( + JSON.parse(rawRecord.entityNames as string) as (string | null)[] + ) as string[] + else { + console.info( + `Chart ${rawRecord.id} has too many entities, skipping its entities` + ) + } + } + const entityNames = processAvailableEntities(parsedEntities) + + const tags = JSON.parse(rawRecord.tags) + const keyChartForTags = JSON.parse( + rawRecord.keyChartForTags as string + ).filter((t: string | null) => t) + + return { + ...rawRecord, + entityNames, + tags, + keyChartForTags, + } +} + +export const getChartsRecords = async ( + knex: db.KnexReadonlyTransaction +): Promise => { + console.log("Fetching charts to index") + const chartsToIndex = await db.knexRaw( + knex, + `-- sql + WITH indexable_charts_with_entity_names AS ( + SELECT c.id, + cc.slug, + cc.full ->> "$.title" AS title, + cc.full ->> "$.variantName" AS variantName, + cc.full ->> "$.subtitle" AS subtitle, + JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions, + c.publishedAt, + c.updatedAt, + JSON_ARRAYAGG(e.name) AS entityNames + FROM charts c + LEFT JOIN chart_configs cc ON c.configId = cc.id + LEFT JOIN charts_x_entities ce ON c.id = ce.chartId + LEFT JOIN entities e ON ce.entityId = e.id + WHERE cc.full ->> "$.isPublished" = 'true' + AND c.isIndexable IS TRUE + GROUP BY c.id + ) + SELECT c.id, + c.slug, + c.title, + c.variantName, + c.subtitle, + c.numDimensions, + c.publishedAt, + c.updatedAt, + c.entityNames, -- this array may contain null values, will have to filter these out + JSON_ARRAYAGG(t.name) AS tags, + JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out + FROM indexable_charts_with_entity_names c + LEFT JOIN chart_tags ct ON c.id = ct.chartId + LEFT JOIN tags t on ct.tagId = t.id + GROUP BY c.id + HAVING COUNT(t.id) >= 1 + ` + ) + + const parsedRows = chartsToIndex.map(parseAndProcessChartRecords) + + const pageviews = await getAnalyticsPageviewsByUrlObj(knex) + + const parentTagsByChildName = await db.getParentTagsByChildName(knex) + + const records: ChartRecord[] = [] + for (const c of parsedRows) { + // Our search currently cannot render explorers, so don't index them because + // otherwise they will fail when rendered in the search results + if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue + + const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? [] + const linksFromGdocs = await getPublishedLinksTo( + knex, + [c.slug], + OwidGdocLinkType.Grapher + ) + + const plaintextSubtitle = isNil(c.subtitle) + ? undefined + : new MarkdownTextWrap({ + text: c.subtitle, + fontSize: 10, // doesn't matter, but is a mandatory field + }).plaintext + + const parentTags = c.tags.flatMap( + // a chart can be tagged with a tag that isn't in the tag graph + (tag) => parentTagsByChildName[tag] || [] + ) + + const record = { + objectID: c.id.toString(), + type: ChartRecordType.Chart, + chartId: c.id, + slug: c.slug, + title: c.title, + variantName: c.variantName, + subtitle: plaintextSubtitle, + availableEntities: c.entityNames, + numDimensions: parseInt(c.numDimensions), + publishedAt: c.publishedAt, + updatedAt: c.updatedAt, + tags: uniq([...c.tags, ...parentTags]), + keyChartForTags: c.keyChartForTags as string[], + titleLength: c.title.length, + // Number of references to this chart in all our posts and pages + numRelatedArticles: relatedArticles.length + linksFromGdocs.length, + views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0, + } + const score = computeChartScore(record) + records.push({ ...record, score }) + } + + return records +} diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts new file mode 100644 index 00000000000..84c2037fd4f --- /dev/null +++ b/baker/algolia/utils/explorerViews.ts @@ -0,0 +1,755 @@ +import { + ExplorerChoiceParams, + ExplorerControlType, + GridBoolean, + DecisionMatrix, + TableDef, +} from "@ourworldindata/explorer" +import { at, get, groupBy, mapValues, orderBy, partition, uniq } from "lodash" +import { MarkdownTextWrap } from "@ourworldindata/components" +import { logErrorAndMaybeSendToBugsnag } from "../../../serverUtils/errorLog.js" +import { obtainAvailableEntitiesForGraphers } from "../../updateChartEntities.js" +import { fetchS3MetadataByPath } from "../../../db/model/Variable.js" +import { getVariableMetadataRoute } from "@ourworldindata/grapher" +import pMap from "p-map" +import { ExplorerAdminServer } from "../../../explorerAdminServer/ExplorerAdminServer.js" +import { GIT_CMS_DIR } from "../../../gitCms/GitCmsConstants.js" +import { parseDelimited } from "@ourworldindata/core-table" +import { + ColumnTypeNames, + CoreRow, + MinimalExplorerInfo, +} from "@ourworldindata/types" + +import * as db from "../../../db/db.js" +import { DATA_API_URL } from "../../../settings/serverSettings.js" +import { keyBy } from "@ourworldindata/utils" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { + CsvUnenrichedExplorerViewRecord, + EnrichedExplorerRecord, + EntitiesByColumnDictionary, + ExplorerIndicatorMetadataDictionary, + ExplorerIndicatorMetadataFromDb, + ExplorerViewFinalRecord, + ExplorerViewBaseRecord, + ExplorerViewGrapherInfo, + GrapherEnrichedExplorerViewRecord, + GrapherUnenrichedExplorerViewRecord, + IndicatorEnrichedExplorerViewRecord, + IndicatorUnenrichedExplorerViewRecord, + CsvEnrichedExplorerViewRecord, + ConvertedExplorerChartHit, +} from "./types.js" +import { + processAvailableEntities as processRecordAvailableEntities, + scaleRecordScores, +} from "./shared.js" +import { + ChartRecord, + ChartRecordType, +} from "../../../site/search/searchTypes.js" + +export function explorerViewRecordToChartRecord( + e: ExplorerViewFinalRecord +): ConvertedExplorerChartHit { + return { + type: ChartRecordType.ExplorerView, + objectID: e.objectID!, + chartId: -1, + slug: e.explorerSlug, + queryParams: e.viewQueryParams, + title: e.viewTitle, + subtitle: e.explorerSubtitle, + variantName: "", + keyChartForTags: [], + tags: e.tags, + availableEntities: e.availableEntities, + publishedAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + numDimensions: e.numNonDefaultSettings, + titleLength: e.titleLength, + numRelatedArticles: 0, + views_7d: e.explorerViews_7d, + viewTitleIndexWithinExplorer: e.viewTitleIndexWithinExplorer, + score: e.score, + } +} + +/** + * Scale explorer record scores then convert them to ChartRecords. + * Each explorer has a default view (whichever is defined first in the decision matrix) + * We scale these default view scores between 0 and 10000, but the rest we scale between 0 and 1000 + * to bury them under the (higher quality) grapher views in the data catalog. + */ +export function adaptExplorerViews( + explorerViews: ExplorerViewFinalRecord[] +): ChartRecord[] { + const [firstViews, rest] = partition( + explorerViews, + (view) => view.isFirstExplorerView + ) + return [ + ...scaleRecordScores(firstViews, [1000, 10000]), + ...scaleRecordScores(rest, [0, 1000]), + ].map(explorerViewRecordToChartRecord) +} + +// Creates a search-ready string from a choice. +// Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included. +// Imagine a "Per capita" checkbox, for example. If it's not ticked, then we don't want searches for "per capita" to wrongfully match it. +const explorerChoiceToViewSettings = ( + choices: ExplorerChoiceParams, + decisionMatrix: DecisionMatrix +): string[] => { + return Object.entries(choices).map(([choiceName, choiceValue]) => { + const choiceControlType = + decisionMatrix.choiceNameToControlTypeMap.get(choiceName) + if (choiceControlType === ExplorerControlType.Checkbox) + return choiceValue === GridBoolean.true ? choiceName : "" + else return choiceValue + }) +} + +/** + * Takes records with `yVariableIds` and fetches their metadata. + * First it fetches base metadata from the DB, then it fetches availableEntities from S3. + * Returns a dictionary of metadata by id (and path, when possible): + * ``` + * { + * 123: { id: 123, name: "GDP", entityNames: ["United States", "Canada"] }, + * "an/etl#path": { id: "an/etl#path", name: "GDP", entityNames: ["United States", "Canada"] } + * } + * ``` + */ +async function fetchIndicatorMetadata( + records: IndicatorUnenrichedExplorerViewRecord[], + trx: db.KnexReadonlyTransaction +): Promise { + function checkIsETLPath(idOrPath: string | number): idOrPath is string { + return typeof idOrPath === "string" + } + + const { etlPaths, ids } = records.reduce( + ({ etlPaths, ids }, record) => { + for (const yVariableId of record.yVariableIds) { + if (checkIsETLPath(yVariableId)) { + etlPaths.add(yVariableId) + } else { + ids.add(yVariableId) + } + } + return { etlPaths, ids } + }, + { etlPaths: new Set(), ids: new Set() } + ) + + const metadataFromDB = ( + await trx + .table("variables") + .select( + "id", + "catalogPath", + "name", + "titlePublic", + "display", + "descriptionShort" + ) + .whereIn("id", [...ids]) + .orWhereIn("catalogPath", [...etlPaths]) + ).map((row) => ({ + ...row, + display: row.display ? JSON.parse(row.display) : {}, + })) as ExplorerIndicatorMetadataFromDb[] + + const indicatorMetadataByIdAndPath = { + ...keyBy(metadataFromDB, "id"), + ...keyBy(metadataFromDB, "catalogPath"), + } as ExplorerIndicatorMetadataDictionary + + async function fetchEntitiesForId(id: number) { + const metadata = await fetchS3MetadataByPath( + getVariableMetadataRoute(DATA_API_URL, id) + ) + const entityNames = get(metadata, "dimensions.entities.values", []) + .map((value) => value.name) + .filter((name): name is string => !!name) + + const idEntry = indicatorMetadataByIdAndPath[id] + if (idEntry) { + idEntry.entityNames = entityNames + } + const path = metadata.catalogPath + if (path) { + const pathEntry = indicatorMetadataByIdAndPath[path] + if (pathEntry) { + pathEntry.entityNames = entityNames + } + } + } + + await pMap( + metadataFromDB.map((meta) => meta.id), + fetchEntitiesForId, + { concurrency: 10 } + ) + + return indicatorMetadataByIdAndPath +} + +/** Almost always `"country"`, but sometimes things like `"location"` */ +function getEntityNameSlug(tableDef: TableDef): string { + return ( + tableDef.columnDefinitions?.find( + (col) => col.type === ColumnTypeNames.EntityName + )?.slug || "country" + ) +} + +/** + * Returns an aggregator function that can be used to aggregate entities per column in a parsed CSV + * e.g. if there's a column named "gdp", this will return an object like `{ gdp: Set }` + * containing all the entities that have any data for gdp. + */ +function makeAggregator(entityNameSlug: string) { + return ( + result: Record>, + row: Record + ) => { + const entityName = row[entityNameSlug] + Object.keys(row).forEach((columnSlug) => { + if (columnSlug === entityNameSlug || columnSlug === "year") return + + const value = row[columnSlug] + if (value) { + if (!result[columnSlug]) { + result[columnSlug] = new Set() + } + if (entityName) { + result[columnSlug].add(entityName) + } + } + }) + + return result + } +} + +/** + * Fetches the CSVs for all of an explorer's tables, parses them, and aggregates their entities per column. + * Returns an object like: + * ``` + * { + * almonds: { population: ["United States", "Canada"], food__tonnes: ["United States"] }, + * olives: { population: ["United States", "Canada"], food__tonnes: ["United States", "Greece"] }, + * } + * ``` + */ +async function getEntitiesPerColumnPerTable( + tableDefs: TableDef[] +): Promise { + return pMap( + tableDefs, + (tableDef) => { + console.log("Fetching CSV table data from", tableDef.url) + return fetch(tableDef.url!) + .then((res) => res.text()) + .then((csv) => parseDelimited(csv)) + .then((parsed) => { + const entityNameSlug = getEntityNameSlug(tableDef) + const aggregateEntities = makeAggregator(entityNameSlug) + const entitiesPerColumn = parsed.reduce( + aggregateEntities, + {} + ) + + // Convert sets to arrays + const entityNamesAsArray = mapValues( + entitiesPerColumn, + (set) => Array.from(set) + ) as Record + + return { [tableDef.slug!]: entityNamesAsArray } + }) + }, + { + concurrency: 5, + } + // Merge all these objects together + ).then((results) => Object.assign({}, ...results)) +} + +const computeExplorerViewScore = (record: { + explorerViews_7d: number + numNonDefaultSettings: number + titleLength: number +}) => + (record.explorerViews_7d || 0) * 10 - + record.numNonDefaultSettings * 50 - + record.titleLength + +const parseYVariableIds = (matrixRow: CoreRow): (string | number)[] => { + return ( + matrixRow.yVariableIds + ?.trim() + .split(" ") + .map((idOrPath: string) => + isNaN(parseInt(idOrPath)) ? idOrPath : parseInt(idOrPath) + ) || [] + ) +} + +const getNonDefaultSettings = ( + choice: ExplorerChoiceParams, + matrix: DecisionMatrix +): [string, any][] => { + const defaultSettings = matrix.defaultSettings + return Object.entries(matrix.availableChoiceOptions).filter( + ([choiceName, choiceOptions]) => { + return ( + choiceOptions.length > 1 && + !(defaultSettings[choiceName] !== undefined + ? defaultSettings[choiceName] === choice[choiceName] + : choice[choiceName] === choiceOptions[0]) + ) + } + ) +} + +const createBaseRecord = ( + choice: ExplorerChoiceParams, + matrix: DecisionMatrix, + index: number, + explorerInfo: MinimalExplorerInfo +): ExplorerViewBaseRecord => { + matrix.setValuesFromChoiceParams(choice) + const nonDefaultSettings = getNonDefaultSettings(choice, matrix) + const yVariableIds = parseYVariableIds(matrix.selectedRow) + + return { + availableEntities: [], + viewTitle: matrix.selectedRow.title, + viewSubtitle: matrix.selectedRow.subtitle, + viewSettings: explorerChoiceToViewSettings(choice, matrix), + viewGrapherId: matrix.selectedRow.grapherId, + yVariableIds, + viewQueryParams: matrix.toString(), + viewIndexWithinExplorer: index, + numNonDefaultSettings: nonDefaultSettings.length, + tableSlug: matrix.selectedRow.tableSlug, + ySlugs: matrix.selectedRow.ySlugs?.split(" ") || [], + explorerSlug: explorerInfo.slug, + isFirstExplorerView: index === 0, + } +} + +const createBaseRecords = ( + explorerInfo: MinimalExplorerInfo, + matrix: DecisionMatrix +): ExplorerViewBaseRecord[] => { + return matrix + .allDecisionsAsQueryParams() + .map((choice: ExplorerChoiceParams, index: number) => + createBaseRecord(choice, matrix, index, explorerInfo) + ) +} + +const fetchGrapherInfo = async ( + trx: db.KnexReadonlyTransaction, + grapherIds: number[] +): Promise> => { + return await trx + .select( + trx.raw("charts.id as id"), + trx.raw("chart_configs.full->>'$.title' as title"), + trx.raw("chart_configs.full->>'$.subtitle' as subtitle") + ) + .from("charts") + .join("chart_configs", { "charts.configId": "chart_configs.id" }) + .whereIn("charts.id", grapherIds) + .andWhereRaw("chart_configs.full->>'$.isPublished' = 'true'") + .then((rows) => keyBy(rows, "id")) +} + +async function enrichRecordWithGrapherInfo( + record: GrapherUnenrichedExplorerViewRecord, + grapherInfo: Record, + availableEntities: Map, + explorerInfo: MinimalExplorerInfo +): Promise { + const grapher = grapherInfo[record.viewGrapherId] + if (!grapher) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewGrapherMissing", + message: `Explorer with slug "${explorerInfo.slug}" has a view with a missing grapher: ${record.viewQueryParams}.`, + }) + return + } + + return { + ...record, + availableEntities: + availableEntities.get(record.viewGrapherId)?.availableEntities ?? + [], + viewTitle: grapher.title, + viewSubtitle: grapher.subtitle, + titleLength: grapher.title.length, + } +} + +const enrichWithGrapherData = async ( + trx: db.KnexReadonlyTransaction, + records: GrapherUnenrichedExplorerViewRecord[], + explorerInfo: MinimalExplorerInfo +): Promise => { + if (!records.length) return [] + const grapherIds = records.map((record) => record.viewGrapherId as number) + + console.log( + `Fetching grapher configs from ${grapherIds.length} graphers for explorer ${explorerInfo.slug}` + ) + const grapherInfo = await fetchGrapherInfo(trx, grapherIds) + const availableEntities = await obtainAvailableEntitiesForGraphers( + trx, + grapherIds + ) + + const enrichedRecords: GrapherEnrichedExplorerViewRecord[] = [] + for (const record of records) { + const enrichedRecord = await enrichRecordWithGrapherInfo( + record, + grapherInfo, + availableEntities, + explorerInfo + ) + if (enrichedRecord) enrichedRecords.push(enrichedRecord) + } + return enrichedRecords +} + +async function enrichRecordWithTableData( + record: CsvUnenrichedExplorerViewRecord, + entitiesPerColumnPerTable: EntitiesByColumnDictionary +): Promise { + const { tableSlug, ySlugs, viewTitle } = record + if (!tableSlug || !ySlugs?.length || !viewTitle) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewMissingData", + message: `Explorer with slug "${record.explorerSlug}" has a view with missing data: ${record.viewQueryParams}.`, + }) + return + } + + const availableEntities = uniq( + ySlugs.flatMap((ySlug) => entitiesPerColumnPerTable[tableSlug][ySlug]) + ).filter((name): name is string => !!name) + + return { + ...record, + availableEntities, + titleLength: viewTitle.length, + } +} + +async function enrichWithTableData( + records: CsvUnenrichedExplorerViewRecord[], + entitiesPerColumnPerTable: EntitiesByColumnDictionary +): Promise { + const enrichedRecords: CsvEnrichedExplorerViewRecord[] = [] + + for (const record of records) { + const enrichedRecord = await enrichRecordWithTableData( + record, + entitiesPerColumnPerTable + ) + if (enrichedRecord) { + enrichedRecords.push(enrichedRecord) + } + } + return enrichedRecords +} + +function enrichRecordWithIndicatorData( + record: IndicatorUnenrichedExplorerViewRecord, + indicatorMetadataDictionary: ExplorerIndicatorMetadataDictionary +): IndicatorEnrichedExplorerViewRecord { + const allEntityNames = at( + indicatorMetadataDictionary, + record.yVariableIds + ).flatMap((meta) => meta.entityNames) + + const uniqueNonEmptyEntityNames = uniq(allEntityNames).filter( + (name): name is string => !!name + ) + + const firstYIndicator = record.yVariableIds[0] + + const indicatorInfo = indicatorMetadataDictionary[firstYIndicator] + + const viewTitle = + record.viewTitle || + indicatorInfo.titlePublic || + indicatorInfo.display?.name || + (indicatorInfo.name as string) + + const viewSubtitle = + record.viewSubtitle || (indicatorInfo.descriptionShort as string) + + return { + ...record, + availableEntities: uniqueNonEmptyEntityNames, + viewTitle, + viewSubtitle, + titleLength: viewTitle.length, + } +} + +const enrichWithIndicatorMetadata = async ( + indicatorBaseRecords: IndicatorUnenrichedExplorerViewRecord[], + indicatorMetadataDictionary: ExplorerIndicatorMetadataDictionary +): Promise => { + return indicatorBaseRecords.map((indicatorBaseRecord) => + enrichRecordWithIndicatorData( + indicatorBaseRecord, + indicatorMetadataDictionary + ) + ) +} + +function processSubtitles( + records: EnrichedExplorerRecord[] +): EnrichedExplorerRecord[] { + return records.map((record) => { + // Remove markdown links from text + const viewSubtitle = record.viewSubtitle + ? new MarkdownTextWrap({ + text: record.viewSubtitle, + fontSize: 10, + }).plaintext + : undefined + return { + ...record, + viewSubtitle, + } as EnrichedExplorerRecord + }) +} + +async function processAvailableEntities( + records: EnrichedExplorerRecord[] +): Promise { + const processedRecords: EnrichedExplorerRecord[] = [] + for (const record of records) { + const availableEntities = processRecordAvailableEntities( + record.availableEntities + ) + if (!availableEntities) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerViewMissingData", + message: `Explorer with slug "${record.explorerSlug}" has a view with missing entities: ${record.viewQueryParams}.`, + }) + } else { + processedRecords.push({ + ...record, + availableEntities, + }) + } + } + return processedRecords +} + +async function finalizeRecords( + records: EnrichedExplorerRecord[], + slug: string, + pageviews: Record, + explorerInfo: MinimalExplorerInfo +): Promise { + const withCleanSubtitles = processSubtitles(records) + + const withCleanEntities = await processAvailableEntities(withCleanSubtitles) + + const withPageviews = withCleanEntities.map((record) => ({ + ...record, + explorerViews_7d: get(pageviews, [`/explorers/${slug}`, "views_7d"], 0), + })) + + const unsortedFinalRecords = withPageviews.map( + ( + record, + i + ): Omit => ({ + ...record, + viewSettings: record.viewSettings.filter((x): x is string => !!x), + viewTitle: record.viewTitle!, + viewSubtitle: record.viewSubtitle!, + explorerSlug: explorerInfo.slug, + explorerTitle: explorerInfo.title, + explorerSubtitle: explorerInfo.subtitle, + viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`, + numViewsWithinExplorer: withPageviews.length, + tags: explorerInfo.tags, + objectID: `${explorerInfo.slug}-${i}`, + score: computeExplorerViewScore(record), + }) + ) + + const sortedByScore = orderBy( + unsortedFinalRecords, + computeExplorerViewScore, + "desc" + ) + + const groupedByTitle = groupBy(sortedByScore, "viewTitle") + + const indexedExplorerViewData = Object.values(groupedByTitle).flatMap( + (records) => + records.map((record, i) => ({ + ...record, + viewTitleIndexWithinExplorer: i, + })) + ) + + return indexedExplorerViewData +} + +export const getExplorerViewRecordsForExplorer = async ( + trx: db.KnexReadonlyTransaction, + explorerInfo: MinimalExplorerInfo, + pageviews: Record, + explorerAdminServer: ExplorerAdminServer, + skipGrapherViews: boolean +): Promise => { + const { slug } = explorerInfo + const explorerProgram = await explorerAdminServer.getExplorerFromSlug(slug) + + console.log( + `Creating ${explorerProgram.decisionMatrix.numRows} base records for explorer ${slug}` + ) + const baseRecords = createBaseRecords( + explorerInfo, + explorerProgram.decisionMatrix + ) + + const [grapherBaseRecords, nonGrapherBaseRecords] = partition( + baseRecords, + (record) => record.viewGrapherId !== undefined + ) as [GrapherUnenrichedExplorerViewRecord[], ExplorerViewBaseRecord[]] + + let enrichedGrapherRecords: GrapherEnrichedExplorerViewRecord[] = [] + if (!skipGrapherViews) { + enrichedGrapherRecords = await enrichWithGrapherData( + trx, + grapherBaseRecords, + explorerInfo + ) + } + + const [indicatorBaseRecords, csvBaseRecords] = partition( + nonGrapherBaseRecords, + (record) => record.yVariableIds.length > 0 + ) as [ + IndicatorUnenrichedExplorerViewRecord[], + CsvUnenrichedExplorerViewRecord[], + ] + + // Fetch and apply indicator metadata + console.log("Fetching indicator metadata for explorer", slug) + const indicatorMetadataDictionary = await fetchIndicatorMetadata( + indicatorBaseRecords, + trx + ) + console.log("Fetched indicator metadata for explorer", slug) + + const enrichedIndicatorRecords = await enrichWithIndicatorMetadata( + indicatorBaseRecords, + indicatorMetadataDictionary + ) + + const tableDefs = explorerProgram.tableSlugs + .map((tableSlug) => explorerProgram.getTableDef(tableSlug)) + .filter((x) => x && x.url && x.slug) as TableDef[] + + // Fetch and process CSV table data + console.log( + `Fetching CSV table data for ${slug} and aggregating entities by column` + ) + const entitiesPerColumnPerTable = + await getEntitiesPerColumnPerTable(tableDefs) + console.log( + "Finished fetching CSV table data and aggregating entities by column" + ) + + const enrichedCsvRecords = await enrichWithTableData( + csvBaseRecords, + entitiesPerColumnPerTable + ) + + const enrichedRecords = [ + ...enrichedGrapherRecords, + ...enrichedIndicatorRecords, + ...enrichedCsvRecords, + ] + + // // Finalize records with titles, sorting, and grouping + return finalizeRecords(enrichedRecords, slug, pageviews, explorerInfo) +} + +async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { + const explorersBySlug = await db.getPublishedExplorersBySlug(trx) + // The DB query gets the tags for the explorer, but we need to add the parent tags as well. + // This isn't done in the query because it would require a recursive CTE. + // It's easier to write that query once, separately, and reuse it. + const parentTags = await db.getParentTagsByChildName(trx) + const publishedExplorersWithTags = [] + + for (const explorer of Object.values(explorersBySlug)) { + if (!explorer.tags.length) { + await logErrorAndMaybeSendToBugsnag({ + name: "ExplorerTagMissing", + message: `Explorer "${explorer.slug}" has no tags.`, + }) + } + const tags = new Set() + for (const tag of explorer.tags) { + tags.add(tag) + for (const parentTag of parentTags[tag]) { + tags.add(parentTag) + } + } + + publishedExplorersWithTags.push({ + ...explorer, + tags: Array.from(tags), + }) + } + + return publishedExplorersWithTags +} + +export const getExplorerViewRecords = async ( + trx: db.KnexReadonlyTransaction, + skipGrapherViews = false +): Promise => { + console.log("Getting explorer view records") + if (skipGrapherViews) { + console.log("(Skipping grapher views)") + } + const publishedExplorersWithTags = await getExplorersWithInheritedTags(trx) + const pageviews = await getAnalyticsPageviewsByUrlObj(trx) + + const explorerAdminServer = new ExplorerAdminServer(GIT_CMS_DIR) + + const records = await pMap( + publishedExplorersWithTags, + (explorerInfo) => + getExplorerViewRecordsForExplorer( + trx, + explorerInfo, + pageviews, + explorerAdminServer, + skipGrapherViews + ), + { concurrency: 1 } + ).then((records) => records.flat()) + + return records +} diff --git a/baker/algolia/algoliaUtils.tsx b/baker/algolia/utils/pages.ts similarity index 91% rename from baker/algolia/algoliaUtils.tsx rename to baker/algolia/utils/pages.ts index 70767e36cab..24d6d2781cb 100644 --- a/baker/algolia/algoliaUtils.tsx +++ b/baker/algolia/utils/pages.ts @@ -1,6 +1,6 @@ -import * as db from "../../db/db.js" -import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js" -import { chunkParagraphs } from "../chunk.js" +import * as db from "../../../db/db.js" +import { ALGOLIA_INDEXING } from "../../../settings/serverSettings.js" +import { chunkParagraphs } from "../../chunk.js" import { countries, Country, @@ -17,36 +17,31 @@ import { DEFAULT_GDOC_FEATURED_IMAGE, DEFAULT_THUMBNAIL_FILENAME, } from "@ourworldindata/utils" -import { formatPost } from "../formatWordpressPost.js" +import { formatPost } from "../../formatWordpressPost.js" import ReactDOMServer from "react-dom/server.js" -import { getAlgoliaClient } from "./configureAlgolia.js" +import { getAlgoliaClient } from "../configureAlgolia.js" import { htmlToText } from "html-to-text" import { PageRecord, - PageType, SearchIndexName, -} from "../../site/search/searchTypes.js" -import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" -import { ArticleBlocks } from "../../site/gdocs/components/ArticleBlocks.js" +} from "../../../site/search/searchTypes.js" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { ArticleBlocks } from "../../../site/gdocs/components/ArticleBlocks.js" import React from "react" import { getFullPost, getPostTags, getPostsFromSnapshots, -} from "../../db/model/Post.js" -import { getIndexName } from "../../site/search/searchClient.js" +} from "../../../db/model/Post.js" +import { getIndexName } from "../../../site/search/searchClient.js" import { ObjectWithObjectID } from "@algolia/client-search" import { SearchIndex } from "algoliasearch" import { match, P } from "ts-pattern" -import { gdocFromJSON } from "../../db/model/Gdoc/GdocFactory.js" -import { formatUrls } from "../../site/formatting.js" +import { gdocFromJSON } from "../../../db/model/Gdoc/GdocFactory.js" +import { formatUrls } from "../../../site/formatting.js" +import { TypeAndImportance } from "./types.js" -interface TypeAndImportance { - type: PageType - importance: number -} - -const computeScore = (record: Omit): number => { +const computePageScore = (record: Omit): number => { const { importance, views_7d } = record return importance * 1000 + views_7d } @@ -70,7 +65,7 @@ function generateCountryRecords( documentType: "country-page" as const, thumbnailUrl: `/${DEFAULT_THUMBNAIL_FILENAME}`, } - const score = computeScore(record) + const score = computePageScore(record) return { ...record, score } }) } @@ -152,7 +147,7 @@ async function generateWordpressRecords( views_7d: pageviews[`/${post.path}`]?.views_7d ?? 0, documentType: "wordpress" as const, } - const score = computeScore(record) + const score = computePageScore(record) records.push({ ...record, score }) i += 1 } @@ -205,9 +200,13 @@ function generateGdocRecords( if (!gdoc.content.body) continue // Only rendering the blocks - not the page nav, title, byline, etc const renderedPostContent = ReactDOMServer.renderToStaticMarkup( -
- -
+ React.createElement( + "div", + null, + React.createElement(ArticleBlocks, { + blocks: gdoc.content.body, + }) + ) ) const chunks = generateChunksFromHtmlText(renderedPostContent) const postTypeAndImportance = getPostTypeAndImportance(gdoc) @@ -230,7 +229,7 @@ function generateGdocRecords( authors: gdoc.content.authors, thumbnailUrl, } - const score = computeScore(record) + const score = computePageScore(record) records.push({ ...record, score }) i += 1 } diff --git a/baker/algolia/utils/shared.ts b/baker/algolia/utils/shared.ts new file mode 100644 index 00000000000..0ebc1aef230 --- /dev/null +++ b/baker/algolia/utils/shared.ts @@ -0,0 +1,63 @@ +import { + countries, + orderBy, + removeTrailingParenthetical, +} from "@ourworldindata/utils" + +const countriesWithVariantNames = new Set( + countries + .filter((country) => country.variantNames?.length || country.shortName) + .map((country) => country.name) +) + +export const processAvailableEntities = ( + availableEntities: string[] | null +) => { + if (!availableEntities) return [] + + // Algolia is a bit weird with synonyms: + // If we have a synonym "USA" -> "United States", and we search for "USA", + // then it seems that Algolia can only find that within `availableEntities` + // if "USA" is within the first 100-or-so entries of the array. + // So, the easy solution is to sort the entities to ensure that countries + // with variant names are at the top. + // Also, entities containing a hyphen like "low-income countries" can also + // only be found if they're within the first 100-or-so entries. + // - @marcelgerber, 2024-03-25 + return orderBy( + availableEntities, + [ + (entityName) => + countriesWithVariantNames.has( + removeTrailingParenthetical(entityName) + ), + (entityName) => entityName.includes("-"), + (entityName) => entityName, + ], + ["desc", "desc", "asc"] + ) +} + +/** + * Scale records' positive scores to be between two numbers. + */ +export function scaleRecordScores( + records: T[], + range: [number, number] +): T[] { + const scores = records.map((r) => r.score) + const [min, max] = range + const maxScore = Math.max(...scores) + return records.map((record): T => { + // For ExplorerView records, we want to keep negative scores, + // because they're intentionally downranked as near-duplicates of existing views + if (record.score < 0) return record + // A value between 0 and 1 + const normalized = record.score / maxScore + const scaled = Math.round(normalized * (max - min) + min) + return { + ...record, + score: scaled, + } + }) +} diff --git a/baker/algolia/utils/types.ts b/baker/algolia/utils/types.ts new file mode 100644 index 00000000000..eaf60b0a337 --- /dev/null +++ b/baker/algolia/utils/types.ts @@ -0,0 +1,173 @@ +import { DbEnrichedVariable } from "@ourworldindata/types" +import { ChartRecord, PageType } from "../../../site/search/searchTypes.js" + +/** Pages */ +export interface TypeAndImportance { + type: PageType + importance: number +} + +/** Charts */ +export interface RawChartRecordRow { + id: number + slug: string + title: string + variantName: string + subtitle: string + numDimensions: string + publishedAt: string + updatedAt: string + entityNames: string + tags: string + keyChartForTags: string +} + +export interface ParsedChartRecordRow { + id: number + slug: string + title: string + variantName: string + subtitle: string + numDimensions: string + publishedAt: string + updatedAt: string + entityNames: string[] + tags: string[] + keyChartForTags: string[] +} + +/** Explorers */ +export interface IndicatorMetadata { + entityNames: string[] + titlePublic?: string + display?: { name: string } + name: string + descriptionShort?: string +} + +export interface ExplorerViewGrapherInfo { + id: number + title: string + subtitle: string +} + +export type EntitiesByColumnDictionary = Record< + string, + Record +> + +export type ExplorerIndicatorMetadataFromDb = Pick< + DbEnrichedVariable, + | "id" + | "catalogPath" + | "name" + | "titlePublic" + | "display" + | "descriptionShort" +> + +export type ExplorerIndicatorMetadataDictionary = Record< + string | number, + ExplorerIndicatorMetadataFromDb & { + entityNames?: string[] + } +> + +export interface ExplorerViewBaseRecord { + availableEntities: string[] + numNonDefaultSettings: number + tableSlug?: string + viewGrapherId?: number + viewIndexWithinExplorer: number + viewQueryParams: string + // TODO: are nulls necessary here? + viewSettings: Array + viewSubtitle?: string + viewTitle?: string + ySlugs: Array + yVariableIds: Array + explorerSlug: string + // True when the record is the first view specified in the explorer's config + // Used in order to downrank all other views for the same explorer in the data catalog + isFirstExplorerView: boolean +} + +export type GrapherUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: number +} + +export type GrapherEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewTitle: string + viewSubtitle: string + titleLength: number +} + +export type IndicatorUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: [] + tableSlug: never +} + +export type IndicatorEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: string[] + tableSlug: never + availableEntities: string[] + titleLength: number +} + +export type CsvUnenrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: string[] + tableSlug: string +} + +export type CsvEnrichedExplorerViewRecord = ExplorerViewBaseRecord & { + viewGrapherId: never + ySlugs: string[] + tableSlug: string + titleLength: number +} + +export type EnrichedExplorerRecord = + | GrapherEnrichedExplorerViewRecord + | IndicatorEnrichedExplorerViewRecord + | CsvEnrichedExplorerViewRecord + +/** This is the final record we index to Algolia for the `explorer-views` index */ +export interface ExplorerViewFinalRecord { + objectID: string + explorerTitle: string + viewTitle: string + viewSettings: string[] + /** + * We often have several views with the same title within an explorer, e.g. "Population". + * In order to only display _one_ of these views in search results, we need a way to demote duplicates. + * This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc. + */ + viewTitleIndexWithinExplorer: number + score: number + viewIndexWithinExplorer: number + viewSubtitle: string + viewQueryParams: string + titleLength: number + numNonDefaultSettings: number + explorerSlug: string + explorerSubtitle: string + explorerViews_7d: number + viewTitleAndExplorerSlug: string + numViewsWithinExplorer: number + // These 2 aren't currently used in the explorer-views index (used in /search), but we need them in the data catalog + tags: string[] + availableEntities: string[] + // Only used to filter out these views from the data catalog (because we already index graphers) + viewGrapherId?: number + // True when the record is the first view specified in the explorer's config + // Used in order to downrank all other views for the same explorer in the data catalog + isFirstExplorerView: boolean +} + +// This is the final record we index to Algolia for the `explorer-views-and-charts` index +export type ConvertedExplorerChartHit = ChartRecord & { + viewTitleIndexWithinExplorer: number +} diff --git a/baker/updateChartEntities.ts b/baker/updateChartEntities.ts index 5ba84537674..10264e28155 100644 --- a/baker/updateChartEntities.ts +++ b/baker/updateChartEntities.ts @@ -120,8 +120,10 @@ const obtainAvailableEntitiesForGrapherConfig = async ( } else return [] } -const obtainAvailableEntitiesForAllGraphers = async ( - trx: db.KnexReadonlyTransaction +export const obtainAvailableEntitiesForGraphers = async ( + trx: db.KnexReadonlyTransaction, + // Optional subset of IDs to restrict data fetching to + chartIds?: number[] ) => { const entityNameToIdMap = await mapEntityNamesToEntityIds(trx) @@ -134,10 +136,17 @@ const obtainAvailableEntitiesForAllGraphers = async ( FROM charts c JOIN chart_configs cc ON c.configId = cc.id WHERE cc.full ->> "$.isPublished" = 'true' + ${chartIds && chartIds.length ? `AND c.id IN (${chartIds.join(",")})` : ""} ` ) - const availableEntitiesByChartId = new Map() + const availableEntitiesByChartId = new Map< + number, + { + availableEntities: string[] + availableEntityIds: number[] + } + >() await pMap( allPublishedGraphers, async (grapher) => { @@ -156,7 +165,10 @@ const obtainAvailableEntitiesForAllGraphers = async ( return [entityId] } ) - availableEntitiesByChartId.set(grapher.id, availableEntityIds) + availableEntitiesByChartId.set(grapher.id, { + availableEntities, + availableEntityIds, + }) console.log( grapher.id, @@ -184,7 +196,7 @@ const updateAvailableEntitiesForAllGraphers = async ( "--- Obtaining available entity ids for all published graphers ---" ) const availableEntitiesByChartId = - await obtainAvailableEntitiesForAllGraphers(trx) + await obtainAvailableEntitiesForGraphers(trx) console.log("--- Fetch stats ---") console.log( @@ -194,7 +206,10 @@ const updateAvailableEntitiesForAllGraphers = async ( console.log("--- Updating charts_x_entities ---") await trx.delete().from(ChartsXEntitiesTableName) // clears out the WHOLE table - for (const [chartId, availableEntityIds] of availableEntitiesByChartId) { + for (const [ + chartId, + { availableEntityIds }, + ] of availableEntitiesByChartId) { const rows = availableEntityIds.map((entityId) => ({ chartId, entityId, diff --git a/db/db.ts b/db/db.ts index 8f28b746806..8cc02389120 100644 --- a/db/db.ts +++ b/db/db.ts @@ -9,7 +9,6 @@ import { import { registerExitHandler } from "./cleanup.js" import { createTagGraph, keyBy } from "@ourworldindata/utils" import { - DbChartTagJoin, ImageMetadata, MinimalDataInsightInterface, OwidGdocType, @@ -28,6 +27,7 @@ import { OwidGdoc, DbPlainTag, TagGraphNode, + MinimalExplorerInfo, } from "@ourworldindata/types" import { groupBy, uniq } from "lodash" import { gdocFromJSON } from "./model/Gdoc/GdocFactory.js" @@ -197,7 +197,7 @@ export const getSlugsWithPublishedGdocsSuccessors = async ( export const getExplorerTags = async ( knex: KnexReadonlyTransaction -): Promise<{ slug: string; tags: DbChartTagJoin[] }[]> => { +): Promise<{ slug: string; tags: Pick[] }[]> => { return knexRaw<{ slug: string; tags: string }>( knex, `-- sql @@ -216,21 +216,14 @@ export const getExplorerTags = async ( ).then((rows) => rows.map((row) => ({ slug: row.slug, - tags: JSON.parse(row.tags) as DbChartTagJoin[], + tags: JSON.parse(row.tags) as Pick[], })) ) } export const getPublishedExplorersBySlug = async ( knex: KnexReadonlyTransaction -): Promise<{ - [slug: string]: { - slug: string - title: string - subtitle: string - tags: DbChartTagJoin[] - } -}> => { +): Promise> => { const tags = await getExplorerTags(knex) const tagsBySlug = keyBy(tags, "slug") return knexRaw( @@ -246,11 +239,14 @@ export const getPublishedExplorersBySlug = async ( isPublished = TRUE` ).then((rows) => { const processed = rows.map((row: any) => { + const tagsForExplorer = tagsBySlug[row.slug] return { slug: row.slug, title: row.title, subtitle: row.subtitle === "null" ? "" : row.subtitle, - tags: tagsBySlug[row.slug]?.tags ?? [], + tags: tagsForExplorer + ? tagsForExplorer.tags.map((tag) => tag.name) + : [], } }) return keyBy(processed, "slug") diff --git a/packages/@ourworldindata/core-table/src/CoreTableUtils.ts b/packages/@ourworldindata/core-table/src/CoreTableUtils.ts index 6f1a9bdb9e8..618c41dc8f3 100644 --- a/packages/@ourworldindata/core-table/src/CoreTableUtils.ts +++ b/packages/@ourworldindata/core-table/src/CoreTableUtils.ts @@ -578,8 +578,8 @@ export const parseDelimited = ( const result = Papa.parse(str, { delimiter: delimiter ?? detectDelimiter(str), - skipEmptyLines: true, header: true, + skipEmptyLines: true, transformHeader: (header: string) => header.trim(), transform: (value: string) => value.trim(), }) diff --git a/packages/@ourworldindata/explorer/src/ExplorerProgram.ts b/packages/@ourworldindata/explorer/src/ExplorerProgram.ts index 07f998345f5..5535566d998 100644 --- a/packages/@ourworldindata/explorer/src/ExplorerProgram.ts +++ b/packages/@ourworldindata/explorer/src/ExplorerProgram.ts @@ -48,6 +48,7 @@ export interface TableDef { url?: string columnDefinitions?: OwidColumnDef[] inlineData?: string[][] + slug?: TableSlug } interface ExplorerGrapherInterface extends GrapherInterface { @@ -477,6 +478,7 @@ export class ExplorerProgram extends GridProgram { url, columnDefinitions, inlineData, + slug: tableSlug, } } } diff --git a/packages/@ourworldindata/explorer/src/index.ts b/packages/@ourworldindata/explorer/src/index.ts index 70eb37f3a43..036fc1450ef 100644 --- a/packages/@ourworldindata/explorer/src/index.ts +++ b/packages/@ourworldindata/explorer/src/index.ts @@ -11,6 +11,7 @@ export { EXPLORER_EMBEDDED_FIGURE_SELECTOR, ExplorerChartCreationMode, ExplorerContainerId, + ExplorerControlType, ExplorerControlTypeRegex, EXPLORERS_GIT_CMS_FOLDER, EXPLORERS_PREVIEW_ROUTE, @@ -23,7 +24,6 @@ export { type ExplorerChoice, type ExplorerChoiceOption, type ExplorerChoiceParams, - type ExplorerControlType, type ExplorerFullQueryParams, type ExplorersRouteResponse, type ExplorerStandardQueryParams, @@ -32,6 +32,7 @@ export { } from "./ExplorerConstants.js" export { + type TableDef, ExplorerProgram, EXPLORER_FILE_SUFFIX, makeFullPath, diff --git a/packages/@ourworldindata/types/src/dbTypes/Explorers.ts b/packages/@ourworldindata/types/src/dbTypes/Explorers.ts index cafe0a7809e..9ccd0ef3ea1 100644 --- a/packages/@ourworldindata/types/src/dbTypes/Explorers.ts +++ b/packages/@ourworldindata/types/src/dbTypes/Explorers.ts @@ -10,3 +10,11 @@ export interface DbInsertExplorer { } export type DbPlainExplorer = Required // TODO: add enriched type and type config properly + +/** A sparse set of explorer metadata. Currently used to begin Algolia indexing with */ +export type MinimalExplorerInfo = { + slug: string + title: string + subtitle: string + tags: string[] +} diff --git a/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts b/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts index 6f15c14f15a..8adff305b33 100644 --- a/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts +++ b/packages/@ourworldindata/types/src/gdocTypes/Gdoc.ts @@ -11,7 +11,6 @@ import { RawBlockText, RefDictionary, } from "./ArchieMlComponents.js" -import { DbChartTagJoin } from "../dbTypes/ChartTags.js" import { MinimalTag } from "../dbTypes/Tags.js" import { DbEnrichedLatestWork } from "../domainTypes/Author.js" @@ -42,7 +41,7 @@ export interface LinkedChart { title: string subtitle?: string thumbnail?: string - tags: DbChartTagJoin[] + tags: string[] tab?: GrapherTabOption indicatorId?: number // in case of a datapage } diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts index 4f0e8a12d1a..0335c469664 100644 --- a/packages/@ourworldindata/types/src/index.ts +++ b/packages/@ourworldindata/types/src/index.ts @@ -502,6 +502,7 @@ export { type DbPlainExplorer, type DbInsertExplorer, ExplorersTableName, + type MinimalExplorerInfo, } from "./dbTypes/Explorers.js" export { type DbPlainExplorerVariable, diff --git a/settings/clientSettings.ts b/settings/clientSettings.ts index 2fbccb0cb4c..73d9a362987 100644 --- a/settings/clientSettings.ts +++ b/settings/clientSettings.ts @@ -38,6 +38,9 @@ export const BAKED_SITE_EXPORTS_BASE_URL: string = export const GRAPHER_DYNAMIC_THUMBNAIL_URL: string = process.env.GRAPHER_DYNAMIC_THUMBNAIL_URL ?? `${BAKED_GRAPHER_URL}` +export const EXPLORER_DYNAMIC_THUMBNAIL_URL: string = + process.env.EXPLORER_DYNAMIC_THUMBNAIL_URL ?? `${BAKED_BASE_URL}/explorers` + export const GRAPHER_DYNAMIC_CONFIG_URL: string = process.env.GRAPHER_DYNAMIC_CONFIG_URL ?? `${BAKED_GRAPHER_URL}` diff --git a/site/DataCatalog/DataCatalogUtils.ts b/site/DataCatalog/DataCatalogUtils.ts index 040b3fbc2f5..7840b420882 100644 --- a/site/DataCatalog/DataCatalogUtils.ts +++ b/site/DataCatalog/DataCatalogUtils.ts @@ -4,7 +4,7 @@ import { SearchResponse, } from "instantsearch.js" import { getIndexName } from "../search/searchClient.js" -import { SearchIndexName } from "../search/searchTypes.js" +import { ChartRecordType, SearchIndexName } from "../search/searchTypes.js" import { TagGraphRoot } from "@ourworldindata/types" import { DataCatalogState } from "./DataCatalogState.js" import { countriesByName, Region } from "@ourworldindata/utils" @@ -13,13 +13,15 @@ import { SearchClient } from "algoliasearch" /** * Constants */ -const CHARTS_INDEX = getIndexName(SearchIndexName.Charts) +const CHARTS_INDEX = getIndexName(SearchIndexName.ExplorerViewsAndCharts) const DATA_CATALOG_ATTRIBUTES = [ "title", "slug", "availableEntities", "variantName", + "type", + "queryParams", ] /** @@ -44,6 +46,8 @@ export type IDataCatalogHit = { availableEntities: string[] objectID: string variantName: string | null + type: ChartRecordType + queryParams: string __position: number _highlightResult?: HitHighlightResult _snippetResult?: HitHighlightResult diff --git a/site/gdocs/components/ExplorerTiles.tsx b/site/gdocs/components/ExplorerTiles.tsx index 1f1208f0046..2884085df24 100644 --- a/site/gdocs/components/ExplorerTiles.tsx +++ b/site/gdocs/components/ExplorerTiles.tsx @@ -19,10 +19,9 @@ function ExplorerTile({ url }: { url: string }) { height={40} width={40} src={`${BAKED_BASE_URL}/images/tag-icons/${encodeURIComponent( - linkedChart.tags[0].name + linkedChart.tags[0] )}.svg`} - className="explorer-tile__icon" - alt={`Icon for topic ${linkedChart.tags[0].name}`} + alt={`Icon for topic ${linkedChart.tags[0]}`} loading="lazy" /> ) : null diff --git a/site/search/ChartHit.tsx b/site/search/ChartHit.tsx index a9b97fadeae..607f489322c 100644 --- a/site/search/ChartHit.tsx +++ b/site/search/ChartHit.tsx @@ -1,12 +1,14 @@ import React, { useEffect, useMemo, useState } from "react" import cx from "classnames" import { Region } from "@ourworldindata/utils" -import { IChartHit, SearchIndexName } from "./searchTypes.js" +import { ChartRecordType, IChartHit, SearchIndexName } from "./searchTypes.js" import { getEntityQueryStr, pickEntitiesForChartHit } from "./SearchUtils.js" import { HitAttributeHighlightResult } from "instantsearch.js" import { + BAKED_BASE_URL, BAKED_GRAPHER_EXPORTS_BASE_URL, BAKED_GRAPHER_URL, + EXPLORER_DYNAMIC_THUMBNAIL_URL, GRAPHER_DYNAMIC_THUMBNAIL_URL, } from "../../settings/clientSettings.js" import { getIndexName } from "./searchClient.js" @@ -21,6 +23,7 @@ import { } from "@ourworldindata/grapher" import { Highlight } from "react-instantsearch" import { IDataCatalogHit } from "../DataCatalog/DataCatalogUtils.js" +import { EXPLORERS_ROUTE_FOLDER } from "@ourworldindata/explorer" export function ChartHit({ hit, @@ -35,6 +38,7 @@ export function ChartHit({ }) { const [imgLoaded, setImgLoaded] = useState(false) const [imgError, setImgError] = useState(false) + const isExplorerView = hit.type === ChartRecordType.ExplorerView const entities = useMemo( () => @@ -51,10 +55,36 @@ export function ChartHit({ searchQueryRegionsMatches, ] ) - const queryStr = useMemo(() => getEntityQueryStr(entities), [entities]) - const previewUrl = queryStr - ? `${GRAPHER_DYNAMIC_THUMBNAIL_URL}/${hit.slug}.svg${queryStr}` - : `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${hit.slug}.svg` + const entityQueryStr = useMemo( + () => getEntityQueryStr(entities), + [entities] + ) + + const fullQueryParams = isExplorerView + ? hit.queryParams! + entityQueryStr.replace("?", "&") + : entityQueryStr + + function createExplorerViewThumbnailUrl( + slug: string, + fullQueryParams: string + ): string { + return `${EXPLORER_DYNAMIC_THUMBNAIL_URL}/${slug}.svg${fullQueryParams}` + } + function createGrapherThumbnailUrl( + slug: string, + fullQueryParams?: string + ): string { + return fullQueryParams + ? `${GRAPHER_DYNAMIC_THUMBNAIL_URL}/${slug}.svg${fullQueryParams}` + : `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${slug}.svg` + } + const previewUrl = isExplorerView + ? createExplorerViewThumbnailUrl(hit.slug, fullQueryParams) + : createGrapherThumbnailUrl(hit.slug, fullQueryParams) + + const chartUrl = isExplorerView + ? `${BAKED_BASE_URL}/${EXPLORERS_ROUTE_FOLDER}/${hit.slug}${fullQueryParams}` + : `${BAKED_GRAPHER_URL}/${hit.slug}${fullQueryParams}` useEffect(() => { setImgLoaded(false) @@ -63,7 +93,7 @@ export function ChartHit({ return ( & { viewTitleIndexWithinExplorer: number } +export enum ChartRecordType { + Chart = "chart", + ExplorerView = "explorerView", +} + export interface ChartRecord { + type: ChartRecordType objectID: string chartId: number slug: string + queryParams?: string title: string subtitle: string | undefined variantName: string @@ -81,6 +88,7 @@ export enum SearchIndexName { ExplorerViews = "explorer-views", Charts = "charts", Pages = "pages", + ExplorerViewsAndCharts = "explorer-views-and-charts", } export type SearchCategoryFilter = SearchIndexName | "all" @@ -96,4 +104,6 @@ export const indexNameToSubdirectoryMap: Record = { [SearchIndexName.Pages]: "", [SearchIndexName.Charts]: "/grapher", [SearchIndexName.ExplorerViews]: "/explorers", + // n/a - charts and explorers have different subdirectories, so this needs to be resolved elsewhere + [SearchIndexName.ExplorerViewsAndCharts]: "", }