owid · ikesau · Nov 8, 2024 · Nov 5, 2024 · Oct 29, 2024 · Oct 31, 2024
diff --git a/Makefile b/Makefile
@@ -297,9 +297,10 @@ update.chart-entities: itsJustJavascript
 reindex: itsJustJavascript
 	@echo '==> Reindexing search in Algolia'
 	node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js
-	node --enable-source-maps itsJustJavascript/baker/algolia/indexToAlgolia.js
+	node --enable-source-maps itsJustJavascript/baker/algolia/indexPagesToAlgolia.js
 	node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
 	node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js
+	node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js
 
 delete-algolia-index: itsJustJavascript
 	@echo '==> Deleting Algolia index'

diff --git a/adminSiteServer/apiRouter.ts b/adminSiteServer/apiRouter.ts
@@ -125,7 +125,7 @@ import { denormalizeLatestCountryData } from "../baker/countryProfiles.js"
 import {
     indexIndividualGdocPost,
     removeIndividualGdocPostFromIndex,
-} from "../baker/algolia/algoliaUtils.js"
+} from "../baker/algolia/utils/pages.js"
 import { References } from "../adminSiteClient/ChartEditor.js"
 import { DeployQueueServer } from "../baker/DeployQueueServer.js"
 import { FunctionalRouter } from "./FunctionalRouter.js"

diff --git a/baker/algolia/configureAlgolia.ts b/baker/algolia/configureAlgolia.ts
@@ -159,6 +159,41 @@ export const configureAlgolia = async () => {
         ],
     })
 
+    const explorerViewsAndChartsIndex = client.initIndex(
+        getIndexName(SearchIndexName.ExplorerViewsAndCharts)
+    )
+
+    await explorerViewsAndChartsIndex.setSettings({
+        ...baseSettings,
+        searchableAttributes: [
+            "unordered(title)",
+            "unordered(slug)",
+            "unordered(variantName)",
+            "unordered(subtitle)",
+            "unordered(tags)",
+            "unordered(availableEntities)",
+        ],
+        ranking: ["typo", "words", "exact", "attribute", "custom", "proximity"],
+        customRanking: [
+            "desc(score)",
+            // For multiple explorer views with the same title, we want to avoid surfacing duplicates.
+            // So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc.
+            "asc(viewTitleIndexWithinExplorer)",
+            "asc(titleLength)",
+        ],
+        attributesToSnippet: ["subtitle:24"],
+        attributeForDistinct: "id",
+        optionalWords: ["vs"],
+
+        // These lines below essentially demote matches in the `subtitle` and `availableEntities` fields:
+        // If we find a match (only) there, then it doesn't count towards `exact`, and is therefore ranked lower.
+        // We also disable prefix matching and typo tolerance on these.
+        disableExactOnAttributes: ["tags", "subtitle", "availableEntities"],
+        disableTypoToleranceOnAttributes: ["subtitle", "availableEntities"],
+        disablePrefixOnAttributes: ["subtitle"],
+        attributesForFaceting: ["tags", "availableEntities"],
+    })
+
     const synonyms = [
         ["owid", "our world in data"],
         ["kids", "children"],

diff --git a/baker/algolia/indexChartsToAlgolia.ts b/baker/algolia/indexChartsToAlgolia.ts
@@ -1,222 +1,23 @@
 import * as db from "../../db/db.js"
-import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js"
-import { getAlgoliaClient } from "./configureAlgolia.js"
-import { isPathRedirectedToExplorer } from "../../explorerAdminServer/ExplorerRedirects.js"
-import { ChartRecord, SearchIndexName } from "../../site/search/searchTypes.js"
 import {
-    KeyChartLevel,
-    OwidGdocLinkType,
-    excludeNullish,
-    isNil,
-    countries,
-    orderBy,
-    removeTrailingParenthetical,
-    uniq,
-} from "@ourworldindata/utils"
-import { MarkdownTextWrap } from "@ourworldindata/components"
-import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
-import { getRelatedArticles } from "../../db/model/Post.js"
+    ALGOLIA_INDEXING,
+    BUGSNAG_NODE_API_KEY,
+} from "../../settings/serverSettings.js"
+import { getAlgoliaClient } from "./configureAlgolia.js"
+import { SearchIndexName } from "../../site/search/searchTypes.js"
 import { getIndexName } from "../../site/search/searchClient.js"
-import { getPublishedLinksTo } from "../../db/model/Link.js"
-
-const computeScore = (record: Omit<ChartRecord, "score">): number => {
-    const { numRelatedArticles, views_7d } = record
-    return numRelatedArticles * 500 + views_7d
-}
-
-const countriesWithVariantNames = new Set(
-    countries
-        .filter((country) => country.variantNames?.length || country.shortName)
-        .map((country) => country.name)
-)
-
-const processAvailableEntities = (availableEntities: string[] | null) => {
-    if (!availableEntities) return []
-
-    // Algolia is a bit weird with synonyms:
-    // If we have a synonym "USA" -> "United States", and we search for "USA",
-    // then it seems that Algolia can only find that within `availableEntities`
-    // if "USA" is within the first 100-or-so entries of the array.
-    // So, the easy solution is to sort the entities to ensure that countries
-    // with variant names are at the top.
-    // Also, entities containing a hyphen like "low-income countries" can also
-    // only be found if they're within the first 100-or-so entries.
-    // - @marcelgerber, 2024-03-25
-    return orderBy(
-        availableEntities,
-        [
-            (entityName) =>
-                countriesWithVariantNames.has(
-                    removeTrailingParenthetical(entityName)
-                ),
-            (entityName) => entityName.includes("-"),
-            (entityName) => entityName,
-        ],
-        ["desc", "desc", "asc"]
-    )
-}
-
-interface RawChartRecordRow {
-    id: number
-    slug: string
-    title: string
-    variantName: string
-    subtitle: string
-    numDimensions: string
-    publishedAt: string
-    updatedAt: string
-    entityNames: string
-    tags: string
-    keyChartForTags: string
-}
-
-interface ParsedChartRecordRow {
-    id: number
-    slug: string
-    title: string
-    variantName: string
-    subtitle: string
-    numDimensions: string
-    publishedAt: string
-    updatedAt: string
-    entityNames: string[]
-    tags: string[]
-    keyChartForTags: string[]
-}
-
-const parseAndProcessChartRecords = (
-    rawRecord: RawChartRecordRow
-): ParsedChartRecordRow => {
-    let parsedEntities: string[] = []
-    if (rawRecord.entityNames !== null) {
-        // This is a very rough way to check for the Algolia record size limit, but it's better than the update failing
-        // because we exceed the 20KB record size limit
-        if (rawRecord.entityNames.length < 12000)
-            parsedEntities = excludeNullish(
-                JSON.parse(rawRecord.entityNames as string) as (string | null)[]
-            ) as string[]
-        else {
-            console.info(
-                `Chart ${rawRecord.id} has too many entities, skipping its entities`
-            )
-        }
-    }
-    const entityNames = processAvailableEntities(parsedEntities)
-
-    const tags = JSON.parse(rawRecord.tags)
-    const keyChartForTags = JSON.parse(
-        rawRecord.keyChartForTags as string
-    ).filter((t: string | null) => t)
-
-    return {
-        ...rawRecord,
-        entityNames,
-        tags,
-        keyChartForTags,
-    }
-}
-
-const getChartsRecords = async (
-    knex: db.KnexReadonlyTransaction
-): Promise<ChartRecord[]> => {
-    const chartsToIndex = await db.knexRaw<RawChartRecordRow>(
-        knex,
-        `-- sql
-        WITH indexable_charts_with_entity_names AS (
-            SELECT c.id,
-                   cc.slug,
-                   cc.full ->> "$.title"                   AS title,
-                   cc.full ->> "$.variantName"             AS variantName,
-                   cc.full ->> "$.subtitle"                AS subtitle,
-                   JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions,
-                   c.publishedAt,
-                   c.updatedAt,
-                   JSON_ARRAYAGG(e.name)                  AS entityNames
-            FROM charts c
-                     LEFT JOIN chart_configs cc ON c.configId = cc.id
-                     LEFT JOIN charts_x_entities ce ON c.id = ce.chartId
-                     LEFT JOIN entities e ON ce.entityId = e.id
-            WHERE cc.full ->> "$.isPublished" = 'true'
-                AND c.isIndexable IS TRUE
-            GROUP BY c.id
-        )
-        SELECT c.id,
-               c.slug,
-               c.title,
-               c.variantName,
-               c.subtitle,
-               c.numDimensions,
-               c.publishedAt,
-               c.updatedAt,
-               c.entityNames, -- this array may contain null values, will have to filter these out
-               JSON_ARRAYAGG(t.name) AS tags,
-               JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out
-        FROM indexable_charts_with_entity_names c
-                 LEFT JOIN chart_tags ct ON c.id = ct.chartId
-                 LEFT JOIN tags t on ct.tagId = t.id
-        GROUP BY c.id
-        HAVING COUNT(t.id) >= 1
-    `
-    )
-
-    const parsedRows = chartsToIndex.map(parseAndProcessChartRecords)
-
-    const pageviews = await getAnalyticsPageviewsByUrlObj(knex)
-
-    const parentTagsByChildName = await db.getParentTagsByChildName(knex)
-
-    const records: ChartRecord[] = []
-    for (const c of parsedRows) {
-        // Our search currently cannot render explorers, so don't index them because
-        // otherwise they will fail when rendered in the search results
-        if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue
-
-        const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? []
-        const linksFromGdocs = await getPublishedLinksTo(
-            knex,
-            [c.slug],
-            OwidGdocLinkType.Grapher
-        )
-
-        const plaintextSubtitle = isNil(c.subtitle)
-            ? undefined
-            : new MarkdownTextWrap({
-                  text: c.subtitle,
-                  fontSize: 10, // doesn't matter, but is a mandatory field
-              }).plaintext
-
-        const parentTags = c.tags.flatMap(
-            // a chart can be tagged with a tag that isn't in the tag graph
-            (tag) => parentTagsByChildName[tag] || []
-        )
-
-        const record = {
-            objectID: c.id.toString(),
-            chartId: c.id,
-            slug: c.slug,
-            title: c.title,
-            variantName: c.variantName,
-            subtitle: plaintextSubtitle,
-            availableEntities: c.entityNames,
-            numDimensions: parseInt(c.numDimensions),
-            publishedAt: c.publishedAt,
-            updatedAt: c.updatedAt,
-            tags: uniq([...c.tags, ...parentTags]),
-            keyChartForTags: c.keyChartForTags as string[],
-            titleLength: c.title.length,
-            // Number of references to this chart in all our posts and pages
-            numRelatedArticles: relatedArticles.length + linksFromGdocs.length,
-            views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0,
-        }
-        const score = computeScore(record)
-        records.push({ ...record, score })
-    }
-
-    return records
-}
+import { getChartsRecords } from "./utils/charts.js"
+import Bugsnag from "@bugsnag/js"
 
 const indexChartsToAlgolia = async () => {
     if (!ALGOLIA_INDEXING) return
+    if (BUGSNAG_NODE_API_KEY) {
+        Bugsnag.start({
+            apiKey: BUGSNAG_NODE_API_KEY,
+            context: "index-explorer-views-to-algolia",
+            autoTrackSessions: false,
+        })
+    }
 
     const client = getAlgoliaClient()
     if (!client) {

diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts
@@ -0,0 +1,81 @@
+import Bugsnag from "@bugsnag/js"
+import * as db from "../../db/db.js"
+import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js"
+import {
+    ALGOLIA_INDEXING,
+    BUGSNAG_NODE_API_KEY,
+} from "../../settings/serverSettings.js"
+import { getAlgoliaClient } from "./configureAlgolia.js"
+import {
+    getExplorerViewRecords,
+    adaptExplorerViews,
+} from "./utils/explorerViews.js"
+import { scaleRecordScores } from "./utils/shared.js"
+import { getChartsRecords } from "./utils/charts.js"
+import { getIndexName } from "../../site/search/searchClient.js"
+import { SearchIndexName } from "../../site/search/searchTypes.js"
+
+// We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over.
+// If we standardize the record shape, we could have this be the only index and have a `type` field
+// to use in /search.
+const indexExplorerViewsAndChartsToAlgolia = async () => {
+    if (!ALGOLIA_INDEXING) return
+    if (BUGSNAG_NODE_API_KEY) {
+        Bugsnag.start({
+            apiKey: BUGSNAG_NODE_API_KEY,
+            context: "index-explorer-views-to-algolia",
+            autoTrackSessions: false,
+        })
+    }
+    const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts)
+    console.log(
+        `Indexing explorer views and charts to the "${indexName}" index on Algolia`
+    )
+    const client = getAlgoliaClient()
+    if (!client) {
+        await logErrorAndMaybeSendToBugsnag(
+            `Failed indexing explorer views (Algolia client not initialized)`
+        )
+        return
+    }
+
+    try {
+        const { explorerViews, grapherViews } =
+            await db.knexReadonlyTransaction(async (trx) => {
+                return {
+                    explorerViews: await getExplorerViewRecords(trx, true),
+                    grapherViews: await getChartsRecords(trx),
+                }
+            }, db.TransactionCloseMode.Close)
+
+        // Scale grapher records and the default explorer views between 1000 and 10000,
+        // Scale the remaining explorer views between 0 and 1000.
+        // This is because Graphers are generally higher quality than Explorers and we don't want
+        // the data catalog to smother Grapher results with hundreds of low-quality Explorer results.
+        const scaledGrapherViews = scaleRecordScores(
+            grapherViews,
+            [1000, 10000]
+        )
+        const scaledExplorerViews = adaptExplorerViews(explorerViews)
+
+        const records = [...scaledGrapherViews, ...scaledExplorerViews]
+
+        const index = client.initIndex(indexName)
+        console.log(`Indexing ${records.length} records`)
+        await index.replaceAllObjects(records)
+        console.log(`Indexing complete`)
+    } catch (error) {
+        console.log("Error: ", error)
+        await logErrorAndMaybeSendToBugsnag({
+            name: `IndexExplorerViewsToAlgoliaError`,
+            message: error,
+        })
+    }
+}
+
+process.on("unhandledRejection", (e) => {
+    console.error(e)
+    process.exit(1)
+})
+
+void indexExplorerViewsAndChartsToAlgolia()