Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🎉 Explorers in the Data Catalog #4100

Merged
merged 14 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,10 @@ update.chart-entities: itsJustJavascript
reindex: itsJustJavascript
@echo '==> Reindexing search in Algolia'
node --enable-source-maps itsJustJavascript/baker/algolia/configureAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexPagesToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js

delete-algolia-index: itsJustJavascript
@echo '==> Deleting Algolia index'
Expand Down
2 changes: 1 addition & 1 deletion adminSiteServer/apiRouter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ import { denormalizeLatestCountryData } from "../baker/countryProfiles.js"
import {
indexIndividualGdocPost,
removeIndividualGdocPostFromIndex,
} from "../baker/algolia/algoliaUtils.js"
} from "../baker/algolia/utils/pages.js"
import { References } from "../adminSiteClient/ChartEditor.js"
import { DeployQueueServer } from "../baker/DeployQueueServer.js"
import { FunctionalRouter } from "./FunctionalRouter.js"
Expand Down
35 changes: 35 additions & 0 deletions baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,41 @@ export const configureAlgolia = async () => {
],
})

const explorerViewsAndChartsIndex = client.initIndex(
getIndexName(SearchIndexName.ExplorerViewsAndCharts)
)

await explorerViewsAndChartsIndex.setSettings({
...baseSettings,
searchableAttributes: [
"unordered(title)",
"unordered(slug)",
"unordered(variantName)",
"unordered(subtitle)",
"unordered(tags)",
"unordered(availableEntities)",
],
ranking: ["typo", "words", "exact", "attribute", "custom", "proximity"],
customRanking: [
"desc(score)",
// For multiple explorer views with the same title, we want to avoid surfacing duplicates.
// So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc.
"asc(viewTitleIndexWithinExplorer)",
"asc(titleLength)",
],
attributesToSnippet: ["subtitle:24"],
attributeForDistinct: "id",
optionalWords: ["vs"],

// These lines below essentially demote matches in the `subtitle` and `availableEntities` fields:
// If we find a match (only) there, then it doesn't count towards `exact`, and is therefore ranked lower.
// We also disable prefix matching and typo tolerance on these.
disableExactOnAttributes: ["tags", "subtitle", "availableEntities"],
disableTypoToleranceOnAttributes: ["subtitle", "availableEntities"],
disablePrefixOnAttributes: ["subtitle"],
attributesForFaceting: ["tags", "availableEntities"],
})

const synonyms = [
["owid", "our world in data"],
["kids", "children"],
Expand Down
227 changes: 14 additions & 213 deletions baker/algolia/indexChartsToAlgolia.ts
Original file line number Diff line number Diff line change
@@ -1,222 +1,23 @@
import * as db from "../../db/db.js"
import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import { isPathRedirectedToExplorer } from "../../explorerAdminServer/ExplorerRedirects.js"
import { ChartRecord, SearchIndexName } from "../../site/search/searchTypes.js"
import {
KeyChartLevel,
OwidGdocLinkType,
excludeNullish,
isNil,
countries,
orderBy,
removeTrailingParenthetical,
uniq,
} from "@ourworldindata/utils"
import { MarkdownTextWrap } from "@ourworldindata/components"
import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
import { getRelatedArticles } from "../../db/model/Post.js"
ALGOLIA_INDEXING,
BUGSNAG_NODE_API_KEY,
} from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { getIndexName } from "../../site/search/searchClient.js"
import { getPublishedLinksTo } from "../../db/model/Link.js"

const computeScore = (record: Omit<ChartRecord, "score">): number => {
const { numRelatedArticles, views_7d } = record
return numRelatedArticles * 500 + views_7d
}

const countriesWithVariantNames = new Set(
countries
.filter((country) => country.variantNames?.length || country.shortName)
.map((country) => country.name)
)

const processAvailableEntities = (availableEntities: string[] | null) => {
if (!availableEntities) return []

// Algolia is a bit weird with synonyms:
// If we have a synonym "USA" -> "United States", and we search for "USA",
// then it seems that Algolia can only find that within `availableEntities`
// if "USA" is within the first 100-or-so entries of the array.
// So, the easy solution is to sort the entities to ensure that countries
// with variant names are at the top.
// Also, entities containing a hyphen like "low-income countries" can also
// only be found if they're within the first 100-or-so entries.
// - @marcelgerber, 2024-03-25
return orderBy(
availableEntities,
[
(entityName) =>
countriesWithVariantNames.has(
removeTrailingParenthetical(entityName)
),
(entityName) => entityName.includes("-"),
(entityName) => entityName,
],
["desc", "desc", "asc"]
)
}

interface RawChartRecordRow {
id: number
slug: string
title: string
variantName: string
subtitle: string
numDimensions: string
publishedAt: string
updatedAt: string
entityNames: string
tags: string
keyChartForTags: string
}

interface ParsedChartRecordRow {
id: number
slug: string
title: string
variantName: string
subtitle: string
numDimensions: string
publishedAt: string
updatedAt: string
entityNames: string[]
tags: string[]
keyChartForTags: string[]
}

const parseAndProcessChartRecords = (
rawRecord: RawChartRecordRow
): ParsedChartRecordRow => {
let parsedEntities: string[] = []
if (rawRecord.entityNames !== null) {
// This is a very rough way to check for the Algolia record size limit, but it's better than the update failing
// because we exceed the 20KB record size limit
if (rawRecord.entityNames.length < 12000)
parsedEntities = excludeNullish(
JSON.parse(rawRecord.entityNames as string) as (string | null)[]
) as string[]
else {
console.info(
`Chart ${rawRecord.id} has too many entities, skipping its entities`
)
}
}
const entityNames = processAvailableEntities(parsedEntities)

const tags = JSON.parse(rawRecord.tags)
const keyChartForTags = JSON.parse(
rawRecord.keyChartForTags as string
).filter((t: string | null) => t)

return {
...rawRecord,
entityNames,
tags,
keyChartForTags,
}
}

const getChartsRecords = async (
knex: db.KnexReadonlyTransaction
): Promise<ChartRecord[]> => {
const chartsToIndex = await db.knexRaw<RawChartRecordRow>(
knex,
`-- sql
WITH indexable_charts_with_entity_names AS (
SELECT c.id,
cc.slug,
cc.full ->> "$.title" AS title,
cc.full ->> "$.variantName" AS variantName,
cc.full ->> "$.subtitle" AS subtitle,
JSON_LENGTH(cc.full ->> "$.dimensions") AS numDimensions,
c.publishedAt,
c.updatedAt,
JSON_ARRAYAGG(e.name) AS entityNames
FROM charts c
LEFT JOIN chart_configs cc ON c.configId = cc.id
LEFT JOIN charts_x_entities ce ON c.id = ce.chartId
LEFT JOIN entities e ON ce.entityId = e.id
WHERE cc.full ->> "$.isPublished" = 'true'
AND c.isIndexable IS TRUE
GROUP BY c.id
)
SELECT c.id,
c.slug,
c.title,
c.variantName,
c.subtitle,
c.numDimensions,
c.publishedAt,
c.updatedAt,
c.entityNames, -- this array may contain null values, will have to filter these out
JSON_ARRAYAGG(t.name) AS tags,
JSON_ARRAYAGG(IF(ct.keyChartLevel = ${KeyChartLevel.Top}, t.name, NULL)) AS keyChartForTags -- this results in an array that contains null entries, will have to filter them out
FROM indexable_charts_with_entity_names c
LEFT JOIN chart_tags ct ON c.id = ct.chartId
LEFT JOIN tags t on ct.tagId = t.id
GROUP BY c.id
HAVING COUNT(t.id) >= 1
`
)

const parsedRows = chartsToIndex.map(parseAndProcessChartRecords)

const pageviews = await getAnalyticsPageviewsByUrlObj(knex)

const parentTagsByChildName = await db.getParentTagsByChildName(knex)

const records: ChartRecord[] = []
for (const c of parsedRows) {
// Our search currently cannot render explorers, so don't index them because
// otherwise they will fail when rendered in the search results
if (isPathRedirectedToExplorer(`/grapher/${c.slug}`)) continue

const relatedArticles = (await getRelatedArticles(knex, c.id)) ?? []
const linksFromGdocs = await getPublishedLinksTo(
knex,
[c.slug],
OwidGdocLinkType.Grapher
)

const plaintextSubtitle = isNil(c.subtitle)
? undefined
: new MarkdownTextWrap({
text: c.subtitle,
fontSize: 10, // doesn't matter, but is a mandatory field
}).plaintext

const parentTags = c.tags.flatMap(
// a chart can be tagged with a tag that isn't in the tag graph
(tag) => parentTagsByChildName[tag] || []
)

const record = {
objectID: c.id.toString(),
chartId: c.id,
slug: c.slug,
title: c.title,
variantName: c.variantName,
subtitle: plaintextSubtitle,
availableEntities: c.entityNames,
numDimensions: parseInt(c.numDimensions),
publishedAt: c.publishedAt,
updatedAt: c.updatedAt,
tags: uniq([...c.tags, ...parentTags]),
keyChartForTags: c.keyChartForTags as string[],
titleLength: c.title.length,
// Number of references to this chart in all our posts and pages
numRelatedArticles: relatedArticles.length + linksFromGdocs.length,
views_7d: pageviews[`/grapher/${c.slug}`]?.views_7d ?? 0,
}
const score = computeScore(record)
records.push({ ...record, score })
}

return records
}
import { getChartsRecords } from "./utils/charts.js"
import Bugsnag from "@bugsnag/js"

const indexChartsToAlgolia = async () => {
if (!ALGOLIA_INDEXING) return
if (BUGSNAG_NODE_API_KEY) {
Bugsnag.start({
apiKey: BUGSNAG_NODE_API_KEY,
context: "index-explorer-views-to-algolia",
autoTrackSessions: false,
})
}

const client = getAlgoliaClient()
if (!client) {
Expand Down
81 changes: 81 additions & 0 deletions baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import Bugsnag from "@bugsnag/js"
import * as db from "../../db/db.js"
import { logErrorAndMaybeSendToBugsnag } from "../../serverUtils/errorLog.js"
import {
ALGOLIA_INDEXING,
BUGSNAG_NODE_API_KEY,
} from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import {
getExplorerViewRecords,
adaptExplorerViews,
} from "./utils/explorerViews.js"
import { scaleRecordScores } from "./utils/shared.js"
import { getChartsRecords } from "./utils/charts.js"
import { getIndexName } from "../../site/search/searchClient.js"
import { SearchIndexName } from "../../site/search/searchTypes.js"

// We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over.
// If we standardize the record shape, we could have this be the only index and have a `type` field
// to use in /search.
const indexExplorerViewsAndChartsToAlgolia = async () => {
if (!ALGOLIA_INDEXING) return
if (BUGSNAG_NODE_API_KEY) {
Bugsnag.start({
apiKey: BUGSNAG_NODE_API_KEY,
context: "index-explorer-views-to-algolia",
autoTrackSessions: false,
})
}
const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts)
console.log(
`Indexing explorer views and charts to the "${indexName}" index on Algolia`
)
const client = getAlgoliaClient()
if (!client) {
await logErrorAndMaybeSendToBugsnag(
`Failed indexing explorer views (Algolia client not initialized)`
)
return
}

try {
const { explorerViews, grapherViews } =
await db.knexReadonlyTransaction(async (trx) => {
return {
explorerViews: await getExplorerViewRecords(trx, true),
grapherViews: await getChartsRecords(trx),
}
}, db.TransactionCloseMode.Close)

// Scale grapher records and the default explorer views between 1000 and 10000,
// Scale the remaining explorer views between 0 and 1000.
// This is because Graphers are generally higher quality than Explorers and we don't want
// the data catalog to smother Grapher results with hundreds of low-quality Explorer results.
const scaledGrapherViews = scaleRecordScores(
grapherViews,
[1000, 10000]
)
const scaledExplorerViews = adaptExplorerViews(explorerViews)

const records = [...scaledGrapherViews, ...scaledExplorerViews]

const index = client.initIndex(indexName)
console.log(`Indexing ${records.length} records`)
await index.replaceAllObjects(records)
console.log(`Indexing complete`)
} catch (error) {
console.log("Error: ", error)
await logErrorAndMaybeSendToBugsnag({
name: `IndexExplorerViewsToAlgoliaError`,
message: error,
})
}
}

process.on("unhandledRejection", (e) => {
console.error(e)
process.exit(1)
})

void indexExplorerViewsAndChartsToAlgolia()
Loading