Skip to content

Commit

Permalink
Add landing page block content to search index
Browse files Browse the repository at this point in the history
- Landing pages have "blocks" rather than a body, and
  by convention we want anything in a block that has the
  key `content:` to be put into the search index.
  Because blocks can be arbitrarily nested, we use the
  JSONPath $.details.blocks..content.
- However, some blocks (govspeak ones) may be structured
  to contain multiple content types (eg if they were marked
  up as content-type: text/govspeak, publishing-api will
  have automatically created a rendered text/html and will
  present both of these to the search api in the
  message_queue message. This means that this block will
  be matched by the JSONPath 3 times - once for the array
  that contains the different content items (since the
  key for the whole thing is `content:`, and once for each
  actual content item inside that array (since those keys
  are also `content:`.
- In order to prevent these keys appearing 3 times in the
  search index (one processed normally by BodyContent's
  matcher for content_type: "text/html", then once again
  for that content item's content: key and once for the
  govspeak content, every time we add a structured array
  like this we add the content items to an ignore set,
  When we get the secondary matches (which aren't arrays),
  we check them against the ignore set. If they're present,
  we delete them from the ignore set and continue without
  presenting them to the search index.
  • Loading branch information
KludgeKML committed Oct 23, 2024
1 parent 004f78f commit 3f4d8c8
Show file tree
Hide file tree
Showing 3 changed files with 322 additions and 1 deletion.
26 changes: 25 additions & 1 deletion app/models/concerns/publishing_api/content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,41 @@ def content
values_from_json_paths = INDEXABLE_CONTENT_VALUES_JSON_PATHS.map do |item|
item.on(document_hash).map { |body| BodyContent.new(body).html_content }
end

values_from_parts = document_hash.dig(:details, :parts)&.map do |part|
# Add the part title as a heading to help the search model better understand the structure
# of the content
["<h1>#{part[:title]}</h1>", BodyContent.new(part[:body]).html_content]
end

[*values_from_json_paths, *values_from_parts]
[*values_from_json_paths, *values_from_parts, *values_from_blocks(document_hash)]
.flatten
.compact_blank
.join(INDEXABLE_CONTENT_SEPARATOR)
.truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE)
end

def values_from_blocks(document_hash)
matches = JsonPath.new("$.details.blocks..content", use_symbols: true).on(document_hash)
return [] unless matches.any?

ignore_set = []
values = []
matches.each do |match|
case match
in Array
match.each { |m| ignore_set << m[:content] if m[:content].present? }
values << BodyContent.new(match).html_content
else
if ignore_set.index(match).present?
ignore_set.delete_at(ignore_set.index(match))
else
values << BodyContent.new(match).html_content
end
end
end

values
end
end
end
265 changes: 265 additions & 0 deletions spec/fixtures/files/message_queue/landing_page_message.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
{
"title": "Landing Page Fixture",
"public_updated_at": "2024-10-21T23:00:00Z",
"publishing_app": "whitehall",
"rendering_app": "frontend",
"update_type": "minor",
"phase": "live",
"analytics_identifier": null,
"document_type": "landing_page",
"schema_name": "landing_page",
"first_published_at": "2024-10-21T23:00:00Z",
"base_path": "/landing-page/search-test",
"description": "Landing page description",
"details": {
"blocks": [
{
"type": "hero",
"image": {
"alt": "Placeholder alt text",
"sources": {
"mobile": "landing_page/placeholder/mobile.png"
}
},
"hero_content": {
"blocks": [
{
"type": "govspeak",
"content": [
{
"content": "## Govspeak in Hero Example\n\nTHIS SHOULD NOT APPEAR!\n",
"content_type": "text/govspeak"
},
{
"content_type": "text/html",
"content": "\u003ch2 id=\"govspeak-in-hero-example\"\u003eGovspeak in Hero Example\u003c/h2\u003e\n\n\u003cp\u003eMore text\u003c/p\u003e\n"
}
],
"inverse": true
}
]
}
},
{
"type": "featured",
"image": {
"alt": "example alt text",
"sources": {
"mobile": "landing_page/placeholder/mobile.png"
}
},
"featured_content": {
"blocks": [
{
"type": "govspeak",
"content": "\u003ch2\u003eGovspeak in featured example\u003c/h2\u003e\n\u003cp\u003eLorem ipsum.\u003c/p\u003e\n",
"inverse": true
}
]
}
},
{
"type": "header",
"content": "Header block example"
},
{
"type": "tabs",
"tab_items": [
{
"id": "tab-1",
"label": "Item 1",
"content": "Content in tab example one"
},
{
"id": "tab-2",
"label": "Item 2",
"content": "Content in tab example two"
}
]
},
{
"type": "govspeak",
"content": "\u003cp\u003eHand-crafted Govspeak example\u003c/p\u003e"
},
{
"type": "two_column_layout",
"theme": "two_thirds_one_third",
"blocks": [
{
"type": "govspeak",
"content": "\u003cp\u003eGovspeak in two column example left\u003c/p\u003e"
}
]
}
]
},
"routes": [
{
"path": "/landing-page/search-test",
"type": "exact"
}
],
"redirects": [],
"content_id": "4423de24-06d2-454c-8fc1-2bd9c43087f0",
"locale": "en",
"expanded_links": {
"taxons": [
{
"content_id": "e48ab80a-de80-4e83-bf59-26316856a5f9",
"title": "Government",
"locale": "en",
"analytics_identifier": null,
"api_path": "/api/content/government/all",
"base_path": "/government/all",
"document_type": "taxon",
"public_updated_at": "2018-09-16T20:29:39Z",
"schema_name": "taxon",
"withdrawn": false,
"description": "",
"details": {
"internal_name": "Government",
"notes_for_editors": "",
"visible_to_departmental_editors": true
},
"phase": "live",
"links": {
"root_taxon": [
{
"content_id": "f3bbdec2-0e62-4520-a7fd-6ffd5d36e03a",
"title": "GOV.UK homepage",
"locale": "en",
"analytics_identifier": null,
"api_path": "/api/content/",
"base_path": "/",
"document_type": "homepage",
"public_updated_at": "2023-06-28T09:32:34Z",
"schema_name": "homepage",
"withdrawn": false,
"links": {}
}
]
}
}
],
"organisations": [
{
"content_id": "af07d5a5-df63-4ddc-9383-6a666845ebe9",
"title": "Government Digital Service",
"locale": "en",
"analytics_identifier": "OT1056",
"api_path": "/api/content/government/organisations/government-digital-service",
"base_path": "/government/organisations/government-digital-service",
"document_type": "organisation",
"schema_name": "organisation",
"withdrawn": false,
"details": {
"acronym": "GDS",
"logo": {
"crest": "single-identity",
"formatted_title": "Government Digital Service"
},
"brand": "department-for-science-innovation-and-technology",
"default_news_image": null,
"organisation_govuk_status": {
"url": null,
"status": "live",
"updated_at": null
}
},
"links": {}
}
],
"original_primary_publishing_organisation": [
{
"content_id": "af07d5a5-df63-4ddc-9383-6a666845ebe9",
"title": "Government Digital Service",
"locale": "en",
"analytics_identifier": "OT1056",
"api_path": "/api/content/government/organisations/government-digital-service",
"base_path": "/government/organisations/government-digital-service",
"document_type": "organisation",
"schema_name": "organisation",
"withdrawn": false,
"details": {
"acronym": "GDS",
"logo": {
"crest": "single-identity",
"formatted_title": "Government Digital Service"
},
"brand": "department-for-science-innovation-and-technology",
"default_news_image": null,
"organisation_govuk_status": {
"url": null,
"status": "live",
"updated_at": null
}
},
"links": {}
}
],
"primary_publishing_organisation": [
{
"content_id": "af07d5a5-df63-4ddc-9383-6a666845ebe9",
"title": "Government Digital Service",
"locale": "en",
"analytics_identifier": "OT1056",
"api_path": "/api/content/government/organisations/government-digital-service",
"base_path": "/government/organisations/government-digital-service",
"document_type": "organisation",
"schema_name": "organisation",
"withdrawn": false,
"details": {
"acronym": "GDS",
"logo": {
"crest": "single-identity",
"formatted_title": "Government Digital Service"
},
"brand": "department-for-science-innovation-and-technology",
"default_news_image": null,
"organisation_govuk_status": {
"url": null,
"status": "live",
"updated_at": null
}
},
"links": {}
}
],
"available_translations": [
{
"title": "Search test",
"public_updated_at": "2024-10-21T23:00:00Z",
"analytics_identifier": null,
"document_type": "landing_page",
"schema_name": "landing_page",
"base_path": "/landing-page/search-test",
"api_path": "/api/content/landing-page/search-test",
"withdrawn": false,
"content_id": "4423de24-06d2-454c-8fc1-2bd9c43087f0",
"locale": "en"
}
]
},
"user_journey_document_supertype": "thing",
"email_document_supertype": "other",
"government_document_supertype": "other",
"content_purpose_subgroup": "other",
"content_purpose_supergroup": "other",
"publishing_request_id": "21-1729602038.916-10.1.21.246-2547",
"govuk_request_id": null,
"links": {
"taxons": [
"e48ab80a-de80-4e83-bf59-26316856a5f9"
],
"organisations": [
"af07d5a5-df63-4ddc-9383-6a666845ebe9"
],
"original_primary_publishing_organisation": [
"af07d5a5-df63-4ddc-9383-6a666845ebe9"
],
"primary_publishing_organisation": [
"af07d5a5-df63-4ddc-9383-6a666845ebe9"
]
},
"payload_version": "12345"
}
32 changes: 32 additions & 0 deletions spec/integration/document_synchronization_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,38 @@
end
end

describe "for a 'landing_page' message" do
let(:payload) { json_fixture_as_hash("message_queue/landing_page_message.json") }

it "is added to Discovery Engine through the Put service" do
expect(DiscoveryEngine::Sync::Put).to have_received(:new).with(
"4423de24-06d2-454c-8fc1-2bd9c43087f0",
{
content_id: "4423de24-06d2-454c-8fc1-2bd9c43087f0",
content_purpose_supergroup: "other",
debug: {
last_synced_at: "1989-12-13T01:02:03+00:00",
payload_version: 12_345,
},
description: "Landing page description",
document_type: "landing_page",
is_historic: 0,
link: "/landing-page/search-test",
locale: "en",
organisations: %w[government-digital-service],
part_of_taxonomy_tree: %w[f3bbdec2-0e62-4520-a7fd-6ffd5d36e03a e48ab80a-de80-4e83-bf59-26316856a5f9],
public_timestamp: 1_729_551_600,
public_timestamp_datetime: "2024-10-21T23:00:00Z",
title: "Landing Page Fixture",
url: "https://www.gov.uk/landing-page/search-test",
},
content: "Landing Page Fixture\nLanding page description\n<h2 id=\"govspeak-in-hero-example\">Govspeak in Hero Example</h2>\n\n<p>More text</p>\n\n<h2>Govspeak in featured example</h2>\n<p>Lorem ipsum.</p>\n\nHeader block example\nContent in tab example one\nContent in tab example two\n<p>Hand-crafted Govspeak example</p>\n<p>Govspeak in two column example left</p>",
payload_version: 12_345,
)
expect(put_service).to have_received(:call)
end
end

describe "for an 'external_content' message" do
let(:payload) { json_fixture_as_hash("message_queue/external_content_message.json") }

Expand Down

0 comments on commit 3f4d8c8

Please sign in to comment.