diff --git a/app/models/concerns/publishing_api/content.rb b/app/models/concerns/publishing_api/content.rb index ce2d6d1..2bc6210 100644 --- a/app/models/concerns/publishing_api/content.rb +++ b/app/models/concerns/publishing_api/content.rb @@ -3,15 +3,45 @@ module Content # All the possible keys in the message hash that can contain the primary unstructured document # content that we want to index, represented as JsonPath path strings. INDEXABLE_CONTENT_VALUES_JSON_PATHS = %w[ + $.details.acronym + $.details.attachments[*]['title','isbn','unique_reference','command_paper_number','hoc_paper_number'] + $.details.body + $.details.contact_groups[*].title $.details.description + $.details.hidden_search_terms $.details.introduction $.details.introductory_paragraph - $.details.contact_groups[*].title - $.details.title - $.details.summary - $.details.body - $.details.need_to_know + $.details.licence_short_description + $.details.metadata.aircraft_type + $.details.metadata.authors + $.details.metadata.business_sizes + $.details.metadata.business_stages + $.details.metadata.hidden_indexable_content + $.details.metadata.industries + $.details.metadata.keyword + $.details.metadata.licence_transaction_industry + $.details.metadata.project_code + $.details.metadata.reference_number + $.details.metadata.regions + $.details.metadata.registration + $.details.metadata.research_document_type + $.details.metadata.result + $.details.metadata.stage + $.details.metadata.theme + $.details.metadata.tribunal_decision_categories_name + $.details.metadata.tribunal_decision_category_name + $.details.metadata.tribunal_decision_country_name + $.details.metadata.tribunal_decision_judges_name + $.details.metadata.tribunal_decision_landmark_name + $.details.metadata.tribunal_decision_sub_categories_name + $.details.metadata.tribunal_decision_sub_category_name + $.details.metadata.types_of_support + $.details.metadata.virus_strain + $.details.metadata.year_adopted $.details.more_information + $.details.need_to_know + $.details.summary + $.details.title ].map { JsonPath.new(_1, use_symbols: true) }.freeze INDEXABLE_CONTENT_SEPARATOR = "\n".freeze @@ -34,7 +64,7 @@ def content [*values_from_json_paths, *values_from_parts] .flatten - .compact + .compact_blank .join(INDEXABLE_CONTENT_SEPARATOR) .truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE) end diff --git a/app/models/concerns/publishing_api/metadata.rb b/app/models/concerns/publishing_api/metadata.rb index a2b547a..a877d37 100644 --- a/app/models/concerns/publishing_api/metadata.rb +++ b/app/models/concerns/publishing_api/metadata.rb @@ -1,49 +1,11 @@ module PublishingApi module Metadata - # All the possible keys in the message hash that can contain additional keywords or other text - # that should be searchable but doesn't form part of the primary document content, represented - # as JsonPath path strings. - ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS = %w[ - $.details.acronym - $.details.attachments[*]['title','isbn','unique_reference','command_paper_number','hoc_paper_number'] - $.details.hidden_search_terms - $.details.licence_short_description - $.details.metadata.aircraft_type - $.details.metadata.authors - $.details.metadata.business_sizes - $.details.metadata.business_stages - $.details.metadata.hidden_indexable_content - $.details.metadata.industries - $.details.metadata.keyword - $.details.metadata.licence_transaction_industry - $.details.metadata.project_code - $.details.metadata.reference_number - $.details.metadata.regions - $.details.metadata.registration - $.details.metadata.research_document_type - $.details.metadata.result - $.details.metadata.stage - $.details.metadata.theme - $.details.metadata.tribunal_decision_categories_name - $.details.metadata.tribunal_decision_category_name - $.details.metadata.tribunal_decision_country_name - $.details.metadata.tribunal_decision_judges_name - $.details.metadata.tribunal_decision_landmark_name - $.details.metadata.tribunal_decision_sub_categories_name - $.details.metadata.tribunal_decision_sub_category_name - $.details.metadata.types_of_support - $.details.metadata.virus_strain - $.details.metadata.year_adopted - ].map { JsonPath.new(_1, use_symbols: true) }.freeze - ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR = "\n".freeze - # Extracts a hash of structured metadata about this document. def metadata { content_id: document_hash[:content_id], title: document_hash[:title], description: document_hash[:description], - additional_searchable_text:, link:, url:, public_timestamp:, @@ -75,14 +37,6 @@ def url Plek.website_root + link end - def additional_searchable_text - values = ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS.map { _1.on(document_hash) } - values - .flatten - .compact_blank - .join(ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR) - end - def public_timestamp return nil unless document_hash[:public_updated_at] diff --git a/app/models/publishing_api/body_content.rb b/app/models/publishing_api/body_content.rb index 408df0c..a6acc83 100644 --- a/app/models/publishing_api/body_content.rb +++ b/app/models/publishing_api/body_content.rb @@ -2,10 +2,14 @@ module PublishingApi class BodyContent def initialize(raw_content) @content = case raw_content - when String + in String raw_content - when Array - raw_content.find { _1[:content_type] == "text/html" }&.dig(:content) + in [*, { content_type: "text/html", content: html_content }, *] + html_content + in Array + raw_content.join(" ") if raw_content.all? { _1.is_a?(String) } + else + nil end end diff --git a/spec/integration/document_synchronization_spec.rb b/spec/integration/document_synchronization_spec.rb index c12dd05..dcc8dda 100644 --- a/spec/integration/document_synchronization_spec.rb +++ b/spec/integration/document_synchronization_spec.rb @@ -159,7 +159,6 @@ content_id: "6ba90ae6-972d-4d48-ad66-693bbb31496d", title: "Legal Aid Agency", description: "We provide civil and criminal legal aid and advice in England and Wales to help people deal with their legal problems. LAA is an executive agency, sponsored by the Ministry of Justice .", - additional_searchable_text: "LAA", link: "/government/organisations/legal-aid-agency", url: "http://www.dev.gov.uk/government/organisations/legal-aid-agency", public_timestamp: 1_695_391_634, @@ -169,7 +168,7 @@ content_purpose_supergroup: "other", locale: "en", }, - content: a_string_starting_with("
We provide civil"), + content: a_string_starting_with("LAA\n
We provide civil"), payload_version: 12_345, ) end @@ -185,11 +184,6 @@ content_id: "5d315ee8-7631-11e4-a3cb-005056011aef", title: "Directgov 2010 and beyond: revolution not evolution, a report by Martha Lane Fox", description: "A report from the Digital Champion Martha Lane Fox with recommendations for the future of Directgov.", - additional_searchable_text: <<~TEXT.chomp, - Directgov 2010 and Beyond: Revolution Not Evolution - Letter from Martha Lane Fox to Francis Maude - Francis Maude's reply to Martha Lane Fox's letter - Directgov Strategic Review - Executive Summary - TEXT link: "/government/publications/directgov-2010-and-beyond-revolution-not-evolution-a-report-by-martha-lane-fox", url: "http://www.dev.gov.uk/government/publications/directgov-2010-and-beyond-revolution-not-evolution-a-report-by-martha-lane-fox", public_timestamp: 1_290_470_400, @@ -200,7 +194,12 @@ part_of_taxonomy_tree: %w[f3caf326-fe33-410f-b7f4-553f4011c81e], locale: "en", }, - content: a_string_starting_with("
A report from the Digital"), + content: a_string_starting_with(<<~TEXT.chomp), + Directgov 2010 and Beyond: Revolution Not Evolution - Letter from Martha Lane Fox to Francis Maude + Francis Maude's reply to Martha Lane Fox's letter + Directgov Strategic Review - Executive Summary +
A report from the Digital + TEXT payload_version: 54_321, ) end @@ -240,7 +239,6 @@ content_id: "526d5caf-221b-4c7b-9e74-b3e0b189fc8d", title: "Brighton & Hove City Council", description: "Website of Brighton & Hove City Council", - additional_searchable_text: "Brighton & Hove City Council", link: "https://www.brighton-hove.gov.uk", url: "https://www.brighton-hove.gov.uk", public_timestamp: 1_695_912_979, @@ -249,7 +247,7 @@ content_purpose_supergroup: "other", locale: "en", }, - content: "", + content: "Brighton & Hove City Council", payload_version: 17, ) end diff --git a/spec/models/concerns/publishing_api/content_spec.rb b/spec/models/concerns/publishing_api/content_spec.rb index 81faeaa..cc681a4 100644 --- a/spec/models/concerns/publishing_api/content_spec.rb +++ b/spec/models/concerns/publishing_api/content_spec.rb @@ -8,14 +8,14 @@ let(:document_hash) do { details: { - description: "a", - introduction: "b", - introductory_paragraph: "c", - title: "d", - summary: "e", - body: "f", - need_to_know: "g", - more_information: "h", + body: "a", + description: "b", + introduction: "c", + introductory_paragraph: "d", + more_information: "e", + need_to_know: "f", + summary: "g", + title: "h", }, } end @@ -76,13 +76,13 @@ let(:document_hash) do { details: { - body: "a" * 1.1.megabytes, + body: "a" * 600.kilobytes, }, } end it "truncates the content" do - expect(extracted_content.bytesize).to be <= 1.megabyte + expect(extracted_content.bytesize).to be <= 500.kilobytes end end diff --git a/spec/models/concerns/publishing_api/metadata_spec.rb b/spec/models/concerns/publishing_api/metadata_spec.rb index 183cd3a..f64cfb7 100644 --- a/spec/models/concerns/publishing_api/metadata_spec.rb +++ b/spec/models/concerns/publishing_api/metadata_spec.rb @@ -28,24 +28,6 @@ it { is_expected.to eq("Lorem ipsum dolor sit amet.") } end - describe "additional_searchable_text" do - subject(:additional_searchable_text) { extracted_metadata[:additional_searchable_text] } - - let(:document_hash) do - { - details: { - acronym: "BA", - metadata: { - registration: "G-CIVY", - aircraft_type: "Boeing 747-436", - }, - }, - } - end - - it { is_expected.to eq("BA\nBoeing 747-436\nG-CIVY") } - end - describe "link" do subject(:extracted_link) { extracted_metadata[:link] } diff --git a/spec/models/publishing_api/body_content_spec.rb b/spec/models/publishing_api/body_content_spec.rb index 9e10add..e324306 100644 --- a/spec/models/publishing_api/body_content_spec.rb +++ b/spec/models/publishing_api/body_content_spec.rb @@ -75,4 +75,26 @@ it { is_expected.to be_nil } end end + + context "when the content is an array of strings" do + let(:content) { %w[Hello world] } + + describe "#html_content" do + subject(:html_content) { body_content.html_content } + + it { is_expected.to eq("Hello world") } + end + + describe "#text_content" do + subject(:text_content) { body_content.text_content } + + it { is_expected.to eq("Hello world") } + end + + describe "#summarized_text_content" do + subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) } + + it { is_expected.to eq("Hello…") } + end + end end