Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove additional_searchable_text metadata field #126

Merged
merged 1 commit into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions app/models/concerns/publishing_api/content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,45 @@ module Content
# All the possible keys in the message hash that can contain the primary unstructured document
# content that we want to index, represented as JsonPath path strings.
INDEXABLE_CONTENT_VALUES_JSON_PATHS = %w[
$.details.acronym
$.details.attachments[*]['title','isbn','unique_reference','command_paper_number','hoc_paper_number']
$.details.body
$.details.contact_groups[*].title
$.details.description
$.details.hidden_search_terms
$.details.introduction
$.details.introductory_paragraph
$.details.contact_groups[*].title
$.details.title
$.details.summary
$.details.body
$.details.need_to_know
$.details.licence_short_description
$.details.metadata.aircraft_type
$.details.metadata.authors
$.details.metadata.business_sizes
$.details.metadata.business_stages
$.details.metadata.hidden_indexable_content
$.details.metadata.industries
$.details.metadata.keyword
$.details.metadata.licence_transaction_industry
$.details.metadata.project_code
$.details.metadata.reference_number
$.details.metadata.regions
$.details.metadata.registration
$.details.metadata.research_document_type
$.details.metadata.result
$.details.metadata.stage
$.details.metadata.theme
$.details.metadata.tribunal_decision_categories_name
$.details.metadata.tribunal_decision_category_name
$.details.metadata.tribunal_decision_country_name
$.details.metadata.tribunal_decision_judges_name
$.details.metadata.tribunal_decision_landmark_name
$.details.metadata.tribunal_decision_sub_categories_name
$.details.metadata.tribunal_decision_sub_category_name
$.details.metadata.types_of_support
$.details.metadata.virus_strain
$.details.metadata.year_adopted
$.details.more_information
$.details.need_to_know
$.details.summary
$.details.title
].map { JsonPath.new(_1, use_symbols: true) }.freeze
INDEXABLE_CONTENT_SEPARATOR = "\n".freeze

Expand All @@ -34,7 +64,7 @@ def content

[*values_from_json_paths, *values_from_parts]
.flatten
.compact
.compact_blank
.join(INDEXABLE_CONTENT_SEPARATOR)
.truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE)
end
Expand Down
46 changes: 0 additions & 46 deletions app/models/concerns/publishing_api/metadata.rb
Original file line number Diff line number Diff line change
@@ -1,49 +1,11 @@
module PublishingApi
module Metadata
# All the possible keys in the message hash that can contain additional keywords or other text
# that should be searchable but doesn't form part of the primary document content, represented
# as JsonPath path strings.
ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS = %w[
$.details.acronym
$.details.attachments[*]['title','isbn','unique_reference','command_paper_number','hoc_paper_number']
$.details.hidden_search_terms
$.details.licence_short_description
$.details.metadata.aircraft_type
$.details.metadata.authors
$.details.metadata.business_sizes
$.details.metadata.business_stages
$.details.metadata.hidden_indexable_content
$.details.metadata.industries
$.details.metadata.keyword
$.details.metadata.licence_transaction_industry
$.details.metadata.project_code
$.details.metadata.reference_number
$.details.metadata.regions
$.details.metadata.registration
$.details.metadata.research_document_type
$.details.metadata.result
$.details.metadata.stage
$.details.metadata.theme
$.details.metadata.tribunal_decision_categories_name
$.details.metadata.tribunal_decision_category_name
$.details.metadata.tribunal_decision_country_name
$.details.metadata.tribunal_decision_judges_name
$.details.metadata.tribunal_decision_landmark_name
$.details.metadata.tribunal_decision_sub_categories_name
$.details.metadata.tribunal_decision_sub_category_name
$.details.metadata.types_of_support
$.details.metadata.virus_strain
$.details.metadata.year_adopted
].map { JsonPath.new(_1, use_symbols: true) }.freeze
ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR = "\n".freeze

# Extracts a hash of structured metadata about this document.
def metadata
{
content_id: document_hash[:content_id],
title: document_hash[:title],
description: document_hash[:description],
additional_searchable_text:,
link:,
url:,
public_timestamp:,
Expand Down Expand Up @@ -75,14 +37,6 @@ def url
Plek.website_root + link
end

def additional_searchable_text
values = ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS.map { _1.on(document_hash) }
values
.flatten
.compact_blank
.join(ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR)
end

def public_timestamp
return nil unless document_hash[:public_updated_at]

Expand Down
10 changes: 7 additions & 3 deletions app/models/publishing_api/body_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ module PublishingApi
class BodyContent
def initialize(raw_content)
@content = case raw_content
when String
in String
raw_content
when Array
raw_content.find { _1[:content_type] == "text/html" }&.dig(:content)
in [*, { content_type: "text/html", content: html_content }, *]
html_content
in Array
raw_content.join(" ") if raw_content.all? { _1.is_a?(String) }
else
nil
end
end

Expand Down
18 changes: 8 additions & 10 deletions spec/integration/document_synchronization_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@
content_id: "6ba90ae6-972d-4d48-ad66-693bbb31496d",
title: "Legal Aid Agency",
description: "We provide civil and criminal legal aid and advice in England and Wales to help people deal with their legal problems. LAA is an executive agency, sponsored by the Ministry of Justice .",
additional_searchable_text: "LAA",
link: "/government/organisations/legal-aid-agency",
url: "http://www.dev.gov.uk/government/organisations/legal-aid-agency",
public_timestamp: 1_695_391_634,
Expand All @@ -169,7 +168,7 @@
content_purpose_supergroup: "other",
locale: "en",
},
content: a_string_starting_with("<div class=\"govspeak\"><p>We provide civil"),
content: a_string_starting_with("LAA\n<div class=\"govspeak\"><p>We provide civil"),
payload_version: 12_345,
)
end
Expand All @@ -185,11 +184,6 @@
content_id: "5d315ee8-7631-11e4-a3cb-005056011aef",
title: "Directgov 2010 and beyond: revolution not evolution, a report by Martha Lane Fox",
description: "A report from the Digital Champion Martha Lane Fox with recommendations for the future of Directgov.",
additional_searchable_text: <<~TEXT.chomp,
Directgov 2010 and Beyond: Revolution Not Evolution - Letter from Martha Lane Fox to Francis Maude
Francis Maude's reply to Martha Lane Fox's letter
Directgov Strategic Review - Executive Summary
TEXT
link: "/government/publications/directgov-2010-and-beyond-revolution-not-evolution-a-report-by-martha-lane-fox",
url: "http://www.dev.gov.uk/government/publications/directgov-2010-and-beyond-revolution-not-evolution-a-report-by-martha-lane-fox",
public_timestamp: 1_290_470_400,
Expand All @@ -200,7 +194,12 @@
part_of_taxonomy_tree: %w[f3caf326-fe33-410f-b7f4-553f4011c81e],
locale: "en",
},
content: a_string_starting_with("<div class=\"govspeak\"><p>A report from the Digital"),
content: a_string_starting_with(<<~TEXT.chomp),
Directgov 2010 and Beyond: Revolution Not Evolution - Letter from Martha Lane Fox to Francis Maude
Francis Maude's reply to Martha Lane Fox's letter
Directgov Strategic Review - Executive Summary
<div class=\"govspeak\"><p>A report from the Digital
TEXT
payload_version: 54_321,
)
end
Expand Down Expand Up @@ -240,7 +239,6 @@
content_id: "526d5caf-221b-4c7b-9e74-b3e0b189fc8d",
title: "Brighton & Hove City Council",
description: "Website of Brighton & Hove City Council",
additional_searchable_text: "Brighton & Hove City Council",
link: "https://www.brighton-hove.gov.uk",
url: "https://www.brighton-hove.gov.uk",
public_timestamp: 1_695_912_979,
Expand All @@ -249,7 +247,7 @@
content_purpose_supergroup: "other",
locale: "en",
},
content: "",
content: "Brighton & Hove City Council",
payload_version: 17,
)
end
Expand Down
20 changes: 10 additions & 10 deletions spec/models/concerns/publishing_api/content_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
let(:document_hash) do
{
details: {
description: "a",
introduction: "b",
introductory_paragraph: "c",
title: "d",
summary: "e",
body: "f",
need_to_know: "g",
more_information: "h",
body: "a",
description: "b",
introduction: "c",
introductory_paragraph: "d",
more_information: "e",
need_to_know: "f",
summary: "g",
title: "h",
},
}
end
Expand Down Expand Up @@ -76,13 +76,13 @@
let(:document_hash) do
{
details: {
body: "a" * 1.1.megabytes,
body: "a" * 600.kilobytes,
},
}
end

it "truncates the content" do
expect(extracted_content.bytesize).to be <= 1.megabyte
expect(extracted_content.bytesize).to be <= 500.kilobytes
end
end

Expand Down
18 changes: 0 additions & 18 deletions spec/models/concerns/publishing_api/metadata_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,6 @@
it { is_expected.to eq("Lorem ipsum dolor sit amet.") }
end

describe "additional_searchable_text" do
subject(:additional_searchable_text) { extracted_metadata[:additional_searchable_text] }

let(:document_hash) do
{
details: {
acronym: "BA",
metadata: {
registration: "G-CIVY",
aircraft_type: "Boeing 747-436",
},
},
}
end

it { is_expected.to eq("BA\nBoeing 747-436\nG-CIVY") }
end

describe "link" do
subject(:extracted_link) { extracted_metadata[:link] }

Expand Down
22 changes: 22 additions & 0 deletions spec/models/publishing_api/body_content_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,26 @@
it { is_expected.to be_nil }
end
end

context "when the content is an array of strings" do
let(:content) { %w[Hello world] }

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to eq("Hello world") }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to eq("Hello world") }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) }

it { is_expected.to eq("Hello…") }
end
end
end