Skip to content

Commit

Permalink
Merge pull request #126 from alphagov/additional-data-body
Browse files Browse the repository at this point in the history
Remove `additional_searchable_text` metadata field
  • Loading branch information
csutter authored Nov 24, 2023
2 parents fc3d352 + 3ad5abd commit 2113953
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 93 deletions.
42 changes: 36 additions & 6 deletions app/models/concerns/publishing_api/content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,45 @@ module Content
# All the possible keys in the message hash that can contain the primary unstructured document
# content that we want to index, represented as JsonPath path strings.
INDEXABLE_CONTENT_VALUES_JSON_PATHS = %w[
$.details.acronym
$.details.attachments[*]['title','isbn','unique_reference','command_paper_number','hoc_paper_number']
$.details.body
$.details.contact_groups[*].title
$.details.description
$.details.hidden_search_terms
$.details.introduction
$.details.introductory_paragraph
$.details.contact_groups[*].title
$.details.title
$.details.summary
$.details.body
$.details.need_to_know
$.details.licence_short_description
$.details.metadata.aircraft_type
$.details.metadata.authors
$.details.metadata.business_sizes
$.details.metadata.business_stages
$.details.metadata.hidden_indexable_content
$.details.metadata.industries
$.details.metadata.keyword
$.details.metadata.licence_transaction_industry
$.details.metadata.project_code
$.details.metadata.reference_number
$.details.metadata.regions
$.details.metadata.registration
$.details.metadata.research_document_type
$.details.metadata.result
$.details.metadata.stage
$.details.metadata.theme
$.details.metadata.tribunal_decision_categories_name
$.details.metadata.tribunal_decision_category_name
$.details.metadata.tribunal_decision_country_name
$.details.metadata.tribunal_decision_judges_name
$.details.metadata.tribunal_decision_landmark_name
$.details.metadata.tribunal_decision_sub_categories_name
$.details.metadata.tribunal_decision_sub_category_name
$.details.metadata.types_of_support
$.details.metadata.virus_strain
$.details.metadata.year_adopted
$.details.more_information
$.details.need_to_know
$.details.summary
$.details.title
].map { JsonPath.new(_1, use_symbols: true) }.freeze
INDEXABLE_CONTENT_SEPARATOR = "\n".freeze

Expand All @@ -34,7 +64,7 @@ def content

[*values_from_json_paths, *values_from_parts]
.flatten
.compact
.compact_blank
.join(INDEXABLE_CONTENT_SEPARATOR)
.truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE)
end
Expand Down
46 changes: 0 additions & 46 deletions app/models/concerns/publishing_api/metadata.rb
Original file line number Diff line number Diff line change
@@ -1,49 +1,11 @@
module PublishingApi
module Metadata
# All the possible keys in the message hash that can contain additional keywords or other text
# that should be searchable but doesn't form part of the primary document content, represented
# as JsonPath path strings.
ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS = %w[
$.details.acronym
$.details.attachments[*]['title','isbn','unique_reference','command_paper_number','hoc_paper_number']
$.details.hidden_search_terms
$.details.licence_short_description
$.details.metadata.aircraft_type
$.details.metadata.authors
$.details.metadata.business_sizes
$.details.metadata.business_stages
$.details.metadata.hidden_indexable_content
$.details.metadata.industries
$.details.metadata.keyword
$.details.metadata.licence_transaction_industry
$.details.metadata.project_code
$.details.metadata.reference_number
$.details.metadata.regions
$.details.metadata.registration
$.details.metadata.research_document_type
$.details.metadata.result
$.details.metadata.stage
$.details.metadata.theme
$.details.metadata.tribunal_decision_categories_name
$.details.metadata.tribunal_decision_category_name
$.details.metadata.tribunal_decision_country_name
$.details.metadata.tribunal_decision_judges_name
$.details.metadata.tribunal_decision_landmark_name
$.details.metadata.tribunal_decision_sub_categories_name
$.details.metadata.tribunal_decision_sub_category_name
$.details.metadata.types_of_support
$.details.metadata.virus_strain
$.details.metadata.year_adopted
].map { JsonPath.new(_1, use_symbols: true) }.freeze
ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR = "\n".freeze

# Extracts a hash of structured metadata about this document.
def metadata
{
content_id: document_hash[:content_id],
title: document_hash[:title],
description: document_hash[:description],
additional_searchable_text:,
link:,
url:,
public_timestamp:,
Expand Down Expand Up @@ -75,14 +37,6 @@ def url
Plek.website_root + link
end

def additional_searchable_text
values = ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS.map { _1.on(document_hash) }
values
.flatten
.compact_blank
.join(ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR)
end

def public_timestamp
return nil unless document_hash[:public_updated_at]

Expand Down
10 changes: 7 additions & 3 deletions app/models/publishing_api/body_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ module PublishingApi
class BodyContent
def initialize(raw_content)
@content = case raw_content
when String
in String
raw_content
when Array
raw_content.find { _1[:content_type] == "text/html" }&.dig(:content)
in [*, { content_type: "text/html", content: html_content }, *]
html_content
in Array
raw_content.join(" ") if raw_content.all? { _1.is_a?(String) }
else
nil
end
end

Expand Down
18 changes: 8 additions & 10 deletions spec/integration/document_synchronization_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@
content_id: "6ba90ae6-972d-4d48-ad66-693bbb31496d",
title: "Legal Aid Agency",
description: "We provide civil and criminal legal aid and advice in England and Wales to help people deal with their legal problems. LAA is an executive agency, sponsored by the Ministry of Justice .",
additional_searchable_text: "LAA",
link: "/government/organisations/legal-aid-agency",
url: "http://www.dev.gov.uk/government/organisations/legal-aid-agency",
public_timestamp: 1_695_391_634,
Expand All @@ -169,7 +168,7 @@
content_purpose_supergroup: "other",
locale: "en",
},
content: a_string_starting_with("<div class=\"govspeak\"><p>We provide civil"),
content: a_string_starting_with("LAA\n<div class=\"govspeak\"><p>We provide civil"),
payload_version: 12_345,
)
end
Expand All @@ -185,11 +184,6 @@
content_id: "5d315ee8-7631-11e4-a3cb-005056011aef",
title: "Directgov 2010 and beyond: revolution not evolution, a report by Martha Lane Fox",
description: "A report from the Digital Champion Martha Lane Fox with recommendations for the future of Directgov.",
additional_searchable_text: <<~TEXT.chomp,
Directgov 2010 and Beyond: Revolution Not Evolution - Letter from Martha Lane Fox to Francis Maude
Francis Maude's reply to Martha Lane Fox's letter
Directgov Strategic Review - Executive Summary
TEXT
link: "/government/publications/directgov-2010-and-beyond-revolution-not-evolution-a-report-by-martha-lane-fox",
url: "http://www.dev.gov.uk/government/publications/directgov-2010-and-beyond-revolution-not-evolution-a-report-by-martha-lane-fox",
public_timestamp: 1_290_470_400,
Expand All @@ -200,7 +194,12 @@
part_of_taxonomy_tree: %w[f3caf326-fe33-410f-b7f4-553f4011c81e],
locale: "en",
},
content: a_string_starting_with("<div class=\"govspeak\"><p>A report from the Digital"),
content: a_string_starting_with(<<~TEXT.chomp),
Directgov 2010 and Beyond: Revolution Not Evolution - Letter from Martha Lane Fox to Francis Maude
Francis Maude's reply to Martha Lane Fox's letter
Directgov Strategic Review - Executive Summary
<div class=\"govspeak\"><p>A report from the Digital
TEXT
payload_version: 54_321,
)
end
Expand Down Expand Up @@ -240,7 +239,6 @@
content_id: "526d5caf-221b-4c7b-9e74-b3e0b189fc8d",
title: "Brighton & Hove City Council",
description: "Website of Brighton & Hove City Council",
additional_searchable_text: "Brighton & Hove City Council",
link: "https://www.brighton-hove.gov.uk",
url: "https://www.brighton-hove.gov.uk",
public_timestamp: 1_695_912_979,
Expand All @@ -249,7 +247,7 @@
content_purpose_supergroup: "other",
locale: "en",
},
content: "",
content: "Brighton & Hove City Council",
payload_version: 17,
)
end
Expand Down
20 changes: 10 additions & 10 deletions spec/models/concerns/publishing_api/content_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
let(:document_hash) do
{
details: {
description: "a",
introduction: "b",
introductory_paragraph: "c",
title: "d",
summary: "e",
body: "f",
need_to_know: "g",
more_information: "h",
body: "a",
description: "b",
introduction: "c",
introductory_paragraph: "d",
more_information: "e",
need_to_know: "f",
summary: "g",
title: "h",
},
}
end
Expand Down Expand Up @@ -76,13 +76,13 @@
let(:document_hash) do
{
details: {
body: "a" * 1.1.megabytes,
body: "a" * 600.kilobytes,
},
}
end

it "truncates the content" do
expect(extracted_content.bytesize).to be <= 1.megabyte
expect(extracted_content.bytesize).to be <= 500.kilobytes
end
end

Expand Down
18 changes: 0 additions & 18 deletions spec/models/concerns/publishing_api/metadata_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,6 @@
it { is_expected.to eq("Lorem ipsum dolor sit amet.") }
end

describe "additional_searchable_text" do
subject(:additional_searchable_text) { extracted_metadata[:additional_searchable_text] }

let(:document_hash) do
{
details: {
acronym: "BA",
metadata: {
registration: "G-CIVY",
aircraft_type: "Boeing 747-436",
},
},
}
end

it { is_expected.to eq("BA\nBoeing 747-436\nG-CIVY") }
end

describe "link" do
subject(:extracted_link) { extracted_metadata[:link] }

Expand Down
22 changes: 22 additions & 0 deletions spec/models/publishing_api/body_content_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,26 @@
it { is_expected.to be_nil }
end
end

context "when the content is an array of strings" do
let(:content) { %w[Hello world] }

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to eq("Hello world") }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to eq("Hello world") }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) }

it { is_expected.to eq("Hello…") }
end
end
end

0 comments on commit 2113953

Please sign in to comment.