Skip to content

Commit

Permalink
Merge pull request #123 from alphagov/fix-body-content-parsing
Browse files Browse the repository at this point in the history
Refactor parsing of body content
  • Loading branch information
csutter authored Nov 24, 2023
2 parents cd16c39 + 143f53d commit 72f46ea
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 25 deletions.
2 changes: 1 addition & 1 deletion app/models/concerns/publishing_api/content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def content
values_from_parts = document_hash.dig(:details, :parts)&.map do
# Add the part title as a heading to help the search model better understand the structure
# of the content
["<h1>#{_1[:title]}</h1>", ContentWithMultipleTypes.new(_1[:body]).html_content]
["<h1>#{_1[:title]}</h1>", BodyContent.new(_1[:body]).html_content]
end

[*values_from_json_paths, *values_from_parts]
Expand Down
2 changes: 1 addition & 1 deletion app/models/concerns/publishing_api/metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def parts
{
slug: _1[:slug],
title: _1[:title],
body: ContentWithMultipleTypes.new(_1[:body]).summarized_text_content,
body: BodyContent.new(_1[:body]).summarized_text_content,
}
end
end
Expand Down
33 changes: 33 additions & 0 deletions app/models/publishing_api/body_content.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
module PublishingApi
class BodyContent
def initialize(raw_content)
@content = case raw_content
when String
raw_content
when Array
raw_content.find { _1[:content_type] == "text/html" }&.dig(:content)
end
end

def html_content
content
end

def text_content
return nil unless html_content

Loofah
.document(html_content)
.to_text(encode_special_chars: false)
.squish
end

def summarized_text_content(length: 75, omission: "…", separator: " ")
text_content&.truncate(length, omission:, separator:)
end

private

attr_reader :content
end
end
22 changes: 0 additions & 22 deletions app/models/publishing_api/content_with_multiple_types.rb

This file was deleted.

2 changes: 1 addition & 1 deletion spec/models/concerns/publishing_api/metadata_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@
it "contains the expected body with HTML stripped and truncated" do
expect(extracted_parts.map { _1[:body] }).to eq([
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur…",
"",
nil,
])
end
end
Expand Down
78 changes: 78 additions & 0 deletions spec/models/publishing_api/body_content_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
RSpec.describe PublishingApi::BodyContent do
subject(:body_content) { described_class.new(content) }

context "when the content is a plain string" do
let(:content) { "Hello, world!" }

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to eq("Hello, world!") }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to eq("Hello, world!") }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) }

it { is_expected.to eq("Hello…") }
end
end

context "when the content is an array of typed content that includes text/html content" do
let(:content) do
[
{ content_type: "application/json", content: '{"foo": "bar"}' },
{ content_type: "text/html", content: "<blink>Hello, world!</blink>" },
]
end

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to eq("<blink>Hello, world!</blink>") }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to eq("Hello, world!") }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) }

it { is_expected.to eq("Hello…") }
end
end

context "when the content is an array of typed content that doesn't include text/html content" do
let(:content) do
[
{ content_type: "application/json", content: '{"foo": "bar"}' },
]
end

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to be_nil }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to be_nil }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content }

it { is_expected.to be_nil }
end
end
end

0 comments on commit 72f46ea

Please sign in to comment.