Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor parsing of body content #123

Merged
merged 1 commit into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/models/concerns/publishing_api/content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def content
values_from_parts = document_hash.dig(:details, :parts)&.map do
# Add the part title as a heading to help the search model better understand the structure
# of the content
["<h1>#{_1[:title]}</h1>", ContentWithMultipleTypes.new(_1[:body]).html_content]
["<h1>#{_1[:title]}</h1>", BodyContent.new(_1[:body]).html_content]
end

[*values_from_json_paths, *values_from_parts]
Expand Down
2 changes: 1 addition & 1 deletion app/models/concerns/publishing_api/metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def parts
{
slug: _1[:slug],
title: _1[:title],
body: ContentWithMultipleTypes.new(_1[:body]).summarized_text_content,
body: BodyContent.new(_1[:body]).summarized_text_content,
}
end
end
Expand Down
33 changes: 33 additions & 0 deletions app/models/publishing_api/body_content.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
module PublishingApi
class BodyContent
def initialize(raw_content)
@content = case raw_content
when String
raw_content
when Array
raw_content.find { _1[:content_type] == "text/html" }&.dig(:content)
end
end

def html_content
content
end

def text_content
return nil unless html_content

Loofah
.document(html_content)
.to_text(encode_special_chars: false)
.squish
end

def summarized_text_content(length: 75, omission: "…", separator: " ")
text_content&.truncate(length, omission:, separator:)
end

private

attr_reader :content
end
end
22 changes: 0 additions & 22 deletions app/models/publishing_api/content_with_multiple_types.rb

This file was deleted.

2 changes: 1 addition & 1 deletion spec/models/concerns/publishing_api/metadata_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@
it "contains the expected body with HTML stripped and truncated" do
expect(extracted_parts.map { _1[:body] }).to eq([
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur…",
"",
nil,
])
end
end
Expand Down
78 changes: 78 additions & 0 deletions spec/models/publishing_api/body_content_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
RSpec.describe PublishingApi::BodyContent do
subject(:body_content) { described_class.new(content) }

context "when the content is a plain string" do
let(:content) { "Hello, world!" }

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to eq("Hello, world!") }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to eq("Hello, world!") }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) }

it { is_expected.to eq("Hello…") }
end
end

context "when the content is an array of typed content that includes text/html content" do
let(:content) do
[
{ content_type: "application/json", content: '{"foo": "bar"}' },
{ content_type: "text/html", content: "<blink>Hello, world!</blink>" },
]
end

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to eq("<blink>Hello, world!</blink>") }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to eq("Hello, world!") }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) }

it { is_expected.to eq("Hello…") }
end
end

context "when the content is an array of typed content that doesn't include text/html content" do
let(:content) do
[
{ content_type: "application/json", content: '{"foo": "bar"}' },
]
end

describe "#html_content" do
subject(:html_content) { body_content.html_content }

it { is_expected.to be_nil }
end

describe "#text_content" do
subject(:text_content) { body_content.text_content }

it { is_expected.to be_nil }
end

describe "#summarized_text_content" do
subject(:summarized_text_content) { body_content.summarized_text_content }

it { is_expected.to be_nil }
end
end
end