From 143f53dd764e30547ef1c78534a5d01c58ff6dad Mon Sep 17 00:00:00 2001 From: Christian Sutter Date: Fri, 24 Nov 2023 12:15:04 +0000 Subject: [PATCH] Refactor parsing of body content We've previously had `ContentWithMultipleTypes`, but we also want to be able to handle content that could just come through as a plain string. - Add `BodyContent` to encapsulate and handle a piece of content that could be a string or an array of "typed" content values - Change to use `BodyContent` throughout --- app/models/concerns/publishing_api/content.rb | 2 +- .../concerns/publishing_api/metadata.rb | 2 +- app/models/publishing_api/body_content.rb | 33 ++++++++ .../content_with_multiple_types.rb | 22 ------ .../concerns/publishing_api/metadata_spec.rb | 2 +- .../publishing_api/body_content_spec.rb | 78 +++++++++++++++++++ 6 files changed, 114 insertions(+), 25 deletions(-) create mode 100644 app/models/publishing_api/body_content.rb delete mode 100644 app/models/publishing_api/content_with_multiple_types.rb create mode 100644 spec/models/publishing_api/body_content_spec.rb diff --git a/app/models/concerns/publishing_api/content.rb b/app/models/concerns/publishing_api/content.rb index 1695ae4..74b8ba7 100644 --- a/app/models/concerns/publishing_api/content.rb +++ b/app/models/concerns/publishing_api/content.rb @@ -27,7 +27,7 @@ def content values_from_parts = document_hash.dig(:details, :parts)&.map do # Add the part title as a heading to help the search model better understand the structure # of the content - ["

#{_1[:title]}

", ContentWithMultipleTypes.new(_1[:body]).html_content] + ["

#{_1[:title]}

", BodyContent.new(_1[:body]).html_content] end [*values_from_json_paths, *values_from_parts] diff --git a/app/models/concerns/publishing_api/metadata.rb b/app/models/concerns/publishing_api/metadata.rb index 097abe8..a2b547a 100644 --- a/app/models/concerns/publishing_api/metadata.rb +++ b/app/models/concerns/publishing_api/metadata.rb @@ -117,7 +117,7 @@ def parts { slug: _1[:slug], title: _1[:title], - body: ContentWithMultipleTypes.new(_1[:body]).summarized_text_content, + body: BodyContent.new(_1[:body]).summarized_text_content, } end end diff --git a/app/models/publishing_api/body_content.rb b/app/models/publishing_api/body_content.rb new file mode 100644 index 0000000..408df0c --- /dev/null +++ b/app/models/publishing_api/body_content.rb @@ -0,0 +1,33 @@ +module PublishingApi + class BodyContent + def initialize(raw_content) + @content = case raw_content + when String + raw_content + when Array + raw_content.find { _1[:content_type] == "text/html" }&.dig(:content) + end + end + + def html_content + content + end + + def text_content + return nil unless html_content + + Loofah + .document(html_content) + .to_text(encode_special_chars: false) + .squish + end + + def summarized_text_content(length: 75, omission: "…", separator: " ") + text_content&.truncate(length, omission:, separator:) + end + + private + + attr_reader :content + end +end diff --git a/app/models/publishing_api/content_with_multiple_types.rb b/app/models/publishing_api/content_with_multiple_types.rb deleted file mode 100644 index 7dc7fd6..0000000 --- a/app/models/publishing_api/content_with_multiple_types.rb +++ /dev/null @@ -1,22 +0,0 @@ -module PublishingApi - class ContentWithMultipleTypes - def initialize(content_with_multiple_types) - @content_with_multiple_types = content_with_multiple_types - end - - def html_content - @content_with_multiple_types.find { _1[:content_type] == "text/html" }&.dig(:content) - end - - def text_content - Loofah - .document(html_content) - .to_text(encode_special_chars: false) - .squish - end - - def summarized_text_content(length: 75, omission: "…", separator: " ") - text_content.truncate(length, omission:, separator:) - end - end -end diff --git a/spec/models/concerns/publishing_api/metadata_spec.rb b/spec/models/concerns/publishing_api/metadata_spec.rb index dbc5d77..183cd3a 100644 --- a/spec/models/concerns/publishing_api/metadata_spec.rb +++ b/spec/models/concerns/publishing_api/metadata_spec.rb @@ -291,7 +291,7 @@ it "contains the expected body with HTML stripped and truncated" do expect(extracted_parts.map { _1[:body] }).to eq([ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur…", - "", + nil, ]) end end diff --git a/spec/models/publishing_api/body_content_spec.rb b/spec/models/publishing_api/body_content_spec.rb new file mode 100644 index 0000000..9e10add --- /dev/null +++ b/spec/models/publishing_api/body_content_spec.rb @@ -0,0 +1,78 @@ +RSpec.describe PublishingApi::BodyContent do + subject(:body_content) { described_class.new(content) } + + context "when the content is a plain string" do + let(:content) { "Hello, world!" } + + describe "#html_content" do + subject(:html_content) { body_content.html_content } + + it { is_expected.to eq("Hello, world!") } + end + + describe "#text_content" do + subject(:text_content) { body_content.text_content } + + it { is_expected.to eq("Hello, world!") } + end + + describe "#summarized_text_content" do + subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) } + + it { is_expected.to eq("Hello…") } + end + end + + context "when the content is an array of typed content that includes text/html content" do + let(:content) do + [ + { content_type: "application/json", content: '{"foo": "bar"}' }, + { content_type: "text/html", content: "Hello, world!" }, + ] + end + + describe "#html_content" do + subject(:html_content) { body_content.html_content } + + it { is_expected.to eq("Hello, world!") } + end + + describe "#text_content" do + subject(:text_content) { body_content.text_content } + + it { is_expected.to eq("Hello, world!") } + end + + describe "#summarized_text_content" do + subject(:summarized_text_content) { body_content.summarized_text_content(length: 6) } + + it { is_expected.to eq("Hello…") } + end + end + + context "when the content is an array of typed content that doesn't include text/html content" do + let(:content) do + [ + { content_type: "application/json", content: '{"foo": "bar"}' }, + ] + end + + describe "#html_content" do + subject(:html_content) { body_content.html_content } + + it { is_expected.to be_nil } + end + + describe "#text_content" do + subject(:text_content) { body_content.text_content } + + it { is_expected.to be_nil } + end + + describe "#summarized_text_content" do + subject(:summarized_text_content) { body_content.summarized_text_content } + + it { is_expected.to be_nil } + end + end +end