From 529921b3bb224450f517dbcb2d12f62f70785715 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 25 Oct 2024 17:26:26 +0100 Subject: [PATCH 01/15] Allow tokens to be consumed during serialization --- src/rewritable_units/tokens/attributes.rs | 10 +++++----- src/rewritable_units/tokens/doctype.rs | 4 ++-- src/rewritable_units/tokens/mod.rs | 16 ++++++++-------- src/rewritable_units/tokens/start_tag.rs | 2 +- src/transform_stream/dispatcher.rs | 4 ++-- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index c55e18b9..57fbb87a 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -124,9 +124,9 @@ impl<'i> Attribute<'i> { } } -impl Serialize for Attribute<'_> { +impl Serialize for &Attribute<'_> { #[inline] - fn to_bytes(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { match self.raw.as_ref() { Some(raw) => output_handler(raw), None => { @@ -254,14 +254,14 @@ impl<'i> Deref for Attributes<'i> { } } -impl Serialize for Attributes<'_> { +impl Serialize for &Attributes<'_> { #[inline] - fn to_bytes(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { if !self.is_empty() { let last = self.len() - 1; for (idx, attr) in self.iter().enumerate() { - attr.to_bytes(output_handler); + attr.into_bytes(output_handler); if idx != last { output_handler(b" "); diff --git a/src/rewritable_units/tokens/doctype.rs b/src/rewritable_units/tokens/doctype.rs index 1dbf6d7a..06829995 100644 --- a/src/rewritable_units/tokens/doctype.rs +++ b/src/rewritable_units/tokens/doctype.rs @@ -112,9 +112,9 @@ impl<'i> Doctype<'i> { impl_user_data!(Doctype<'_>); -impl Serialize for Doctype<'_> { +impl Serialize for &Doctype<'_> { #[inline] - fn to_bytes(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { if !self.removed() { output_handler(&self.raw); } diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index e2af3f8f..17a3a4c8 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -9,14 +9,14 @@ pub use self::capturer::*; // Pub only for integration tests pub trait Serialize { - fn to_bytes(&self, output_handler: &mut dyn FnMut(&[u8])); + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])); } macro_rules! impl_serialize { ($Token:ident) => { impl crate::rewritable_units::Serialize for $Token<'_> { #[inline] - fn to_bytes(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { let Mutations { content_before, replacement, @@ -70,13 +70,13 @@ pub enum Token<'i> { impl Serialize for Token<'_> { #[inline] - fn to_bytes(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { match self { - Token::TextChunk(t) => t.to_bytes(output_handler), - Token::Comment(t) => t.to_bytes(output_handler), - Token::StartTag(t) => t.to_bytes(output_handler), - Token::EndTag(t) => t.to_bytes(output_handler), - Token::Doctype(t) => t.to_bytes(output_handler), + Token::TextChunk(t) => t.into_bytes(output_handler), + Token::Comment(t) => t.into_bytes(output_handler), + Token::StartTag(t) => t.into_bytes(output_handler), + Token::EndTag(t) => t.into_bytes(output_handler), + Token::Doctype(t) => t.into_bytes(output_handler), } } } diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 0f1fae6e..4363260a 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -149,7 +149,7 @@ impl<'i> StartTag<'i> { if !self.attributes.is_empty() { output_handler(b" "); - self.attributes.to_bytes(output_handler); + self.attributes.into_bytes(output_handler); // NOTE: attributes can be modified the way that // last attribute has an unquoted value. We always diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index a337b496..05ecbb82 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -162,7 +162,7 @@ where transform_controller.handle_token(&mut token)?; if emission_enabled { - token.to_bytes(&mut |c| output_sink.handle_chunk(c)); + token.into_bytes(&mut |c| output_sink.handle_chunk(c)); } } } @@ -278,7 +278,7 @@ where transform_controller.handle_token(&mut token)?; if emission_enabled { - token.to_bytes(&mut |c| output_sink.handle_chunk(c)); + token.into_bytes(&mut |c| output_sink.handle_chunk(c)); } } From 46fe4e43648702d00492deb497028ea35bfb0416 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 25 Oct 2024 20:59:57 +0100 Subject: [PATCH 02/15] Make to_bytes fallible --- src/rewritable_units/tokens/attributes.rs | 9 ++++++--- src/rewritable_units/tokens/comment.rs | 7 ++++++- src/rewritable_units/tokens/doctype.rs | 4 +++- src/rewritable_units/tokens/end_tag.rs | 7 ++++++- src/rewritable_units/tokens/mod.rs | 10 ++++++---- src/rewritable_units/tokens/start_tag.rs | 9 +++++++-- src/rewritable_units/tokens/text_chunk.rs | 7 ++++++- src/transform_stream/dispatcher.rs | 4 ++-- 8 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index 57fbb87a..20153744 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -1,4 +1,5 @@ use crate::base::Bytes; +use crate::errors::RewritingError; use crate::parser::AttributeBuffer; use crate::rewritable_units::Serialize; use encoding_rs::Encoding; @@ -126,7 +127,7 @@ impl<'i> Attribute<'i> { impl Serialize for &Attribute<'_> { #[inline] - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { match self.raw.as_ref() { Some(raw) => output_handler(raw), None => { @@ -136,6 +137,7 @@ impl Serialize for &Attribute<'_> { output_handler(b"\""); } } + Ok(()) } } @@ -256,17 +258,18 @@ impl<'i> Deref for Attributes<'i> { impl Serialize for &Attributes<'_> { #[inline] - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { if !self.is_empty() { let last = self.len() - 1; for (idx, attr) in self.iter().enumerate() { - attr.into_bytes(output_handler); + attr.into_bytes(output_handler)?; if idx != last { output_handler(b" "); } } } + Ok(()) } } diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index e324c4cd..6ff06909 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -1,5 +1,6 @@ use super::{Mutations, Token}; use crate::base::Bytes; +use crate::errors::RewritingError; use encoding_rs::Encoding; use std::any::Any; use std::fmt::{self, Debug}; @@ -191,10 +192,14 @@ impl<'i> Comment<'i> { } #[inline] - fn serialize_from_parts(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn serialize_from_parts( + &self, + output_handler: &mut dyn FnMut(&[u8]), + ) -> Result<(), RewritingError> { output_handler(b""); + Ok(()) } } diff --git a/src/rewritable_units/tokens/doctype.rs b/src/rewritable_units/tokens/doctype.rs index 06829995..1a340455 100644 --- a/src/rewritable_units/tokens/doctype.rs +++ b/src/rewritable_units/tokens/doctype.rs @@ -1,4 +1,5 @@ use crate::base::Bytes; +use crate::errors::RewritingError; use crate::rewritable_units::{Serialize, Token}; use encoding_rs::Encoding; use std::any::Any; @@ -114,10 +115,11 @@ impl_user_data!(Doctype<'_>); impl Serialize for &Doctype<'_> { #[inline] - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { if !self.removed() { output_handler(&self.raw); } + Ok(()) } } diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index 7b349860..ed271368 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -1,5 +1,6 @@ use super::{Mutations, Token}; use crate::base::Bytes; +use crate::errors::RewritingError; use crate::rewritable_units::ContentType; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -99,10 +100,14 @@ impl<'i> EndTag<'i> { } #[inline] - fn serialize_from_parts(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn serialize_from_parts( + &self, + output_handler: &mut dyn FnMut(&[u8]), + ) -> Result<(), RewritingError> { output_handler(b""); + Ok(()) } } diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index 17a3a4c8..73971257 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -2,6 +2,7 @@ mod attributes; mod capturer; use super::Mutations; +use crate::errors::RewritingError; pub(super) use self::attributes::Attributes; pub use self::attributes::{Attribute, AttributeNameError}; @@ -9,14 +10,14 @@ pub use self::capturer::*; // Pub only for integration tests pub trait Serialize { - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])); + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError>; } macro_rules! impl_serialize { ($Token:ident) => { impl crate::rewritable_units::Serialize for $Token<'_> { #[inline] - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { let Mutations { content_before, replacement, @@ -32,7 +33,7 @@ macro_rules! impl_serialize { if !removed { match self.raw() { Some(raw) => output_handler(raw), - None => self.serialize_from_parts(output_handler), + None => self.serialize_from_parts(output_handler)?, } } else if !replacement.is_empty() { output_handler(replacement); @@ -41,6 +42,7 @@ macro_rules! impl_serialize { if !content_after.is_empty() { output_handler(content_after); } + Ok(()) } } }; @@ -70,7 +72,7 @@ pub enum Token<'i> { impl Serialize for Token<'_> { #[inline] - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) { + fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { match self { Token::TextChunk(t) => t.into_bytes(output_handler), Token::Comment(t) => t.into_bytes(output_handler), diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 4363260a..aa44fcc2 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -1,6 +1,7 @@ use super::{Attribute, AttributeNameError, Attributes}; use super::{Mutations, Serialize, Token}; use crate::base::Bytes; +use crate::errors::RewritingError; use crate::html::Namespace; use crate::rewritable_units::ContentType; use encoding_rs::Encoding; @@ -142,14 +143,17 @@ impl<'i> StartTag<'i> { } #[inline] - fn serialize_from_parts(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn serialize_from_parts( + &self, + output_handler: &mut dyn FnMut(&[u8]), + ) -> Result<(), RewritingError> { output_handler(b"<"); output_handler(&self.name); if !self.attributes.is_empty() { output_handler(b" "); - self.attributes.into_bytes(output_handler); + self.attributes.into_bytes(output_handler)?; // NOTE: attributes can be modified the way that // last attribute has an unquoted value. We always @@ -166,6 +170,7 @@ impl<'i> StartTag<'i> { } else { output_handler(b">"); } + Ok(()) } #[cfg(test)] diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 2ff14380..8b6155de 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -1,5 +1,6 @@ use super::{Mutations, Token}; use crate::base::Bytes; +use crate::errors::RewritingError; use crate::html::TextType; use encoding_rs::Encoding; use std::any::Any; @@ -275,10 +276,14 @@ impl<'i> TextChunk<'i> { } #[inline] - fn serialize_from_parts(&self, output_handler: &mut dyn FnMut(&[u8])) { + fn serialize_from_parts( + &self, + output_handler: &mut dyn FnMut(&[u8]), + ) -> Result<(), RewritingError> { if !self.text.is_empty() { output_handler(&Bytes::from_str(&self.text, self.encoding)); } + Ok(()) } } diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index 05ecbb82..98c882b0 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -162,7 +162,7 @@ where transform_controller.handle_token(&mut token)?; if emission_enabled { - token.into_bytes(&mut |c| output_sink.handle_chunk(c)); + token.into_bytes(&mut |c| output_sink.handle_chunk(c))?; } } } @@ -278,7 +278,7 @@ where transform_controller.handle_token(&mut token)?; if emission_enabled { - token.into_bytes(&mut |c| output_sink.handle_chunk(c)); + token.into_bytes(&mut |c| output_sink.handle_chunk(c))?; } } From c4fe758a9810216cd5fb3299285b298aa60aa65a Mon Sep 17 00:00:00 2001 From: Kornel Date: Tue, 5 Nov 2024 13:00:30 +0000 Subject: [PATCH 03/15] Avoid dummy raw() --- src/rewritable_units/tokens/attributes.rs | 15 +++++++-------- src/rewritable_units/tokens/comment.rs | 20 ++++++++------------ src/rewritable_units/tokens/end_tag.rs | 20 ++++++++------------ src/rewritable_units/tokens/mod.rs | 16 ++++++++-------- src/rewritable_units/tokens/start_tag.rs | 14 +++++--------- src/rewritable_units/tokens/text_chunk.rs | 11 +---------- 6 files changed, 37 insertions(+), 59 deletions(-) diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index 20153744..d36dea30 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -128,14 +128,13 @@ impl<'i> Attribute<'i> { impl Serialize for &Attribute<'_> { #[inline] fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { - match self.raw.as_ref() { - Some(raw) => output_handler(raw), - None => { - output_handler(&self.name); - output_handler(b"=\""); - self.value.replace_byte((b'"', b"""), output_handler); - output_handler(b"\""); - } + if let Some(raw) = self.raw.as_ref() { + output_handler(raw) + } else { + output_handler(&self.name); + output_handler(b"=\""); + self.value.replace_byte((b'"', b"""), output_handler); + output_handler(b"\""); } Ok(()) } diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index 6ff06909..baec2144 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -187,18 +187,14 @@ impl<'i> Comment<'i> { } #[inline] - const fn raw(&self) -> Option<&Bytes<'_>> { - self.raw.as_ref() - } - - #[inline] - fn serialize_from_parts( - &self, - output_handler: &mut dyn FnMut(&[u8]), - ) -> Result<(), RewritingError> { - output_handler(b""); + fn serialize_self(&self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { + if let Some(raw) = &self.raw { + output_handler(raw); + } else { + output_handler(b""); + } Ok(()) } } diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index ed271368..1eb29498 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -95,18 +95,14 @@ impl<'i> EndTag<'i> { } #[inline] - const fn raw(&self) -> Option<&Bytes<'_>> { - self.raw.as_ref() - } - - #[inline] - fn serialize_from_parts( - &self, - output_handler: &mut dyn FnMut(&[u8]), - ) -> Result<(), RewritingError> { - output_handler(b""); + fn serialize_self(&self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { + if let Some(raw) = &self.raw { + output_handler(raw); + } else { + output_handler(b""); + } Ok(()) } } diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index 73971257..bfc398f7 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -16,8 +16,11 @@ pub trait Serialize { macro_rules! impl_serialize { ($Token:ident) => { impl crate::rewritable_units::Serialize for $Token<'_> { - #[inline] - fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { + #[inline(always)] + fn into_bytes( + self, + output_handler: &mut dyn FnMut(&[u8]), + ) -> Result<(), RewritingError> { let Mutations { content_before, replacement, @@ -31,10 +34,7 @@ macro_rules! impl_serialize { } if !removed { - match self.raw() { - Some(raw) => output_handler(raw), - None => self.serialize_from_parts(output_handler)?, - } + self.serialize_self(output_handler)?; } else if !replacement.is_empty() { output_handler(replacement); } @@ -64,9 +64,9 @@ pub use self::text_chunk::TextChunk; #[derive(Debug)] pub enum Token<'i> { TextChunk(TextChunk<'i>), - Comment(Comment<'i>), StartTag(StartTag<'i>), EndTag(EndTag<'i>), + Comment(Comment<'i>), Doctype(Doctype<'i>), } @@ -75,9 +75,9 @@ impl Serialize for Token<'_> { fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { match self { Token::TextChunk(t) => t.into_bytes(output_handler), - Token::Comment(t) => t.into_bytes(output_handler), Token::StartTag(t) => t.into_bytes(output_handler), Token::EndTag(t) => t.into_bytes(output_handler), + Token::Comment(t) => t.into_bytes(output_handler), Token::Doctype(t) => t.into_bytes(output_handler), } } diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index aa44fcc2..67ef5a01 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -137,16 +137,12 @@ impl<'i> StartTag<'i> { self.mutations.remove(); } - #[inline] - const fn raw(&self) -> Option<&Bytes<'_>> { - self.raw.as_ref() - } + fn serialize_self(&self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { + if let Some(raw) = &self.raw { + output_handler(raw); + return Ok(()); + } - #[inline] - fn serialize_from_parts( - &self, - output_handler: &mut dyn FnMut(&[u8]), - ) -> Result<(), RewritingError> { output_handler(b"<"); output_handler(&self.name); diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 8b6155de..d84d383d 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -270,16 +270,7 @@ impl<'i> TextChunk<'i> { } #[inline] - #[allow(clippy::unused_self)] - const fn raw(&self) -> Option<&Bytes<'_>> { - None - } - - #[inline] - fn serialize_from_parts( - &self, - output_handler: &mut dyn FnMut(&[u8]), - ) -> Result<(), RewritingError> { + fn serialize_self(&self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { if !self.text.is_empty() { output_handler(&Bytes::from_str(&self.text, self.encoding)); } From f54ee05715a8f89607a6024794c6824a0258b191 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 25 Oct 2024 18:39:09 +0100 Subject: [PATCH 04/15] Store mutations as text chunks --- src/lib.rs | 2 +- src/rewritable_units/element.rs | 44 +++++++-- src/rewritable_units/mod.rs | 3 +- src/rewritable_units/mutations.rs | 103 +++++++++++++++------- src/rewritable_units/tokens/comment.rs | 10 ++- src/rewritable_units/tokens/end_tag.rs | 12 ++- src/rewritable_units/tokens/mod.rs | 40 ++++----- src/rewritable_units/tokens/start_tag.rs | 12 ++- src/rewritable_units/tokens/text_chunk.rs | 17 ++-- 9 files changed, 159 insertions(+), 84 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f78fa3b3..77dbf9cc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -206,7 +206,7 @@ cfg_if! { }; pub use self::rewritable_units::{ - EndTag, Serialize, StartTag, Token, TokenCaptureFlags, Mutations + EndTag, Serialize, StartTag, Token, TokenCaptureFlags, }; pub use self::memory::SharedMemoryLimiter; diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index c68631f5..248c4666 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -1,4 +1,4 @@ -use super::{Attribute, AttributeNameError, ContentType, EndTag, Mutations, StartTag}; +use super::{Attribute, AttributeNameError, ContentType, EndTag, Mutations, StartTag, StringChunk}; use crate::base::Bytes; use crate::rewriter::{HandlerTypes, LocalHandlerTypes}; use encoding_rs::Encoding; @@ -89,7 +89,9 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { #[inline] fn remove_content(&mut self) { self.start_tag.mutations.content_after.clear(); - self.end_tag_mutations_mut().content_before.clear(); + if let Some(end) = &mut self.end_tag_mutations { + end.content_before.clear(); + } self.should_remove_content = true; } @@ -232,7 +234,10 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { - self.start_tag.mutations.before(content, content_type); + self.start_tag + .mutations + .content_before + .push_back((content, content_type).into()); } /// Inserts `content` after the element. @@ -265,11 +270,16 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { + self.after_chunk((content, content_type).into()); + } + + fn after_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { - self.end_tag_mutations_mut().after(content, content_type); + &mut self.end_tag_mutations_mut().content_after } else { - self.start_tag.mutations.after(content, content_type); + &mut self.start_tag.mutations.content_after } + .push_front(chunk); } /// Prepends `content` to the element's inner content, i.e. inserts content right after @@ -309,8 +319,12 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn prepend(&mut self, content: &str, content_type: ContentType) { + self.prepend_chunk((content, content_type).into()); + } + + fn prepend_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { - self.start_tag.mutations.after(content, content_type); + self.start_tag.mutations.content_after.push_front(chunk); } } @@ -351,8 +365,12 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn append(&mut self, content: &str, content_type: ContentType) { + self.append_chunk((content, content_type).into()); + } + + fn append_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { - self.end_tag_mutations_mut().before(content, content_type); + self.end_tag_mutations_mut().content_before.push_back(chunk); } } @@ -392,9 +410,13 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn set_inner_content(&mut self, content: &str, content_type: ContentType) { + self.set_inner_content_chunk((content, content_type).into()); + } + + fn set_inner_content_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { self.remove_content(); - self.start_tag.mutations.after(content, content_type); + self.start_tag.mutations.content_after.push_front(chunk); } } @@ -427,7 +449,11 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.start_tag.mutations.replace(content, content_type); + self.replace_chunk((content, content_type).into()); + } + + fn replace_chunk(&mut self, chunk: StringChunk) { + self.start_tag.mutations.replace(chunk); if self.can_have_content { self.remove_content(); diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index da5521d7..c7b282ee 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -2,7 +2,8 @@ use std::any::Any; pub use self::document_end::*; pub use self::element::*; -pub use self::mutations::{ContentType, Mutations}; +pub use self::mutations::ContentType; +pub(crate) use self::mutations::{Mutations, StringChunk}; pub use self::tokens::*; /// Data that can be attached to a rewritable unit by a user and shared between content handler diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index a8118bcc..82cdec43 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -1,7 +1,11 @@ use crate::base::Bytes; use encoding_rs::Encoding; +use std::error::Error as StdError; + +type BoxResult = Result<(), Box>; /// The type of inserted content. +#[derive(Copy, Clone)] pub enum ContentType { /// HTML content type. The rewriter will insert the content as is. Html, @@ -17,7 +21,7 @@ pub(super) fn content_to_bytes( content: &str, content_type: ContentType, encoding: &'static Encoding, - mut output_handler: &mut dyn FnMut(&[u8]), + output_handler: &mut dyn FnMut(&[u8]), ) { let bytes = Bytes::from_str(content, encoding); @@ -27,68 +31,99 @@ pub(super) fn content_to_bytes( (b'<', b"<"), (b'>', b">"), (b'&', b"&"), - &mut output_handler, + &mut *output_handler, ), } } -pub struct Mutations { - pub content_before: Vec, - pub replacement: Vec, - pub content_after: Vec, +pub(crate) struct Mutations { + pub content_before: DynamicString, + pub replacement: DynamicString, + pub content_after: DynamicString, pub removed: bool, - encoding: &'static Encoding, + pub encoding: &'static Encoding, } impl Mutations { #[inline] - pub fn new(encoding: &'static Encoding) -> Self { - Mutations { - content_before: Vec::default(), - replacement: Vec::default(), - content_after: Vec::default(), + #[must_use] + pub const fn new(encoding: &'static Encoding) -> Self { + Self { + content_before: DynamicString::new(), + replacement: DynamicString::new(), + content_after: DynamicString::new(), removed: false, encoding, } } #[inline] - pub fn before(&mut self, content: &str, content_type: ContentType) { - content_to_bytes(content, content_type, self.encoding, &mut |c| { - self.content_before.extend_from_slice(c); - }); + pub fn replace(&mut self, chunk: StringChunk) { + self.remove(); + self.replacement.clear(); + self.replacement.push_back(chunk); } #[inline] - pub fn after(&mut self, content: &str, content_type: ContentType) { - let mut pos = 0; - - content_to_bytes(content, content_type, self.encoding, &mut |c| { - self.content_after.splice(pos..pos, c.iter().cloned()); + pub fn remove(&mut self) { + self.removed = true; + } - pos += c.len(); - }); + #[inline] + pub const fn removed(&self) -> bool { + self.removed } +} +impl From<(&str, ContentType)> for StringChunk { #[inline] - pub fn replace(&mut self, content: &str, content_type: ContentType) { - let mut replacement = Vec::default(); + fn from((content, content_type): (&str, ContentType)) -> Self { + Self::Buffer(Box::from(content), content_type) + } +} + +pub(crate) enum StringChunk { + Buffer(Box, ContentType), +} - content_to_bytes(content, content_type, self.encoding, &mut |c| { - replacement.extend_from_slice(c); - }); +#[derive(Default)] +pub(crate) struct DynamicString { + chunks: Vec, +} - self.replacement = replacement; - self.remove(); +impl DynamicString { + #[inline] + pub const fn new() -> Self { + Self { chunks: vec![] } } #[inline] - pub fn remove(&mut self) { - self.removed = true; + pub fn clear(&mut self) { + self.chunks.clear(); } #[inline] - pub fn removed(&self) -> bool { - self.removed + pub fn push_front(&mut self, chunk: StringChunk) { + self.chunks.insert(0, chunk); + } + + #[inline] + pub fn push_back(&mut self, chunk: StringChunk) { + self.chunks.push(chunk); + } + + pub fn into_bytes( + self, + encoding: &'static Encoding, + output_handler: &mut dyn FnMut(&[u8]), + ) -> BoxResult { + for chunk in self.chunks { + match chunk { + StringChunk::Buffer(content, content_type) => { + content_to_bytes(&content, content_type, encoding, output_handler); + } + }; + } + Ok(()) } } diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index baec2144..29073261 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -106,7 +106,9 @@ impl<'i> Comment<'i> { /// ``` #[inline] pub fn before(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.before(content, content_type); + self.mutations + .content_before + .push_back((content, content_type).into()); } /// Inserts `content` after the comment. @@ -138,7 +140,9 @@ impl<'i> Comment<'i> { /// ``` #[inline] pub fn after(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.after(content, content_type); + self.mutations + .content_after + .push_front((content, content_type).into()); } /// Replaces the comment with the `content`. @@ -170,7 +174,7 @@ impl<'i> Comment<'i> { /// ``` #[inline] pub fn replace(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.replace(content, content_type); + self.mutations.replace((content, content_type).into()); } /// Removes the comment. diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index 1eb29498..8723c7a5 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -1,7 +1,7 @@ use super::{Mutations, Token}; use crate::base::Bytes; use crate::errors::RewritingError; -use crate::rewritable_units::ContentType; +use crate::html_content::ContentType; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -69,7 +69,9 @@ impl<'i> EndTag<'i> { /// Consequent calls to the method append `content` to the previously inserted content. #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { - self.mutations.before(content, content_type); + self.mutations + .content_before + .push_back((content, content_type).into()); } /// Inserts `content` after the end tag. @@ -77,7 +79,9 @@ impl<'i> EndTag<'i> { /// Consequent calls to the method prepend `content` to the previously inserted content. #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { - self.mutations.after(content, content_type); + self.mutations + .content_after + .push_front((content, content_type).into()); } /// Replaces the end tag with `content`. @@ -85,7 +89,7 @@ impl<'i> EndTag<'i> { /// Consequent calls to the method overwrite previous replacement content. #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.mutations.replace(content, content_type); + self.mutations.replace((content, content_type).into()); } /// Removes the end tag. diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index bfc398f7..c0555a02 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -16,33 +16,29 @@ pub trait Serialize { macro_rules! impl_serialize { ($Token:ident) => { impl crate::rewritable_units::Serialize for $Token<'_> { - #[inline(always)] + #[inline] fn into_bytes( - self, + mut self, output_handler: &mut dyn FnMut(&[u8]), - ) -> Result<(), RewritingError> { - let Mutations { - content_before, - replacement, - content_after, - removed, - .. - } = &self.mutations; + ) -> Result<(), crate::errors::RewritingError> { + let content_before = ::std::mem::take(&mut self.mutations.content_before); + content_before + .into_bytes(self.mutations.encoding, output_handler) + .map_err(crate::errors::RewritingError::ContentHandlerError)?; - if !content_before.is_empty() { - output_handler(content_before); + if !self.mutations.removed { + self.serialize_self(encoder.output_handler())?; + } else { + self.mutations + .replacement + .into_bytes(self.mutations.encoding, output_handler) + .map_err(crate::errors::RewritingError::ContentHandlerError)?; } - if !removed { - self.serialize_self(output_handler)?; - } else if !replacement.is_empty() { - output_handler(replacement); - } - - if !content_after.is_empty() { - output_handler(content_after); - } - Ok(()) + self.mutations + .content_after + .into_bytes(self.mutations.encoding, output_handler) + .map_err(crate::errors::RewritingError::ContentHandlerError) } } }; diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 67ef5a01..429b6f1c 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -3,7 +3,7 @@ use super::{Mutations, Serialize, Token}; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::Namespace; -use crate::rewritable_units::ContentType; +use crate::html_content::ContentType; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -112,7 +112,9 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method append `content` to the previously inserted content. #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { - self.mutations.before(content, content_type); + self.mutations + .content_before + .push_back((content, content_type).into()); } /// Inserts `content` after the start tag. @@ -120,7 +122,9 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method prepend `content` to the previously inserted content. #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { - self.mutations.after(content, content_type); + self.mutations + .content_after + .push_front((content, content_type).into()); } /// Replaces the start tag with `content`. @@ -128,7 +132,7 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method overwrite previous replacement content. #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.mutations.replace(content, content_type); + self.mutations.replace((content, content_type).into()); } /// Removes the start tag. diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index d84d383d..6d0b60a7 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -2,6 +2,7 @@ use super::{Mutations, Token}; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::TextType; +use crate::html_content::ContentType; use encoding_rs::Encoding; use std::any::Any; use std::borrow::Cow; @@ -184,8 +185,10 @@ impl<'i> TextChunk<'i> { /// assert_eq!(html, r#"
Hello world
"#); /// ``` #[inline] - pub fn before(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.before(content, content_type); + pub fn before(&mut self, content: &str, content_type: ContentType) { + self.mutations + .content_before + .push_back((content, content_type).into()); } /// Inserts `content` after the text chunk. @@ -218,8 +221,10 @@ impl<'i> TextChunk<'i> { /// assert_eq!(html, r#"
FooQuxBar
"#); /// ``` #[inline] - pub fn after(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.after(content, content_type); + pub fn after(&mut self, content: &str, content_type: ContentType) { + self.mutations + .content_after + .push_front((content, content_type).into()); } /// Replaces the text chunk with the `content`. @@ -252,8 +257,8 @@ impl<'i> TextChunk<'i> { /// assert_eq!(html, r#"
Qux
"#); /// ``` #[inline] - pub fn replace(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.replace(content, content_type); + pub fn replace(&mut self, content: &str, content_type: ContentType) { + self.mutations.replace((content, content_type).into()); } /// Removes the text chunk. From 553d4b75c32bd8ae49a756ac814f6a694cf1185b Mon Sep 17 00:00:00 2001 From: Kornel Date: Thu, 31 Oct 2024 16:50:38 +0000 Subject: [PATCH 05/15] Avoid heap allocations in text encoding --- src/base/bytes.rs | 75 ----------- src/html/mod.rs | 63 +++++++++ src/lib.rs | 5 + src/rewritable_units/document_end.rs | 7 +- src/rewritable_units/mod.rs | 1 + src/rewritable_units/mutations.rs | 30 +---- src/rewritable_units/text_encoder.rs | 155 ++++++++++++++++++++++ src/rewritable_units/tokens/attributes.rs | 3 +- src/rewritable_units/tokens/mod.rs | 10 +- 9 files changed, 240 insertions(+), 109 deletions(-) create mode 100644 src/rewritable_units/text_encoder.rs diff --git a/src/base/bytes.rs b/src/base/bytes.rs index d5ea253f..b2591145 100644 --- a/src/base/bytes.rs +++ b/src/base/bytes.rs @@ -1,6 +1,5 @@ use super::Range; use encoding_rs::{Encoding, WINDOWS_1252}; -use memchr::{memchr, memchr3}; use std::borrow::Cow; use std::fmt::{self, Debug}; use std::ops::Deref; @@ -84,80 +83,6 @@ impl<'b> Bytes<'b> { } } -macro_rules! impl_replace_byte { - ($self:tt, $output_handler:ident, $impls:ident) => { - let mut tail: &[u8] = $self; - - loop { - match $impls!(@find tail) { - Some(pos) => { - let replacement = $impls!(@get_replacement tail, pos); - let chunk = &tail[..pos]; - - if !chunk.is_empty() { - $output_handler(chunk); - } - - $output_handler(&replacement); - tail = &tail[pos + 1..]; - } - None => { - if !tail.is_empty() { - $output_handler(&tail); - } - break; - } - } - } - }; -} - -impl<'b> Bytes<'b> { - #[inline] - pub fn replace_byte(&self, (needle, repl): (u8, &[u8]), output_handler: &mut dyn FnMut(&[u8])) { - macro_rules! impls { - (@find $tail:ident) => { - memchr(needle, $tail) - }; - - (@get_replacement $tail:ident, $pos:ident) => { - repl - }; - } - - impl_replace_byte!(self, output_handler, impls); - } - - #[inline] - pub fn replace_byte3( - &self, - (needle1, repl1): (u8, &[u8]), - (needle2, repl2): (u8, &[u8]), - (needle3, repl3): (u8, &[u8]), - output_handler: &mut dyn FnMut(&[u8]), - ) { - macro_rules! impls { - (@find $tail:ident) => { - memchr3(needle1, needle2, needle3, $tail) - }; - - (@get_replacement $tail:ident, $pos:ident) => {{ - let matched = $tail[$pos]; - - if matched == needle1 { - repl1 - } else if matched == needle2 { - repl2 - } else { - repl3 - } - }}; - } - - impl_replace_byte!(self, output_handler, impls); - } -} - impl<'b> From> for Bytes<'b> { #[inline] fn from(bytes: Cow<'b, [u8]>) -> Self { diff --git a/src/html/mod.rs b/src/html/mod.rs index 2e23a89b..73dfe336 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -1,3 +1,6 @@ +use crate::base::Bytes; +use memchr::{memchr, memchr3}; + #[macro_use] mod tag; @@ -9,3 +12,63 @@ pub use self::local_name::{LocalName, LocalNameHash}; pub use self::namespace::Namespace; pub use self::tag::Tag; pub use self::text_type::TextType; + +/// Convert text to HTML +#[inline] +pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) { + loop { + if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) { + let Some((chunk_before, (matched, rest))) = content + .split_at_checked(pos) + .and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?))) + else { + return; + }; + content = rest; + let matched = matched.as_bytes()[0]; + + if !chunk_before.is_empty() { + (output_handler)(chunk_before); + } + (output_handler)(match matched { + b'<' => "<", + b'>' => ">", + _ => "&", + }); + } else { + if !content.is_empty() { + (output_handler)(content); + } + return; + } + } +} + +/// Replace `"` with `"` ONLY, leaving `&` unescaped +pub(crate) fn escape_double_quotes_only( + content: &Bytes<'_>, + output_handler: &mut dyn FnMut(&[u8]), +) { + let mut content = &**content; + loop { + if let Some(pos) = memchr(b'"', content) { + let Some((chunk_before, rest)) = content + .split_at_checked(pos) + .and_then(|(before, rest)| Some((before, rest.get(1..)?))) + else { + return; + }; + content = rest; + + if !chunk_before.is_empty() { + (output_handler)(chunk_before); + } + (output_handler)(b"""); + } else { + if !content.is_empty() { + (output_handler)(content); + } + return; + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 77dbf9cc..a780160b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -156,6 +156,7 @@ pub mod test_utils { impl Output { #[must_use] + #[inline] pub fn new(encoding: &'static Encoding) -> Self { Self { bytes: Vec::default(), @@ -164,6 +165,8 @@ pub mod test_utils { } } + #[inline] + #[track_caller] pub fn push(&mut self, chunk: &[u8]) { if chunk.is_empty() { self.finalizing_chunk_received = true; @@ -179,6 +182,8 @@ pub mod test_utils { } impl From for String { + #[inline] + #[track_caller] fn from(output: Output) -> Self { assert!( output.finalizing_chunk_received, diff --git a/src/rewritable_units/document_end.rs b/src/rewritable_units/document_end.rs index 70952739..5d87c98b 100644 --- a/src/rewritable_units/document_end.rs +++ b/src/rewritable_units/document_end.rs @@ -1,4 +1,4 @@ -use super::mutations::content_to_bytes; +use super::text_encoder::StreamingHandlerSink; use super::ContentType; use encoding_rs::Encoding; @@ -50,9 +50,10 @@ impl<'a> DocumentEnd<'a> { /// ``` #[inline] pub fn append(&mut self, content: &str, content_type: ContentType) { - content_to_bytes(content, content_type, self.encoding, &mut |c: &[u8]| { + StreamingHandlerSink::new(self.encoding, &mut |c| { self.output_sink.handle_chunk(c); - }); + }) + .write_str(content, content_type); } } diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index c7b282ee..e6496e1b 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -84,6 +84,7 @@ mod mutations; mod document_end; mod element; +mod text_encoder; mod tokens; #[cfg(test)] diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index 82cdec43..6c639a34 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -1,4 +1,4 @@ -use crate::base::Bytes; +use super::text_encoder::StreamingHandlerSink; use encoding_rs::Encoding; use std::error::Error as StdError; @@ -16,26 +16,6 @@ pub enum ContentType { Text, } -#[inline] -pub(super) fn content_to_bytes( - content: &str, - content_type: ContentType, - encoding: &'static Encoding, - output_handler: &mut dyn FnMut(&[u8]), -) { - let bytes = Bytes::from_str(content, encoding); - - match content_type { - ContentType::Html => output_handler(&bytes), - ContentType::Text => bytes.replace_byte3( - (b'<', b"<"), - (b'>', b">"), - (b'&', b"&"), - &mut *output_handler, - ), - } -} - pub(crate) struct Mutations { pub content_before: DynamicString, pub replacement: DynamicString, @@ -112,15 +92,11 @@ impl DynamicString { self.chunks.push(chunk); } - pub fn into_bytes( - self, - encoding: &'static Encoding, - output_handler: &mut dyn FnMut(&[u8]), - ) -> BoxResult { + pub fn encode(self, sink: &mut StreamingHandlerSink<'_>) -> BoxResult { for chunk in self.chunks { match chunk { StringChunk::Buffer(content, content_type) => { - content_to_bytes(&content, content_type, encoding, output_handler); + sink.write_str(&content, content_type); } }; } diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs new file mode 100644 index 00000000..05477443 --- /dev/null +++ b/src/rewritable_units/text_encoder.rs @@ -0,0 +1,155 @@ +use super::ContentType; +use crate::html::escape_body_text; +use encoding_rs::{CoderResult, Encoder, Encoding, UTF_8}; + +/// Used to write chunks of text or markup in streaming mutation handlers. +/// +/// Argument to [`StreamingHandler::write_all()`](crate::html_content::StreamingHandler::write_all). +pub struct StreamingHandlerSink<'output_handler> { + non_utf8_encoder: Option, + + /// ```compile_fail + /// use lol_html::html_content::StreamingHandlerSink; + /// struct IsSend(T); + /// let x: IsSend>; + /// ``` + /// + /// ```compile_fail + /// use lol_html::html_content::StreamingHandlerSink; + /// struct IsSync(T); + /// let x: IsSync>; + /// ``` + output_handler: &'output_handler mut dyn FnMut(&[u8]), +} + +impl<'output_handler> StreamingHandlerSink<'output_handler> { + #[inline(always)] + pub(crate) fn new( + encoding: &'static Encoding, + output_handler: &'output_handler mut dyn FnMut(&[u8]), + ) -> Self { + Self { + non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), + output_handler, + } + } + + /// Writes the given UTF-8 string to the output, converting the encoding and [escaping](ContentType) if necessary. + /// + /// It may be called multiple times. The strings will be concatenated together. + #[inline] + pub fn write_str(&mut self, content: &str, content_type: ContentType) { + match content_type { + ContentType::Html => self.write_html(content), + ContentType::Text => self.write_body_text(content), + } + } + + pub(crate) fn write_html(&mut self, html: &str) { + if let Some(encoder) = &mut self.non_utf8_encoder { + encoder.encode(html, self.output_handler); + } else if !html.is_empty() { + (self.output_handler)(html.as_bytes()); + } + } + + /// For text content, not attributes + pub(crate) fn write_body_text(&mut self, plaintext: &str) { + if let Some(encoder) = &mut self.non_utf8_encoder { + escape_body_text(plaintext, &mut |chunk| { + debug_assert!(!chunk.is_empty()); + encoder.encode(chunk, self.output_handler); + }); + } else { + escape_body_text(plaintext, &mut |chunk| { + debug_assert!(!chunk.is_empty()); + (self.output_handler)(chunk.as_bytes()); + }); + } + } + + #[inline] + pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) { + &mut self.output_handler + } +} + +enum Buffer { + Heap(Vec), + Stack([u8; 63]), // leave a byte for the tag +} + +struct TextEncoder { + encoder: Encoder, + buffer: Buffer, +} + +impl TextEncoder { + #[inline] + pub fn new(encoding: &'static Encoding) -> Self { + debug_assert!(encoding != UTF_8); + debug_assert!(encoding.is_ascii_compatible()); + Self { + encoder: encoding.new_encoder(), + buffer: Buffer::Stack([0; 63]), + } + } + + /// This is more efficient than `Bytes::from_str`, because it can output non-UTF-8/non-ASCII encodings + /// without heap allocations. + /// It also avoids methods that have UB: https://github.com/hsivonen/encoding_rs/issues/79 + #[inline(never)] + fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) { + loop { + debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it + let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes()); + if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) { + if !ascii.is_empty() { + (output_handler)(ascii.as_bytes()); + } + if remainder.is_empty() { + return; + } + content = remainder; + } + + let buffer = match &mut self.buffer { + Buffer::Heap(buf) => buf.as_mut_slice(), + // Long non-ASCII content could take lots of roundtrips through the encoder + buf if content.len() >= 1 << 20 => { + *buf = Buffer::Heap(vec![0; 4096]); + match buf { + Buffer::Heap(buf) => buf.as_mut(), + _ => unreachable!(), + } + } + Buffer::Stack(buf) => buf.as_mut_slice(), + }; + + let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false); + if written > 0 && written <= buffer.len() { + (output_handler)(&buffer[..written]); + } + if read >= content.len() { + return; + } + content = &content[read..]; + match result { + CoderResult::InputEmpty => { + debug_assert!(content.is_empty()); + return; + } + CoderResult::OutputFull => { + match &mut self.buffer { + Buffer::Heap(buf) if buf.len() >= 1024 => { + if written == 0 { + panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes + } + } + buf => *buf = Buffer::Heap(vec![0; 1024]), + } + } + } + } + } +} diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index d36dea30..7218e690 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -1,5 +1,6 @@ use crate::base::Bytes; use crate::errors::RewritingError; +use crate::html::escape_double_quotes_only; use crate::parser::AttributeBuffer; use crate::rewritable_units::Serialize; use encoding_rs::Encoding; @@ -133,7 +134,7 @@ impl Serialize for &Attribute<'_> { } else { output_handler(&self.name); output_handler(b"=\""); - self.value.replace_byte((b'"', b"""), output_handler); + escape_double_quotes_only(&self.value, output_handler); output_handler(b"\""); } Ok(()) diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index c0555a02..53d677cf 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -21,9 +21,13 @@ macro_rules! impl_serialize { mut self, output_handler: &mut dyn FnMut(&[u8]), ) -> Result<(), crate::errors::RewritingError> { + let mut encoder = crate::rewritable_units::text_encoder::StreamingHandlerSink::new( + self.mutations.encoding, + output_handler, + ); let content_before = ::std::mem::take(&mut self.mutations.content_before); content_before - .into_bytes(self.mutations.encoding, output_handler) + .encode(&mut encoder) .map_err(crate::errors::RewritingError::ContentHandlerError)?; if !self.mutations.removed { @@ -31,13 +35,13 @@ macro_rules! impl_serialize { } else { self.mutations .replacement - .into_bytes(self.mutations.encoding, output_handler) + .encode(&mut encoder) .map_err(crate::errors::RewritingError::ContentHandlerError)?; } self.mutations .content_after - .into_bytes(self.mutations.encoding, output_handler) + .encode(&mut encoder) .map_err(crate::errors::RewritingError::ContentHandlerError) } } From 693e0958529a33902b9354d0af9ea4ec0de82376 Mon Sep 17 00:00:00 2001 From: Kornel Date: Tue, 5 Nov 2024 15:58:42 +0000 Subject: [PATCH 06/15] Make mutations struct smaller --- src/rewritable_units/element.rs | 33 +++++++----- src/rewritable_units/mutations.rs | 64 ++++++++++++++++------- src/rewritable_units/tokens/comment.rs | 10 ++-- src/rewritable_units/tokens/end_tag.rs | 10 ++-- src/rewritable_units/tokens/mod.rs | 39 ++++++++------ src/rewritable_units/tokens/start_tag.rs | 10 ++-- src/rewritable_units/tokens/text_chunk.rs | 10 ++-- src/rewriter/handlers_dispatcher.rs | 2 +- 8 files changed, 118 insertions(+), 60 deletions(-) diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index 248c4666..d02a80eb 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -1,3 +1,4 @@ +use super::mutations::MutationsInner; use super::{Attribute, AttributeNameError, ContentType, EndTag, Mutations, StartTag, StringChunk}; use crate::base::Bytes; use crate::rewriter::{HandlerTypes, LocalHandlerTypes}; @@ -88,19 +89,18 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { #[inline] fn remove_content(&mut self) { - self.start_tag.mutations.content_after.clear(); - if let Some(end) = &mut self.end_tag_mutations { + self.start_tag.mutations.mutate().content_after.clear(); + if let Some(end) = self.end_tag_mutations.as_mut().and_then(|m| m.if_mutated()) { end.content_before.clear(); } self.should_remove_content = true; } #[inline] - fn end_tag_mutations_mut(&mut self) -> &mut Mutations { - let encoding = self.encoding; - + fn end_tag_mutations_mut(&mut self) -> &mut MutationsInner { self.end_tag_mutations - .get_or_insert_with(|| Mutations::new(encoding)) + .get_or_insert_with(Mutations::new) + .mutate() } /// Returns the tag name of the element. @@ -236,6 +236,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { pub fn before(&mut self, content: &str, content_type: ContentType) { self.start_tag .mutations + .mutate() .content_before .push_back((content, content_type).into()); } @@ -277,7 +278,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { if self.can_have_content { &mut self.end_tag_mutations_mut().content_after } else { - &mut self.start_tag.mutations.content_after + &mut self.start_tag.mutations.mutate().content_after } .push_front(chunk); } @@ -324,7 +325,11 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { fn prepend_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { - self.start_tag.mutations.content_after.push_front(chunk); + self.start_tag + .mutations + .mutate() + .content_after + .push_front(chunk); } } @@ -416,7 +421,11 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { fn set_inner_content_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { self.remove_content(); - self.start_tag.mutations.content_after.push_front(chunk); + self.start_tag + .mutations + .mutate() + .content_after + .push_front(chunk); } } @@ -453,7 +462,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { } fn replace_chunk(&mut self, chunk: StringChunk) { - self.start_tag.mutations.replace(chunk); + self.start_tag.mutations.mutate().replace(chunk); if self.can_have_content { self.remove_content(); @@ -464,7 +473,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// Removes the element and its inner content. #[inline] pub fn remove(&mut self) { - self.start_tag.mutations.remove(); + self.start_tag.mutations.mutate().remove(); if self.can_have_content { self.remove_content(); @@ -497,7 +506,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn remove_and_keep_content(&mut self) { - self.start_tag.mutations.remove(); + self.start_tag.remove(); if self.can_have_content { self.end_tag_mutations_mut().remove(); diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index 6c639a34..a5b57f82 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -1,5 +1,4 @@ use super::text_encoder::StreamingHandlerSink; -use encoding_rs::Encoding; use std::error::Error as StdError; type BoxResult = Result<(), Box>; @@ -16,27 +15,14 @@ pub enum ContentType { Text, } -pub(crate) struct Mutations { +pub(crate) struct MutationsInner { pub content_before: DynamicString, pub replacement: DynamicString, pub content_after: DynamicString, pub removed: bool, - pub encoding: &'static Encoding, } -impl Mutations { - #[inline] - #[must_use] - pub const fn new(encoding: &'static Encoding) -> Self { - Self { - content_before: DynamicString::new(), - replacement: DynamicString::new(), - content_after: DynamicString::new(), - removed: false, - encoding, - } - } - +impl MutationsInner { #[inline] pub fn replace(&mut self, chunk: StringChunk) { self.remove(); @@ -48,10 +34,52 @@ impl Mutations { pub fn remove(&mut self) { self.removed = true; } +} + +pub(crate) struct Mutations { + inner: Option>, +} + +impl Mutations { + #[inline] + #[must_use] + pub const fn new() -> Self { + Self { inner: None } + } + + #[inline] + pub fn take(&mut self) -> Option> { + self.inner.take() + } + + #[inline] + pub fn if_mutated(&mut self) -> Option<&mut MutationsInner> { + self.inner.as_deref_mut() + } + + #[inline] + pub fn mutate(&mut self) -> &mut MutationsInner { + #[inline(never)] + fn alloc_content(inner: &mut Option>) -> &mut MutationsInner { + inner.get_or_insert_with(move || { + Box::new(MutationsInner { + content_before: DynamicString::new(), + replacement: DynamicString::new(), + content_after: DynamicString::new(), + removed: false, + }) + }) + } + + match &mut self.inner { + Some(inner) => inner, + uninit => alloc_content(uninit), + } + } #[inline] - pub const fn removed(&self) -> bool { - self.removed + pub fn removed(&self) -> bool { + self.inner.as_ref().is_some_and(|inner| inner.removed) } } diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index 29073261..5202365a 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -43,7 +43,7 @@ impl<'i> Comment<'i> { text, raw: Some(raw), encoding, - mutations: Mutations::new(encoding), + mutations: Mutations::new(), user_data: Box::new(()), }) } @@ -107,6 +107,7 @@ impl<'i> Comment<'i> { #[inline] pub fn before(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { self.mutations + .mutate() .content_before .push_back((content, content_type).into()); } @@ -141,6 +142,7 @@ impl<'i> Comment<'i> { #[inline] pub fn after(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { self.mutations + .mutate() .content_after .push_front((content, content_type).into()); } @@ -174,13 +176,15 @@ impl<'i> Comment<'i> { /// ``` #[inline] pub fn replace(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { - self.mutations.replace((content, content_type).into()); + self.mutations + .mutate() + .replace((content, content_type).into()); } /// Removes the comment. #[inline] pub fn remove(&mut self) { - self.mutations.remove(); + self.mutations.mutate().remove(); } /// Returns `true` if the comment has been replaced or removed. diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index 8723c7a5..3cff346b 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -27,7 +27,7 @@ impl<'i> EndTag<'i> { name, raw: Some(raw), encoding, - mutations: Mutations::new(encoding), + mutations: Mutations::new(), }) } @@ -70,6 +70,7 @@ impl<'i> EndTag<'i> { #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { self.mutations + .mutate() .content_before .push_back((content, content_type).into()); } @@ -80,6 +81,7 @@ impl<'i> EndTag<'i> { #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { self.mutations + .mutate() .content_after .push_front((content, content_type).into()); } @@ -89,13 +91,15 @@ impl<'i> EndTag<'i> { /// Consequent calls to the method overwrite previous replacement content. #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.mutations.replace((content, content_type).into()); + self.mutations + .mutate() + .replace((content, content_type).into()); } /// Removes the end tag. #[inline] pub fn remove(&mut self) { - self.mutations.remove(); + self.mutations.mutate().remove(); } #[inline] diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index 53d677cf..37f5372c 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -22,27 +22,32 @@ macro_rules! impl_serialize { output_handler: &mut dyn FnMut(&[u8]), ) -> Result<(), crate::errors::RewritingError> { let mut encoder = crate::rewritable_units::text_encoder::StreamingHandlerSink::new( - self.mutations.encoding, + self.encoding, output_handler, ); - let content_before = ::std::mem::take(&mut self.mutations.content_before); - content_before - .encode(&mut encoder) - .map_err(crate::errors::RewritingError::ContentHandlerError)?; + match self.mutations.take() { + None => self.serialize_self(encoder.output_handler()), + Some(mutations) => { + mutations + .content_before + .encode(&mut encoder) + .map_err(crate::errors::RewritingError::ContentHandlerError)?; - if !self.mutations.removed { - self.serialize_self(encoder.output_handler())?; - } else { - self.mutations - .replacement - .encode(&mut encoder) - .map_err(crate::errors::RewritingError::ContentHandlerError)?; - } + if !mutations.removed { + self.serialize_self(encoder.output_handler())?; + } else { + mutations + .replacement + .encode(&mut encoder) + .map_err(crate::errors::RewritingError::ContentHandlerError)?; + } - self.mutations - .content_after - .encode(&mut encoder) - .map_err(crate::errors::RewritingError::ContentHandlerError) + mutations + .content_after + .encode(&mut encoder) + .map_err(crate::errors::RewritingError::ContentHandlerError) + } + } } } }; diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 429b6f1c..e7849a55 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -38,7 +38,7 @@ impl<'i> StartTag<'i> { self_closing, raw: Some(raw), encoding, - mutations: Mutations::new(encoding), + mutations: Mutations::new(), }) } @@ -113,6 +113,7 @@ impl<'i> StartTag<'i> { #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { self.mutations + .mutate() .content_before .push_back((content, content_type).into()); } @@ -123,6 +124,7 @@ impl<'i> StartTag<'i> { #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { self.mutations + .mutate() .content_after .push_front((content, content_type).into()); } @@ -132,13 +134,15 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method overwrite previous replacement content. #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.mutations.replace((content, content_type).into()); + self.mutations + .mutate() + .replace((content, content_type).into()); } /// Removes the start tag. #[inline] pub fn remove(&mut self) { - self.mutations.remove(); + self.mutations.mutate().remove(); } fn serialize_self(&self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 6d0b60a7..7a6589d0 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -82,7 +82,7 @@ impl<'i> TextChunk<'i> { text_type, last_in_text_node, encoding, - mutations: Mutations::new(encoding), + mutations: Mutations::new(), user_data: Box::new(()), }) } @@ -187,6 +187,7 @@ impl<'i> TextChunk<'i> { #[inline] pub fn before(&mut self, content: &str, content_type: ContentType) { self.mutations + .mutate() .content_before .push_back((content, content_type).into()); } @@ -223,6 +224,7 @@ impl<'i> TextChunk<'i> { #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { self.mutations + .mutate() .content_after .push_front((content, content_type).into()); } @@ -258,13 +260,15 @@ impl<'i> TextChunk<'i> { /// ``` #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.mutations.replace((content, content_type).into()); + self.mutations + .mutate() + .replace((content, content_type).into()); } /// Removes the text chunk. #[inline] pub fn remove(&mut self) { - self.mutations.remove(); + self.mutations.mutate().remove(); } /// Returns `true` if the text chunk has been replaced or removed. diff --git a/src/rewriter/handlers_dispatcher.rs b/src/rewriter/handlers_dispatcher.rs index 9eb49869..760ddb62 100644 --- a/src/rewriter/handlers_dispatcher.rs +++ b/src/rewriter/handlers_dispatcher.rs @@ -232,7 +232,7 @@ impl<'h, H: HandlerTypes> ContentHandlersDispatcher<'h, H> { current_element_data: Option<&mut ElementDescriptor>, ) -> HandlerResult { if self.matched_elements_with_removed_content > 0 { - start_tag.mutations.remove(); + start_tag.remove(); } let mut element = Element::new(start_tag, self.next_element_can_have_content); From ce6ff7140c9002427b4fa83c1a586ab4246fbd9d Mon Sep 17 00:00:00 2001 From: Kornel Date: Thu, 31 Oct 2024 16:50:38 +0000 Subject: [PATCH 07/15] Streaming content mutations --- fuzz/test_case/src/lib.rs | 13 +-- src/lib.rs | 2 +- src/rewritable_units/document_end.rs | 3 +- src/rewritable_units/element.rs | 102 +++++++++++++++++++++- src/rewritable_units/mod.rs | 3 +- src/rewritable_units/mutations.rs | 53 ++++++++++- src/rewritable_units/text_encoder.rs | 28 ++++++ src/rewritable_units/tokens/comment.rs | 49 ++++++++++- src/rewritable_units/tokens/end_tag.rs | 38 +++++++- src/rewritable_units/tokens/start_tag.rs | 35 +++++++- src/rewritable_units/tokens/text_chunk.rs | 35 +++++++- src/rewriter/settings.rs | 45 ++++++++++ 12 files changed, 387 insertions(+), 19 deletions(-) diff --git a/fuzz/test_case/src/lib.rs b/fuzz/test_case/src/lib.rs index 70f01632..98a2d96f 100644 --- a/fuzz/test_case/src/lib.rs +++ b/fuzz/test_case/src/lib.rs @@ -11,7 +11,8 @@ use std::ffi::{CStr, CString}; use encoding_rs::*; use lol_html::html_content::ContentType; -use lol_html::{comments, doc_comments, doc_text, element, text, HtmlRewriter, MemorySettings, Settings}; +use lol_html::{comments, doc_comments, doc_text, element, streaming, text}; +use lol_html::{HtmlRewriter, MemorySettings, Settings}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); @@ -111,10 +112,12 @@ fn run_rewriter_iter(data: &[u8], selector: &str, encoding: &'static Encoding) { &format!(""), ContentType::Html, ); - el.set_inner_content( - &format!(""), - ContentType::Html, - ); + + let replaced = format!(""); + el.streaming_set_inner_content(streaming!(move |sink| { + sink.write_str(&replaced, ContentType::Html); + Ok(()) + })); Ok(()) }), diff --git a/src/lib.rs b/src/lib.rs index a780160b..b7be6475 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -96,7 +96,7 @@ pub mod errors { pub mod html_content { pub use super::rewritable_units::{ Attribute, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag, - TextChunk, UserData, + StreamingHandler, StreamingHandlerSink, TextChunk, UserData, }; pub use super::html::TextType; diff --git a/src/rewritable_units/document_end.rs b/src/rewritable_units/document_end.rs index 5d87c98b..4458157c 100644 --- a/src/rewritable_units/document_end.rs +++ b/src/rewritable_units/document_end.rs @@ -1,5 +1,4 @@ -use super::text_encoder::StreamingHandlerSink; -use super::ContentType; +use super::{ContentType, StreamingHandlerSink}; use encoding_rs::Encoding; use crate::transform_stream::OutputSink; diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index d02a80eb..d0c6e452 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -1,5 +1,8 @@ use super::mutations::MutationsInner; -use super::{Attribute, AttributeNameError, ContentType, EndTag, Mutations, StartTag, StringChunk}; +use super::{ + Attribute, AttributeNameError, ContentType, EndTag, Mutations, StartTag, StreamingHandler, + StringChunk, +}; use crate::base::Bytes; use crate::rewriter::{HandlerTypes, LocalHandlerTypes}; use encoding_rs::Encoding; @@ -241,6 +244,19 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { .push_back((content, content_type).into()); } + /// Inserts content from a [`StreamingHandler`] before the element. + /// + /// Consequent calls to the method append to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_before(&mut self, string_writer: Box) { + self.start_tag + .mutations + .mutate() + .content_before + .push_back(string_writer.into()); + } + /// Inserts `content` after the element. /// /// Consequent calls to the method prepend `content` to the previously inserted content. @@ -283,6 +299,16 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { .push_front(chunk); } + /// Inserts content from a [`StreamingHandler`] after the element. + /// + /// Consequent calls to the method prepend to the previously inserted content. + /// + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_after(&mut self, string_writer: Box) { + self.after_chunk(string_writer.into()); + } + /// Prepends `content` to the element's inner content, i.e. inserts content right after /// the element's start tag. /// @@ -333,6 +359,20 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { } } + /// Prepends content from a [`StreamingHandler`] to the element's inner content, + /// i.e. inserts content right after the element's start tag. + /// + /// Consequent calls to the method prepend to the previously inserted content. + /// A call to the method doesn't make any effect if the element is an [empty element]. + /// + /// [empty element]: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element + /// + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_prepend(&mut self, string_writer: Box) { + self.prepend_chunk(string_writer.into()); + } + /// Appends `content` to the element's inner content, i.e. inserts content right before /// the element's end tag. /// @@ -379,6 +419,19 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { } } + /// Appends content from a [`StreamingHandler`] to the element's inner content, + /// i.e. inserts content right before the element's end tag. + /// + /// Consequent calls to the method append to the previously inserted content. + /// A call to the method doesn't make any effect if the element is an [empty element]. + /// + /// [empty element]: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_append(&mut self, string_writer: Box) { + self.append_chunk(string_writer.into()); + } + /// Replaces inner content of the element with `content`. /// /// Consequent calls to the method overwrite previously inserted content. @@ -429,6 +482,19 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { } } + /// Replaces inner content of the element with content from a [`StreamingHandler`]. + /// + /// Consequent calls to the method overwrite previously inserted content. + /// A call to the method doesn't make any effect if the element is an [empty element]. + /// + /// [empty element]: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element + /// + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_set_inner_content(&mut self, string_writer: Box) { + self.set_inner_content_chunk(string_writer.into()); + } + /// Replaces the element and its inner content with `content`. /// /// Consequent calls to the method overwrite previously inserted content. @@ -470,6 +536,16 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { } } + /// Replaces the element and its inner content with content from a [`StreamingHandler`]. + /// + /// Consequent calls to the method overwrite previously inserted content. + /// + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_replace(&mut self, string_writer: Box) { + self.replace_chunk(string_writer.into()); + } + /// Removes the element and its inner content. #[inline] pub fn remove(&mut self) { @@ -638,6 +714,7 @@ mod tests { use crate::rewritable_units::test_utils::*; use crate::*; use encoding_rs::{Encoding, EUC_JP, UTF_8}; + use rewritable_units::StreamingHandlerSink; fn rewrite_element( html: &[u8], @@ -660,7 +737,11 @@ mod tests { el.before("[before: should be removed]", ContentType::Text); el.after("[after: should be removed]", ContentType::Text); el.append("[append: should be removed]", ContentType::Text); - el.before("[before: should be removed]", ContentType::Text); + el.streaming_before(Box::new(|sink: &mut StreamingHandlerSink<'_>| { + sink.write_str("[before:", ContentType::Text); + sink.write_str(" should be removed]", ContentType::Text); + Ok(()) + })); Ok(()) }), ], @@ -1133,7 +1214,10 @@ mod tests { encoded("
HiRemoveŴ
") { let output = rewrite_element(&html, enc, "span", |el| { - el.prepend("", ContentType::Html); + el.streaming_prepend(streaming!(|s| { + s.write_str("", ContentType::Html); + Ok(()) + })); el.append("", ContentType::Html); el.set_inner_content("", ContentType::Html); el.set_inner_content("", ContentType::Text); @@ -1267,7 +1351,17 @@ mod tests { #[test] fn self_closing_element() { let output = rewrite_element(b"Hi", UTF_8, "foo", |el| { - el.after("", ContentType::Html); + el.after("->", ContentType::Html); + el.streaming_after(streaming!(|sink| { + sink.write_str("er-", ContentType::Html); + Ok(()) + })); + el.after("t", ContentType::Html); + el.streaming_after(streaming!(|sink| { + sink.write_str("af", ContentType::Html); + Ok(()) + })); + el.after("", ContentType::Html); - c.replace("", ContentType::Text); + c.streaming_replace(streaming!(|h| { + h.write_str("", ContentType::Text); + Ok(()) + })); assert!(c.removed()); }, diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index 3cff346b..fbd7ee32 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -1,7 +1,7 @@ use super::{Mutations, Token}; use crate::base::Bytes; use crate::errors::RewritingError; -use crate::html_content::ContentType; +use crate::html_content::{ContentType, StreamingHandler}; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -96,6 +96,42 @@ impl<'i> EndTag<'i> { .replace((content, content_type).into()); } + /// Inserts content from a [`StreamingHandler`] before the end tag. + /// + /// Consequent calls to the method append to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + #[inline] + pub fn streaming_before(&mut self, string_writer: Box) { + self.mutations + .mutate() + .content_before + .push_back(string_writer.into()); + } + + /// Inserts content from a [`StreamingHandler`] after the end tag. + /// + /// Consequent calls to the method prepend to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + #[inline] + pub fn streaming_after(&mut self, string_writer: Box) { + self.mutations + .mutate() + .content_after + .push_front(string_writer.into()); + } + + /// Replaces the end tag with content from a [`StreamingHandler`]. + /// + /// Consequent calls to the method overwrite previous replacement content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + #[inline] + pub fn streaming_replace(&mut self, string_writer: Box) { + self.mutations.mutate().replace(string_writer.into()); + } + /// Removes the end tag. #[inline] pub fn remove(&mut self) { diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index e7849a55..5b637d71 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -3,7 +3,7 @@ use super::{Mutations, Serialize, Token}; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::Namespace; -use crate::html_content::ContentType; +use crate::html_content::{ContentType, StreamingHandler}; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -139,6 +139,39 @@ impl<'i> StartTag<'i> { .replace((content, content_type).into()); } + /// Inserts content from a [`StreamingHandler`] before the start tag. + /// + /// Consequent calls to the method append to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_before(&mut self, string_writer: Box) { + self.mutations + .mutate() + .content_before + .push_back(string_writer.into()); + } + + /// Inserts content from a [`StreamingHandler`] after the start tag. + /// + /// Consequent calls to the method prepend to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_after(&mut self, string_writer: Box) { + self.mutations + .mutate() + .content_after + .push_front(string_writer.into()); + } + + /// Replaces the start tag with the content from a [`StreamingHandler`]. + /// + /// Consequent calls to the method overwrite previous replacement content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_replace(&mut self, string_writer: Box) { + self.mutations.mutate().replace(string_writer.into()); + } + /// Removes the start tag. #[inline] pub fn remove(&mut self) { diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 7a6589d0..7af680a1 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -2,7 +2,7 @@ use super::{Mutations, Token}; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::TextType; -use crate::html_content::ContentType; +use crate::html_content::{ContentType, StreamingHandler}; use encoding_rs::Encoding; use std::any::Any; use std::borrow::Cow; @@ -265,6 +265,39 @@ impl<'i> TextChunk<'i> { .replace((content, content_type).into()); } + /// Inserts content from a [`StreamingHandler`] before the text chunk. + /// + /// Consequent calls to the method append `content` to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_before(&mut self, string_writer: Box) { + self.mutations + .mutate() + .content_before + .push_back(string_writer.into()); + } + + /// Inserts content from a [`StreamingHandler`] after the text chunk. + /// + /// Consequent calls to the method prepend to the previously inserted content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_after(&mut self, string_writer: Box) { + self.mutations + .mutate() + .content_after + .push_front(string_writer.into()); + } + + /// Replaces the text chunk with the content from a [`StreamingHandler`]. + /// + /// Consequent calls to the method overwrite previous replacement content. + /// + /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. + pub fn streaming_replace(&mut self, string_writer: Box) { + self.mutations.mutate().replace(string_writer.into()); + } + /// Removes the text chunk. #[inline] pub fn remove(&mut self) { diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index ea5451ca..92d1d5d2 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -503,6 +503,51 @@ macro_rules! comments { }}; } +/// A convenience macro to construct a `StreamingHandler` from a closure. +/// +/// For use with [`Element::streaming_replace`], etc. +/// +/// ```rust +/// use lol_html::{element, streaming, RewriteStrSettings}; +/// use lol_html::html_content::ContentType; +/// +/// RewriteStrSettings { +/// element_content_handlers: vec![ +/// element!("div", |element| { +/// element.streaming_replace(streaming!(|sink| { +/// sink.write_str("…", ContentType::Html); +/// sink.write_str("…", ContentType::Html); +/// Ok(()) +/// })); +/// Ok(()) +/// }) +/// ], +/// ..RewriteStrSettings::default() +/// }; +/// ``` + +#[macro_export(local_inner_macros)] +macro_rules! streaming { + ($closure:expr) => {{ + use ::std::error::Error; + use $crate::html_content::StreamingHandlerSink; + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + const fn streaming_macro_type_hint( + handler_closure: StreamingHandler, + ) -> StreamingHandler + where + StreamingHandler: + FnOnce(&mut StreamingHandlerSink<'_>) -> Result<(), Box> + 'static + Send, + { + handler_closure + } + + Box::new(streaming_macro_type_hint($closure)) + as Box + }}; +} + #[doc(hidden)] #[macro_export] macro_rules! __document_content_handler { From 6dae0c68fa3651a54a3aec8ba6672e5219153ed5 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 1 Nov 2024 13:55:17 +0000 Subject: [PATCH 08/15] C API for streaming content mutations --- c-api/c-tests/src/test.c | 6 +- c-api/c-tests/src/test_element_api.c | 109 +++++++++++++ c-api/cbindgen.toml | 16 ++ c-api/include/lol_html.h | 228 +++++++++++++++++++++++++++ c-api/src/comment.rs | 49 ++---- c-api/src/doctype.rs | 13 +- c-api/src/document_end.rs | 12 +- c-api/src/element.rs | 136 ++++------------ c-api/src/errors.rs | 9 ++ c-api/src/lib.rs | 128 +++++++++++++-- c-api/src/streaming.rs | 85 ++++++++++ c-api/src/text_chunk.rs | 55 ++----- 12 files changed, 626 insertions(+), 220 deletions(-) create mode 100644 c-api/cbindgen.toml create mode 100644 c-api/src/streaming.rs diff --git a/c-api/c-tests/src/test.c b/c-api/c-tests/src/test.c index 70ce42dc..7ea22ef4 100644 --- a/c-api/c-tests/src/test.c +++ b/c-api/c-tests/src/test.c @@ -16,5 +16,9 @@ int run_tests() { subtest("Element API", element_api_test); subtest("Document end API", document_end_api_test); subtest("Memory limiting", test_memory_limiting); - return done_testing(); + int res = done_testing(); + if (res) { + fprintf(stderr, "\nSome tests have failed\n"); + } + return res; } diff --git a/c-api/c-tests/src/test_element_api.c b/c-api/c-tests/src/test_element_api.c index 70236b18..a8171123 100644 --- a/c-api/c-tests/src/test_element_api.c +++ b/c-api/c-tests/src/test_element_api.c @@ -238,6 +238,114 @@ static void test_insert_content_around_element(lol_html_selector_t *selector, vo ); } +//------------------------------------------------------------------------- +EXPECT_OUTPUT( + streaming_mutations_output_sink, + "&before
Hi
&after\xf0\x9f\x98\x82", + &EXPECTED_USER_DATA, + sizeof(EXPECTED_USER_DATA) +); + +static void loltest_drop(void *user_data) { + int *drops = user_data; + (*drops)++; +} + +static int loltest_write_all_callback_before(lol_html_streaming_sink_t *sink, void *user_data) { + int *counter = user_data; + ok(*counter >= 100 && *counter <= 103); + + const char *before = "&before"; + return lol_html_streaming_sink_write_str(sink, before, strlen(before), false); +} + +static int loltest_write_all_callback_after(lol_html_streaming_sink_t *sink, void *user_data) { + int *counter = user_data; + ok(*counter >= 100 && *counter <= 103); + + const char *after = "&after"; + const char emoji[] = {0xf0,0x9f,0x98,0x82}; + return lol_html_streaming_sink_write_str(sink, after, strlen(after), false) || + lol_html_streaming_sink_write_str(sink, emoji, 4, false); +} + +static int loltest_write_all_callback_prepend(lol_html_streaming_sink_t *sink, void *user_data) { + int *counter = user_data; + ok(*counter >= 100 && *counter <= 103); + + const char *prepend1 = ""; + return lol_html_streaming_sink_write_str(sink, prepend1, strlen(prepend1), true) || + lol_html_streaming_sink_write_str(sink, prepend2, strlen(prepend2), true); +} + +static int loltest_write_all_callback_append(lol_html_streaming_sink_t *sink, void *user_data) { + int *counter = user_data; + ok(*counter >= 100 && *counter <= 103); + + const char *append = ""; + return lol_html_streaming_sink_write_str(sink, append, strlen(append), true); +} + +static lol_html_rewriter_directive_t streaming_mutations_around_element( + lol_html_element_t *element, + void *user_data +) { + note("Stream before/prepend"); + ok(!lol_html_element_streaming_before(element, &(lol_html_streaming_handler_t){ + .write_all_callback = loltest_write_all_callback_before, + .user_data = user_data, + .drop_callback = loltest_drop, + })); + ok(!lol_html_element_streaming_prepend(element, &(lol_html_streaming_handler_t){ + .write_all_callback = loltest_write_all_callback_prepend, + .user_data = user_data, + // tests null drop callback + })); + note("Stream after/append"); + ok(!lol_html_element_streaming_append(element, &(lol_html_streaming_handler_t){ + .write_all_callback = loltest_write_all_callback_append, + .user_data = user_data, + .drop_callback = loltest_drop, + })); + ok(!lol_html_element_streaming_after(element, &(lol_html_streaming_handler_t){ + .write_all_callback = loltest_write_all_callback_after, + .user_data = user_data, + .drop_callback = loltest_drop, + })); + + return LOL_HTML_CONTINUE; +} + +static void test_streaming_mutations_around_element(lol_html_selector_t *selector, void *user_data) { + UNUSED(user_data); + lol_html_rewriter_builder_t *builder = lol_html_rewriter_builder_new(); + + int drop_count = 100; + + int err = lol_html_rewriter_builder_add_element_content_handlers( + builder, + selector, + &streaming_mutations_around_element, + &drop_count, + NULL, + NULL, + NULL, + NULL + ); + + ok(!err); + + run_rewriter( + builder, + "
Hi
", + streaming_mutations_output_sink, + user_data + ); + + ok(drop_count == 103); // one has no drop callback on purpose +} + //------------------------------------------------------------------------- EXPECT_OUTPUT( set_element_inner_content_output_sink, @@ -706,6 +814,7 @@ void element_api_test() { test_iterate_attributes(selector, &user_data); test_get_and_modify_attributes(selector, &user_data); test_insert_content_around_element(selector, &user_data); + test_streaming_mutations_around_element(selector, &user_data); lol_html_selector_free(selector); } diff --git a/c-api/cbindgen.toml b/c-api/cbindgen.toml new file mode 100644 index 00000000..bfa2aae8 --- /dev/null +++ b/c-api/cbindgen.toml @@ -0,0 +1,16 @@ +# To generate a header: +# +# cargo expand > tmp.rs +# cbindgen tmp.rs + +language = "C" +tab_width = 4 +documentation = true +documentation_style = "c99" +documentation_length = "full" + +[export] +prefix = "lol_html_" + +[export.mangle] +rename_types = "SnakeCase" diff --git a/c-api/include/lol_html.h b/c-api/include/lol_html.h index 54e5e5c1..69e9fcf5 100644 --- a/c-api/include/lol_html.h +++ b/c-api/include/lol_html.h @@ -30,6 +30,7 @@ typedef struct lol_html_Element lol_html_element_t; typedef struct lol_html_AttributesIterator lol_html_attributes_iterator_t; typedef struct lol_html_Attribute lol_html_attribute_t; typedef struct lol_html_Selector lol_html_selector_t; +typedef struct lol_html_CStreamingHandlerSink lol_html_streaming_sink_t; // Library-allocated UTF8 string fat pointer. // @@ -116,6 +117,30 @@ typedef lol_html_rewriter_directive_t (*lol_html_end_tag_handler_t)( void *user_data ); +// For use with streaming content handlers. +// +// Safety: the user data and the callbacks must be safe to use from a different thread (e.g. can't rely on thread-local storage). +// It doesn't have to be `Sync`, it will be used only by one thread at a time. +// +// Handler functions copy this struct. It can (and should) be created on the stack. +typedef struct lol_html_CStreamingHandler { + // Anything you like + void *user_data; + // Called when the handler is supposed to produce its output. Return `0` for success. + // The `sink` argument is guaranteed non-`NULL`. It is valid only for the duration of this call, and can only be used on the same thread. + // The sink is for [`lol_html_streaming_sink_write_str`]. + // `user_data` comes from this struct. + // + // `write_all_callback` must not be `NULL`. + int (*write_all_callback)(lol_html_streaming_sink_t *sink, void *user_data); + // Called exactly once, after the last use of this handler. + // It may be `NULL`. + // `user_data` comes from this struct. + void (*drop_callback)(void *user_data); + // *Always* initialize to `NULL`. + void *reserved; +} lol_html_streaming_handler_t; + // Selector //--------------------------------------------------------------------- @@ -792,6 +817,209 @@ int lol_html_doc_end_append( bool is_html ); + + +//[`Element::streaming_prepend`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`element` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_prepend(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +//[`Element::streaming_append`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`element` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_append(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +//[`Element::streaming_before`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`element` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_before(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +//[`Element::streaming_after`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`element` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_after(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +//[`Element::streaming_set_inner_content`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`element` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_set_inner_content(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +//[`Element::streaming_replace`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`element` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_replace(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +//[`EndTag::streaming_before`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`end_tag` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_end_tag_streaming_before(lol_html_end_tag_t *end_tag, + lol_html_streaming_handler_t *streaming_writer); + +//[`EndTag::streaming_after`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`end_tag` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_end_tag_streaming_after(lol_html_end_tag_t *end_tag, + lol_html_streaming_handler_t *streaming_writer); + +//[`EndTag::streaming_replace`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`end_tag` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_end_tag_streaming_replace(lol_html_end_tag_t *end_tag, + lol_html_streaming_handler_t *streaming_writer); + + +//[`TextChunk::streaming_before`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`text_chunk` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_text_chunk_streaming_before(lol_html_text_chunk_t *text_chunk, + lol_html_streaming_handler_t *streaming_writer); + +//[`TextChunk::streaming_after`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`text_chunk` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_text_chunk_streaming_after(lol_html_text_chunk_t *text_chunk, + lol_html_streaming_handler_t *streaming_writer); + +//[`TextChunk::streaming_replace`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +//`text_chunk` +// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_text_chunk_streaming_replace(lol_html_text_chunk_t *text_chunk, + lol_html_streaming_handler_t *streaming_writer); + +// Write another piece of UTF-8 data to the output. Returns `0` on success, and `-1` if it wasn't valid UTF-8. +// All pointers must be non-NULL. +int lol_html_streaming_sink_write_str(lol_html_streaming_sink_t *sink, + const char *string_utf8, + size_t string_utf8_len, + bool is_html); + + #if defined(__cplusplus) } // extern C #endif diff --git a/c-api/src/comment.rs b/c-api/src/comment.rs index 81c4e056..25707318 100644 --- a/c-api/src/comment.rs +++ b/c-api/src/comment.rs @@ -19,45 +19,16 @@ pub extern "C" fn lol_html_comment_text_set( 0 } -#[no_mangle] -pub extern "C" fn lol_html_comment_before( - comment: *mut Comment, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { comment.before(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_comment_after( - comment: *mut Comment, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { comment.after(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_comment_replace( - comment: *mut Comment, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { comment.replace(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_comment_remove(comment: *mut Comment) { - to_ref_mut!(comment).remove(); -} - -#[no_mangle] -pub extern "C" fn lol_html_comment_is_removed(comment: *const Comment) -> bool { - to_ref!(comment).removed() -} +impl_content_mutation_handlers! { comment: Comment [ + lol_html_comment_before => before, + lol_html_comment_after => after, + lol_html_comment_replace => replace, + @VOID lol_html_comment_remove => remove, + @BOOL lol_html_comment_is_removed => removed, + @STREAM lol_html_comment_streaming_before => streaming_before, + @STREAM lol_html_comment_streaming_after => streaming_after, + @STREAM lol_html_comment_streaming_replace => streaming_replace, +] } #[no_mangle] pub extern "C" fn lol_html_comment_user_data_set(comment: *mut Comment, user_data: *mut c_void) { diff --git a/c-api/src/doctype.rs b/c-api/src/doctype.rs index cfa30e4c..3afcf38e 100644 --- a/c-api/src/doctype.rs +++ b/c-api/src/doctype.rs @@ -25,12 +25,7 @@ pub extern "C" fn lol_html_doctype_user_data_get(doctype: *const Doctype) -> *mu get_user_data!(doctype) } -#[no_mangle] -pub extern "C" fn lol_html_doctype_remove(doctype: *mut Doctype) { - to_ref_mut!(doctype).remove(); -} - -#[no_mangle] -pub extern "C" fn lol_html_doctype_is_removed(doctype: *const Doctype) -> bool { - to_ref!(doctype).removed() -} +impl_content_mutation_handlers! { doctype: Doctype [ + @VOID lol_html_doctype_remove => remove, + @BOOL lol_html_doctype_is_removed => removed, +] } diff --git a/c-api/src/document_end.rs b/c-api/src/document_end.rs index 6bd9eee5..566d7f2e 100644 --- a/c-api/src/document_end.rs +++ b/c-api/src/document_end.rs @@ -1,11 +1,5 @@ use super::*; -#[no_mangle] -pub extern "C" fn lol_html_doc_end_append( - document_end: *mut DocumentEnd, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { document_end.append(content, content_len, is_html) } -} +impl_content_mutation_handlers! { doc_end: DocumentEnd [ + lol_html_doc_end_append => append, +] } diff --git a/c-api/src/element.rs b/c-api/src/element.rs index 897b70a6..28f1f20b 100644 --- a/c-api/src/element.rs +++ b/c-api/src/element.rs @@ -29,16 +29,6 @@ pub extern "C" fn lol_html_element_tag_name_set( 0 } -#[no_mangle] -pub extern "C" fn lol_html_element_is_self_closing(element: *mut Element) -> bool { - to_ref!(element).is_self_closing() -} - -#[no_mangle] -pub extern "C" fn lol_html_element_can_have_content(element: *mut Element) -> bool { - to_ref!(element).can_have_content() -} - #[no_mangle] pub extern "C" fn lol_html_element_namespace_uri_get(element: *mut Element) -> *const c_char { let element = to_ref!(element); @@ -157,80 +147,25 @@ pub extern "C" fn lol_html_element_remove_attribute( 0 } -#[no_mangle] -pub extern "C" fn lol_html_element_before( - element: *mut Element, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { element.before(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_element_prepend( - element: *mut Element, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { element.prepend(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_element_append( - element: *mut Element, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { element.append(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_element_after( - element: *mut Element, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { element.after(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_element_set_inner_content( - element: *mut Element, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { element.set_inner_content(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_element_replace( - element: *mut Element, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { element.replace(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_element_remove(element: *mut Element) { - to_ref_mut!(element).remove(); -} - -#[no_mangle] -pub extern "C" fn lol_html_element_remove_and_keep_content(element: *mut Element) { - to_ref_mut!(element).remove_and_keep_content(); -} - -#[no_mangle] -pub extern "C" fn lol_html_element_is_removed(element: *mut Element) -> bool { - to_ref_mut!(element).removed() -} +impl_content_mutation_handlers! { element: Element [ + lol_html_element_prepend => prepend, + lol_html_element_append => append, + lol_html_element_before => before, + lol_html_element_after => after, + lol_html_element_set_inner_content => set_inner_content, + lol_html_element_replace => replace, + @VOID lol_html_element_remove => remove, + @VOID lol_html_element_remove_and_keep_content => remove_and_keep_content, + @BOOL lol_html_element_is_removed => removed, + @BOOL lol_html_element_is_self_closing => is_self_closing, + @BOOL lol_html_element_can_have_content => can_have_content, + @STREAM lol_html_element_streaming_prepend => streaming_prepend, + @STREAM lol_html_element_streaming_append => streaming_append, + @STREAM lol_html_element_streaming_before => streaming_before, + @STREAM lol_html_element_streaming_after => streaming_after, + @STREAM lol_html_element_streaming_set_inner_content => streaming_set_inner_content, + @STREAM lol_html_element_streaming_replace => streaming_replace, +] } #[no_mangle] pub extern "C" fn lol_html_element_user_data_set(element: *mut Element, user_data: *mut c_void) { @@ -274,30 +209,15 @@ pub extern "C" fn lol_html_element_clear_end_tag_handlers(element: *mut Element) } } -#[no_mangle] -pub extern "C" fn lol_html_end_tag_before( - end_tag: *mut EndTag, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { end_tag.before(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_end_tag_after( - end_tag: *mut EndTag, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { end_tag.after(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_end_tag_remove(end_tag: *mut EndTag) { - to_ref_mut!(end_tag).remove(); -} +impl_content_mutation_handlers! { end_tag: EndTag [ + lol_html_end_tag_before => before, + lol_html_end_tag_after => after, + lol_html_end_tag_replace => replace, + @VOID lol_html_end_tag_remove => remove, + @STREAM lol_html_end_tag_streaming_before => streaming_before, + @STREAM lol_html_end_tag_streaming_after => streaming_after, + @STREAM lol_html_end_tag_streaming_replace => streaming_replace, +] } #[no_mangle] pub extern "C" fn lol_html_end_tag_name_get(end_tag: *mut EndTag) -> Str { diff --git a/c-api/src/errors.rs b/c-api/src/errors.rs index 9e7b4f87..27413e8c 100644 --- a/c-api/src/errors.rs +++ b/c-api/src/errors.rs @@ -11,3 +11,12 @@ pub extern "C" fn lol_html_take_last_error() -> Str { Str::from_opt(err.map(|e| e.to_string())) } + +#[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] +pub enum CStreamingHandlerError { + #[error("Not all fields of the struct were initialized")] + Uninitialized, + + #[error("write_all_callback reported error: {0}")] + HandlerError(c_int), +} diff --git a/c-api/src/lib.rs b/c-api/src/lib.rs index b41d4662..04523d7d 100644 --- a/c-api/src/lib.rs +++ b/c-api/src/lib.rs @@ -1,3 +1,4 @@ +pub use crate::streaming::CStreamingHandler; use libc::{c_char, c_int, c_void, size_t}; use lol_html::html_content::*; use lol_html::*; @@ -81,6 +82,94 @@ macro_rules! unwrap_or_ret_null { }; } +macro_rules! impl_content_mutation_handlers { + ($name:ident: $typ:ty [ $($(@$kind:ident)? $fn_name:ident => $method:ident),+$(,)? ]) => { + $( + // stable Rust can't concatenate idents, so fn_name must be written out manually, + // but it is possible to compare concatenated strings. + #[cfg(debug_assertions)] + const _: () = { + let expected_fn_name_prefix = concat!("lol_html_", stringify!($name), "_").as_bytes(); + let fn_name = stringify!($fn_name).as_bytes(); + // removed vs is_removed prevents exact comparison + assert!(fn_name.len() >= expected_fn_name_prefix.len() + (stringify!($method).len()), stringify!($fn_name)); + let mut i = 0; + while i < expected_fn_name_prefix.len() { + assert!(expected_fn_name_prefix[i] == fn_name[i], stringify!($fn_name)); + i += 1; + } + }; + impl_content_mutation_handlers! { IMPL $($kind)? $name: $typ, $fn_name => $method } + )+ + }; + (IMPL $name:ident: $typ:ty, $fn_name:ident => $method:ident) => { + #[doc = concat!("[`", stringify!($typ), "::", stringify!($method), "`]")] + /// + /// The `content` must be a valid UTF-8 string. It's copied immediately. + /// If `is_html` is `true`, then the `content` will be written without HTML-escaping. + /// + #[doc = concat!("`", stringify!($name), "`")] + /// must be valid and non-`NULL`. If `content` is `NULL`, an error will be reported. + /// + /// Returns 0 on success. + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + content: *const c_char, + content_len: size_t, + is_html: bool, + ) -> c_int { + content_insertion_fn_body! { $name.$method(content, content_len, is_html) } + } + }; + (IMPL STREAM $name:ident: $typ:ty, $fn_name:ident => $method:ident) => { + #[doc = concat!("[`", stringify!($typ), "::", stringify!($method), "`]")] + /// + /// The [`CStreamingHandler`] contains callbacks that will be called + /// when the content needs to be written. + /// + /// `streaming_writer` is copied immediately, and doesn't have a stable address. + /// `streaming_writer` may be used from another thread (`Send`), but it's only going + /// to be used by one thread at a time (`!Sync`). + /// + #[doc = concat!("`", stringify!($name), "`")] + /// must be valid and non-`NULL`. If `streaming_writer` is `NULL`, an error will be reported. + /// + /// Returns 0 on success. + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + streaming_writer: *mut CStreamingHandler, + ) -> c_int { + content_insertion_fn_body! { $name.$method(streaming_writer) } + } + }; + (IMPL VOID $name:ident: $typ:ty, $fn_name:ident => $method:ident) => { + #[doc = concat!("[`", stringify!($typ), "::", stringify!($method), "`]")] + /// + #[doc = concat!("`", stringify!($name), "`")] + /// must be valid and non-`NULL`. + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + ) { + to_ref_mut!($name).$method(); + } + }; + (IMPL BOOL $name:ident: $typ:ty, $fn_name:ident => $method:ident) => { + #[doc = concat!("[`", stringify!($typ), "::", stringify!($method), "`]")] + /// + #[doc = concat!("`", stringify!($name), "`")] + /// must be valid and non-`NULL`. Returns `_Bool`. + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + ) -> bool { + to_ref_mut!($name).$method() + } + }; +} + macro_rules! content_insertion_fn_body { ($target:ident.$method:ident($content:ident, $content_len:ident, $is_html:ident)) => {{ let target = to_ref_mut!($target); @@ -97,6 +186,24 @@ macro_rules! content_insertion_fn_body { 0 }}; + ($target:ident.$method:ident($handler:expr)) => {{ + let handler_ptr: *mut CStreamingHandler = $handler; + if unsafe { handler_ptr.as_ref() }.is_none_or(|handler| !handler.reserved.is_null()) { + // we can't even safely call drop callback on this + return -1; + } + // Taking ownership of the CStreamingHandler + let handler: Box = Box::new(unsafe { handler_ptr.read() }); + if handler.write_all_callback.is_none() { + return -1; + } + if let Some(target) = unsafe { $target.as_mut() } { + target.$method(handler); + 0 + } else { + -1 + } + }}; } macro_rules! get_user_data { @@ -109,16 +216,17 @@ macro_rules! get_user_data { }; } -mod comment; -mod doctype; -mod document_end; -mod element; -mod errors; -mod rewriter; -mod rewriter_builder; -mod selector; -mod string; -mod text_chunk; +pub mod comment; +pub mod doctype; +pub mod document_end; +pub mod element; +pub mod errors; +pub mod rewriter; +pub mod rewriter_builder; +pub mod selector; +pub mod streaming; +pub mod string; +pub mod text_chunk; pub use self::string::Str; diff --git a/c-api/src/streaming.rs b/c-api/src/streaming.rs new file mode 100644 index 00000000..76a1003a --- /dev/null +++ b/c-api/src/streaming.rs @@ -0,0 +1,85 @@ +use super::*; +use crate::errors::CStreamingHandlerError; +use lol_html::html_content::StreamingHandler; +use lol_html::html_content::StreamingHandlerSink; + +/// Opaque type from C's perspective +pub type CStreamingHandlerSink<'tmp> = StreamingHandlerSink<'tmp>; + +/// Write another piece of UTF-8 data to the output. Returns `0` on success, and `-1` if it wasn't valid UTF-8. +/// All pointers must be non-NULL. +#[no_mangle] +pub unsafe extern "C" fn lol_html_streaming_sink_write_str( + sink: *mut CStreamingHandlerSink<'_>, + string_utf8: *const c_char, + string_utf8_len: size_t, + is_html: bool, +) -> c_int { + let sink = to_ref_mut!(sink); + let content = unwrap_or_ret_err_code! { to_str!(string_utf8, string_utf8_len) }; + let is_html = if is_html { + ContentType::Html + } else { + ContentType::Text + }; + + sink.write_str(content, is_html); + 0 +} + +/// Safety: the user data and the callbacks must be safe to use from a different thread (e.g. can't rely on thread-local storage). +/// It doesn't have to be `Sync`, it will be used only by one thread at a time. +/// +/// Handler functions copy this struct. It can (and should) be created on the stack. +#[repr(C)] +pub struct CStreamingHandler { + /// Anything you like + pub user_data: *mut c_void, + /// Called when the handler is supposed to produce its output. Return `0` for success. + /// The `sink` argument is guaranteed non-`NULL`. It is valid only for the duration of this call, and can only be used on the same thread. + /// The sink is for [`lol_html_streaming_sink_write_str`]. + /// `user_data` comes from this struct. + /// `write_all_callback` must not be `NULL`. + pub write_all_callback: Option< + unsafe extern "C" fn(sink: &mut CStreamingHandlerSink<'_>, user_data: *mut c_void) -> c_int, + >, + /// Called exactly once, after the last use of this handler. + /// `user_data` comes from this struct. + /// May be `NULL`. + pub drop_callback: Option, + /// *Always* initialize to `NULL`. + pub reserved: *mut c_void, +} + +// It's up to C to obey this +unsafe impl Send for CStreamingHandler {} + +impl StreamingHandler for CStreamingHandler { + fn write_all( + self: Box, + sink: &mut StreamingHandlerSink<'_>, + ) -> Result<(), Box<(dyn std::error::Error + Send + Sync)>> { + if !self.reserved.is_null() { + return Err(CStreamingHandlerError::Uninitialized.into()); + } + let cb = self + .write_all_callback + .ok_or(CStreamingHandlerError::Uninitialized)?; + let res = unsafe { (cb)(sink, self.user_data) }; + if res == 0 { + Ok(()) + } else { + Err(CStreamingHandlerError::HandlerError(res).into()) + } + } +} + +impl Drop for CStreamingHandler { + fn drop(&mut self) { + if let Some(cb) = self.drop_callback { + unsafe { + cb(self.user_data); + } + } + } +} diff --git a/c-api/src/text_chunk.rs b/c-api/src/text_chunk.rs index 76074b3a..31cb03ae 100644 --- a/c-api/src/text_chunk.rs +++ b/c-api/src/text_chunk.rs @@ -22,50 +22,17 @@ pub extern "C" fn lol_html_text_chunk_content_get(chunk: *mut TextChunk) -> Text TextChunkContent::new(to_ref!(chunk)) } -#[no_mangle] -pub extern "C" fn lol_html_text_chunk_is_last_in_text_node(chunk: *mut TextChunk) -> bool { - to_ref!(chunk).last_in_text_node() -} - -#[no_mangle] -pub extern "C" fn lol_html_text_chunk_before( - chunk: *mut TextChunk, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { chunk.before(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_text_chunk_after( - chunk: *mut TextChunk, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { chunk.after(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_text_chunk_replace( - chunk: *mut TextChunk, - content: *const c_char, - content_len: size_t, - is_html: bool, -) -> c_int { - content_insertion_fn_body! { chunk.replace(content, content_len, is_html) } -} - -#[no_mangle] -pub extern "C" fn lol_html_text_chunk_remove(chunk: *mut TextChunk) { - to_ref_mut!(chunk).remove(); -} - -#[no_mangle] -pub extern "C" fn lol_html_text_chunk_is_removed(chunk: *const TextChunk) -> bool { - to_ref!(chunk).removed() -} +impl_content_mutation_handlers! { text_chunk: TextChunk [ + lol_html_text_chunk_before => before, + lol_html_text_chunk_after => after, + lol_html_text_chunk_replace => replace, + @VOID lol_html_text_chunk_remove => remove, + @BOOL lol_html_text_chunk_is_removed => removed, + @BOOL lol_html_text_chunk_is_last_in_text_node => last_in_text_node, + @STREAM lol_html_text_chunk_streaming_before => streaming_before, + @STREAM lol_html_text_chunk_streaming_after => streaming_after, + @STREAM lol_html_text_chunk_streaming_replace => streaming_replace, +] } #[no_mangle] pub extern "C" fn lol_html_text_chunk_user_data_set(chunk: *mut TextChunk, user_data: *mut c_void) { From bd1a5dc20fe2478b19cf4ef5374454eb5b4febcb Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 1 Nov 2024 13:55:17 +0000 Subject: [PATCH 09/15] Support for incomplete UTF-8 writes --- c-api/c-tests/src/test_element_api.c | 9 +- c-api/include/lol_html.h | 18 ++- c-api/src/streaming.rs | 33 +++- src/lib.rs | 4 +- src/rewritable_units/element.rs | 2 +- src/rewritable_units/mod.rs | 2 +- src/rewritable_units/text_encoder.rs | 215 +++++++++++++++++++++++++-- 7 files changed, 266 insertions(+), 17 deletions(-) diff --git a/c-api/c-tests/src/test_element_api.c b/c-api/c-tests/src/test_element_api.c index a8171123..95f9a7e5 100644 --- a/c-api/c-tests/src/test_element_api.c +++ b/c-api/c-tests/src/test_element_api.c @@ -266,7 +266,10 @@ static int loltest_write_all_callback_after(lol_html_streaming_sink_t *sink, voi const char *after = "&after"; const char emoji[] = {0xf0,0x9f,0x98,0x82}; return lol_html_streaming_sink_write_str(sink, after, strlen(after), false) || - lol_html_streaming_sink_write_str(sink, emoji, 4, false); + lol_html_streaming_sink_write_utf8_chunk(sink, &emoji[0], 1, false) || + lol_html_streaming_sink_write_utf8_chunk(sink, &emoji[1], 1, false) || + lol_html_streaming_sink_write_utf8_chunk(sink, &emoji[2], 1, false) || + lol_html_streaming_sink_write_utf8_chunk(sink, &emoji[3], 1, false); } static int loltest_write_all_callback_prepend(lol_html_streaming_sink_t *sink, void *user_data) { @@ -275,8 +278,8 @@ static int loltest_write_all_callback_prepend(lol_html_streaming_sink_t *sink, v const char *prepend1 = ""; - return lol_html_streaming_sink_write_str(sink, prepend1, strlen(prepend1), true) || - lol_html_streaming_sink_write_str(sink, prepend2, strlen(prepend2), true); + return lol_html_streaming_sink_write_utf8_chunk(sink, prepend1, strlen(prepend1), true) || + lol_html_streaming_sink_write_utf8_chunk(sink, prepend2, strlen(prepend2), true); } static int loltest_write_all_callback_append(lol_html_streaming_sink_t *sink, void *user_data) { diff --git a/c-api/include/lol_html.h b/c-api/include/lol_html.h index 69e9fcf5..27e37931 100644 --- a/c-api/include/lol_html.h +++ b/c-api/include/lol_html.h @@ -128,7 +128,7 @@ typedef struct lol_html_CStreamingHandler { void *user_data; // Called when the handler is supposed to produce its output. Return `0` for success. // The `sink` argument is guaranteed non-`NULL`. It is valid only for the duration of this call, and can only be used on the same thread. - // The sink is for [`lol_html_streaming_sink_write_str`]. + // The sink is for [`lol_html_streaming_sink_write_str`] and [`lol_html_streaming_sink_write_utf8_chunk`]. // `user_data` comes from this struct. // // `write_all_callback` must not be `NULL`. @@ -1019,6 +1019,22 @@ int lol_html_streaming_sink_write_str(lol_html_streaming_sink_t *sink, size_t string_utf8_len, bool is_html); +// [`StreamingHandlerSink::write_utf8_chunk`] +// +// Writes as much of the given UTF-8 fragment as possible, converting the encoding and HTML-escaping if `is_html` is `false`. +// +// The `bytes_utf8` doesn't need to be a complete UTF-8 string, as long as consecutive calls to this function create a valid UTF-8 string. +// Any incomplete UTF-8 sequence at the end of the content is buffered and flushed as soon as it's completed. +// +// Other functions like [`lol_html_streaming_sink_write_str`] should not be called after a +// `lol_html_streaming_sink_write_utf8_chunk` call with an incomplete UTF-8 sequence. +// +// Returns `0` on success, and `-1` if it wasn't valid UTF-8. +// All pointers must be non-`NULL`. +int lol_html_streaming_sink_write_utf8_chunk(lol_html_streaming_sink_t *sink, + const char *bytes_utf8, + size_t bytes_utf8_len, + bool is_html); #if defined(__cplusplus) } // extern C diff --git a/c-api/src/streaming.rs b/c-api/src/streaming.rs index 76a1003a..426edffd 100644 --- a/c-api/src/streaming.rs +++ b/c-api/src/streaming.rs @@ -27,6 +27,37 @@ pub unsafe extern "C" fn lol_html_streaming_sink_write_str( 0 } +/// [`StreamingHandlerSink::write_utf8_chunk`] +/// +/// Writes as much of the given UTF-8 fragment as possible, converting the encoding and HTML-escaping if `is_html` is `false`. +/// +/// The `bytes_utf8` doesn't need to be a complete UTF-8 string, as long as consecutive calls to this function create a valid UTF-8 string. +/// Any incomplete UTF-8 sequence at the end of the content is buffered and flushed as soon as it's completed. +/// +/// Other functions like [`lol_html_streaming_sink_write_str`] should not be called after a +/// `lol_html_streaming_sink_write_utf8_chunk` call with an incomplete UTF-8 sequence. +/// +/// Returns `0` on success, and `-1` if it wasn't valid UTF-8. +/// All pointers must be non-`NULL`. +#[no_mangle] +pub unsafe extern "C" fn lol_html_streaming_sink_write_utf8_chunk( + sink: *mut CStreamingHandlerSink<'_>, + bytes_utf8: *const c_char, + bytes_utf8_len: size_t, + is_html: bool, +) -> c_int { + let sink = to_ref_mut!(sink); + let content = to_bytes!(bytes_utf8, bytes_utf8_len); + let is_html = if is_html { + ContentType::Html + } else { + ContentType::Text + }; + + unwrap_or_ret_err_code! { sink.write_utf8_chunk(content, is_html) }; + 0 +} + /// Safety: the user data and the callbacks must be safe to use from a different thread (e.g. can't rely on thread-local storage). /// It doesn't have to be `Sync`, it will be used only by one thread at a time. /// @@ -37,7 +68,7 @@ pub struct CStreamingHandler { pub user_data: *mut c_void, /// Called when the handler is supposed to produce its output. Return `0` for success. /// The `sink` argument is guaranteed non-`NULL`. It is valid only for the duration of this call, and can only be used on the same thread. - /// The sink is for [`lol_html_streaming_sink_write_str`]. + /// The sink is for [`lol_html_streaming_sink_write_str`] and [`lol_html_streaming_sink_write_utf8_chunk`]. /// `user_data` comes from this struct. /// `write_all_callback` must not be `NULL`. pub write_all_callback: Option< diff --git a/src/lib.rs b/src/lib.rs index b7be6475..ea67156d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,7 +87,9 @@ pub mod send { pub mod errors { pub use super::memory::MemoryLimitExceededError; pub use super::parser::ParsingAmbiguityError; - pub use super::rewritable_units::{AttributeNameError, CommentTextError, TagNameError}; + pub use super::rewritable_units::{ + AttributeNameError, CommentTextError, TagNameError, Utf8Error, + }; pub use super::rewriter::RewritingError; pub use super::selectors_vm::SelectorError; } diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index d0c6e452..38d4b8c8 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -1215,7 +1215,7 @@ mod tests { { let output = rewrite_element(&html, enc, "span", |el| { el.streaming_prepend(streaming!(|s| { - s.write_str("", ContentType::Html); + s.write_utf8_chunk(b"", ContentType::Html)?; Ok(()) })); el.append("", ContentType::Html); diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index 8ac912b3..069a3fda 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -4,7 +4,7 @@ pub use self::document_end::*; pub use self::element::*; pub use self::mutations::{ContentType, StreamingHandler}; pub(crate) use self::mutations::{Mutations, StringChunk}; -pub use self::text_encoder::StreamingHandlerSink; +pub use self::text_encoder::{StreamingHandlerSink, Utf8Error}; pub use self::tokens::*; /// Data that can be attached to a rewritable unit by a user and shared between content handler diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index 2c72795e..b584c7c7 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -1,11 +1,26 @@ use super::ContentType; use crate::html::escape_body_text; -use encoding_rs::{CoderResult, Encoder, Encoding, UTF_8}; +use encoding_rs::{CoderResult, Decoder, DecoderResult, Encoder, Encoding, UTF_8}; +use thiserror::Error; + +/// Input contained non-UTF-8 byte sequence +/// +/// [`StreamingHandlerSink::write_utf8_chunk`] will not fail on an incomplete UTF-8 sequence at the end of the chunk, +/// but it will report errors if incomplete UTF-8 sequences are within the chunk, or the next call starts with +/// bytes that don't match the previous call's trailing bytes. +#[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] +#[error("Invalid UTF-8")] +pub struct Utf8Error; /// Used to write chunks of text or markup in streaming mutation handlers. /// /// Argument to [`StreamingHandler::write_all()`](crate::html_content::StreamingHandler::write_all). pub struct StreamingHandlerSink<'output_handler> { + incomplete_utf8: Option, + inner: StreamingHandlerSinkInner<'output_handler>, +} + +struct StreamingHandlerSinkInner<'output_handler> { non_utf8_encoder: Option, /// ```compile_fail @@ -29,8 +44,11 @@ impl<'output_handler> StreamingHandlerSink<'output_handler> { output_handler: &'output_handler mut dyn FnMut(&[u8]), ) -> Self { Self { - non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), - output_handler, + incomplete_utf8: None, + inner: StreamingHandlerSinkInner { + non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), + output_handler, + }, } } @@ -39,6 +57,51 @@ impl<'output_handler> StreamingHandlerSink<'output_handler> { /// It may be called multiple times. The strings will be concatenated together. #[inline] pub fn write_str(&mut self, content: &str, content_type: ContentType) { + if self + .incomplete_utf8 + .as_mut() + .is_some_and(|d| d.discard_incomplete()) + { + // too late to report the error to the caller of write_utf8_chunk + self.inner.write_html("\u{FFFD}"); + } + self.inner.write_str(content, content_type); + } + + #[inline] + pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) { + &mut self.inner.output_handler + } + + /// Writes as much of the given UTF-8 fragment as possible, converting the encoding and [escaping](ContentType) if necessary. + /// + /// The `content` doesn't need to be a complete UTF-8 string, as long as consecutive calls to `write_utf8_bytes` create a valid UTF-8 string. + /// Any incomplete UTF-8 sequence at the end of the content is buffered and flushed as soon as it's completed. + /// + /// Other methods like `write_str_chunk` should not be called after a `write_utf8_bytes` call with an incomplete UTF-8 sequence. + #[inline] + pub fn write_utf8_chunk( + &mut self, + mut content: &[u8], + content_type: ContentType, + ) -> Result<(), Utf8Error> { + let decoder = self + .incomplete_utf8 + .get_or_insert_with(IncompleteUtf8Resync::new); + while !content.is_empty() { + let (valid_chunk, rest) = decoder.utf8_bytes_to_slice(content, false)?; + content = rest; + if !valid_chunk.is_empty() { + self.inner.write_str(valid_chunk, content_type); + } + } + Ok(()) + } +} + +impl<'output_handler> StreamingHandlerSinkInner<'output_handler> { + #[inline] + pub(crate) fn write_str(&mut self, content: &str, content_type: ContentType) { match content_type { ContentType::Html => self.write_html(content), ContentType::Text => self.write_body_text(content), @@ -67,11 +130,6 @@ impl<'output_handler> StreamingHandlerSink<'output_handler> { }); } } - - #[inline] - pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) { - &mut self.output_handler - } } enum Buffer { @@ -154,6 +212,78 @@ impl TextEncoder { } } +struct IncompleteUtf8Resync { + decoder: Decoder, + buffer: String, +} + +impl IncompleteUtf8Resync { + pub fn new() -> Self { + Self { + decoder: UTF_8.new_decoder_without_bom_handling(), + buffer: "\0".repeat(1024), + } + } + + pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>( + &'buf mut self, + content: &'src [u8], + is_last: bool, + ) -> Result<(&'buf str, &'src [u8]), Utf8Error> { + let (result, read, written) = + self.decoder + .decode_to_str_without_replacement(content, &mut self.buffer, is_last); + + match result { + DecoderResult::InputEmpty => {} + DecoderResult::OutputFull => { + if written == 0 { + panic!("encoding_rs infinite loop"); // the buffer is always large enough + } + } + DecoderResult::Malformed(_, _) => return Err(Utf8Error), + } + + let written = &self.buffer[..written]; + let remaining = &content[read..]; + Ok((written, remaining)) + } + + /// True if there were incomplete invalid bytes in the buffer + pub fn discard_incomplete(&mut self) -> bool { + match self.utf8_bytes_to_slice(b"", true) { + Ok((valid_chunk, rest)) => { + debug_assert!(rest.is_empty()); + debug_assert!(valid_chunk.is_empty()); // this can't happen in UTF-8 after empty write + false + } + Err(_) => true, + } + } +} + +#[test] +fn utf8_fragments() { + let text = "🐈°文字化けしない ▀▄ ɯopuɐɹ ⓤⓝⓘⓒⓞⓓⓔ and ascii 🐳 sʇuıodǝpoɔ ✴"; + for with_zero_writes in [false, true] { + for len in 1..9 { + let mut out = Vec::new(); + let mut handler = |ch: &[u8]| out.extend_from_slice(ch); + let mut t = StreamingHandlerSink::new(UTF_8, &mut handler); + for (nth, chunk) in text.as_bytes().chunks(len).enumerate() { + let msg = + format!("{len} at {nth} '{chunk:?}'; with_zero_writes={with_zero_writes}"); + if with_zero_writes { + t.write_utf8_chunk(b"", ContentType::Text).expect(&msg); + } + t.write_utf8_chunk(chunk, ContentType::Html).expect(&msg); + } + drop(t); + assert_eq!(String::from_utf8_lossy(&out), text, "{len}"); + } + } +} + #[test] fn long_text() { let mut written = 0; @@ -174,10 +304,77 @@ fn long_text() { let mut t = StreamingHandlerSink::new(encoding_rs::ISO_8859_2, &mut handler); let mut s = "ą0ą1ą2ą3ą4ą5ą6ą7ą8ą9".repeat(128); + let mut split_point = 1; while s.len() <= 1 << 17 { s.push_str(&s.clone()); expected += s.chars().count(); - t.write_str(&s, ContentType::Text); + let (a, b) = s.as_bytes().split_at(split_point); + split_point += 13; + t.write_utf8_chunk(a, ContentType::Text).unwrap(); + t.write_utf8_chunk(b, ContentType::Html).unwrap(); } assert_eq!(expected, written); } + +#[test] +fn invalid_utf8_fragments() { + #[rustfmt::skip] + let broken_utf8 = &[ + &b"\x31\x32\x33\xED\xA0\x80\x31"[..], b"\x31\x32\x33\xEF\x80", b"\x31\x32\x33\xEF\x80\xF0\x3c", + b"\x37\x38\x39\xFE", b"\x37\x38\xFE", b"\x37\xFF", b"\x3c\x23\x24\xFE\x3C", b"\x3C\x23\xFE\x3C\x3C", + b"\x3C\x3D\xE0\x80\x3C", b"\x3C\x3D\xE0\x80\xAF\x3C", b"\x3C\x3D\xE0\x80\xE0\x80\x3C", + b"\x3C\x3D\xED\xA0\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x80\x3C", + b"\x3C\x3D\xF7\xBF\xBF\xBF\x3C", b"\x3C\x3D\xFF\x3C", b"\x7F", b"\x80", b"\x80\x3C", + b"\x80\x81\x82\x83\x84\x85\x86\x87", b"\x80\xBF", b"\x80\xBF\x80", b"\x80\xBF\x80\xBF", + b"\x80\xBF\x80\xBF\x80", b"\x80\xBF\x80\xBF\x80\xBF", b"\x81", b"\x81\x3C", + b"\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F", b"\x90\x91\x92\x93\x94\x95\x96\x97", b"\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F", + b"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7", b"\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF", b"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7", + b"\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF", b"\xBF", b"\xC0", b"\xC0\x3C\xC1\x3C\xC2\x3C\xC3\x3C", b"\xC0\x80", + b"\xC0\xAF", b"\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41", b"\xC1\x3C", b"\xC1\xBF", b"\xC1\xBF", b"\xC2\x00", + b"\xC2\x41\x42", b"\xC2\x7F", b"\xC2\xC0", b"\xC2\xFF", b"\xC4\x3C\xC5\x3C\xC6\x3C\xC7\x3C", + b"\xC8\x3C\xC9\x3C\xCA\x3C\xCB\x3C", b"\xCC\x3C\xCD\x3C\xCE\x3C\xCF\x3C", b"\xD0\x3C\xD1\x3C\xD2\x3C\xD3\x3C", + b"\xD4\x3C\xD5\x3C\xD6\x3C\xD7\x3C", b"\xD8\x3C\xD9\x3C\xDA\x3C\xDB\x3C", b"\xDC\x3C\xDD\x3C\xDE\x3C\xDF\x3C", + b"\xDF", b"\xDF\x00", b"\xDF\x7F", b"\xDF\xC0", b"\xDF\xFF", b"\xE0\x3C\xE1\x3C\xE2\x3C\xE3\x3C", b"\xE0\x80", + b"\xE0\x80\x00", b"\xE0\x80\x7F", b"\xE0\x80\x80", b"\xE0\x80\xAF", b"\xE0\x80\xC0", b"\xE0\x80\xFF", + b"\xE0\x81\xBF", b"\xE0\x9F\xBF", b"\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41", + b"\xE4\x3C\xE5\x3C\xE6\x3C\xE7\x3C", b"\xE8\x3C\xE9\x3C\xEA\x3C\xEB\x3C", b"\xEC\x3C\xED\x3C\xEE\x3C\xEF\x3C", + b"\xED\x80\x00", b"\xED\x80\x7F", b"\xED\x80\xC0", b"\xED\x80\xFF", b"\xED\xA0\x80", b"\xED\xA0\x80\x35", + b"\xED\xA0\x80\xED\xB0\x80", b"\xED\xA0\x80\xED\xBF\xBF", b"\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41", + b"\xED\xAD\xBF", b"\xED\xAD\xBF\xED\xB0\x80", b"\xED\xAD\xBF\xED\xBF\xBF", b"\xED\xAE\x80", + b"\xED\xAE\x80\xED\xB0\x80", b"\xED\xAE\x80\xED\xBF\xBF", b"\xED\xAF\xBF", b"\xED\xAF\xBF\xED\xB0\x80", + b"\xED\xAF\xBF\xED\xBF\xBF", b"\xED\xB0\x80", b"\xED\xBE\x80", b"\xED\xBF\xBF", b"\xEF\xBF", + b"\xF0\x3C\xF1\x3C", b"\xF0\x80\x80", b"\xF0\x80\x80\x80", b"\xF0\x80\x80\xAF", b"\xF0\x80\x81\xBF", + b"\xF0\x8F\xBF\xBF", b"\xF0\x90\x80\x00", b"\xF0\x90\x80\x7F", b"\xF0\x90\x80\xC0", b"\xF0\x90\x80\xFF", + b"\xF1\x80\x80\x00", b"\xF1\x80\x80\x7F", b"\xF1\x80\x80\xC0", b"\xF1\x80\x80\xFF", b"\xF2\x3C\xF3\x3C", + b"\xF4\x3C\xF5\x3C", b"\xF4\x80\x80\x00", b"\xF4\x80\x80\x7F", b"\xF4\x80\x80\xC0", b"\xF4\x80\x80\xFF", + b"\xF4\x90\x80\x80", b"\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42", b"\xF5\x3C", b"\xF6\x3C\xF7\x3C", + b"\xF7\xBF\xBF", b"\xF7\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF\xBF", + b"\xF7\xBF\xBF\xBF\xBF\xBF\xBF", b"\xF8\x3C", b"\xF8\x80\x80\x80", b"\xF8\x80\x80\x80\xAF", + b"\xF8\x87\xBF\xBF\xBF", b"\xF8\x88\x80\x80\x80", b"\xF9\x3C", b"\xFA\x3C", b"\xFB\x3C", b"\xFB\xBF\xBF\xBF", + b"\xFC\x3C", b"\xFC\x80\x80\x80\x80", b"\xFC\x80\x80\x80\x80\xAF", b"\xFC\x84\x80\x80\x80\x80", b"\xFD\x3C", + b"\xFD\xBF\xBF\xBF\xBF", b"\xFE", b"\xFF", b"\xFF\x3C" + ]; + + for bad in broken_utf8 { + 'next: for len in 1..bad.len() { + let mut handler = |ch: &[u8]| { + assert!( + !std::str::from_utf8(ch).unwrap().contains('<'), + "{ch:x?} of {bad:x?}" + ) + }; + let mut t = StreamingHandlerSink::new(UTF_8, &mut handler); + for chunk in bad.chunks(len) { + if t.write_utf8_chunk(chunk, ContentType::Text).is_err() { + continue 'next; + } + } + // An ASCII write forces flush of an incomplete sequence + assert!( + t.write_utf8_chunk(b"<", ContentType::Text).is_err(), + "Shouldn't have allowed {bad:?} {}", + String::from_utf8_lossy(bad) + ); + } + } +} From 59164a04edb590cbedb015740b8e9d4c528d8747 Mon Sep 17 00:00:00 2001 From: Kornel Date: Thu, 7 Nov 2024 11:30:26 +0000 Subject: [PATCH 10/15] Don't use encoding_rs for UTF-8 sync --- src/rewritable_units/text_encoder.rs | 119 ++++++++++++++++++--------- 1 file changed, 81 insertions(+), 38 deletions(-) diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index b584c7c7..3fd002d0 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -1,6 +1,6 @@ use super::ContentType; use crate::html::escape_body_text; -use encoding_rs::{CoderResult, Decoder, DecoderResult, Encoder, Encoding, UTF_8}; +use encoding_rs::{CoderResult, Encoder, Encoding, UTF_8}; use thiserror::Error; /// Input contained non-UTF-8 byte sequence @@ -16,7 +16,7 @@ pub struct Utf8Error; /// /// Argument to [`StreamingHandler::write_all()`](crate::html_content::StreamingHandler::write_all). pub struct StreamingHandlerSink<'output_handler> { - incomplete_utf8: Option, + incomplete_utf8: IncompleteUtf8Resync, inner: StreamingHandlerSinkInner<'output_handler>, } @@ -44,7 +44,7 @@ impl<'output_handler> StreamingHandlerSink<'output_handler> { output_handler: &'output_handler mut dyn FnMut(&[u8]), ) -> Self { Self { - incomplete_utf8: None, + incomplete_utf8: IncompleteUtf8Resync::new(), inner: StreamingHandlerSinkInner { non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), output_handler, @@ -57,11 +57,7 @@ impl<'output_handler> StreamingHandlerSink<'output_handler> { /// It may be called multiple times. The strings will be concatenated together. #[inline] pub fn write_str(&mut self, content: &str, content_type: ContentType) { - if self - .incomplete_utf8 - .as_mut() - .is_some_and(|d| d.discard_incomplete()) - { + if self.incomplete_utf8.discard_incomplete() { // too late to report the error to the caller of write_utf8_chunk self.inner.write_html("\u{FFFD}"); } @@ -85,11 +81,8 @@ impl<'output_handler> StreamingHandlerSink<'output_handler> { mut content: &[u8], content_type: ContentType, ) -> Result<(), Utf8Error> { - let decoder = self - .incomplete_utf8 - .get_or_insert_with(IncompleteUtf8Resync::new); while !content.is_empty() { - let (valid_chunk, rest) = decoder.utf8_bytes_to_slice(content, false)?; + let (valid_chunk, rest) = self.incomplete_utf8.utf8_bytes_to_slice(content)?; content = rest; if !valid_chunk.is_empty() { self.inner.write_str(valid_chunk, content_type); @@ -212,52 +205,86 @@ impl TextEncoder { } } +const fn is_continuation_byte(b: u8) -> bool { + (b >> 6) == 0b10 +} + +const fn utf8_width(b: u8) -> u8 { + b.leading_ones() as _ +} + struct IncompleteUtf8Resync { - decoder: Decoder, - buffer: String, + bytes: [u8; 4], + len: u8, } impl IncompleteUtf8Resync { pub fn new() -> Self { Self { - decoder: UTF_8.new_decoder_without_bom_handling(), - buffer: "\0".repeat(1024), + bytes: [0; 4], + len: 0, } } pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>( &'buf mut self, - content: &'src [u8], - is_last: bool, + mut content: &'src [u8], ) -> Result<(&'buf str, &'src [u8]), Utf8Error> { - let (result, read, written) = - self.decoder - .decode_to_str_without_replacement(content, &mut self.buffer, is_last); + if self.len > 0 { + let mut found_end_byte = false; + while let Some((&next_byte, rest)) = content.split_first() { + if is_continuation_byte(next_byte) { + if let Some(buf) = self.bytes.get_mut(self.len as usize) { + *buf = next_byte; + self.len += 1; + content = rest; + continue; + } + } + found_end_byte = true; + break; + } - match result { - DecoderResult::InputEmpty => {} - DecoderResult::OutputFull => { - if written == 0 { - panic!("encoding_rs infinite loop"); // the buffer is always large enough + if found_end_byte || self.len >= utf8_width(self.bytes[0]) { + let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?; + self.len = 0; + std::str::from_utf8(char_buf) + .map_err(|_| Utf8Error) + .map(|ch| (ch, content)) + } else { + debug_assert!(content.is_empty()); + Ok(("", b"")) + } + } else { + match std::str::from_utf8(content) { + Ok(src) => Ok((src, b"")), + // error_len means invalid bytes somewhere, not just incomplete 1-3 bytes at the end + Err(err) if err.error_len().is_some() => Err(Utf8Error), + Err(err) => { + let (valid, invalid) = content + .split_at_checked(err.valid_up_to()) + .ok_or(Utf8Error)?; + self.bytes + .get_mut(..invalid.len()) + .ok_or(Utf8Error)? + .copy_from_slice(invalid); + self.len = invalid.len() as _; + // valid_up_to promises it is valid + debug_assert!(std::str::from_utf8(valid).is_ok()); + let valid = unsafe { std::str::from_utf8_unchecked(valid) }; + Ok((valid, b"")) } } - DecoderResult::Malformed(_, _) => return Err(Utf8Error), } - - let written = &self.buffer[..written]; - let remaining = &content[read..]; - Ok((written, remaining)) } /// True if there were incomplete invalid bytes in the buffer pub fn discard_incomplete(&mut self) -> bool { - match self.utf8_bytes_to_slice(b"", true) { - Ok((valid_chunk, rest)) => { - debug_assert!(rest.is_empty()); - debug_assert!(valid_chunk.is_empty()); // this can't happen in UTF-8 after empty write - false - } - Err(_) => true, + if self.len > 0 { + self.len = 0; + true + } else { + false } } } @@ -284,6 +311,22 @@ fn utf8_fragments() { } } +#[test] +fn chars() { + let boundaries = "🐈°文字化けしない" + .as_bytes() + .iter() + .map(|&ch| { + if is_continuation_byte(ch) { + '.' + } else { + (b'0' + utf8_width(ch)) as char + } + }) + .collect::(); + assert_eq!("4...2.3..3..3..3..3..3..3..", boundaries); +} + #[test] fn long_text() { let mut written = 0; From 7fd8c21ebd90a07d1bb79f0bf631d0c8f114bdfd Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 22 Nov 2024 16:27:39 +0000 Subject: [PATCH 11/15] Clarifications --- src/html/mod.rs | 9 ++- src/rewritable_units/text_encoder.rs | 117 ++++++++++++++++++--------- 2 files changed, 82 insertions(+), 44 deletions(-) diff --git a/src/html/mod.rs b/src/html/mod.rs index 73dfe336..7a33027e 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -18,12 +18,13 @@ pub use self::text_type::TextType; pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) { loop { if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) { - let Some((chunk_before, (matched, rest))) = content - .split_at_checked(pos) - .and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?))) - else { + let Some((chunk_before, rest)) = content.split_at_checked(pos) else { return; }; + let Some((matched, rest)) = rest.split_at_checked(1) else { + return; + }; + content = rest; let matched = matched.as_bytes()[0]; diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index 3fd002d0..c4da2c76 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> { } } +/// Temporary buffer used for encoding_rs output enum Buffer { + /// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path. + Stack([u8; 63]), // leave a byte for the enum's tag, so that the enum has 64-byte size + /// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up Heap(Vec), - Stack([u8; 63]), // leave a byte for the tag +} + +impl Buffer { + /// Arbitrary limit when to switch from a small on-stack buffer to heap allocation + const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER: usize = 1 << 20; + + /// Arbitrary, about a page size + const DEFAULT_HEAP_BUFFER_SIZE: usize = 4096; + + fn buffer_for_length(&mut self, content_len: usize) -> &mut [u8] { + let buffer = match self { + Buffer::Heap(buf) => buf.as_mut_slice(), + // Long non-ASCII content could take lots of roundtrips through the encoder + buf if content_len >= Self::CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => { + *buf = Buffer::Heap(vec![0; Self::DEFAULT_HEAP_BUFFER_SIZE]); + match buf { + Buffer::Heap(buf) => buf.as_mut(), + _ => unreachable!(), + } + } + Buffer::Stack(buf) => buf.as_mut_slice(), + }; + buffer + } } struct TextEncoder { @@ -152,6 +179,7 @@ impl TextEncoder { #[inline(never)] fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) { loop { + // First, fast path for ASCII-only prefix debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes()); if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) { @@ -164,20 +192,12 @@ impl TextEncoder { content = remainder; } - let buffer = match &mut self.buffer { - Buffer::Heap(buf) => buf.as_mut_slice(), - // Long non-ASCII content could take lots of roundtrips through the encoder - buf if content.len() >= 1 << 20 => { - *buf = Buffer::Heap(vec![0; 4096]); - match buf { - Buffer::Heap(buf) => buf.as_mut(), - _ => unreachable!(), - } - } - Buffer::Stack(buf) => buf.as_mut_slice(), - }; + // Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to. + let buffer = self.buffer.buffer_for_length(content.len()); + // last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false); + if written > 0 && written <= buffer.len() { (output_handler)(&buffer[..written]); } @@ -185,20 +205,21 @@ impl TextEncoder { return; } content = &content[read..]; + match result { CoderResult::InputEmpty => { debug_assert!(content.is_empty()); return; } + // we've made progress, and can try again without growing the buffer + CoderResult::OutputFull if written > 0 => {} CoderResult::OutputFull => { - match &mut self.buffer { - Buffer::Heap(buf) if buf.len() >= 1024 => { - if written == 0 { - panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes - } - } - buf => *buf = Buffer::Heap(vec![0; 1024]), - } + // encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug. + assert!( + buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE, + "encoding_rs infinite loop" + ); + self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]); } } } @@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 { b.leading_ones() as _ } +/// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments struct IncompleteUtf8Resync { - bytes: [u8; 4], - len: u8, + /// Buffers an incomplete UTF-8 sequence + char_bytes: [u8; 4], + /// Number of bytes in `bytes` + char_len: u8, } impl IncompleteUtf8Resync { pub fn new() -> Self { Self { - bytes: [0; 4], - len: 0, + char_bytes: [0; 4], + char_len: 0, } } + /// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes. + /// + /// Call `discard_incomplete()` after the last write to flush any partially-written chars. pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>( &'buf mut self, mut content: &'src [u8], ) -> Result<(&'buf str, &'src [u8]), Utf8Error> { - if self.len > 0 { - let mut found_end_byte = false; + // There may be incomplete char buffered from previous write, that must be continued now + if self.char_len > 0 { + let mut must_emit_now = false; while let Some((&next_byte, rest)) = content.split_first() { if is_continuation_byte(next_byte) { - if let Some(buf) = self.bytes.get_mut(self.len as usize) { + if let Some(buf) = self.char_bytes.get_mut(self.char_len as usize) { *buf = next_byte; - self.len += 1; + self.char_len += 1; content = rest; continue; } + // overlong sequences fall here, and will be checked when the char_bytes is flushed } - found_end_byte = true; + must_emit_now = true; break; } - if found_end_byte || self.len >= utf8_width(self.bytes[0]) { - let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?; - self.len = 0; - std::str::from_utf8(char_buf) - .map_err(|_| Utf8Error) - .map(|ch| (ch, content)) + if self.char_len >= utf8_width(self.char_bytes[0]) { + must_emit_now = true; + } + + if must_emit_now { + let char_buf = self + .char_bytes + .get(..self.char_len as usize) + .ok_or(Utf8Error)?; + self.char_len = 0; + let ch = std::str::from_utf8(char_buf).map_err(|_| Utf8Error)?; + Ok((ch, content)) } else { + // a partial write has ended without fully completing a char (it's possible to write 1 byte at a time) debug_assert!(content.is_empty()); Ok(("", b"")) } @@ -264,11 +300,12 @@ impl IncompleteUtf8Resync { let (valid, invalid) = content .split_at_checked(err.valid_up_to()) .ok_or(Utf8Error)?; - self.bytes + // save the incomplete bytes from the end for the next write + self.char_bytes .get_mut(..invalid.len()) .ok_or(Utf8Error)? .copy_from_slice(invalid); - self.len = invalid.len() as _; + self.char_len = invalid.len() as _; // valid_up_to promises it is valid debug_assert!(std::str::from_utf8(valid).is_ok()); let valid = unsafe { std::str::from_utf8_unchecked(valid) }; @@ -280,8 +317,8 @@ impl IncompleteUtf8Resync { /// True if there were incomplete invalid bytes in the buffer pub fn discard_incomplete(&mut self) -> bool { - if self.len > 0 { - self.len = 0; + if self.char_len > 0 { + self.char_len = 0; true } else { false From b26119fd46aab590efab3ef798bc174ea3700fda Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 27 Nov 2024 17:12:53 +0000 Subject: [PATCH 12/15] Remove unsafe --- src/lib.rs | 1 + src/rewritable_units/mutations.rs | 3 --- src/rewritable_units/text_encoder.rs | 5 ++--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ea67156d..19d15b17 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,7 @@ //! [Cloudflare Workers]: https://www.cloudflare.com/en-gb/products/cloudflare-workers/ //! [`HtmlRewriter`]: struct.HtmlRewriter.html //! [`rewrite_str`]: fn.rewrite_str.html +#![forbid(unsafe_code)] #![allow(clippy::default_trait_access)] #![allow(clippy::module_name_repetitions)] #![allow(clippy::redundant_pub_crate)] diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index 1dd1a33b..f5e89e04 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -150,9 +150,6 @@ pub trait StreamingHandler: Send { // Safety: due to lack of Sync, this trait must not have `&self` methods } -/// Avoid requring `StreamingHandler` to be `Sync`. -/// It only has a method taking exclusive ownership, so there's no sharing possible. -unsafe impl Sync for StringChunk {} impl RefUnwindSafe for StringChunk {} impl UnwindSafe for StringChunk {} diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index c4da2c76..29d615ce 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -306,9 +306,8 @@ impl IncompleteUtf8Resync { .ok_or(Utf8Error)? .copy_from_slice(invalid); self.char_len = invalid.len() as _; - // valid_up_to promises it is valid - debug_assert!(std::str::from_utf8(valid).is_ok()); - let valid = unsafe { std::str::from_utf8_unchecked(valid) }; + // valid_up_to promises it is always valid + let valid = std::str::from_utf8(valid).map_err(|_| Utf8Error)?; Ok((valid, b"")) } } From 9cac48c43a2e6e29f6882cfebe583286137f0a6e Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 27 Nov 2024 17:18:34 +0000 Subject: [PATCH 13/15] Avoid unnecessary From when constructing StringChunks --- src/rewritable_units/element.rs | 24 +++++++++++------------ src/rewritable_units/mutations.rs | 20 ++++++------------- src/rewritable_units/tokens/comment.rs | 21 +++++++++++--------- src/rewritable_units/tokens/end_tag.rs | 15 ++++++++------ src/rewritable_units/tokens/start_tag.rs | 15 ++++++++------ src/rewritable_units/tokens/text_chunk.rs | 15 ++++++++------ 6 files changed, 57 insertions(+), 53 deletions(-) diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index 38d4b8c8..acc7a88e 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -241,7 +241,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { .mutations .mutate() .content_before - .push_back((content, content_type).into()); + .push_back(StringChunk::from_str(content, content_type)); } /// Inserts content from a [`StreamingHandler`] before the element. @@ -254,7 +254,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { .mutations .mutate() .content_before - .push_back(string_writer.into()); + .push_back(StringChunk::Stream(string_writer)); } /// Inserts `content` after the element. @@ -287,7 +287,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn after(&mut self, content: &str, content_type: ContentType) { - self.after_chunk((content, content_type).into()); + self.after_chunk(StringChunk::from_str(content, content_type)); } fn after_chunk(&mut self, chunk: StringChunk) { @@ -306,7 +306,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_after(&mut self, string_writer: Box) { - self.after_chunk(string_writer.into()); + self.after_chunk(StringChunk::Stream(string_writer)); } /// Prepends `content` to the element's inner content, i.e. inserts content right after @@ -346,7 +346,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn prepend(&mut self, content: &str, content_type: ContentType) { - self.prepend_chunk((content, content_type).into()); + self.prepend_chunk(StringChunk::from_str(content, content_type)); } fn prepend_chunk(&mut self, chunk: StringChunk) { @@ -370,7 +370,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_prepend(&mut self, string_writer: Box) { - self.prepend_chunk(string_writer.into()); + self.prepend_chunk(StringChunk::Stream(string_writer)); } /// Appends `content` to the element's inner content, i.e. inserts content right before @@ -410,7 +410,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn append(&mut self, content: &str, content_type: ContentType) { - self.append_chunk((content, content_type).into()); + self.append_chunk(StringChunk::from_str(content, content_type)); } fn append_chunk(&mut self, chunk: StringChunk) { @@ -429,7 +429,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_append(&mut self, string_writer: Box) { - self.append_chunk(string_writer.into()); + self.append_chunk(StringChunk::Stream(string_writer)); } /// Replaces inner content of the element with `content`. @@ -468,7 +468,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn set_inner_content(&mut self, content: &str, content_type: ContentType) { - self.set_inner_content_chunk((content, content_type).into()); + self.set_inner_content_chunk(StringChunk::from_str(content, content_type)); } fn set_inner_content_chunk(&mut self, chunk: StringChunk) { @@ -492,7 +492,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_set_inner_content(&mut self, string_writer: Box) { - self.set_inner_content_chunk(string_writer.into()); + self.set_inner_content_chunk(StringChunk::Stream(string_writer)); } /// Replaces the element and its inner content with `content`. @@ -524,7 +524,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// ``` #[inline] pub fn replace(&mut self, content: &str, content_type: ContentType) { - self.replace_chunk((content, content_type).into()); + self.replace_chunk(StringChunk::from_str(content, content_type)); } fn replace_chunk(&mut self, chunk: StringChunk) { @@ -543,7 +543,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_replace(&mut self, string_writer: Box) { - self.replace_chunk(string_writer.into()); + self.replace_chunk(StringChunk::Stream(string_writer)); } /// Removes the element and its inner content. diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index f5e89e04..7e041f30 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -84,18 +84,17 @@ impl Mutations { } } -impl From<(&str, ContentType)> for StringChunk { - #[inline] - fn from((content, content_type): (&str, ContentType)) -> Self { - Self::Buffer(Box::from(content), content_type) - } -} - pub(crate) enum StringChunk { Buffer(Box, ContentType), Stream(Box), } +impl StringChunk { + pub(crate) fn from_str(content: impl Into>, content_type: ContentType) -> Self { + Self::Buffer(content.into(), content_type) + } +} + #[derive(Default)] pub(crate) struct DynamicString { chunks: Vec, @@ -172,10 +171,3 @@ where (self)(sink) } } - -impl From> for StringChunk { - #[inline] - fn from(writer: Box) -> Self { - Self::Stream(writer) - } -} diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index 190ee857..def1748f 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -2,6 +2,7 @@ use super::{Mutations, Token}; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html_content::StreamingHandler; +use crate::rewritable_units::StringChunk; use encoding_rs::Encoding; use std::any::Any; use std::fmt::{self, Debug}; @@ -110,7 +111,7 @@ impl<'i> Comment<'i> { self.mutations .mutate() .content_before - .push_back((content, content_type).into()); + .push_back(StringChunk::from_str(content, content_type)); } /// Inserts content from a [`StreamingHandler`] before the comment. @@ -119,11 +120,11 @@ impl<'i> Comment<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_before(&mut self, handler: Box) { + pub fn streaming_before(&mut self, string_writer: Box) { self.mutations .mutate() .content_before - .push_back(handler.into()); + .push_back(StringChunk::Stream(string_writer)); } /// Inserts `content` after the comment. @@ -158,7 +159,7 @@ impl<'i> Comment<'i> { self.mutations .mutate() .content_after - .push_front((content, content_type).into()); + .push_front(StringChunk::from_str(content, content_type)); } /// Inserts content from a [`StreamingHandler`] after the comment. @@ -167,11 +168,11 @@ impl<'i> Comment<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_after(&mut self, handler: Box) { + pub fn streaming_after(&mut self, string_writer: Box) { self.mutations .mutate() .content_after - .push_front(handler.into()); + .push_front(StringChunk::Stream(string_writer)); } /// Replaces the comment with the `content`. @@ -205,7 +206,7 @@ impl<'i> Comment<'i> { pub fn replace(&mut self, content: &str, content_type: crate::rewritable_units::ContentType) { self.mutations .mutate() - .replace((content, content_type).into()); + .replace(StringChunk::from_str(content, content_type)); } /// Replaces the comment with the content from a [`StreamingHandler`]. @@ -214,8 +215,10 @@ impl<'i> Comment<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_replace(&mut self, handler: Box) { - self.mutations.mutate().replace(handler.into()); + pub fn streaming_replace(&mut self, string_writer: Box) { + self.mutations + .mutate() + .replace(StringChunk::Stream(string_writer)); } /// Removes the comment. diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index fbd7ee32..4afffc5d 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -2,6 +2,7 @@ use super::{Mutations, Token}; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html_content::{ContentType, StreamingHandler}; +use crate::rewritable_units::StringChunk; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -72,7 +73,7 @@ impl<'i> EndTag<'i> { self.mutations .mutate() .content_before - .push_back((content, content_type).into()); + .push_back(StringChunk::from_str(content, content_type)); } /// Inserts `content` after the end tag. @@ -83,7 +84,7 @@ impl<'i> EndTag<'i> { self.mutations .mutate() .content_after - .push_front((content, content_type).into()); + .push_front(StringChunk::from_str(content, content_type)); } /// Replaces the end tag with `content`. @@ -93,7 +94,7 @@ impl<'i> EndTag<'i> { pub fn replace(&mut self, content: &str, content_type: ContentType) { self.mutations .mutate() - .replace((content, content_type).into()); + .replace(StringChunk::from_str(content, content_type)); } /// Inserts content from a [`StreamingHandler`] before the end tag. @@ -106,7 +107,7 @@ impl<'i> EndTag<'i> { self.mutations .mutate() .content_before - .push_back(string_writer.into()); + .push_back(StringChunk::Stream(string_writer)); } /// Inserts content from a [`StreamingHandler`] after the end tag. @@ -119,7 +120,7 @@ impl<'i> EndTag<'i> { self.mutations .mutate() .content_after - .push_front(string_writer.into()); + .push_front(StringChunk::Stream(string_writer)); } /// Replaces the end tag with content from a [`StreamingHandler`]. @@ -129,7 +130,9 @@ impl<'i> EndTag<'i> { /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] pub fn streaming_replace(&mut self, string_writer: Box) { - self.mutations.mutate().replace(string_writer.into()); + self.mutations + .mutate() + .replace(StringChunk::Stream(string_writer)); } /// Removes the end tag. diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 5b637d71..1f96ec29 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -4,6 +4,7 @@ use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::Namespace; use crate::html_content::{ContentType, StreamingHandler}; +use crate::rewritable_units::StringChunk; use encoding_rs::Encoding; use std::fmt::{self, Debug}; @@ -115,7 +116,7 @@ impl<'i> StartTag<'i> { self.mutations .mutate() .content_before - .push_back((content, content_type).into()); + .push_back(StringChunk::from_str(content, content_type)); } /// Inserts `content` after the start tag. @@ -126,7 +127,7 @@ impl<'i> StartTag<'i> { self.mutations .mutate() .content_after - .push_front((content, content_type).into()); + .push_front(StringChunk::from_str(content, content_type)); } /// Replaces the start tag with `content`. @@ -136,7 +137,7 @@ impl<'i> StartTag<'i> { pub fn replace(&mut self, content: &str, content_type: ContentType) { self.mutations .mutate() - .replace((content, content_type).into()); + .replace(StringChunk::from_str(content, content_type)); } /// Inserts content from a [`StreamingHandler`] before the start tag. @@ -148,7 +149,7 @@ impl<'i> StartTag<'i> { self.mutations .mutate() .content_before - .push_back(string_writer.into()); + .push_back(StringChunk::Stream(string_writer)); } /// Inserts content from a [`StreamingHandler`] after the start tag. @@ -160,7 +161,7 @@ impl<'i> StartTag<'i> { self.mutations .mutate() .content_after - .push_front(string_writer.into()); + .push_front(StringChunk::Stream(string_writer)); } /// Replaces the start tag with the content from a [`StreamingHandler`]. @@ -169,7 +170,9 @@ impl<'i> StartTag<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_replace(&mut self, string_writer: Box) { - self.mutations.mutate().replace(string_writer.into()); + self.mutations + .mutate() + .replace(StringChunk::Stream(string_writer)); } /// Removes the start tag. diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 7af680a1..0c653cd7 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -3,6 +3,7 @@ use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::TextType; use crate::html_content::{ContentType, StreamingHandler}; +use crate::rewritable_units::StringChunk; use encoding_rs::Encoding; use std::any::Any; use std::borrow::Cow; @@ -189,7 +190,7 @@ impl<'i> TextChunk<'i> { self.mutations .mutate() .content_before - .push_back((content, content_type).into()); + .push_back(StringChunk::from_str(content, content_type)); } /// Inserts `content` after the text chunk. @@ -226,7 +227,7 @@ impl<'i> TextChunk<'i> { self.mutations .mutate() .content_after - .push_front((content, content_type).into()); + .push_front(StringChunk::from_str(content, content_type)); } /// Replaces the text chunk with the `content`. @@ -262,7 +263,7 @@ impl<'i> TextChunk<'i> { pub fn replace(&mut self, content: &str, content_type: ContentType) { self.mutations .mutate() - .replace((content, content_type).into()); + .replace(StringChunk::from_str(content, content_type)); } /// Inserts content from a [`StreamingHandler`] before the text chunk. @@ -274,7 +275,7 @@ impl<'i> TextChunk<'i> { self.mutations .mutate() .content_before - .push_back(string_writer.into()); + .push_back(StringChunk::Stream(string_writer)); } /// Inserts content from a [`StreamingHandler`] after the text chunk. @@ -286,7 +287,7 @@ impl<'i> TextChunk<'i> { self.mutations .mutate() .content_after - .push_front(string_writer.into()); + .push_front(StringChunk::Stream(string_writer)); } /// Replaces the text chunk with the content from a [`StreamingHandler`]. @@ -295,7 +296,9 @@ impl<'i> TextChunk<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. pub fn streaming_replace(&mut self, string_writer: Box) { - self.mutations.mutate().replace(string_writer.into()); + self.mutations + .mutate() + .replace(StringChunk::Stream(string_writer)); } /// Removes the text chunk. From 50b9d52e7638a856bf464f0a89c1970b194587c0 Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 27 Nov 2024 17:31:03 +0000 Subject: [PATCH 14/15] Allow non-Send StreamingHandler to exist --- src/rewritable_units/element.rs | 12 ++++++------ src/rewritable_units/mutations.rs | 6 +++--- src/rewritable_units/tokens/comment.rs | 6 +++--- src/rewritable_units/tokens/end_tag.rs | 6 +++--- src/rewritable_units/tokens/start_tag.rs | 6 +++--- src/rewritable_units/tokens/text_chunk.rs | 6 +++--- src/rewriter/settings.rs | 4 ++-- 7 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index acc7a88e..a96edf71 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -249,7 +249,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// Consequent calls to the method append to the previously inserted content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_before(&mut self, string_writer: Box) { + pub fn streaming_before(&mut self, string_writer: Box) { self.start_tag .mutations .mutate() @@ -305,7 +305,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_after(&mut self, string_writer: Box) { + pub fn streaming_after(&mut self, string_writer: Box) { self.after_chunk(StringChunk::Stream(string_writer)); } @@ -369,7 +369,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_prepend(&mut self, string_writer: Box) { + pub fn streaming_prepend(&mut self, string_writer: Box) { self.prepend_chunk(StringChunk::Stream(string_writer)); } @@ -428,7 +428,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// [empty element]: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_append(&mut self, string_writer: Box) { + pub fn streaming_append(&mut self, string_writer: Box) { self.append_chunk(StringChunk::Stream(string_writer)); } @@ -491,7 +491,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_set_inner_content(&mut self, string_writer: Box) { + pub fn streaming_set_inner_content(&mut self, string_writer: Box) { self.set_inner_content_chunk(StringChunk::Stream(string_writer)); } @@ -542,7 +542,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { /// /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_replace(&mut self, string_writer: Box) { + pub fn streaming_replace(&mut self, string_writer: Box) { self.replace_chunk(StringChunk::Stream(string_writer)); } diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index 7e041f30..409a499d 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -86,7 +86,7 @@ impl Mutations { pub(crate) enum StringChunk { Buffer(Box, ContentType), - Stream(Box), + Stream(Box), } impl StringChunk { @@ -137,7 +137,7 @@ impl DynamicString { } /// A callback used to write content asynchronously. -pub trait StreamingHandler: Send { +pub trait StreamingHandler { /// This method is called only once, and is expected to write content /// by calling the [`sink.write_str()`](StreamingHandlerSink::write_str) one or more times. /// @@ -152,7 +152,7 @@ pub trait StreamingHandler: Send { impl RefUnwindSafe for StringChunk {} impl UnwindSafe for StringChunk {} -impl From for Box +impl From for Box where F: FnOnce(&mut StreamingHandlerSink<'_>) -> BoxResult + Send + 'static, { diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index def1748f..4ef4681e 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -120,7 +120,7 @@ impl<'i> Comment<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_before(&mut self, string_writer: Box) { + pub fn streaming_before(&mut self, string_writer: Box) { self.mutations .mutate() .content_before @@ -168,7 +168,7 @@ impl<'i> Comment<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_after(&mut self, string_writer: Box) { + pub fn streaming_after(&mut self, string_writer: Box) { self.mutations .mutate() .content_after @@ -215,7 +215,7 @@ impl<'i> Comment<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_replace(&mut self, string_writer: Box) { + pub fn streaming_replace(&mut self, string_writer: Box) { self.mutations .mutate() .replace(StringChunk::Stream(string_writer)); diff --git a/src/rewritable_units/tokens/end_tag.rs b/src/rewritable_units/tokens/end_tag.rs index 4afffc5d..235ca64f 100644 --- a/src/rewritable_units/tokens/end_tag.rs +++ b/src/rewritable_units/tokens/end_tag.rs @@ -103,7 +103,7 @@ impl<'i> EndTag<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_before(&mut self, string_writer: Box) { + pub fn streaming_before(&mut self, string_writer: Box) { self.mutations .mutate() .content_before @@ -116,7 +116,7 @@ impl<'i> EndTag<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_after(&mut self, string_writer: Box) { + pub fn streaming_after(&mut self, string_writer: Box) { self.mutations .mutate() .content_after @@ -129,7 +129,7 @@ impl<'i> EndTag<'i> { /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. #[inline] - pub fn streaming_replace(&mut self, string_writer: Box) { + pub fn streaming_replace(&mut self, string_writer: Box) { self.mutations .mutate() .replace(StringChunk::Stream(string_writer)); diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 1f96ec29..a808622a 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -145,7 +145,7 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method append to the previously inserted content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_before(&mut self, string_writer: Box) { + pub fn streaming_before(&mut self, string_writer: Box) { self.mutations .mutate() .content_before @@ -157,7 +157,7 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method prepend to the previously inserted content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_after(&mut self, string_writer: Box) { + pub fn streaming_after(&mut self, string_writer: Box) { self.mutations .mutate() .content_after @@ -169,7 +169,7 @@ impl<'i> StartTag<'i> { /// Consequent calls to the method overwrite previous replacement content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_replace(&mut self, string_writer: Box) { + pub fn streaming_replace(&mut self, string_writer: Box) { self.mutations .mutate() .replace(StringChunk::Stream(string_writer)); diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 0c653cd7..10ad1b3f 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -271,7 +271,7 @@ impl<'i> TextChunk<'i> { /// Consequent calls to the method append `content` to the previously inserted content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_before(&mut self, string_writer: Box) { + pub fn streaming_before(&mut self, string_writer: Box) { self.mutations .mutate() .content_before @@ -283,7 +283,7 @@ impl<'i> TextChunk<'i> { /// Consequent calls to the method prepend to the previously inserted content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_after(&mut self, string_writer: Box) { + pub fn streaming_after(&mut self, string_writer: Box) { self.mutations .mutate() .content_after @@ -295,7 +295,7 @@ impl<'i> TextChunk<'i> { /// Consequent calls to the method overwrite previous replacement content. /// /// Use the [`streaming!`] macro to make a `StreamingHandler` from a closure. - pub fn streaming_replace(&mut self, string_writer: Box) { + pub fn streaming_replace(&mut self, string_writer: Box) { self.mutations .mutate() .replace(StringChunk::Stream(string_writer)); diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index 92d1d5d2..e12114cf 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -538,13 +538,13 @@ macro_rules! streaming { ) -> StreamingHandler where StreamingHandler: - FnOnce(&mut StreamingHandlerSink<'_>) -> Result<(), Box> + 'static + Send, + FnOnce(&mut StreamingHandlerSink<'_>) -> Result<(), Box> + 'static, { handler_closure } Box::new(streaming_macro_type_hint($closure)) - as Box + as Box }}; } From d927fdd82f5b77a03e5952bca7bb6d0a2b840ca7 Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 27 Nov 2024 18:36:23 +0000 Subject: [PATCH 15/15] Fix unsafe --- src/base/debug_trace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/debug_trace.rs b/src/base/debug_trace.rs index cb734e65..a843d63d 100644 --- a/src/base/debug_trace.rs +++ b/src/base/debug_trace.rs @@ -11,7 +11,7 @@ cfg_if! { $({ use std::char; - print!(": {:?}", $ch.map(|ch| unsafe { char::from_u32_unchecked(ch as u32) })); + print!(": {:?}", $ch.map(|ch| char::from_u32(ch as u32).unwrap_or('\u{fffd}') )); })* println!();