diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8fb50e08..dc749e54 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
+### Fixed
+ - [741](https://github.com/thoth-pub/thoth/pull/741) - Harden JATS rich-text handling by rejecting malformed or nested markup and abstract line breaks on write, and normalise Crossref abstract output to avoid invalid nested `jats:p` and `jats:break` elements
## [[1.0.2]](https://github.com/thoth-pub/thoth/releases/tag/v1.0.2) - 2026-04-03
### Security
diff --git a/Cargo.lock b/Cargo.lock
index 77fc055a..35def495 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5010,6 +5010,7 @@ dependencies = [
"lazy_static",
"log",
"pulldown-cmark",
+ "quick-xml",
"rand 0.9.2",
"regex",
"scraper",
diff --git a/thoth-api/Cargo.toml b/thoth-api/Cargo.toml
index 8c59df80..f932a28a 100644
--- a/thoth-api/Cargo.toml
+++ b/thoth-api/Cargo.toml
@@ -46,6 +46,7 @@ jsonwebtoken = { version = "10.3.0", optional = true }
juniper = { version = "0.16.1", features = ["chrono", "schema-language", "uuid"] }
lazy_static = "1.5.0"
pulldown-cmark = "0.13.0"
+quick-xml = "0.36"
rand = { version = "0.9.0", optional = true }
regex = "1.11.1"
scraper = "0.20.0"
@@ -64,4 +65,4 @@ log = "0.4.26"
[dev-dependencies]
fs2 = "0.4.3"
-tokio = { version = "1.44", features = ["macros", "rt"] }
+tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread"] }
diff --git a/thoth-api/src/graphql/tests.rs b/thoth-api/src/graphql/tests.rs
index 572b2080..7179322b 100644
--- a/thoth-api/src/graphql/tests.rs
+++ b/thoth-api/src/graphql/tests.rs
@@ -1195,11 +1195,19 @@ fn patch_title(title: &Title) -> PatchTitle {
}
}
+fn append_to_jats_paragraph_content(content: &str, suffix: &str) -> String {
+ if let Some((head, tail)) = content.rsplit_once("
") {
+ format!("{head}{suffix}{tail}")
+ } else {
+ format!("{content}{suffix}")
+ }
+}
+
fn patch_abstract(abstract_item: &Abstract) -> PatchAbstract {
PatchAbstract {
abstract_id: abstract_item.abstract_id,
work_id: abstract_item.work_id,
- content: format!("{} Updated", abstract_item.content),
+ content: append_to_jats_paragraph_content(&abstract_item.content, " Updated"),
locale_code: abstract_item.locale_code,
abstract_type: abstract_item.abstract_type,
canonical: abstract_item.canonical,
@@ -1210,7 +1218,7 @@ fn patch_biography(biography: &Biography) -> PatchBiography {
PatchBiography {
biography_id: biography.biography_id,
contribution_id: biography.contribution_id,
- content: format!("{} Updated", biography.content),
+ content: append_to_jats_paragraph_content(&biography.content, " Updated"),
canonical: biography.canonical,
locale_code: biography.locale_code,
}
@@ -2551,7 +2559,7 @@ query LinkedRelations($reviewId: Uuid!, $endorsementId: Uuid!) {
}
#[test]
-fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
+fn graphql_markup_mutations_accept_valid_jatsxml_but_reject_breaks_and_markup_like_plain_text() {
let (_guard, pool) = test_db::setup_test_db();
let schema = create_schema();
let superuser = test_db::test_superuser("user-jats-xml-mutations");
@@ -2597,12 +2605,17 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
);
let abstract_item = Abstract::from_id(pool.as_ref(), &seed.abstract_short_id).unwrap();
- update_with_data_and_markup(
- &schema,
- &context,
- "updateAbstract",
- "PatchAbstract",
- "abstractId",
+ let abstract_query = r#"
+ mutation UpdateAbstract($data: PatchAbstract!, $markup: MarkupFormat!) {
+ updateAbstract(data: $data, markupFormat: $markup) {
+ abstractId
+ }
+ }
+ "#;
+ let mut abstract_vars = Variables::new();
+ insert_var(
+ &mut abstract_vars,
+ "data",
PatchAbstract {
abstract_id: abstract_item.abstract_id,
work_id: abstract_item.work_id,
@@ -2613,22 +2626,28 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
abstract_type: abstract_item.abstract_type,
canonical: abstract_item.canonical,
},
- MarkupFormat::PlainText,
);
-
- let stored_abstract = Abstract::from_id(pool.as_ref(), &seed.abstract_short_id).unwrap();
- assert_eq!(
- stored_abstract.content,
- "First lineSecond line with E=mc^2 and user@example.org and https://example.org
"
+ insert_var(&mut abstract_vars, "markup", MarkupFormat::PlainText);
+ let (_, abstract_errors) =
+ juniper::execute_sync(abstract_query, None, &schema, &abstract_vars, &context)
+ .expect("GraphQL execution should succeed with validation errors");
+ assert!(
+ !abstract_errors.is_empty(),
+ "Expected abstract validation error"
);
let biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap();
- update_with_data_and_markup(
- &schema,
- &context,
- "updateBiography",
- "PatchBiography",
- "biographyId",
+ let biography_query = r#"
+ mutation UpdateBiography($data: PatchBiography!, $markup: MarkupFormat!) {
+ updateBiography(data: $data, markupFormat: $markup) {
+ biographyId
+ }
+ }
+ "#;
+ let mut biography_vars = Variables::new();
+ insert_var(
+ &mut biography_vars,
+ "data",
PatchBiography {
biography_id: biography.biography_id,
contribution_id: biography.contribution_id,
@@ -2636,13 +2655,14 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() {
canonical: biography.canonical,
locale_code: biography.locale_code,
},
- MarkupFormat::JatsXml,
);
-
- let stored_biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap();
- assert_eq!(
- stored_biography.content,
- "Bio linex^2 bio@example.org https://bio.example.org
"
+ insert_var(&mut biography_vars, "markup", MarkupFormat::JatsXml);
+ let (_, biography_errors) =
+ juniper::execute_sync(biography_query, None, &schema, &biography_vars, &context)
+ .expect("GraphQL execution should succeed with validation errors");
+ assert!(
+ !biography_errors.is_empty(),
+ "Expected biography validation error"
);
}
@@ -3111,7 +3131,7 @@ fn graphql_mutations_cover_all() {
"PatchAbstract",
"abstractId",
patch_abstract(&abstract_item),
- MarkupFormat::PlainText,
+ MarkupFormat::JatsXml,
);
let biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap();
@@ -3122,7 +3142,7 @@ fn graphql_mutations_cover_all() {
"PatchBiography",
"biographyId",
patch_biography(&biography),
- MarkupFormat::PlainText,
+ MarkupFormat::JatsXml,
);
let work = Work::from_id(pool.as_ref(), &seed.book_work_id).unwrap();
diff --git a/thoth-api/src/markup/ast.rs b/thoth-api/src/markup/ast.rs
index 4870a173..8376c4a6 100644
--- a/thoth-api/src/markup/ast.rs
+++ b/thoth-api/src/markup/ast.rs
@@ -61,7 +61,7 @@ fn push_node_to_top(stack: &mut [Node], node: Node) {
}
}
-fn normalize_text_segments(text: &str) -> Vec {
+fn normalise_text_segments(text: &str) -> Vec {
let pattern = regex::Regex::new(
r"(?x)
(?P\$[^$\n]+\$)
@@ -114,22 +114,22 @@ fn normalize_text_segments(text: &str) -> Vec {
result
}
-fn normalize_node(node: Node) -> Node {
+fn normalise_node(node: Node) -> Node {
match node {
- Node::Document(children) => Node::Document(normalize_children(children)),
- Node::Paragraph(children) => Node::Paragraph(normalize_children(children)),
- Node::Bold(children) => Node::Bold(normalize_children(children)),
- Node::Italic(children) => Node::Italic(normalize_children(children)),
- Node::Underline(children) => Node::Underline(normalize_children(children)),
- Node::Strikethrough(children) => Node::Strikethrough(normalize_children(children)),
- Node::Code(children) => Node::Code(normalize_children(children)),
- Node::Superscript(children) => Node::Superscript(normalize_children(children)),
- Node::Subscript(children) => Node::Subscript(normalize_children(children)),
- Node::SmallCaps(children) => Node::SmallCaps(normalize_children(children)),
- Node::List(children) => Node::List(normalize_children(children)),
- Node::ListItem(children) => Node::ListItem(normalize_children(children)),
+ Node::Document(children) => Node::Document(normalise_children(children)),
+ Node::Paragraph(children) => Node::Paragraph(normalise_children(children)),
+ Node::Bold(children) => Node::Bold(normalise_children(children)),
+ Node::Italic(children) => Node::Italic(normalise_children(children)),
+ Node::Underline(children) => Node::Underline(normalise_children(children)),
+ Node::Strikethrough(children) => Node::Strikethrough(normalise_children(children)),
+ Node::Code(children) => Node::Code(normalise_children(children)),
+ Node::Superscript(children) => Node::Superscript(normalise_children(children)),
+ Node::Subscript(children) => Node::Subscript(normalise_children(children)),
+ Node::SmallCaps(children) => Node::SmallCaps(normalise_children(children)),
+ Node::List(children) => Node::List(normalise_children(children)),
+ Node::ListItem(children) => Node::ListItem(normalise_children(children)),
Node::Link { url, text } => {
- let text = normalize_children(text);
+ let text = normalise_children(text);
let plain = inline_text_to_plain_text(&text);
if url.starts_with("mailto:") {
let email = url.trim_start_matches("mailto:");
@@ -147,7 +147,7 @@ fn normalize_node(node: Node) -> Node {
}
}
Node::Text(text) => {
- let segments = normalize_text_segments(&text);
+ let segments = normalise_text_segments(&text);
if segments.len() == 1 {
segments.into_iter().next().unwrap()
} else {
@@ -158,21 +158,21 @@ fn normalize_node(node: Node) -> Node {
}
}
-fn normalize_children(children: Vec) -> Vec {
- let mut normalized = Vec::new();
+fn normalise_children(children: Vec) -> Vec {
+ let mut normalised = Vec::new();
for child in children {
- match normalize_node(child) {
- Node::Document(grandchildren) => normalized.extend(grandchildren),
- node => normalized.push(node),
+ match normalise_node(child) {
+ Node::Document(grandchildren) => normalised.extend(grandchildren),
+ node => normalised.push(node),
}
}
- normalized
+ normalised
}
-fn normalize_inline_root(result: Node) -> Node {
- match normalize_node(result) {
+fn normalise_inline_root(result: Node) -> Node {
+ match normalise_node(result) {
Node::Document(children) => {
if children.len() > 1 {
let all_inline = children.iter().all(is_inline_node);
@@ -206,6 +206,296 @@ fn normalize_inline_root(result: Node) -> Node {
}
}
+fn has_visible_text(text: &str) -> bool {
+ !text.trim().is_empty()
+}
+
+fn has_visible_content(node: &Node) -> bool {
+ match node {
+ Node::Document(children)
+ | Node::Paragraph(children)
+ | Node::Bold(children)
+ | Node::Italic(children)
+ | Node::Underline(children)
+ | Node::Strikethrough(children)
+ | Node::Code(children)
+ | Node::Superscript(children)
+ | Node::Subscript(children)
+ | Node::SmallCaps(children)
+ | Node::List(children)
+ | Node::ListItem(children) => children.iter().any(has_visible_content),
+ Node::Link { text, .. } => text.iter().any(has_visible_content),
+ Node::Text(text) => has_visible_text(text),
+ Node::InlineFormula(tex) => !tex.trim().is_empty(),
+ Node::Email(email) => !email.trim().is_empty(),
+ Node::Uri(uri) => !uri.trim().is_empty(),
+ Node::Break => false,
+ }
+}
+
+fn flush_crossref_inline_buffer(
+ inline_buffer: &mut Vec,
+ normalised: &mut Vec,
+) -> ThothResult<()> {
+ if inline_buffer.iter().any(has_visible_content) {
+ let children = std::mem::take(inline_buffer);
+ normalised.extend(normalise_crossref_paragraph_children(children)?);
+ } else {
+ inline_buffer.clear();
+ }
+ Ok(())
+}
+
+fn normalise_crossref_inline_node(node: Node) -> ThothResult> {
+ match node {
+ Node::Document(children) => {
+ let mut normalised = Vec::new();
+ for child in children {
+ normalised.extend(normalise_crossref_inline_node(child)?);
+ }
+ Ok(normalised)
+ }
+ Node::Paragraph(_) | Node::List(_) | Node::ListItem(_) => Err(ThothError::RequestError(
+ "Crossref abstracts cannot contain nested block elements inside paragraphs."
+ .to_string(),
+ )),
+ Node::Break => Err(ThothError::RequestError(
+ "Crossref abstracts cannot contain line breaks; use separate paragraphs instead."
+ .to_string(),
+ )),
+ Node::Bold(children) => Ok(vec![Node::Bold(normalise_crossref_inline_children(
+ children,
+ )?)]),
+ Node::Italic(children) => Ok(vec![Node::Italic(normalise_crossref_inline_children(
+ children,
+ )?)]),
+ Node::Underline(children) => Ok(vec![Node::Underline(normalise_crossref_inline_children(
+ children,
+ )?)]),
+ Node::Strikethrough(children) => Ok(vec![Node::Strikethrough(
+ normalise_crossref_inline_children(children)?,
+ )]),
+ Node::Code(children) => Ok(vec![Node::Code(normalise_crossref_inline_children(
+ children,
+ )?)]),
+ Node::Superscript(children) => Ok(vec![Node::Superscript(
+ normalise_crossref_inline_children(children)?,
+ )]),
+ Node::Subscript(children) => Ok(vec![Node::Subscript(normalise_crossref_inline_children(
+ children,
+ )?)]),
+ Node::SmallCaps(children) => Ok(vec![Node::SmallCaps(normalise_crossref_inline_children(
+ children,
+ )?)]),
+ Node::Link { url, text } => Ok(vec![Node::Link {
+ url,
+ text: normalise_crossref_inline_children(text)?,
+ }]),
+ Node::InlineFormula(tex) => Ok(vec![Node::InlineFormula(tex)]),
+ Node::Email(email) => Ok(vec![Node::Email(email)]),
+ Node::Uri(uri) => Ok(vec![Node::Uri(uri)]),
+ Node::Text(text) => Ok(vec![Node::Text(text)]),
+ }
+}
+
+fn normalise_crossref_inline_children(children: Vec) -> ThothResult> {
+ let mut normalised = Vec::new();
+ for child in children {
+ normalised.extend(normalise_crossref_inline_node(child)?);
+ }
+ Ok(normalised)
+}
+
+fn normalise_crossref_paragraph_children(children: Vec) -> ThothResult> {
+ let mut normalised = Vec::new();
+ let mut segment = Vec::new();
+
+ for child in children {
+ match child {
+ Node::Break => {
+ if segment.iter().any(has_visible_content) {
+ normalised.push(Node::Paragraph(std::mem::take(&mut segment)));
+ }
+ }
+ other => {
+ segment.extend(normalise_crossref_inline_node(other)?);
+ }
+ }
+ }
+
+ if segment.iter().any(has_visible_content) {
+ normalised.push(Node::Paragraph(segment));
+ }
+
+ Ok(normalised)
+}
+
+fn normalise_crossref_list_item(children: Vec) -> ThothResult {
+ let mut normalised = Vec::new();
+ let mut inline_buffer = Vec::new();
+
+ for child in children {
+ match child {
+ Node::Document(grandchildren) => {
+ for grandchild in grandchildren {
+ match grandchild {
+ Node::Paragraph(paragraph_children) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ normalised.extend(normalise_crossref_paragraph_children(
+ paragraph_children,
+ )?);
+ }
+ inline if is_inline_node(&inline) => inline_buffer.push(inline),
+ Node::Break => {
+ return Err(ThothError::RequestError(
+ "Crossref abstracts cannot contain line breaks; use separate paragraphs instead."
+ .to_string(),
+ ))
+ }
+ _ => {
+ return Err(ThothError::RequestError(
+ "Crossref abstract lists can only contain inline content or paragraphs."
+ .to_string(),
+ ))
+ }
+ }
+ }
+ }
+ Node::Paragraph(paragraph_children) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ normalised.extend(normalise_crossref_paragraph_children(paragraph_children)?);
+ }
+ inline if is_inline_node(&inline) => inline_buffer.push(inline),
+ Node::Break => return Err(ThothError::RequestError(
+ "Crossref abstracts cannot contain line breaks; use separate paragraphs instead."
+ .to_string(),
+ )),
+ _ => {
+ return Err(ThothError::RequestError(
+ "Crossref abstract lists can only contain inline content or paragraphs."
+ .to_string(),
+ ));
+ }
+ }
+ }
+
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ Ok(Node::ListItem(normalised))
+}
+
+/// normalise stored abstract markup into the subset we safely emit to Crossref.
+pub fn normalise_crossref_abstract_ast(node: Node) -> ThothResult {
+ let mut normalised = Vec::new();
+ let mut inline_buffer = Vec::new();
+
+ let top_level_children = match node {
+ Node::Document(children) => children,
+ other => vec![other],
+ };
+
+ for child in top_level_children {
+ match child {
+ Node::Document(grandchildren) => {
+ for grandchild in grandchildren {
+ match grandchild {
+ Node::Document(_) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ let Node::Document(children) =
+ normalise_crossref_abstract_ast(grandchild)?
+ else {
+ unreachable!();
+ };
+ normalised.extend(children);
+ }
+ Node::Paragraph(paragraph_children) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ normalised.extend(normalise_crossref_paragraph_children(
+ paragraph_children,
+ )?);
+ }
+ Node::List(items) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ let normalised_items = items
+ .into_iter()
+ .map(|item| match item {
+ Node::ListItem(children) => normalise_crossref_list_item(children),
+ _ => Err(ThothError::RequestError(
+ "Crossref abstract lists must contain list-item elements."
+ .to_string(),
+ )),
+ })
+ .collect::>>()?;
+ normalised.push(Node::List(normalised_items));
+ }
+ Node::Break => {
+ return Err(ThothError::RequestError(
+ "Crossref abstracts cannot contain line breaks; use separate paragraphs instead."
+ .to_string(),
+ ))
+ }
+ Node::ListItem(_) => {
+ return Err(ThothError::RequestError(
+ "Crossref abstract lists must contain list-item elements."
+ .to_string(),
+ ))
+ }
+ other => {
+ if is_inline_node(&other) {
+ inline_buffer.push(other);
+ } else {
+ return Err(ThothError::RequestError(
+ "Crossref abstracts contain unsupported block structure."
+ .to_string(),
+ ));
+ }
+ }
+ }
+ }
+ }
+ Node::Paragraph(paragraph_children) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ normalised.extend(normalise_crossref_paragraph_children(paragraph_children)?);
+ }
+ Node::List(items) => {
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ let normalised_items = items
+ .into_iter()
+ .map(|item| match item {
+ Node::ListItem(children) => normalise_crossref_list_item(children),
+ _ => Err(ThothError::RequestError(
+ "Crossref abstract lists must contain list-item elements.".to_string(),
+ )),
+ })
+ .collect::>>()?;
+ normalised.push(Node::List(normalised_items));
+ }
+ Node::Break => return Err(ThothError::RequestError(
+ "Crossref abstracts cannot contain line breaks; use separate paragraphs instead."
+ .to_string(),
+ )),
+ Node::ListItem(_) => {
+ return Err(ThothError::RequestError(
+ "Crossref abstract lists must contain list-item elements.".to_string(),
+ ));
+ }
+ other => {
+ if is_inline_node(&other) {
+ inline_buffer.push(other);
+ } else {
+ return Err(ThothError::RequestError(
+ "Crossref abstracts contain unsupported block structure.".to_string(),
+ ));
+ }
+ }
+ }
+ }
+
+ flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?;
+ normalised.retain(has_visible_content);
+
+ Ok(Node::Document(normalised))
+}
+
// Convert Markdown string to AST
pub fn markdown_to_ast(markdown: &str) -> Node {
let parser = Parser::new(markdown);
@@ -247,7 +537,7 @@ pub fn markdown_to_ast(markdown: &str) -> Node {
}
let result = stack.pop().unwrap_or_else(|| Node::Document(vec![]));
- normalize_inline_root(result)
+ normalise_inline_root(result)
}
// Convert HTML string to AST
@@ -322,7 +612,7 @@ pub fn html_to_ast(html: &str) -> Node {
// If there's a body tag, parse its contents, otherwise parse the whole document
if let Some(body_element) = document.select(&body_selector).next() {
- normalize_inline_root(parse_element_to_node(body_element))
+ normalise_inline_root(parse_element_to_node(body_element))
} else {
// If no body tag, create a document node with all top-level elements
let mut children = Vec::new();
@@ -333,7 +623,7 @@ pub fn html_to_ast(html: &str) -> Node {
}
let result = Node::Document(children);
- normalize_inline_root(result)
+ normalise_inline_root(result)
}
}
@@ -353,7 +643,7 @@ pub fn plain_text_to_ast(text: &str) -> Node {
if paragraphs.len() == 1 {
let lines: Vec<&str> = paragraphs[0].split('\n').collect();
if lines.len() == 1 {
- let parsed_nodes = normalize_text_segments(lines[0]);
+ let parsed_nodes = normalise_text_segments(lines[0]);
if parsed_nodes.len() == 1 {
parsed_nodes[0].clone()
} else {
@@ -362,7 +652,7 @@ pub fn plain_text_to_ast(text: &str) -> Node {
} else {
let mut children = Vec::new();
for (index, line) in lines.iter().enumerate() {
- children.extend(normalize_text_segments(line));
+ children.extend(normalise_text_segments(line));
if index + 1 < lines.len() {
children.push(Node::Break);
}
@@ -375,7 +665,7 @@ pub fn plain_text_to_ast(text: &str) -> Node {
let lines: Vec<&str> = paragraph.split('\n').collect();
let mut children = Vec::new();
for (index, line) in lines.iter().enumerate() {
- children.extend(normalize_text_segments(line));
+ children.extend(normalise_text_segments(line));
if index + 1 < lines.len() {
children.push(Node::Break);
}
@@ -511,7 +801,7 @@ pub fn jats_to_ast(jats: &str) -> Node {
.unwrap();
let email_pattern = regex::Regex::new(r"(?s)(.*?)").unwrap();
let uri_pattern = regex::Regex::new(r"(?s)(.*?)").unwrap();
- let normalized_jats = uri_pattern
+ let normalised_jats = uri_pattern
.replace_all(
&email_pattern.replace_all(
&inline_formula_pattern.replace_all(
@@ -608,12 +898,12 @@ pub fn jats_to_ast(jats: &str) -> Node {
}
}
- let document = Html::parse_document(&normalized_jats);
+ let document = Html::parse_document(&normalised_jats);
let body_selector = Selector::parse("body").unwrap();
// If there's a body tag, parse its contents, otherwise parse the whole document
if let Some(body_element) = document.select(&body_selector).next() {
- normalize_inline_root(parse_jats_element_to_node(body_element))
+ normalise_inline_root(parse_jats_element_to_node(body_element))
} else {
// If no body tag, create a document node with all top-level elements
let mut children = Vec::new();
@@ -628,10 +918,10 @@ pub fn jats_to_ast(jats: &str) -> Node {
// Otherwise, wrap in document
match &children[0] {
Node::Text(_) => children.into_iter().next().unwrap(),
- _ => normalize_inline_root(Node::Document(children)),
+ _ => normalise_inline_root(Node::Document(children)),
}
} else {
- normalize_inline_root(Node::Document(children))
+ normalise_inline_root(Node::Document(children))
}
}
}
@@ -1191,7 +1481,11 @@ fn validate_title_content(node: &Node) -> ThothResult<()> {
}
}
Node::Paragraph(children) => {
- // Paragraphs are allowed in titles, but only for grouping inline elements
+ if children.iter().any(|child| !is_inline_node(child)) {
+ return Err(ThothError::RequestError(
+ "Titles cannot contain nested block elements inside paragraphs.".to_string(),
+ ));
+ }
for child in children {
validate_title_content(child)?;
}
@@ -1242,8 +1536,18 @@ fn validate_abstract_content(node: &Node) -> ThothResult<()> {
validate_abstract_content(child)?;
}
}
- Node::Paragraph(children)
- | Node::Bold(children)
+ Node::Paragraph(children) => {
+ if children.iter().any(|child| !is_inline_node(child)) {
+ return Err(ThothError::RequestError(
+ "Abstracts and biographies cannot contain nested block elements inside paragraphs."
+ .to_string(),
+ ));
+ }
+ for child in children {
+ validate_abstract_content(child)?;
+ }
+ }
+ Node::Bold(children)
| Node::Italic(children)
| Node::Underline(children)
| Node::Strikethrough(children)
@@ -1255,7 +1559,13 @@ fn validate_abstract_content(node: &Node) -> ThothResult<()> {
validate_abstract_content(child)?;
}
}
- Node::Break | Node::InlineFormula(_) | Node::Email(_) | Node::Uri(_) => {}
+ Node::Break => {
+ return Err(ThothError::RequestError(
+ "Line breaks are not allowed in abstracts or biographies. Use separate paragraphs instead."
+ .to_string(),
+ ))
+ }
+ Node::InlineFormula(_) | Node::Email(_) | Node::Uri(_) => {}
Node::List(children) | Node::ListItem(children) => {
for child in children {
validate_abstract_content(child)?;
@@ -2062,6 +2372,24 @@ mod tests {
assert!(validate_ast_content(&ast, ConversionLimit::Abstract).is_ok());
}
+ #[test]
+ fn test_validate_abstract_content_disallows_breaks() {
+ let ast = Node::Document(vec![Node::Paragraph(vec![
+ Node::Text("First line".to_string()),
+ Node::Break,
+ Node::Text("Second line".to_string()),
+ ])]);
+ assert!(validate_ast_content(&ast, ConversionLimit::Abstract).is_err());
+ }
+
+ #[test]
+ fn test_validate_abstract_content_disallows_nested_block_elements_inside_paragraphs() {
+ let ast = Node::Document(vec![Node::Paragraph(vec![Node::List(vec![
+ Node::ListItem(vec![Node::Text("Item".to_string())]),
+ ])])]);
+ assert!(validate_ast_content(&ast, ConversionLimit::Abstract).is_err());
+ }
+
#[test]
fn test_contains_disallowed_title_elements_html() {
let content = "Title with
";
diff --git a/thoth-api/src/markup/mod.rs b/thoth-api/src/markup/mod.rs
index 42d5275a..62b1867c 100644
--- a/thoth-api/src/markup/mod.rs
+++ b/thoth-api/src/markup/mod.rs
@@ -6,7 +6,7 @@ pub mod ast;
use ast::{
ast_to_html, ast_to_jats, ast_to_markdown, ast_to_plain_text, html_to_ast, jats_to_ast,
- markdown_to_ast, plain_text_ast_to_jats, plain_text_to_ast,
+ markdown_to_ast, normalise_crossref_abstract_ast, plain_text_ast_to_jats, plain_text_to_ast,
strip_structural_elements_from_ast_for_conversion, validate_ast_content,
};
@@ -53,40 +53,187 @@ fn looks_like_markup(content: &str) -> bool {
.is_match(content)
}
-fn validate_jats_subset(content: &str) -> ThothResult<()> {
- let allowed_tags = [
- "p",
- "break",
- "bold",
- "italic",
- "underline",
- "strike",
- "monospace",
- "sup",
- "sub",
- "sc",
- "list",
- "list-item",
- "ext-link",
- "inline-formula",
- "tex-math",
- "email",
- "uri",
- "article",
- "body",
- "sec",
- "div",
- ];
-
- let tag_pattern = regex::Regex::new(r"?\s*([A-Za-z][A-Za-z0-9:-]*)\b").unwrap();
- for captures in tag_pattern.captures_iter(content) {
- let tag_name = captures.get(1).unwrap().as_str();
- if !allowed_tags.contains(&tag_name) {
- return Err(ThothError::RequestError(format!(
- "Unsupported JATS element: <{}>",
- tag_name
- )));
+fn validate_jats_subset(content: &str, conversion_limit: ConversionLimit) -> ThothResult<()> {
+ use quick_xml::events::Event;
+ use quick_xml::Reader;
+
+ fn local_tag_name(raw_name: &str) -> &str {
+ raw_name.rsplit(':').next().unwrap_or(raw_name)
+ }
+
+ let allowed_tags: &[&str] = match conversion_limit {
+ ConversionLimit::Title => &[
+ "bold",
+ "italic",
+ "underline",
+ "strike",
+ "monospace",
+ "sup",
+ "sub",
+ "sc",
+ "ext-link",
+ "inline-formula",
+ "tex-math",
+ "email",
+ "uri",
+ ],
+ ConversionLimit::Abstract | ConversionLimit::Biography => &[
+ "p",
+ "bold",
+ "italic",
+ "underline",
+ "strike",
+ "monospace",
+ "sup",
+ "sub",
+ "sc",
+ "list",
+ "list-item",
+ "ext-link",
+ "inline-formula",
+ "tex-math",
+ "email",
+ "uri",
+ ],
+ };
+
+ let wrapped = format!(
+ r#"{}"#,
+ content
+ );
+ let mut reader = Reader::from_str(&wrapped);
+ let mut buf = Vec::new();
+ let mut open_tags: Vec = Vec::new();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(e)) => {
+ let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
+ let tag_name = local_tag_name(&raw_name).to_string();
+ if raw_name != "root" {
+ if !allowed_tags.contains(&tag_name.as_str()) {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS element: <{}>",
+ tag_name
+ )));
+ }
+
+ if matches!(
+ conversion_limit,
+ ConversionLimit::Abstract | ConversionLimit::Biography
+ ) && matches!(open_tags.last().map(String::as_str), Some("p"))
+ && matches!(tag_name.as_str(), "p" | "list" | "list-item")
+ {
+ return Err(ThothError::RequestError(
+ "Abstracts and biographies cannot contain nested block elements inside paragraphs."
+ .to_string(),
+ ));
+ }
+ }
+
+ for attr in e.attributes() {
+ let attr = attr.map_err(|err| {
+ ThothError::RequestError(format!("Invalid JATS attribute: {}", err))
+ })?;
+ let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
+ if raw_name == "root" {
+ if key != "xmlns:xlink" {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS attribute: {}",
+ key
+ )));
+ }
+ continue;
+ }
+
+ if raw_name.ends_with("ext-link") {
+ if key != "xlink:href" {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS attribute on ext-link: {}",
+ key
+ )));
+ }
+ } else {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS attribute on {}: {}",
+ raw_name, key
+ )));
+ }
+ }
+ if raw_name != "root" {
+ open_tags.push(tag_name);
+ }
+ }
+ Ok(Event::Empty(e)) => {
+ let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
+ let tag_name = local_tag_name(&raw_name).to_string();
+ if raw_name != "root" {
+ if !allowed_tags.contains(&tag_name.as_str()) {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS element: <{}>",
+ tag_name
+ )));
+ }
+
+ if matches!(
+ conversion_limit,
+ ConversionLimit::Abstract | ConversionLimit::Biography
+ ) && matches!(open_tags.last().map(String::as_str), Some("p"))
+ && matches!(tag_name.as_str(), "p" | "list" | "list-item")
+ {
+ return Err(ThothError::RequestError(
+ "Abstracts and biographies cannot contain nested block elements inside paragraphs."
+ .to_string(),
+ ));
+ }
+ }
+
+ for attr in e.attributes() {
+ let attr = attr.map_err(|err| {
+ ThothError::RequestError(format!("Invalid JATS attribute: {}", err))
+ })?;
+ let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
+ if raw_name == "root" {
+ if key != "xmlns:xlink" {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS attribute: {}",
+ key
+ )));
+ }
+ continue;
+ }
+
+ if raw_name.ends_with("ext-link") {
+ if key != "xlink:href" {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS attribute on ext-link: {}",
+ key
+ )));
+ }
+ } else {
+ return Err(ThothError::RequestError(format!(
+ "Unsupported JATS attribute on {}: {}",
+ raw_name, key
+ )));
+ }
+ }
+ }
+ Ok(Event::End(e)) => {
+ let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
+ if raw_name != "root" {
+ open_tags.pop();
+ }
+ }
+ Ok(Event::Eof) => break,
+ Err(err) => {
+ return Err(ThothError::RequestError(format!(
+ "Invalid JATS XML: {}",
+ err
+ )))
+ }
+ _ => {}
}
+ buf.clear();
}
let email_pattern = regex::Regex::new(r"(?s)(.*?)").unwrap();
@@ -150,7 +297,14 @@ pub fn validate_format(content: &str, format: &MarkupFormat) -> ThothResult<()>
return Err(ThothError::UnsupportedFileFormatError);
}
}
- MarkupFormat::PlainText => {}
+ MarkupFormat::PlainText => {
+ if looks_like_markup(content) {
+ return Err(ThothError::RequestError(
+ "Plain text input cannot contain markup tags. Use HTML or JATS XML instead."
+ .to_string(),
+ ));
+ }
+ }
}
Ok(())
}
@@ -164,7 +318,7 @@ pub fn convert_to_jats(
if format == MarkupFormat::JatsXml {
let content_looks_like_jats = looks_like_markup(&content);
let ast = if content_looks_like_jats {
- validate_jats_subset(&content)?;
+ validate_jats_subset(&content, conversion_limit)?;
jats_to_ast(&content)
} else {
plain_text_to_ast(&content)
@@ -244,6 +398,17 @@ pub fn convert_to_jats(
}
}
+/// normalise stored abstract-like markup into the subset we safely emit to Crossref.
+pub fn normalise_crossref_abstract_jats(content: &str) -> ThothResult {
+ let ast = if looks_like_markup(content) {
+ jats_to_ast(content)
+ } else {
+ plain_text_to_ast(content)
+ };
+ let normalised = normalise_crossref_abstract_ast(ast)?;
+ Ok(ast_to_jats(&normalised))
+}
+
/// Convert from JATS XML to specified format using a specific tag name
pub fn convert_from_jats(
jats_xml: &str,
@@ -277,7 +442,7 @@ pub fn convert_from_jats(
});
}
- // Read paths need to tolerate legacy stored markup and normalize it on the fly.
+ // Read paths need to tolerate legacy stored markup and normalise it on the fly.
let ast = jats_to_ast(jats_xml);
// For title conversion, strip structural elements before validation
@@ -479,33 +644,25 @@ mod tests {
}
#[test]
- fn test_markdown_formula_email_uri_and_break_conversion() {
+ fn test_markdown_formula_email_uri_and_break_conversion_is_rejected_for_abstracts() {
let input = "Formula $E=mc^2$ \n ";
- let output = convert_to_jats(
+ assert!(convert_to_jats(
input.to_string(),
MarkupFormat::Markdown,
ConversionLimit::Abstract,
)
- .unwrap();
- assert!(output.contains("E=mc^2"));
- assert!(output.contains("user@example.org"));
- assert!(output.contains("https://example.org"));
- assert!(output.contains(""));
+ .is_err());
}
#[test]
- fn test_html_break_formula_email_and_uri_conversion() {
+ fn test_html_break_formula_email_and_uri_conversion_is_rejected_for_biographies() {
let input = r#"Line
E=mc^2 user@example.org https://example.org
"#;
- let output = convert_to_jats(
+ assert!(convert_to_jats(
input.to_string(),
MarkupFormat::Html,
ConversionLimit::Biography,
)
- .unwrap();
- assert_eq!(
- output,
- "LineE=mc^2 user@example.org https://example.org
"
- );
+ .is_err());
}
#[test]
@@ -551,6 +708,53 @@ mod tests {
)
.is_err());
}
+
+ #[test]
+ fn test_plain_text_rejects_markup_like_input() {
+ let input = "Hello
";
+ assert!(convert_to_jats(
+ input.to_string(),
+ MarkupFormat::PlainText,
+ ConversionLimit::Abstract,
+ )
+ .is_err());
+ }
+
+ #[test]
+ fn test_jatsxml_rejects_legacy_html_tags() {
+ let input = "Hello";
+ assert!(convert_to_jats(
+ input.to_string(),
+ MarkupFormat::JatsXml,
+ ConversionLimit::Abstract,
+ )
+ .is_err());
+ }
+
+ #[test]
+ fn test_jatsxml_rejects_nested_paragraphs() {
+ let input = "Hello
";
+ assert!(convert_to_jats(
+ input.to_string(),
+ MarkupFormat::JatsXml,
+ ConversionLimit::Abstract,
+ )
+ .is_err());
+ }
+
+ #[test]
+ fn test_normalise_crossref_abstract_jats_splits_breaks_and_removes_empty_paragraphs() {
+ let input = "First lineSecond line
";
+ let output = normalise_crossref_abstract_jats(input).unwrap();
+ assert_eq!(output, "First line
Second line
");
+ }
+
+ #[test]
+ fn test_normalise_crossref_abstract_jats_flattens_inline_only_content() {
+ let input = "This has bold text.";
+ let output = normalise_crossref_abstract_jats(input).unwrap();
+ assert_eq!(output, "This has bold text.
");
+ }
// --- convert_to_jats tests end ---
// --- convert_from_jats tests start ---
diff --git a/thoth-errors/Cargo.toml b/thoth-errors/Cargo.toml
index f00b62e3..125fee7f 100644
--- a/thoth-errors/Cargo.toml
+++ b/thoth-errors/Cargo.toml
@@ -14,7 +14,7 @@ chrono = "0.4.40"
csv = "1.3.1"
deadpool-redis = "0.20.0"
dialoguer = { version = "0.11.0", features = ["password"] }
-diesel = "2.2.8"
+diesel = { version = "2.2.8", features = ["postgres", "r2d2"] }
juniper = "0.16.1"
marc = { version = "3.1.1", features = ["xml"] }
phf = { version = "0.11", features = ["macros"] }
diff --git a/thoth-export-server/Cargo.toml b/thoth-export-server/Cargo.toml
index 5048cff7..ca65c35d 100644
--- a/thoth-export-server/Cargo.toml
+++ b/thoth-export-server/Cargo.toml
@@ -10,7 +10,7 @@ readme = "README.md"
build = "build.rs"
[dependencies]
-thoth-api = { version = "=1.0.2", path = "../thoth-api" }
+thoth-api = { version = "=1.0.2", path = "../thoth-api", features = ["backend"] }
thoth-errors = { version = "=1.0.2", path = "../thoth-errors" }
thoth-client = { version = "=1.0.2", path = "../thoth-client" }
actix-web = "4.10"
diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs
index 2a0d26ff..f0bcb04f 100644
--- a/thoth-export-server/src/xml/doideposit_crossref.rs
+++ b/thoth-export-server/src/xml/doideposit_crossref.rs
@@ -1,6 +1,7 @@
use chrono::Utc;
use regex::Regex;
use std::io::Write;
+use thoth_api::markup::normalise_crossref_abstract_jats;
use thoth_api::model::IdentifierWithDomain;
use thoth_client::{
AbstractType, ContributionType, Funding, PublicationType, Reference, RelationType, Work,
@@ -392,12 +393,18 @@ fn write_abstract_content(
abstract_type: &str,
w: &mut EventWriter,
) -> ThothResult<()> {
+ let normalised_content = normalise_crossref_abstract_jats(abstract_content).map_err(|err| {
+ ThothError::IncompleteMetadataRecord(
+ DEPOSIT_ERROR.to_string(),
+ format!("Invalid Crossref abstract markup: {}", err),
+ )
+ })?;
write_full_element_block(
"jats:abstract",
Some(vec![("abstract-type", abstract_type)]),
w,
|w| {
- write_jats_content(abstract_content, w)?;
+ write_jats_content(&normalised_content, w)?;
Ok(())
},
)
@@ -409,6 +416,12 @@ fn write_abstract_content_with_locale_code(
locale_code: &str,
w: &mut EventWriter,
) -> ThothResult<()> {
+ let normalised_content = normalise_crossref_abstract_jats(abstract_content).map_err(|err| {
+ ThothError::IncompleteMetadataRecord(
+ DEPOSIT_ERROR.to_string(),
+ format!("Invalid Crossref abstract markup: {}", err),
+ )
+ })?;
write_full_element_block(
"jats:abstract",
Some(vec![
@@ -416,7 +429,7 @@ fn write_abstract_content_with_locale_code(
("xml:lang", locale_code),
]),
w,
- |w| write_jats_content(abstract_content, w),
+ |w| write_jats_content(&normalised_content, w),
)
}
@@ -2449,6 +2462,63 @@ mod tests {
assert!(output.contains(r#""#));
// Should not contain any paragraph elements
assert!(!output.contains(r#""#));
+
+ // Nested paragraph wrappers should be flattened before writing.
+ let mut buffer = Vec::new();
+ let mut writer = xml::writer::EmitterConfig::new()
+ .perform_indent(true)
+ .create_writer(&mut buffer);
+
+ let result = write_abstract_content_with_locale_code(
+ "Nested paragraph.
",
+ "long",
+ "EN",
+ &mut writer,
+ );
+
+ assert!(result.is_ok());
+ let output = String::from_utf8(buffer).unwrap();
+ assert!(output.contains(r#"Nested paragraph."#));
+ assert!(!output.contains(r#""#));
+ assert!(!output.contains(r#""#));
+
+ // Break elements should be converted into sibling paragraphs.
+ let mut buffer = Vec::new();
+ let mut writer = xml::writer::EmitterConfig::new()
+ .perform_indent(true)
+ .create_writer(&mut buffer);
+
+ let result = write_abstract_content_with_locale_code(
+ "First lineSecond line
",
+ "long",
+ "EN",
+ &mut writer,
+ );
+
+ assert!(result.is_ok());
+ let output = String::from_utf8(buffer).unwrap();
+ assert!(output.contains(r#"First line"#));
+ assert!(output.contains(r#"Second line"#));
+ assert!(!output.contains(r#"ItemTwo",
+ "long",
+ "EN",
+ &mut writer,
+ );
+
+ assert!(result.is_err());
+ let error = result.unwrap_err().to_string();
+ assert!(error.contains("Invalid Crossref abstract markup"));
}
#[test]