diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fb50e08..dc749e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixed + - [741](https://github.com/thoth-pub/thoth/pull/741) - Harden JATS rich-text handling by rejecting malformed or nested markup and abstract line breaks on write, and normalise Crossref abstract output to avoid invalid nested `jats:p` and `jats:break` elements ## [[1.0.2]](https://github.com/thoth-pub/thoth/releases/tag/v1.0.2) - 2026-04-03 ### Security diff --git a/Cargo.lock b/Cargo.lock index 77fc055a..35def495 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5010,6 +5010,7 @@ dependencies = [ "lazy_static", "log", "pulldown-cmark", + "quick-xml", "rand 0.9.2", "regex", "scraper", diff --git a/thoth-api/Cargo.toml b/thoth-api/Cargo.toml index 8c59df80..f932a28a 100644 --- a/thoth-api/Cargo.toml +++ b/thoth-api/Cargo.toml @@ -46,6 +46,7 @@ jsonwebtoken = { version = "10.3.0", optional = true } juniper = { version = "0.16.1", features = ["chrono", "schema-language", "uuid"] } lazy_static = "1.5.0" pulldown-cmark = "0.13.0" +quick-xml = "0.36" rand = { version = "0.9.0", optional = true } regex = "1.11.1" scraper = "0.20.0" @@ -64,4 +65,4 @@ log = "0.4.26" [dev-dependencies] fs2 = "0.4.3" -tokio = { version = "1.44", features = ["macros", "rt"] } +tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/thoth-api/src/graphql/tests.rs b/thoth-api/src/graphql/tests.rs index 572b2080..7179322b 100644 --- a/thoth-api/src/graphql/tests.rs +++ b/thoth-api/src/graphql/tests.rs @@ -1195,11 +1195,19 @@ fn patch_title(title: &Title) -> PatchTitle { } } +fn append_to_jats_paragraph_content(content: &str, suffix: &str) -> String { + if let Some((head, tail)) = content.rsplit_once("

") { + format!("{head}{suffix}

{tail}") + } else { + format!("{content}{suffix}") + } +} + fn patch_abstract(abstract_item: &Abstract) -> PatchAbstract { PatchAbstract { abstract_id: abstract_item.abstract_id, work_id: abstract_item.work_id, - content: format!("{} Updated", abstract_item.content), + content: append_to_jats_paragraph_content(&abstract_item.content, " Updated"), locale_code: abstract_item.locale_code, abstract_type: abstract_item.abstract_type, canonical: abstract_item.canonical, @@ -1210,7 +1218,7 @@ fn patch_biography(biography: &Biography) -> PatchBiography { PatchBiography { biography_id: biography.biography_id, contribution_id: biography.contribution_id, - content: format!("{} Updated", biography.content), + content: append_to_jats_paragraph_content(&biography.content, " Updated"), canonical: biography.canonical, locale_code: biography.locale_code, } @@ -2551,7 +2559,7 @@ query LinkedRelations($reviewId: Uuid!, $endorsementId: Uuid!) { } #[test] -fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() { +fn graphql_markup_mutations_accept_valid_jatsxml_but_reject_breaks_and_markup_like_plain_text() { let (_guard, pool) = test_db::setup_test_db(); let schema = create_schema(); let superuser = test_db::test_superuser("user-jats-xml-mutations"); @@ -2597,12 +2605,17 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() { ); let abstract_item = Abstract::from_id(pool.as_ref(), &seed.abstract_short_id).unwrap(); - update_with_data_and_markup( - &schema, - &context, - "updateAbstract", - "PatchAbstract", - "abstractId", + let abstract_query = r#" + mutation UpdateAbstract($data: PatchAbstract!, $markup: MarkupFormat!) { + updateAbstract(data: $data, markupFormat: $markup) { + abstractId + } + } + "#; + let mut abstract_vars = Variables::new(); + insert_var( + &mut abstract_vars, + "data", PatchAbstract { abstract_id: abstract_item.abstract_id, work_id: abstract_item.work_id, @@ -2613,22 +2626,28 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() { abstract_type: abstract_item.abstract_type, canonical: abstract_item.canonical, }, - MarkupFormat::PlainText, ); - - let stored_abstract = Abstract::from_id(pool.as_ref(), &seed.abstract_short_id).unwrap(); - assert_eq!( - stored_abstract.content, - "

First lineSecond line with E=mc^2 and user@example.org and https://example.org

" + insert_var(&mut abstract_vars, "markup", MarkupFormat::PlainText); + let (_, abstract_errors) = + juniper::execute_sync(abstract_query, None, &schema, &abstract_vars, &context) + .expect("GraphQL execution should succeed with validation errors"); + assert!( + !abstract_errors.is_empty(), + "Expected abstract validation error" ); let biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap(); - update_with_data_and_markup( - &schema, - &context, - "updateBiography", - "PatchBiography", - "biographyId", + let biography_query = r#" + mutation UpdateBiography($data: PatchBiography!, $markup: MarkupFormat!) { + updateBiography(data: $data, markupFormat: $markup) { + biographyId + } + } + "#; + let mut biography_vars = Variables::new(); + insert_var( + &mut biography_vars, + "data", PatchBiography { biography_id: biography.biography_id, contribution_id: biography.contribution_id, @@ -2636,13 +2655,14 @@ fn graphql_markup_mutations_accept_plain_text_when_markup_is_jats_xml() { canonical: biography.canonical, locale_code: biography.locale_code, }, - MarkupFormat::JatsXml, ); - - let stored_biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap(); - assert_eq!( - stored_biography.content, - "

Bio linex^2 bio@example.org https://bio.example.org

" + insert_var(&mut biography_vars, "markup", MarkupFormat::JatsXml); + let (_, biography_errors) = + juniper::execute_sync(biography_query, None, &schema, &biography_vars, &context) + .expect("GraphQL execution should succeed with validation errors"); + assert!( + !biography_errors.is_empty(), + "Expected biography validation error" ); } @@ -3111,7 +3131,7 @@ fn graphql_mutations_cover_all() { "PatchAbstract", "abstractId", patch_abstract(&abstract_item), - MarkupFormat::PlainText, + MarkupFormat::JatsXml, ); let biography = Biography::from_id(pool.as_ref(), &seed.biography_id).unwrap(); @@ -3122,7 +3142,7 @@ fn graphql_mutations_cover_all() { "PatchBiography", "biographyId", patch_biography(&biography), - MarkupFormat::PlainText, + MarkupFormat::JatsXml, ); let work = Work::from_id(pool.as_ref(), &seed.book_work_id).unwrap(); diff --git a/thoth-api/src/markup/ast.rs b/thoth-api/src/markup/ast.rs index 4870a173..8376c4a6 100644 --- a/thoth-api/src/markup/ast.rs +++ b/thoth-api/src/markup/ast.rs @@ -61,7 +61,7 @@ fn push_node_to_top(stack: &mut [Node], node: Node) { } } -fn normalize_text_segments(text: &str) -> Vec { +fn normalise_text_segments(text: &str) -> Vec { let pattern = regex::Regex::new( r"(?x) (?P\$[^$\n]+\$) @@ -114,22 +114,22 @@ fn normalize_text_segments(text: &str) -> Vec { result } -fn normalize_node(node: Node) -> Node { +fn normalise_node(node: Node) -> Node { match node { - Node::Document(children) => Node::Document(normalize_children(children)), - Node::Paragraph(children) => Node::Paragraph(normalize_children(children)), - Node::Bold(children) => Node::Bold(normalize_children(children)), - Node::Italic(children) => Node::Italic(normalize_children(children)), - Node::Underline(children) => Node::Underline(normalize_children(children)), - Node::Strikethrough(children) => Node::Strikethrough(normalize_children(children)), - Node::Code(children) => Node::Code(normalize_children(children)), - Node::Superscript(children) => Node::Superscript(normalize_children(children)), - Node::Subscript(children) => Node::Subscript(normalize_children(children)), - Node::SmallCaps(children) => Node::SmallCaps(normalize_children(children)), - Node::List(children) => Node::List(normalize_children(children)), - Node::ListItem(children) => Node::ListItem(normalize_children(children)), + Node::Document(children) => Node::Document(normalise_children(children)), + Node::Paragraph(children) => Node::Paragraph(normalise_children(children)), + Node::Bold(children) => Node::Bold(normalise_children(children)), + Node::Italic(children) => Node::Italic(normalise_children(children)), + Node::Underline(children) => Node::Underline(normalise_children(children)), + Node::Strikethrough(children) => Node::Strikethrough(normalise_children(children)), + Node::Code(children) => Node::Code(normalise_children(children)), + Node::Superscript(children) => Node::Superscript(normalise_children(children)), + Node::Subscript(children) => Node::Subscript(normalise_children(children)), + Node::SmallCaps(children) => Node::SmallCaps(normalise_children(children)), + Node::List(children) => Node::List(normalise_children(children)), + Node::ListItem(children) => Node::ListItem(normalise_children(children)), Node::Link { url, text } => { - let text = normalize_children(text); + let text = normalise_children(text); let plain = inline_text_to_plain_text(&text); if url.starts_with("mailto:") { let email = url.trim_start_matches("mailto:"); @@ -147,7 +147,7 @@ fn normalize_node(node: Node) -> Node { } } Node::Text(text) => { - let segments = normalize_text_segments(&text); + let segments = normalise_text_segments(&text); if segments.len() == 1 { segments.into_iter().next().unwrap() } else { @@ -158,21 +158,21 @@ fn normalize_node(node: Node) -> Node { } } -fn normalize_children(children: Vec) -> Vec { - let mut normalized = Vec::new(); +fn normalise_children(children: Vec) -> Vec { + let mut normalised = Vec::new(); for child in children { - match normalize_node(child) { - Node::Document(grandchildren) => normalized.extend(grandchildren), - node => normalized.push(node), + match normalise_node(child) { + Node::Document(grandchildren) => normalised.extend(grandchildren), + node => normalised.push(node), } } - normalized + normalised } -fn normalize_inline_root(result: Node) -> Node { - match normalize_node(result) { +fn normalise_inline_root(result: Node) -> Node { + match normalise_node(result) { Node::Document(children) => { if children.len() > 1 { let all_inline = children.iter().all(is_inline_node); @@ -206,6 +206,296 @@ fn normalize_inline_root(result: Node) -> Node { } } +fn has_visible_text(text: &str) -> bool { + !text.trim().is_empty() +} + +fn has_visible_content(node: &Node) -> bool { + match node { + Node::Document(children) + | Node::Paragraph(children) + | Node::Bold(children) + | Node::Italic(children) + | Node::Underline(children) + | Node::Strikethrough(children) + | Node::Code(children) + | Node::Superscript(children) + | Node::Subscript(children) + | Node::SmallCaps(children) + | Node::List(children) + | Node::ListItem(children) => children.iter().any(has_visible_content), + Node::Link { text, .. } => text.iter().any(has_visible_content), + Node::Text(text) => has_visible_text(text), + Node::InlineFormula(tex) => !tex.trim().is_empty(), + Node::Email(email) => !email.trim().is_empty(), + Node::Uri(uri) => !uri.trim().is_empty(), + Node::Break => false, + } +} + +fn flush_crossref_inline_buffer( + inline_buffer: &mut Vec, + normalised: &mut Vec, +) -> ThothResult<()> { + if inline_buffer.iter().any(has_visible_content) { + let children = std::mem::take(inline_buffer); + normalised.extend(normalise_crossref_paragraph_children(children)?); + } else { + inline_buffer.clear(); + } + Ok(()) +} + +fn normalise_crossref_inline_node(node: Node) -> ThothResult> { + match node { + Node::Document(children) => { + let mut normalised = Vec::new(); + for child in children { + normalised.extend(normalise_crossref_inline_node(child)?); + } + Ok(normalised) + } + Node::Paragraph(_) | Node::List(_) | Node::ListItem(_) => Err(ThothError::RequestError( + "Crossref abstracts cannot contain nested block elements inside paragraphs." + .to_string(), + )), + Node::Break => Err(ThothError::RequestError( + "Crossref abstracts cannot contain line breaks; use separate paragraphs instead." + .to_string(), + )), + Node::Bold(children) => Ok(vec![Node::Bold(normalise_crossref_inline_children( + children, + )?)]), + Node::Italic(children) => Ok(vec![Node::Italic(normalise_crossref_inline_children( + children, + )?)]), + Node::Underline(children) => Ok(vec![Node::Underline(normalise_crossref_inline_children( + children, + )?)]), + Node::Strikethrough(children) => Ok(vec![Node::Strikethrough( + normalise_crossref_inline_children(children)?, + )]), + Node::Code(children) => Ok(vec![Node::Code(normalise_crossref_inline_children( + children, + )?)]), + Node::Superscript(children) => Ok(vec![Node::Superscript( + normalise_crossref_inline_children(children)?, + )]), + Node::Subscript(children) => Ok(vec![Node::Subscript(normalise_crossref_inline_children( + children, + )?)]), + Node::SmallCaps(children) => Ok(vec![Node::SmallCaps(normalise_crossref_inline_children( + children, + )?)]), + Node::Link { url, text } => Ok(vec![Node::Link { + url, + text: normalise_crossref_inline_children(text)?, + }]), + Node::InlineFormula(tex) => Ok(vec![Node::InlineFormula(tex)]), + Node::Email(email) => Ok(vec![Node::Email(email)]), + Node::Uri(uri) => Ok(vec![Node::Uri(uri)]), + Node::Text(text) => Ok(vec![Node::Text(text)]), + } +} + +fn normalise_crossref_inline_children(children: Vec) -> ThothResult> { + let mut normalised = Vec::new(); + for child in children { + normalised.extend(normalise_crossref_inline_node(child)?); + } + Ok(normalised) +} + +fn normalise_crossref_paragraph_children(children: Vec) -> ThothResult> { + let mut normalised = Vec::new(); + let mut segment = Vec::new(); + + for child in children { + match child { + Node::Break => { + if segment.iter().any(has_visible_content) { + normalised.push(Node::Paragraph(std::mem::take(&mut segment))); + } + } + other => { + segment.extend(normalise_crossref_inline_node(other)?); + } + } + } + + if segment.iter().any(has_visible_content) { + normalised.push(Node::Paragraph(segment)); + } + + Ok(normalised) +} + +fn normalise_crossref_list_item(children: Vec) -> ThothResult { + let mut normalised = Vec::new(); + let mut inline_buffer = Vec::new(); + + for child in children { + match child { + Node::Document(grandchildren) => { + for grandchild in grandchildren { + match grandchild { + Node::Paragraph(paragraph_children) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + normalised.extend(normalise_crossref_paragraph_children( + paragraph_children, + )?); + } + inline if is_inline_node(&inline) => inline_buffer.push(inline), + Node::Break => { + return Err(ThothError::RequestError( + "Crossref abstracts cannot contain line breaks; use separate paragraphs instead." + .to_string(), + )) + } + _ => { + return Err(ThothError::RequestError( + "Crossref abstract lists can only contain inline content or paragraphs." + .to_string(), + )) + } + } + } + } + Node::Paragraph(paragraph_children) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + normalised.extend(normalise_crossref_paragraph_children(paragraph_children)?); + } + inline if is_inline_node(&inline) => inline_buffer.push(inline), + Node::Break => return Err(ThothError::RequestError( + "Crossref abstracts cannot contain line breaks; use separate paragraphs instead." + .to_string(), + )), + _ => { + return Err(ThothError::RequestError( + "Crossref abstract lists can only contain inline content or paragraphs." + .to_string(), + )); + } + } + } + + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + Ok(Node::ListItem(normalised)) +} + +/// normalise stored abstract markup into the subset we safely emit to Crossref. +pub fn normalise_crossref_abstract_ast(node: Node) -> ThothResult { + let mut normalised = Vec::new(); + let mut inline_buffer = Vec::new(); + + let top_level_children = match node { + Node::Document(children) => children, + other => vec![other], + }; + + for child in top_level_children { + match child { + Node::Document(grandchildren) => { + for grandchild in grandchildren { + match grandchild { + Node::Document(_) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + let Node::Document(children) = + normalise_crossref_abstract_ast(grandchild)? + else { + unreachable!(); + }; + normalised.extend(children); + } + Node::Paragraph(paragraph_children) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + normalised.extend(normalise_crossref_paragraph_children( + paragraph_children, + )?); + } + Node::List(items) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + let normalised_items = items + .into_iter() + .map(|item| match item { + Node::ListItem(children) => normalise_crossref_list_item(children), + _ => Err(ThothError::RequestError( + "Crossref abstract lists must contain list-item elements." + .to_string(), + )), + }) + .collect::>>()?; + normalised.push(Node::List(normalised_items)); + } + Node::Break => { + return Err(ThothError::RequestError( + "Crossref abstracts cannot contain line breaks; use separate paragraphs instead." + .to_string(), + )) + } + Node::ListItem(_) => { + return Err(ThothError::RequestError( + "Crossref abstract lists must contain list-item elements." + .to_string(), + )) + } + other => { + if is_inline_node(&other) { + inline_buffer.push(other); + } else { + return Err(ThothError::RequestError( + "Crossref abstracts contain unsupported block structure." + .to_string(), + )); + } + } + } + } + } + Node::Paragraph(paragraph_children) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + normalised.extend(normalise_crossref_paragraph_children(paragraph_children)?); + } + Node::List(items) => { + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + let normalised_items = items + .into_iter() + .map(|item| match item { + Node::ListItem(children) => normalise_crossref_list_item(children), + _ => Err(ThothError::RequestError( + "Crossref abstract lists must contain list-item elements.".to_string(), + )), + }) + .collect::>>()?; + normalised.push(Node::List(normalised_items)); + } + Node::Break => return Err(ThothError::RequestError( + "Crossref abstracts cannot contain line breaks; use separate paragraphs instead." + .to_string(), + )), + Node::ListItem(_) => { + return Err(ThothError::RequestError( + "Crossref abstract lists must contain list-item elements.".to_string(), + )); + } + other => { + if is_inline_node(&other) { + inline_buffer.push(other); + } else { + return Err(ThothError::RequestError( + "Crossref abstracts contain unsupported block structure.".to_string(), + )); + } + } + } + } + + flush_crossref_inline_buffer(&mut inline_buffer, &mut normalised)?; + normalised.retain(has_visible_content); + + Ok(Node::Document(normalised)) +} + // Convert Markdown string to AST pub fn markdown_to_ast(markdown: &str) -> Node { let parser = Parser::new(markdown); @@ -247,7 +537,7 @@ pub fn markdown_to_ast(markdown: &str) -> Node { } let result = stack.pop().unwrap_or_else(|| Node::Document(vec![])); - normalize_inline_root(result) + normalise_inline_root(result) } // Convert HTML string to AST @@ -322,7 +612,7 @@ pub fn html_to_ast(html: &str) -> Node { // If there's a body tag, parse its contents, otherwise parse the whole document if let Some(body_element) = document.select(&body_selector).next() { - normalize_inline_root(parse_element_to_node(body_element)) + normalise_inline_root(parse_element_to_node(body_element)) } else { // If no body tag, create a document node with all top-level elements let mut children = Vec::new(); @@ -333,7 +623,7 @@ pub fn html_to_ast(html: &str) -> Node { } let result = Node::Document(children); - normalize_inline_root(result) + normalise_inline_root(result) } } @@ -353,7 +643,7 @@ pub fn plain_text_to_ast(text: &str) -> Node { if paragraphs.len() == 1 { let lines: Vec<&str> = paragraphs[0].split('\n').collect(); if lines.len() == 1 { - let parsed_nodes = normalize_text_segments(lines[0]); + let parsed_nodes = normalise_text_segments(lines[0]); if parsed_nodes.len() == 1 { parsed_nodes[0].clone() } else { @@ -362,7 +652,7 @@ pub fn plain_text_to_ast(text: &str) -> Node { } else { let mut children = Vec::new(); for (index, line) in lines.iter().enumerate() { - children.extend(normalize_text_segments(line)); + children.extend(normalise_text_segments(line)); if index + 1 < lines.len() { children.push(Node::Break); } @@ -375,7 +665,7 @@ pub fn plain_text_to_ast(text: &str) -> Node { let lines: Vec<&str> = paragraph.split('\n').collect(); let mut children = Vec::new(); for (index, line) in lines.iter().enumerate() { - children.extend(normalize_text_segments(line)); + children.extend(normalise_text_segments(line)); if index + 1 < lines.len() { children.push(Node::Break); } @@ -511,7 +801,7 @@ pub fn jats_to_ast(jats: &str) -> Node { .unwrap(); let email_pattern = regex::Regex::new(r"(?s)(.*?)").unwrap(); let uri_pattern = regex::Regex::new(r"(?s)(.*?)").unwrap(); - let normalized_jats = uri_pattern + let normalised_jats = uri_pattern .replace_all( &email_pattern.replace_all( &inline_formula_pattern.replace_all( @@ -608,12 +898,12 @@ pub fn jats_to_ast(jats: &str) -> Node { } } - let document = Html::parse_document(&normalized_jats); + let document = Html::parse_document(&normalised_jats); let body_selector = Selector::parse("body").unwrap(); // If there's a body tag, parse its contents, otherwise parse the whole document if let Some(body_element) = document.select(&body_selector).next() { - normalize_inline_root(parse_jats_element_to_node(body_element)) + normalise_inline_root(parse_jats_element_to_node(body_element)) } else { // If no body tag, create a document node with all top-level elements let mut children = Vec::new(); @@ -628,10 +918,10 @@ pub fn jats_to_ast(jats: &str) -> Node { // Otherwise, wrap in document match &children[0] { Node::Text(_) => children.into_iter().next().unwrap(), - _ => normalize_inline_root(Node::Document(children)), + _ => normalise_inline_root(Node::Document(children)), } } else { - normalize_inline_root(Node::Document(children)) + normalise_inline_root(Node::Document(children)) } } } @@ -1191,7 +1481,11 @@ fn validate_title_content(node: &Node) -> ThothResult<()> { } } Node::Paragraph(children) => { - // Paragraphs are allowed in titles, but only for grouping inline elements + if children.iter().any(|child| !is_inline_node(child)) { + return Err(ThothError::RequestError( + "Titles cannot contain nested block elements inside paragraphs.".to_string(), + )); + } for child in children { validate_title_content(child)?; } @@ -1242,8 +1536,18 @@ fn validate_abstract_content(node: &Node) -> ThothResult<()> { validate_abstract_content(child)?; } } - Node::Paragraph(children) - | Node::Bold(children) + Node::Paragraph(children) => { + if children.iter().any(|child| !is_inline_node(child)) { + return Err(ThothError::RequestError( + "Abstracts and biographies cannot contain nested block elements inside paragraphs." + .to_string(), + )); + } + for child in children { + validate_abstract_content(child)?; + } + } + Node::Bold(children) | Node::Italic(children) | Node::Underline(children) | Node::Strikethrough(children) @@ -1255,7 +1559,13 @@ fn validate_abstract_content(node: &Node) -> ThothResult<()> { validate_abstract_content(child)?; } } - Node::Break | Node::InlineFormula(_) | Node::Email(_) | Node::Uri(_) => {} + Node::Break => { + return Err(ThothError::RequestError( + "Line breaks are not allowed in abstracts or biographies. Use separate paragraphs instead." + .to_string(), + )) + } + Node::InlineFormula(_) | Node::Email(_) | Node::Uri(_) => {} Node::List(children) | Node::ListItem(children) => { for child in children { validate_abstract_content(child)?; @@ -2062,6 +2372,24 @@ mod tests { assert!(validate_ast_content(&ast, ConversionLimit::Abstract).is_ok()); } + #[test] + fn test_validate_abstract_content_disallows_breaks() { + let ast = Node::Document(vec![Node::Paragraph(vec![ + Node::Text("First line".to_string()), + Node::Break, + Node::Text("Second line".to_string()), + ])]); + assert!(validate_ast_content(&ast, ConversionLimit::Abstract).is_err()); + } + + #[test] + fn test_validate_abstract_content_disallows_nested_block_elements_inside_paragraphs() { + let ast = Node::Document(vec![Node::Paragraph(vec![Node::List(vec![ + Node::ListItem(vec![Node::Text("Item".to_string())]), + ])])]); + assert!(validate_ast_content(&ast, ConversionLimit::Abstract).is_err()); + } + #[test] fn test_contains_disallowed_title_elements_html() { let content = "

Title with

  • list

"; diff --git a/thoth-api/src/markup/mod.rs b/thoth-api/src/markup/mod.rs index 42d5275a..62b1867c 100644 --- a/thoth-api/src/markup/mod.rs +++ b/thoth-api/src/markup/mod.rs @@ -6,7 +6,7 @@ pub mod ast; use ast::{ ast_to_html, ast_to_jats, ast_to_markdown, ast_to_plain_text, html_to_ast, jats_to_ast, - markdown_to_ast, plain_text_ast_to_jats, plain_text_to_ast, + markdown_to_ast, normalise_crossref_abstract_ast, plain_text_ast_to_jats, plain_text_to_ast, strip_structural_elements_from_ast_for_conversion, validate_ast_content, }; @@ -53,40 +53,187 @@ fn looks_like_markup(content: &str) -> bool { .is_match(content) } -fn validate_jats_subset(content: &str) -> ThothResult<()> { - let allowed_tags = [ - "p", - "break", - "bold", - "italic", - "underline", - "strike", - "monospace", - "sup", - "sub", - "sc", - "list", - "list-item", - "ext-link", - "inline-formula", - "tex-math", - "email", - "uri", - "article", - "body", - "sec", - "div", - ]; - - let tag_pattern = regex::Regex::new(r"", - tag_name - ))); +fn validate_jats_subset(content: &str, conversion_limit: ConversionLimit) -> ThothResult<()> { + use quick_xml::events::Event; + use quick_xml::Reader; + + fn local_tag_name(raw_name: &str) -> &str { + raw_name.rsplit(':').next().unwrap_or(raw_name) + } + + let allowed_tags: &[&str] = match conversion_limit { + ConversionLimit::Title => &[ + "bold", + "italic", + "underline", + "strike", + "monospace", + "sup", + "sub", + "sc", + "ext-link", + "inline-formula", + "tex-math", + "email", + "uri", + ], + ConversionLimit::Abstract | ConversionLimit::Biography => &[ + "p", + "bold", + "italic", + "underline", + "strike", + "monospace", + "sup", + "sub", + "sc", + "list", + "list-item", + "ext-link", + "inline-formula", + "tex-math", + "email", + "uri", + ], + }; + + let wrapped = format!( + r#"{}"#, + content + ); + let mut reader = Reader::from_str(&wrapped); + let mut buf = Vec::new(); + let mut open_tags: Vec = Vec::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(e)) => { + let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string(); + let tag_name = local_tag_name(&raw_name).to_string(); + if raw_name != "root" { + if !allowed_tags.contains(&tag_name.as_str()) { + return Err(ThothError::RequestError(format!( + "Unsupported JATS element: <{}>", + tag_name + ))); + } + + if matches!( + conversion_limit, + ConversionLimit::Abstract | ConversionLimit::Biography + ) && matches!(open_tags.last().map(String::as_str), Some("p")) + && matches!(tag_name.as_str(), "p" | "list" | "list-item") + { + return Err(ThothError::RequestError( + "Abstracts and biographies cannot contain nested block elements inside paragraphs." + .to_string(), + )); + } + } + + for attr in e.attributes() { + let attr = attr.map_err(|err| { + ThothError::RequestError(format!("Invalid JATS attribute: {}", err)) + })?; + let key = String::from_utf8_lossy(attr.key.as_ref()).to_string(); + if raw_name == "root" { + if key != "xmlns:xlink" { + return Err(ThothError::RequestError(format!( + "Unsupported JATS attribute: {}", + key + ))); + } + continue; + } + + if raw_name.ends_with("ext-link") { + if key != "xlink:href" { + return Err(ThothError::RequestError(format!( + "Unsupported JATS attribute on ext-link: {}", + key + ))); + } + } else { + return Err(ThothError::RequestError(format!( + "Unsupported JATS attribute on {}: {}", + raw_name, key + ))); + } + } + if raw_name != "root" { + open_tags.push(tag_name); + } + } + Ok(Event::Empty(e)) => { + let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string(); + let tag_name = local_tag_name(&raw_name).to_string(); + if raw_name != "root" { + if !allowed_tags.contains(&tag_name.as_str()) { + return Err(ThothError::RequestError(format!( + "Unsupported JATS element: <{}>", + tag_name + ))); + } + + if matches!( + conversion_limit, + ConversionLimit::Abstract | ConversionLimit::Biography + ) && matches!(open_tags.last().map(String::as_str), Some("p")) + && matches!(tag_name.as_str(), "p" | "list" | "list-item") + { + return Err(ThothError::RequestError( + "Abstracts and biographies cannot contain nested block elements inside paragraphs." + .to_string(), + )); + } + } + + for attr in e.attributes() { + let attr = attr.map_err(|err| { + ThothError::RequestError(format!("Invalid JATS attribute: {}", err)) + })?; + let key = String::from_utf8_lossy(attr.key.as_ref()).to_string(); + if raw_name == "root" { + if key != "xmlns:xlink" { + return Err(ThothError::RequestError(format!( + "Unsupported JATS attribute: {}", + key + ))); + } + continue; + } + + if raw_name.ends_with("ext-link") { + if key != "xlink:href" { + return Err(ThothError::RequestError(format!( + "Unsupported JATS attribute on ext-link: {}", + key + ))); + } + } else { + return Err(ThothError::RequestError(format!( + "Unsupported JATS attribute on {}: {}", + raw_name, key + ))); + } + } + } + Ok(Event::End(e)) => { + let raw_name = String::from_utf8_lossy(e.name().as_ref()).to_string(); + if raw_name != "root" { + open_tags.pop(); + } + } + Ok(Event::Eof) => break, + Err(err) => { + return Err(ThothError::RequestError(format!( + "Invalid JATS XML: {}", + err + ))) + } + _ => {} } + buf.clear(); } let email_pattern = regex::Regex::new(r"(?s)(.*?)").unwrap(); @@ -150,7 +297,14 @@ pub fn validate_format(content: &str, format: &MarkupFormat) -> ThothResult<()> return Err(ThothError::UnsupportedFileFormatError); } } - MarkupFormat::PlainText => {} + MarkupFormat::PlainText => { + if looks_like_markup(content) { + return Err(ThothError::RequestError( + "Plain text input cannot contain markup tags. Use HTML or JATS XML instead." + .to_string(), + )); + } + } } Ok(()) } @@ -164,7 +318,7 @@ pub fn convert_to_jats( if format == MarkupFormat::JatsXml { let content_looks_like_jats = looks_like_markup(&content); let ast = if content_looks_like_jats { - validate_jats_subset(&content)?; + validate_jats_subset(&content, conversion_limit)?; jats_to_ast(&content) } else { plain_text_to_ast(&content) @@ -244,6 +398,17 @@ pub fn convert_to_jats( } } +/// normalise stored abstract-like markup into the subset we safely emit to Crossref. +pub fn normalise_crossref_abstract_jats(content: &str) -> ThothResult { + let ast = if looks_like_markup(content) { + jats_to_ast(content) + } else { + plain_text_to_ast(content) + }; + let normalised = normalise_crossref_abstract_ast(ast)?; + Ok(ast_to_jats(&normalised)) +} + /// Convert from JATS XML to specified format using a specific tag name pub fn convert_from_jats( jats_xml: &str, @@ -277,7 +442,7 @@ pub fn convert_from_jats( }); } - // Read paths need to tolerate legacy stored markup and normalize it on the fly. + // Read paths need to tolerate legacy stored markup and normalise it on the fly. let ast = jats_to_ast(jats_xml); // For title conversion, strip structural elements before validation @@ -479,33 +644,25 @@ mod tests { } #[test] - fn test_markdown_formula_email_uri_and_break_conversion() { + fn test_markdown_formula_email_uri_and_break_conversion_is_rejected_for_abstracts() { let input = "Formula $E=mc^2$ \n "; - let output = convert_to_jats( + assert!(convert_to_jats( input.to_string(), MarkupFormat::Markdown, ConversionLimit::Abstract, ) - .unwrap(); - assert!(output.contains("E=mc^2")); - assert!(output.contains("user@example.org")); - assert!(output.contains("https://example.org")); - assert!(output.contains("")); + .is_err()); } #[test] - fn test_html_break_formula_email_and_uri_conversion() { + fn test_html_break_formula_email_and_uri_conversion_is_rejected_for_biographies() { let input = r#"

Line
E=mc^2 user@example.org https://example.org

"#; - let output = convert_to_jats( + assert!(convert_to_jats( input.to_string(), MarkupFormat::Html, ConversionLimit::Biography, ) - .unwrap(); - assert_eq!( - output, - "

LineE=mc^2 user@example.org https://example.org

" - ); + .is_err()); } #[test] @@ -551,6 +708,53 @@ mod tests { ) .is_err()); } + + #[test] + fn test_plain_text_rejects_markup_like_input() { + let input = "

Hello

"; + assert!(convert_to_jats( + input.to_string(), + MarkupFormat::PlainText, + ConversionLimit::Abstract, + ) + .is_err()); + } + + #[test] + fn test_jatsxml_rejects_legacy_html_tags() { + let input = "Hello"; + assert!(convert_to_jats( + input.to_string(), + MarkupFormat::JatsXml, + ConversionLimit::Abstract, + ) + .is_err()); + } + + #[test] + fn test_jatsxml_rejects_nested_paragraphs() { + let input = "

Hello

"; + assert!(convert_to_jats( + input.to_string(), + MarkupFormat::JatsXml, + ConversionLimit::Abstract, + ) + .is_err()); + } + + #[test] + fn test_normalise_crossref_abstract_jats_splits_breaks_and_removes_empty_paragraphs() { + let input = "

First lineSecond line

"; + let output = normalise_crossref_abstract_jats(input).unwrap(); + assert_eq!(output, "

First line

Second line

"); + } + + #[test] + fn test_normalise_crossref_abstract_jats_flattens_inline_only_content() { + let input = "This has bold text."; + let output = normalise_crossref_abstract_jats(input).unwrap(); + assert_eq!(output, "

This has bold text.

"); + } // --- convert_to_jats tests end --- // --- convert_from_jats tests start --- diff --git a/thoth-errors/Cargo.toml b/thoth-errors/Cargo.toml index f00b62e3..125fee7f 100644 --- a/thoth-errors/Cargo.toml +++ b/thoth-errors/Cargo.toml @@ -14,7 +14,7 @@ chrono = "0.4.40" csv = "1.3.1" deadpool-redis = "0.20.0" dialoguer = { version = "0.11.0", features = ["password"] } -diesel = "2.2.8" +diesel = { version = "2.2.8", features = ["postgres", "r2d2"] } juniper = "0.16.1" marc = { version = "3.1.1", features = ["xml"] } phf = { version = "0.11", features = ["macros"] } diff --git a/thoth-export-server/Cargo.toml b/thoth-export-server/Cargo.toml index 5048cff7..ca65c35d 100644 --- a/thoth-export-server/Cargo.toml +++ b/thoth-export-server/Cargo.toml @@ -10,7 +10,7 @@ readme = "README.md" build = "build.rs" [dependencies] -thoth-api = { version = "=1.0.2", path = "../thoth-api" } +thoth-api = { version = "=1.0.2", path = "../thoth-api", features = ["backend"] } thoth-errors = { version = "=1.0.2", path = "../thoth-errors" } thoth-client = { version = "=1.0.2", path = "../thoth-client" } actix-web = "4.10" diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs index 2a0d26ff..f0bcb04f 100644 --- a/thoth-export-server/src/xml/doideposit_crossref.rs +++ b/thoth-export-server/src/xml/doideposit_crossref.rs @@ -1,6 +1,7 @@ use chrono::Utc; use regex::Regex; use std::io::Write; +use thoth_api::markup::normalise_crossref_abstract_jats; use thoth_api::model::IdentifierWithDomain; use thoth_client::{ AbstractType, ContributionType, Funding, PublicationType, Reference, RelationType, Work, @@ -392,12 +393,18 @@ fn write_abstract_content( abstract_type: &str, w: &mut EventWriter, ) -> ThothResult<()> { + let normalised_content = normalise_crossref_abstract_jats(abstract_content).map_err(|err| { + ThothError::IncompleteMetadataRecord( + DEPOSIT_ERROR.to_string(), + format!("Invalid Crossref abstract markup: {}", err), + ) + })?; write_full_element_block( "jats:abstract", Some(vec![("abstract-type", abstract_type)]), w, |w| { - write_jats_content(abstract_content, w)?; + write_jats_content(&normalised_content, w)?; Ok(()) }, ) @@ -409,6 +416,12 @@ fn write_abstract_content_with_locale_code( locale_code: &str, w: &mut EventWriter, ) -> ThothResult<()> { + let normalised_content = normalise_crossref_abstract_jats(abstract_content).map_err(|err| { + ThothError::IncompleteMetadataRecord( + DEPOSIT_ERROR.to_string(), + format!("Invalid Crossref abstract markup: {}", err), + ) + })?; write_full_element_block( "jats:abstract", Some(vec![ @@ -416,7 +429,7 @@ fn write_abstract_content_with_locale_code( ("xml:lang", locale_code), ]), w, - |w| write_jats_content(abstract_content, w), + |w| write_jats_content(&normalised_content, w), ) } @@ -2449,6 +2462,63 @@ mod tests { assert!(output.contains(r#""#)); // Should not contain any paragraph elements assert!(!output.contains(r#""#)); + + // Nested paragraph wrappers should be flattened before writing. + let mut buffer = Vec::new(); + let mut writer = xml::writer::EmitterConfig::new() + .perform_indent(true) + .create_writer(&mut buffer); + + let result = write_abstract_content_with_locale_code( + "

Nested paragraph.

", + "long", + "EN", + &mut writer, + ); + + assert!(result.is_ok()); + let output = String::from_utf8(buffer).unwrap(); + assert!(output.contains(r#"Nested paragraph."#)); + assert!(!output.contains(r#""#)); + assert!(!output.contains(r#""#)); + + // Break elements should be converted into sibling paragraphs. + let mut buffer = Vec::new(); + let mut writer = xml::writer::EmitterConfig::new() + .perform_indent(true) + .create_writer(&mut buffer); + + let result = write_abstract_content_with_locale_code( + "

First lineSecond line

", + "long", + "EN", + &mut writer, + ); + + assert!(result.is_ok()); + let output = String::from_utf8(buffer).unwrap(); + assert!(output.contains(r#"First line"#)); + assert!(output.contains(r#"Second line"#)); + assert!(!output.contains(r#"ItemTwo", + "long", + "EN", + &mut writer, + ); + + assert!(result.is_err()); + let error = result.unwrap_err().to_string(); + assert!(error.contains("Invalid Crossref abstract markup")); } #[test]