diff --git a/core/utils/string_utils.py b/core/utils/string_utils.py new file mode 100644 index 000000000..cd354fe2d --- /dev/null +++ b/core/utils/string_utils.py @@ -0,0 +1,51 @@ +""" +String utility functions for handling encoding issues and sanitization. +""" + + +def sanitize_unicode_surrogates(obj): + """ + Recursively sanitize Unicode surrogate characters from strings in nested data structures. + + Unicode surrogates (U+D800 to U+DFFF) are invalid in JSON and PostgreSQL. + They can appear when reading filenames with encoding issues using the 'surrogateescape' error handler. + + This function replaces surrogates with the Unicode replacement character (U+FFFD '�'). + + Args: + obj: Any object (str, dict, list, or other types) + + Returns: + The sanitized object with surrogates replaced + + Examples: + >>> sanitize_unicode_surrogates("Sum\\udce1rio.pdf") + 'Sum�rio.pdf' + >>> sanitize_unicode_surrogates({"file": "Sum\\udce1rio.pdf", "count": 5}) + {'file': 'Sum�rio.pdf', 'count': 5} + """ + if isinstance(obj, str): + # Replace surrogates by encoding with 'replace' error handler + # This replaces invalid surrogates with the replacement character � + try: + # First try to encode to UTF-8, which will fail on surrogates + # Then decode back with 'replace' to handle the surrogates + return obj.encode("utf-8", errors="replace").decode( + "utf-8", errors="replace" + ) + except (UnicodeDecodeError, UnicodeEncodeError): + # If all else fails, return a string representation + return repr(obj) + elif isinstance(obj, dict): + return { + sanitize_unicode_surrogates(k): sanitize_unicode_surrogates(v) + for k, v in obj.items() + } + elif isinstance(obj, (list, tuple)): + result = [sanitize_unicode_surrogates(item) for item in obj] + return type(obj)(result) + elif isinstance(obj, set): + return {sanitize_unicode_surrogates(item) for item in obj} + else: + # For other types (int, float, bool, None, etc.), return as-is + return obj diff --git a/core/utils/test_string_utils.py b/core/utils/test_string_utils.py new file mode 100644 index 000000000..402360cd2 --- /dev/null +++ b/core/utils/test_string_utils.py @@ -0,0 +1,133 @@ +""" +Tests for string utility functions. +""" + +from core.utils.string_utils import sanitize_unicode_surrogates + + +class TestSanitizeUnicodeSurrogates: + """Tests for the sanitize_unicode_surrogates function.""" + + def test_sanitize_simple_string_with_surrogates(self): + """Test that surrogate characters in strings are replaced.""" + # String with a low surrogate (would be from 'Sumário' read with encoding errors) + input_str = "Sum\udce1rio.pdf" + result = sanitize_unicode_surrogates(input_str) + + # Surrogates should be replaced with replacement character + assert "\udce1" not in result + assert "Sum" in result + assert "rio.pdf" in result + # The replacement character (U+FFFD) is used for invalid bytes + # In UTF-8, the surrogate will be replaced during encode/decode + assert len(result) > 0 # Result should not be empty + + def test_sanitize_string_without_surrogates(self): + """Test that normal strings pass through unchanged.""" + input_str = "normal_file.pdf" + result = sanitize_unicode_surrogates(input_str) + assert result == input_str + + def test_sanitize_dict_with_surrogate_in_value(self): + """Test that surrogates in dict values are sanitized.""" + input_dict = {"file": "Sum\udce1rio.pdf", "count": 5, "status": "ok"} + result = sanitize_unicode_surrogates(input_dict) + + assert isinstance(result, dict) + assert result["count"] == 5 + assert result["status"] == "ok" + assert "\udce1" not in result["file"] + + def test_sanitize_dict_with_surrogate_in_key(self): + """Test that surrogates in dict keys are sanitized.""" + input_dict = {"Sum\udce1rio": "value"} + result = sanitize_unicode_surrogates(input_dict) + + assert isinstance(result, dict) + # The key should be sanitized + for key in result.keys(): + assert "\udce1" not in key + + def test_sanitize_list_with_surrogates(self): + """Test that surrogates in lists are sanitized.""" + input_list = ["normal.pdf", "Sum\udce1rio.pdf", {"file": "test\udce1.txt"}] + result = sanitize_unicode_surrogates(input_list) + + assert isinstance(result, list) + assert len(result) == 3 + assert result[0] == "normal.pdf" + assert "\udce1" not in result[1] + assert "\udce1" not in result[2]["file"] + + def test_sanitize_nested_structure(self): + """Test that deeply nested structures are sanitized.""" + input_data = { + "exceptions": [ + { + "file": "Sum\udce1rio.pdf", + "error": "Some error", + "nested": {"path": "/path/to/\udce1file"}, + } + ], + "count": 1, + } + result = sanitize_unicode_surrogates(input_data) + + assert isinstance(result, dict) + assert "\udce1" not in result["exceptions"][0]["file"] + assert "\udce1" not in result["exceptions"][0]["nested"]["path"] + assert result["count"] == 1 + + def test_sanitize_tuple(self): + """Test that tuples are properly handled.""" + input_tuple = ("normal", "test\udce1") + result = sanitize_unicode_surrogates(input_tuple) + + assert isinstance(result, tuple) + assert len(result) == 2 + assert result[0] == "normal" + assert "\udce1" not in result[1] + + def test_sanitize_set(self): + """Test that sets are properly handled.""" + input_set = {"normal", "test\udce1"} + result = sanitize_unicode_surrogates(input_set) + + assert isinstance(result, set) + assert "normal" in result + # Check that no item has surrogates + for item in result: + assert "\udce1" not in item + + def test_sanitize_non_string_types(self): + """Test that non-string types pass through unchanged.""" + assert sanitize_unicode_surrogates(42) == 42 + assert sanitize_unicode_surrogates(3.14) == 3.14 + assert sanitize_unicode_surrogates(True) is True + assert sanitize_unicode_surrogates(None) is None + + def test_sanitize_empty_structures(self): + """Test that empty structures are handled correctly.""" + assert sanitize_unicode_surrogates({}) == {} + assert sanitize_unicode_surrogates([]) == [] + assert sanitize_unicode_surrogates("") == "" + + def test_sanitize_real_world_failure_case(self): + """Test the actual failure case from the issue.""" + # Simulating the failures dict that caused the original error + failures = [ + { + "file": "/scielo_www/pepsic/bases/pdf/vinculo/v9n2/Sum\udce1rio.pdf", + "error": "Some error message", + } + ] + detail = {"migrated": 10, "failures": failures} + + result = sanitize_unicode_surrogates(detail) + + # Should not contain surrogates + assert "\udce1" not in str(result) + assert result["migrated"] == 10 + assert isinstance(result["failures"], list) + assert len(result["failures"]) == 1 + assert "\udce1" not in result["failures"][0]["file"] diff --git a/proc/models.py b/proc/models.py index e5bb1f406..95f5d7535 100644 --- a/proc/models.py +++ b/proc/models.py @@ -14,12 +14,8 @@ from modelcluster.models import ClusterableModel from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre from packtools.sps.utils.xml_fixer import fix_inline_graphic_in_caption -from wagtail.admin.panels import ( - FieldPanel, - InlinePanel, - ObjectList, - TabbedInterface, -) +from scielo_classic_website.htmlbody.html_body import HTMLContent +from wagtail.admin.panels import FieldPanel, InlinePanel, ObjectList, TabbedInterface from wagtail.models import Orderable from wagtailautocomplete.edit_handlers import AutocompletePanel @@ -28,6 +24,7 @@ from collection.models import Collection from core.models import CommonControlField from core.utils.file_utils import delete_files +from core.utils.string_utils import sanitize_unicode_surrogates from htmlxml.models import HTMLXML from issue.models import Issue from journal.choices import JOURNAL_AVAILABILTY_STATUS @@ -50,13 +47,12 @@ from proc import exceptions from proc.forms import IssueProcAdminModelForm, ProcAdminModelForm from publication.api.publication import get_api_data -from scielo_classic_website.htmlbody.html_body import HTMLContent from tracker import choices as tracker_choices from tracker.models import UnexpectedEvent, format_traceback -class NoDocumentRecordsToMigrateError(Exception): - ... +class NoDocumentRecordsToMigrateError(Exception): ... + class Operation(CommonControlField): @@ -166,6 +162,10 @@ def finish( if message: detail["message"] = message + # Sanitize Unicode surrogates before saving to database + # This prevents PostgreSQL from rejecting JSON with invalid Unicode + detail = sanitize_unicode_surrogates(detail) + try: json.dumps(detail) except Exception as exc_detail: @@ -424,7 +424,9 @@ def get_or_create(cls, user, collection, pid, **kwargs): except cls.DoesNotExist: return cls.create(user, collection, pid) except cls.MultipleObjectsReturned: - items = cls.objects.filter(collection=collection, pid=pid).order_by("-created") + items = cls.objects.filter(collection=collection, pid=pid).order_by( + "-created" + ) for item in items[1:]: item.delete() return items[0] @@ -1001,12 +1003,12 @@ def __unicode__(self): if self.journal_proc: return f"{self.journal_proc.acron} {self.issue_folder} ({self.collection})" return f"{self.pid} ({self.collection})" - + def __str__(self): if self.journal_proc: return f"{self.journal_proc.acron} {self.issue_folder} ({self.collection})" return f"{self.pid} ({self.collection})" - + journal_proc = models.ForeignKey( JournalProc, on_delete=models.SET_NULL, null=True, blank=True ) @@ -1213,7 +1215,10 @@ def get_files_from_classic_website( ): try: operation = self.start(user, "get_files_from_classic_website") - if self.files_status == tracker_choices.PROGRESS_STATUS_DONE and not force_update: + if ( + self.files_status == tracker_choices.PROGRESS_STATUS_DONE + and not force_update + ): operation.finish( user, completed=True, @@ -1298,7 +1303,7 @@ def docs_to_migrate(cls, collection, journal_acron, publication_year, force_upda migration_status=tracker_choices.PROGRESS_STATUS_DONE, **params, ) - + def find_asset(self, basename, name=None): if not name: name, ext = os.path.splitext(basename) @@ -1312,11 +1317,13 @@ def find_asset(self, basename, name=None): return MigratedFile.find( collection=self.collection, journal_acron=self.journal_proc.acron, - name=name, + name=name, ) @classmethod - def get_id_and_pid_list_to_process(cls, journal_proc, issue_folder, publication_year, issue_pids, status, events): + def get_id_and_pid_list_to_process( + cls, journal_proc, issue_folder, publication_year, issue_pids, status, events + ): events.append("Identify filter: status") q = Q(docs_status__in=status) | Q(files_status__in=status) @@ -1326,14 +1333,13 @@ def get_id_and_pid_list_to_process(cls, journal_proc, issue_folder, publication_ issue_filter["issue_folder"] = issue_folder if publication_year: issue_filter["issue__publication_year"] = publication_year - + events.append("Select journal issues to process") if issue_filter: if issue_pids: issue_filter["pid__in"] = issue_pids return cls.objects.filter( - journal_proc=journal_proc, - **issue_filter + journal_proc=journal_proc, **issue_filter ).values_list("id", "pid") if issue_pids: @@ -1363,7 +1369,10 @@ def migrate_document_records(self, user, force_update=None): ).count() detail["total_document_records"] = total_document_records - force_update = ArticleProc.objects.filter(issue_proc=self).count() < total_document_records + force_update = ( + ArticleProc.objects.filter(issue_proc=self).count() + < total_document_records + ) id_file_records = IdFileRecord.document_records_to_migrate( collection=self.collection, @@ -1392,17 +1401,23 @@ def migrate_document_records(self, user, force_update=None): ) total += 1 if not article_proc: - raise ValueError(f"Unable to create ArticleProc for PID {record.item_pid}") + raise ValueError( + f"Unable to create ArticleProc for PID {record.item_pid}" + ) except Exception as e: exceptions[record.item_pid] = traceback.format_exc() detail["exceptions"] = exceptions detail["total failed"] = len(exceptions) - detail["total done"] = detail["total_document_records_to_migrate"] - detail["total failed"] - id_file_records.exclude(item_pid__in=list(exceptions.keys())).update(todo=False) + detail["total done"] = ( + detail["total_document_records_to_migrate"] - detail["total failed"] + ) + id_file_records.exclude(item_pid__in=list(exceptions.keys())).update( + todo=False + ) new_status = self.get_new_docs_status(total_document_records) - + except NoDocumentRecordsToMigrateError as e: exc_type, exc_value, exc_traceback = sys.exc_info() new_status = self.get_new_docs_status(total_document_records) @@ -1420,11 +1435,13 @@ def migrate_document_records(self, user, force_update=None): completed=self.docs_status == tracker_choices.PROGRESS_STATUS_DONE, exc_traceback=exc_traceback, exception=exception, - detail=detail + detail=detail, ) return total - def get_new_docs_status(self, total_document_records=None, total_migrated_articles=None): + def get_new_docs_status( + self, total_document_records=None, total_migrated_articles=None + ): if total_document_records is None: total_document_records = IdFileRecord.document_records_to_migrate( collection=self.collection, @@ -1432,7 +1449,9 @@ def get_new_docs_status(self, total_document_records=None, total_migrated_articl force_update=True, # todos os registros encontrados em acron.id no momento ).count() if total_migrated_articles is None: - total_migrated_articles = ArticleProc.objects.filter(issue_proc=self).count() + total_migrated_articles = ArticleProc.objects.filter( + issue_proc=self + ).count() if total_document_records == 0: return tracker_choices.PROGRESS_STATUS_BLOCKED if total_migrated_articles == 0: @@ -1484,7 +1503,6 @@ def bundle_id(self): return "-".join([self.journal_proc.pid, self.issue.bundle_id_suffix]) - class ArticleEventCreateError(Exception): ... @@ -1675,7 +1693,9 @@ def get_xml(self, user): migrated_data = self.migrated_data document = migrated_data.document - migrated_document_publication_day = document.get_complete_article_publication_date() + migrated_document_publication_day = ( + document.get_complete_article_publication_date() + ) if not migrated_data.file_type: migrated_data.file_type = document.file_type @@ -1721,7 +1741,9 @@ def get_xml_from_native(self, detail): ) try: # correção de fig/inline-graphic para fig/graphic - detail["fix_inline_graphic_in_caption"] = fix_inline_graphic_in_caption(xml_with_pre.xmltree) + detail["fix_inline_graphic_in_caption"] = fix_inline_graphic_in_caption( + xml_with_pre.xmltree + ) except Exception as e: logging.exception(e) raise @@ -1731,16 +1753,20 @@ def get_xml_from_native(self, detail): processing_date = self.migrated_data.document.processing_date if processing_date: # adota a data de processamento do documento como data de publicação - article_publication_date = datetime.strptime(processing_date, "%Y%m%d").strftime("%Y-%m-%d") + article_publication_date = datetime.strptime( + processing_date, "%Y%m%d" + ).strftime("%Y-%m-%d") else: # completa a data de publicação com média de dia e/ou mes ausentes - article_publication_date = xml_with_pre.get_complete_publication_date() + article_publication_date = ( + xml_with_pre.get_complete_publication_date() + ) xml_with_pre.article_publication_date = article_publication_date except Exception as e: logging.exception(e) raise return xml_with_pre - + def get_xml_from_html(self, user, detail): migrated_data = self.migrated_data classic_ws_doc = migrated_data.document @@ -1754,7 +1780,7 @@ def get_xml_from_html(self, user, detail): self.xml_status = htmlxml.html2xml_status if os.path.isfile(htmlxml.file.path): return htmlxml.file.path - + @property def xml_with_pre(self): try: @@ -1763,12 +1789,14 @@ def xml_with_pre(self): raise XMLVersionXmlWithPreError( _("Unable to get xml_with_pre for {}: {}").format(self, e) ) - - def save_processed_xml(self, xml_with_pre, xml_file_path, detail, migrated_document_publication_day): + + def save_processed_xml( + self, xml_with_pre, xml_file_path, detail, migrated_document_publication_day + ): try: if not xml_with_pre and xml_file_path: xml_with_pre = list(XMLWithPre.create(path=xml_file_path))[0] - + if not xml_with_pre: raise ValueError("No XML with pre to process") @@ -1790,7 +1818,8 @@ def save_processed_xml(self, xml_with_pre, xml_file_path, detail, migrated_docum article_date = None if not article_date: xml_with_pre.article_publication_date = ( - migrated_document_publication_day or xml_with_pre.get_complete_publication_date() + migrated_document_publication_day + or xml_with_pre.get_complete_publication_date() ) detail.update(xml_with_pre.data) @@ -1941,7 +1970,9 @@ def migrated_xml(self): continue return item raise MigratedFile.DoesNotExist( - _("No migrated XML file found for {} ({})").format(self.pkg_name, self.issue_proc) + _("No migrated XML file found for {} ({})").format( + self.pkg_name, self.issue_proc + ) ) @property