python · vstinner · May 27, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -341,6 +341,72 @@ APIs:
    .. versionadded:: 3.3
 
 
+.. c:function:: const void* PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_ssize_t *nbytes, uint32_t *format)
+
+   Export the contents of the *unicode* string in one of the requested format
+   *requested_formats*.
+
+   * On success, set *\*nbytes* and *\*format*, and return the contents.
+   * On error, set an exception and return ``NULL``.
+
+   The contents is valid as long as *unicode* is valid.
+
+   The export must be released by :c:func:`PyUnicode_ReleaseExport`.
+   The contents of the buffer are valid until they are released.
+
+   The buffer must not be modified.
+
+   *unicode*, *nbytes* and *format* must not be NULL.
+
+   Available formats:
+
+   .. c:namespace:: NULL
+
+   ===================================  ========  ===========================
+   Constant Identifier                    Value  Description
+   ===================================  ========  ===========================
+   .. c:macro:: PyUnicode_FORMAT_ASCII  ``0x01``  ASCII string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS1   ``0x02``  UCS-1 string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS2   ``0x04``  UCS-2 string (``Py_UCS2*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS4   ``0x08``  UCS-4 string (``Py_UCS4*``)
+   .. c:macro:: PyUnicode_FORMAT_UTF8   ``0x10``  UTF-8 string (``char*``)
+   ===================================  ========  ===========================
+
+   *requested_formats* can be a single format or a bitwise combination of the
+   formats in the table above.
+   On success, *\*format* will be set to a single one of the requested flags.
+
+   Note that future versions of Python may introduce additional formats.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: void PyUnicode_ReleaseExport(PyObject *unicode, const void* data, uint32_t format)
+
+   Release an export created by :c:func:`PyUnicode_Export`.
+
+   Each argument must match the corresponding argument or result of
+   a single earlier call to :c:func:`PyUnicode_Export`.
+   In particular, this means that you must hold a reference to *unicode*
+   while an export is valid.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format)
+
+   Create a string object from a buffer in an “export format”.
+
+   * Return a reference to a new string object on success.
+   * Set an exception and return ``NULL`` on error.
+
+   *data* must not be NULL. *nbytes* must be positive or zero.
+
+   See :c:func:`PyUnicode_Export` for the available formats.
+
+   .. versionadded:: 3.14
+
+
 .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
                                                     Py_ssize_t size)
 

@@ -140,6 +140,7 @@
     ('c:type', 'size_t'),
     ('c:type', 'ssize_t'),
     ('c:type', 'time_t'),
+    ('c:type', 'uint32_t'),
     ('c:type', 'uint64_t'),
     ('c:type', 'uintmax_t'),
     ('c:type', 'uintptr_t'),

diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat
@@ -300,6 +300,11 @@ New Features
 
   (Contributed by Victor Stinner in :gh:`119182`.)
 
+* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
+  export and import strings.
+  (Contributed by Victor Stinner in :gh:`119609`.)
+
+
 Porting to Python 3.14
 ----------------------
 

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -248,6 +248,37 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
     const char *u              /* UTF-8 encoded string */
     );
 
+#define PyUnicode_FORMAT_ASCII 0x01
+#define PyUnicode_FORMAT_UCS1 0x02
+#define PyUnicode_FORMAT_UCS2 0x04
+#define PyUnicode_FORMAT_UCS4 0x08
+#define PyUnicode_FORMAT_UTF8 0x10
+
+// Get the content of a string in the requested format:
+// - Return the content, set '*nbytes' and '*format' on success.
+// - Set an exception and return NULL on error.
+//
+// The export must be released by PyUnicode_ReleaseExport().
+PyAPI_FUNC(const void*) PyUnicode_Export(
+    PyObject *unicode,
+    uint32_t requested_formats,
+    Py_ssize_t *nbytes,
+    uint32_t *format);
+
+// Release an export created by PyUnicode_Export().
+PyAPI_FUNC(void) PyUnicode_ReleaseExport(
+    PyObject *unicode,
+    const void* data,
+    uint32_t format);
+
+// Create a string object from a string in the format 'format'.
+// - Return a reference to a new string object on success.
+// - Set an exception and return NULL on error.
+PyAPI_FUNC(PyObject*) PyUnicode_Import(
+    const void *data,
+    Py_ssize_t nbytes,
+    uint32_t format);
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -28,6 +28,14 @@ class Str(str):
     pass
 
 
+PyUnicode_FORMAT_ASCII = 0x01
+PyUnicode_FORMAT_UCS1 = 0x02
+PyUnicode_FORMAT_UCS2 = 0x04
+PyUnicode_FORMAT_UCS4 = 0x08
+PyUnicode_FORMAT_UTF8 = 0x10
+# Invalid native format
+PyUnicode_FORMAT_INVALID = 0x20
+
 class CAPITest(unittest.TestCase):
 
     @support.cpython_only
@@ -1679,6 +1687,162 @@ def test_pep393_utf8_caching_bug(self):
                 # Check that the second call returns the same result
                 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
 
+    def test_unicode_export(self):
+        # Test PyUnicode_Export() and PyUnicode_FreeExport()
+        unicode_export = _testlimitedcapi.unicode_export
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        # export to the native format
+        formats = (PyUnicode_FORMAT_ASCII
+                   | PyUnicode_FORMAT_UCS1
+                   | PyUnicode_FORMAT_UCS2
+                   | PyUnicode_FORMAT_UCS4)
+        self.assertEqual(unicode_export("abc", formats),
+                         (b'abc', PyUnicode_FORMAT_ASCII))
+        self.assertEqual(unicode_export("latin1:\xe9", formats),
+                         (b'latin1:\xe9', PyUnicode_FORMAT_UCS1))
+        self.assertEqual(unicode_export('ucs2:\u20ac', formats),
+                         ('ucs2:\u20ac'.encode(ucs2_enc),
+                          PyUnicode_FORMAT_UCS2))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4))
+
+        # export ASCII as UCS1
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1),
+                         (b'abc', PyUnicode_FORMAT_UCS1))
+
+        # export ASCII and UCS1 to UCS2
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2),
+                         ('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2),
+                         ('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2))
+
+        # always export to UCS4
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
+                         ('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
+                         ('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
+                         ('ucs2:\u20ac'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4))
+
+        # always export to UTF8
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
+                         ('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
+                         ('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
+                         ('ucs2:\u20ac'.encode('utf8'),
+                          PyUnicode_FORMAT_UTF8))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
+                         ('ucs4:\U0010ffff'.encode('utf8'),
+                          PyUnicode_FORMAT_UTF8))
+
+        # No supported format or invalid format
+        with self.assertRaisesRegex(ValueError,
+                                    "unable to find a matching export format"):
+            unicode_export('abc', 0)
+        with self.assertRaisesRegex(ValueError,
+                                    "unable to find a matching export format"):
+            unicode_export('abc', PyUnicode_FORMAT_INVALID)
+
+    def test_unicode_import(self):
+        # Test PyUnicode_Import()
+        unicode_import = _testlimitedcapi.unicode_import
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
+                         "abc")
+        self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
+                         "latin1:\xe9")
+
+        self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
+                                          PyUnicode_FORMAT_UCS2),
+                         'ucs2:\u20ac')
+
+        self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
+                                          PyUnicode_FORMAT_UCS4),
+                         'ucs4:\U0010ffff')
+
+        text = "abc\xe9\U0010ffff"
+        self.assertEqual(unicode_import(text.encode('utf8'),
+                                          PyUnicode_FORMAT_UTF8),
+                         text)
+
+        # Empty string
+        for native_format in (
+            PyUnicode_FORMAT_ASCII,
+            PyUnicode_FORMAT_UCS1,
+            PyUnicode_FORMAT_UCS2,
+            PyUnicode_FORMAT_UCS4,
+            PyUnicode_FORMAT_UTF8,
+        ):
+            with self.subTest(native_format=native_format):
+                self.assertEqual(unicode_import(b'', native_format),
+                                 '')
+
+        # Invalid format
+        with self.assertRaises(ValueError):
+            unicode_import(b'', PyUnicode_FORMAT_INVALID)
+
+        # Invalid size
+        ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
+        ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
+
+    def test_unicode_export_import_roundtrip(self):
+        unicode_export = _testlimitedcapi.unicode_export
+        unicode_import = _testlimitedcapi.unicode_import
+
+        ASCII = PyUnicode_FORMAT_ASCII
+        UCS1 = PyUnicode_FORMAT_UCS1
+        UCS2 = PyUnicode_FORMAT_UCS2
+        UCS4 = PyUnicode_FORMAT_UCS4
+        UTF8 = PyUnicode_FORMAT_UTF8
+        ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8)
+
+        for string, allowed_formats in (
+            ('', {ASCII, UCS1, UCS2, UCS4, UTF8}),
+            ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}),
+            ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}),
+            ('ucs2:\u20ac', {UCS2, UCS4, UTF8}),
+            ('ucs4:\U0001f638', {UCS4, UTF8}),
+        ):
+            for format in ASCII, UCS1, UCS2, UCS4, UTF8:
+                with self.subTest(string=string, format=format):
+                    if format not in allowed_formats:
+                        with self.assertRaises(ValueError):
+                            unicode_export(string, format)
+                    else:
+                        buf, buf_fmt = unicode_export(string, format)
+                        restored = unicode_import(buf, buf_fmt)
+                        self.assertEqual(restored, string)
+
+            buf, buf_fmt = unicode_export(string, ALL)
+            restored = unicode_import(buf, buf_fmt)
+            self.assertEqual(restored, string)
+
 
 class PyUnicodeWriterTest(unittest.TestCase):
     def create_writer(self, size):

diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py
diff --git a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst
@@ -0,0 +1,2 @@
+Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
+export and import strings. Patch by Victor Stinner.
@@ -2510,3 +2510,19 @@
 
 [function.Py_TYPE]
     added = '3.14'
+[function.PyUnicode_Export]
+    added = '3.14'
+[function.PyUnicode_ReleaseExport]
+    added = '3.14'
+[function.PyUnicode_Import]
+    added = '3.14'
+[const.PyUnicode_FORMAT_ASCII]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS1]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS2]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS4]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UTF8]
+    added = '3.14'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
		export and import strings. Patch by Victor Stinner.