GH-48241: [Python] Scalar inferencing doesn't infer UUID (#48727)

tadeja · rok · pitrou · web-flow · commit c18733333655 · 2026-03-03T20:39:14.000+01:00
### Rationale for this change This closes #48241, #44224 and #43855. Currently uuid.UUID objects are not inferred/converted automatically in PyArrow, requiring users to explicitly specify the type. ### What changes are included in this PR? Adding support for Python's uuid.UUID objects in PyArrow's type inference and conversion. ### Are these changes tested? Yes, added test_uuid_scalar_from_python() and test_uuid_array_from_python() in `test_extension.py`. ### Are there any user-facing changes? Users can now pass Python uuid.UUID objects directly to PyArrow functions like pa.scalar() and pa.array() without specifying the type; ```python import uuid import pyarrow as pa pa.scalar(uuid.uuid4()) ``` <pyarrow.UuidScalar: UUID('958174b9-3a5c-4cdd-8fc5-d51a2fc55784')> ```python pa.array([uuid.uuid4()]) ``` <pyarrow.lib.UuidArray object at 0x1217725f0> [ 73611FD81F764A209C8B9CDBADDA1F53 ] * GitHub Issue: #48241 Lead-authored-by: Tadeja Kadunc <tadeja.kadunc@gmail.com> Co-authored-by: tadeja <tadeja@users.noreply.github.com> Co-authored-by: Rok Mihevc <rok@mihevc.org> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Rok Mihevc <rok@mihevc.org>
diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst
@@ -476,8 +476,8 @@ You can find the official list of canonical extension types in the
 :ref:`format_canonical_extensions` section. Here we add examples on how to
 use them in PyArrow.
 
-Fixed size tensor
-"""""""""""""""""
+Fixed shape tensor
+""""""""""""""""""
 
 To create an array of tensors with equal shape (fixed shape tensor array) we
 first need to define a fixed shape tensor extension type with value type
@@ -487,7 +487,7 @@ and shape:
 
    >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))
 
-Then we need the storage array with :func:`pyarrow.list_` type where ``value_type```
+Then we need the storage array with :func:`pyarrow.list_` type where ``value_type``
 is the fixed shape tensor value type and list size is a product of ``tensor_type``
 shape elements. Then we can create an array of tensors with
 ``pa.ExtensionArray.from_storage()`` method:
@@ -629,3 +629,41 @@ for ``NCHW`` format where:
 * C: number of channels of the image
 * H: height of the image
 * W: width of the image
+
+UUID
+""""
+
+The UUID extension type (``arrow.uuid``) represents universally unique
+identifiers as 16-byte fixed-size binary values. PyArrow provides integration
+with Python's built-in :mod:`uuid` module, including automatic type inference.
+
+Creating UUID scalars and arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyArrow infers the UUID type from Python's ``uuid.UUID`` objects,
+so you can pass them directly to :func:`pyarrow.scalar` and :func:`pyarrow.array`:
+
+.. code-block:: python
+
+   >>> import uuid
+   >>> import pyarrow as pa
+
+   >>> pa.scalar(uuid.uuid4())
+   <pyarrow.UuidScalar: UUID('...')>
+
+   >>> uuids = [uuid.uuid4() for _ in range(3)]
+   >>> arr = pa.array(uuids)
+   >>> arr.type
+   UuidType(extension<arrow.uuid>)
+
+You can also explicitly specify the UUID type using :func:`pyarrow.uuid`:
+
+.. code-block:: python
+
+   >>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid())
+   <pyarrow.lib.UuidArray object at ...>
+   [
+     ...,
+     ...
+   ]
+
diff --git a/python/pyarrow/src/arrow/python/common.h b/python/pyarrow/src/arrow/python/common.h
@@ -419,6 +419,20 @@ struct PyBytesView {
     return Status::OK();
   }
 
+  // Parse bytes from a uuid.UUID object (stores reference to keep bytes alive)
+  Status ParseUuid(PyObject* obj) {
+    ref.reset(PyObject_GetAttrString(obj, "bytes"));
+    RETURN_IF_PYERROR();
+    if (!PyBytes_Check(ref.obj())) {
+      return Status::TypeError("Expected uuid.UUID.bytes to return bytes, got '",
+                               Py_TYPE(ref.obj())->tp_name, "' object");
+    }
+    bytes = PyBytes_AS_STRING(ref.obj());
+    size = PyBytes_GET_SIZE(ref.obj());
+    is_utf8 = false;
+    return Status::OK();
+  }
+
  protected:
   OwnedRef ref;
 };
diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc
@@ -296,16 +296,69 @@ bool PyFloat_IsNaN(PyObject* obj) {
 
 namespace {
 
-// This needs a conditional, because using std::once_flag could introduce
-// a deadlock when the GIL is enabled. See
-// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for
-// more info.
+// Thread-safe one-time Python module import + attribute lookup. For Pandas and UUID.
+// Uses std::call_once when the GIL is disabled, or a simple boolean flag when
+// the GIL is enabled to avoid deadlocks. See ARROW-10519 for more details and
+// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272
+struct ModuleOnceRunner {
+  std::string module_name;
 #ifdef Py_GIL_DISABLED
-static std::once_flag pandas_static_initialized;
+  std::once_flag initialized;
 #else
-static bool pandas_static_initialized = false;
+  bool initialized = false;
 #endif
 
+  explicit ModuleOnceRunner(const std::string& module_name) : module_name(module_name) {}
+
+  template <typename Func>
+  void RunOnce(Func&& func) {
+    auto do_init = [&]() {
+      OwnedRef module;
+      if (ImportModule(module_name, &module).ok()) {
+#ifndef Py_GIL_DISABLED
+        // Since ImportModule can release the GIL, another thread could have
+        // already initialized the static data.
+        if (initialized) {
+          return;
+        }
+#endif
+        func(module);
+      }
+    };
+#ifdef Py_GIL_DISABLED
+    std::call_once(initialized, do_init);
+#else
+    if (!initialized) {
+      do_init();
+      initialized = true;
+    }
+#endif
+  }
+};
+
+static PyObject* uuid_UUID = nullptr;
+static ModuleOnceRunner uuid_runner("uuid");
+
+}  // namespace
+
+bool IsPyUuid(PyObject* obj) {
+  uuid_runner.RunOnce([](OwnedRef& module) {
+    OwnedRef ref;
+    if (ImportFromModule(module.obj(), "UUID", &ref).ok()) {
+      uuid_UUID = ref.obj();
+    }
+  });
+  if (!uuid_UUID) return false;
+  int result = PyObject_IsInstance(obj, uuid_UUID);
+  if (result < 0) {
+    PyErr_Clear();
+    return false;
+  }
+  return result != 0;
+}
+
+namespace {
+
 // Once initialized, these variables hold borrowed references to Pandas static data.
 // We should not use OwnedRef here because Python destructors would be
 // called on a finalized interpreter.
@@ -315,72 +368,43 @@ static PyObject* pandas_Timedelta = nullptr;
 static PyObject* pandas_Timestamp = nullptr;
 static PyTypeObject* pandas_NaTType = nullptr;
 static PyObject* pandas_DateOffset = nullptr;
+static ModuleOnceRunner pandas_runner("pandas");
 
-void GetPandasStaticSymbols() {
-  OwnedRef pandas;
-
-  // Import pandas
-  Status s = ImportModule("pandas", &pandas);
-  if (!s.ok()) {
-    return;
-  }
-
-#ifndef Py_GIL_DISABLED
-  // Since ImportModule can release the GIL, another thread could have
-  // already initialized the static data.
-  if (pandas_static_initialized) {
-    return;
-  }
-#endif
-
-  OwnedRef ref;
-
-  // set NaT sentinel and its type
-  if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
-    pandas_NaT = ref.obj();
-    // PyObject_Type returns a new reference but we trust that pandas.NaT will
-    // outlive our use of this PyObject*
-    pandas_NaTType = Py_TYPE(ref.obj());
-  }
-
-  // retain a reference to Timedelta
-  if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
-    pandas_Timedelta = ref.obj();
-  }
+}  // namespace
 
-  // retain a reference to Timestamp
-  if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
-    pandas_Timestamp = ref.obj();
-  }
+void InitPandasStaticData() {
+  pandas_runner.RunOnce([](OwnedRef& module) {
+    OwnedRef ref;
+
+    // set NaT sentinel and its type
+    if (ImportFromModule(module.obj(), "NaT", &ref).ok()) {
+      pandas_NaT = ref.obj();
+      // PyObject_Type returns a new reference but we trust that pandas.NaT will
+      // outlive our use of this PyObject*
+      pandas_NaTType = Py_TYPE(ref.obj());
+    }
 
-  // if pandas.NA exists, retain a reference to it
-  if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
-    pandas_NA = ref.obj();
-  }
+    // retain a reference to Timedelta
+    if (ImportFromModule(module.obj(), "Timedelta", &ref).ok()) {
+      pandas_Timedelta = ref.obj();
+    }
 
-  // Import DateOffset type
-  if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
-    pandas_DateOffset = ref.obj();
-  }
-}
+    // retain a reference to Timestamp
+    if (ImportFromModule(module.obj(), "Timestamp", &ref).ok()) {
+      pandas_Timestamp = ref.obj();
+    }
 
-}  // namespace
+    // if pandas.NA exists, retain a reference to it
+    if (ImportFromModule(module.obj(), "NA", &ref).ok()) {
+      pandas_NA = ref.obj();
+    }
 
-#ifdef Py_GIL_DISABLED
-void InitPandasStaticData() {
-  std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
-}
-#else
-void InitPandasStaticData() {
-  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
-  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
-  if (pandas_static_initialized) {
-    return;
-  }
-  GetPandasStaticSymbols();
-  pandas_static_initialized = true;
+    // Import DateOffset type
+    if (ImportFromModule(module.obj(), "DateOffset", &ref).ok()) {
+      pandas_DateOffset = ref.obj();
+    }
+  });
 }
-#endif
 
 bool PandasObjectIsNull(PyObject* obj) {
   if (!MayHaveNaN(obj)) {
diff --git a/python/pyarrow/src/arrow/python/helpers.h b/python/pyarrow/src/arrow/python/helpers.h
@@ -92,6 +92,10 @@ PyObject* BorrowPandasDataOffsetType();
 ARROW_PYTHON_EXPORT
 bool PyFloat_IsNaN(PyObject* obj);
 
+// \brief Check whether obj is a uuid.UUID instance
+ARROW_PYTHON_EXPORT
+bool IsPyUuid(PyObject* obj);
+
 inline bool IsPyBinary(PyObject* obj) {
   return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
 }
diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc
@@ -27,6 +27,7 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/extension/uuid.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/util/decimal.h"
@@ -407,6 +408,7 @@ class TypeInferrer {
         arrow_scalar_count_(0),
         numpy_dtype_count_(0),
         interval_count_(0),
+        uuid_count_(0),
         max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
                               std::numeric_limits<int32_t>::min()),
         decimal_type_() {
@@ -475,6 +477,9 @@ class TypeInferrer {
       ++decimal_count_;
     } else if (PyObject_IsInstance(obj, interval_types_.obj())) {
       ++interval_count_;
+    } else if (internal::IsPyUuid(obj)) {
+      ++uuid_count_;
+      *keep_going = make_unions_;
     } else {
       return internal::InvalidValue(obj,
                                     "did not recognize Python value type when inferring "
@@ -604,6 +609,8 @@ class TypeInferrer {
       *out = utf8();
     } else if (interval_count_) {
       *out = month_day_nano_interval();
+    } else if (uuid_count_) {
+      *out = extension::uuid();
     } else if (arrow_scalar_count_) {
       *out = scalar_type_;
     } else {
@@ -766,6 +773,7 @@ class TypeInferrer {
   int64_t arrow_scalar_count_;
   int64_t numpy_dtype_count_;
   int64_t interval_count_;
+  int64_t uuid_count_;
   std::unique_ptr<TypeInferrer> list_inferrer_;
   std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
   std::unordered_map<std::string, size_t> struct_field_index_;
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -36,6 +36,7 @@
 #include "arrow/array/builder_primitive.h"
 #include "arrow/array/builder_time.h"
 #include "arrow/chunked_array.h"
+#include "arrow/extension_type.h"
 #include "arrow/result.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
@@ -512,7 +513,12 @@ class PyValue {
 
   static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
                         PyBytesView& view) {
-    ARROW_RETURN_NOT_OK(view.ParseString(obj));
+    // Check if obj is a uuid.UUID instance
+    if (internal::IsPyUuid(obj)) {
+      ARROW_RETURN_NOT_OK(view.ParseUuid(obj));
+    } else {
+      ARROW_RETURN_NOT_OK(view.ParseString(obj));
+    }
     if (view.size != type->byte_width()) {
       std::stringstream ss;
       ss << "expected to be length " << type->byte_width() << " was " << view.size;
@@ -1268,16 +1274,24 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
   // In some cases, type inference may be "loose", like strings. If the user
   // passed pa.string(), then we will error if we encounter any non-UTF8
   // value. If not, then we will allow the result to be a BinaryArray
+  std::shared_ptr<DataType> extension_type;
   if (options.type == nullptr) {
     ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas));
     options.strict = false;
+    // If type inference returned an extension type, convert using
+    // the storage type and then wrap the result as an extension array
+    if (options.type->id() == Type::EXTENSION) {
+      extension_type = options.type;
+      options.type = checked_cast<const ExtensionType&>(*options.type).storage_type();
+    }
   } else {
     options.strict = true;
   }
   ARROW_DCHECK_GE(size, 0);
 
   ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>(
                                             options.type, options, pool)));
+  std::shared_ptr<ChunkedArray> result;
   if (converter->may_overflow()) {
     // The converter hierarchy contains binary- or list-like builders which can overflow
     // depending on the input values. Wrap the converter with a chunker which detects
@@ -1288,7 +1302,7 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
     } else {
       RETURN_NOT_OK(chunked_converter->Extend(seq, size));
     }
-    return chunked_converter->ToChunkedArray();
+    ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray());
   } else {
     // If the converter can't overflow spare the capacity error checking on the hot-path,
     // this improves the performance roughly by ~10% for primitive types.
@@ -1297,8 +1311,13 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
     } else {
       RETURN_NOT_OK(converter->Extend(seq, size));
     }
-    return converter->ToChunkedArray();
+    ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray());
+  }
+  // If we inferred an extension type, wrap as an extension array
+  if (extension_type != nullptr) {
+    return ExtensionType::WrapArray(extension_type, result);
   }
+  return result;
 }
 
 }  // namespace py
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py