Skip to content

Commit 1264dce

Browse files
committed
0.6.3 RC 1
1 parent 3402b00 commit 1264dce

File tree

3 files changed

+8
-175
lines changed

3 files changed

+8
-175
lines changed

Diff for: README.rst

+7-1
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,16 @@ ArrayKit requires the following:
3737
What is New in ArrayKit
3838
-------------------------
3939

40+
0.6.3
41+
............
42+
43+
Optimized memory allocation strategy for ``nonzero_1d()``.
44+
45+
4046
0.6.2
4147
............
4248

43-
Extended ``nonzero_1d()`` to support non-contiguous arrays,
49+
Extended ``nonzero_1d()`` to support non-contiguous arrays.
4450

4551
Optimizations to ``TriMap`` when mapping to object and flexible dtypes.
4652

Diff for: setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from setuptools import setup
66
from pathlib import Path
77

8-
AK_VERSION = '0.6.2'
8+
AK_VERSION = '0.6.3'
99

1010
def get_long_description() -> str:
1111
return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.

Diff for: src/_arraykit.c

-173
Original file line numberDiff line numberDiff line change
@@ -3535,179 +3535,6 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35353535
//------------------------------------------------------------------------------
35363536
// general utility
35373537

3538-
3539-
// #define NONZERO_APPEND_INDEX_RELATIVE { \
3540-
// if (AK_UNLIKELY(count == capacity)) { \
3541-
// capacity <<= 1; \
3542-
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
3543-
// if (indices == NULL) { \
3544-
// return NULL; \
3545-
// } \
3546-
// } \
3547-
// indices[count++] = p - p_start; \
3548-
// } \
3549-
3550-
// #define NONZERO_APPEND_INDEX_ABSOLUTE { \
3551-
// if (AK_UNLIKELY(count == capacity)) { \
3552-
// capacity <<= 1; \
3553-
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
3554-
// if (indices == NULL) { \
3555-
// return NULL; \
3556-
// } \
3557-
// } \
3558-
// indices[count++] = i; \
3559-
// } \
3560-
3561-
3562-
3563-
// // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
3564-
// static inline PyObject*
3565-
// AK_nonzero_1d(PyArrayObject* array) {
3566-
// // the maxiumum number of indices we could return is the size of the array; if this is under a certain number, probably better to just allocate that rather than reallocate
3567-
// PyObject* final;
3568-
// npy_intp count_max = PyArray_SIZE(array);
3569-
3570-
// if (count_max == 0) { // return empty array
3571-
// npy_intp dims = {count_max};
3572-
// final = PyArray_SimpleNew(1, &dims, NPY_INT64);
3573-
// PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
3574-
// return final;
3575-
// }
3576-
// lldiv_t size_div = lldiv((long long)count_max, 8); // quot, rem
3577-
3578-
// Py_ssize_t count = 0;
3579-
// // the maximum number of collected integers is equal to or less than count_max; for small count_max, we can just set that value; for large size, we set it to half the size
3580-
// // Py_ssize_t capacity = count_max < 1024 ? count_max : (Py_ssize_t)AK_next_power((npy_uint32)(count_max / 8));
3581-
// Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8;
3582-
// npy_int64* indices = (npy_int64*)malloc(sizeof(npy_int64) * capacity);
3583-
3584-
// NPY_BEGIN_THREADS_DEF;
3585-
// NPY_BEGIN_THREADS;
3586-
3587-
// if (PyArray_IS_C_CONTIGUOUS(array)) {
3588-
// npy_bool* p_start = (npy_bool*)PyArray_DATA(array);
3589-
// npy_bool* p = p_start;
3590-
// npy_bool* p_end = p + count_max;
3591-
// npy_bool* p_end_roll = p_end - size_div.rem;
3592-
3593-
// while (p < p_end_roll) {
3594-
// if (*(npy_uint64*)p == 0) {
3595-
// p += 8; // no true within this 8 byte roll region
3596-
// continue;
3597-
// }
3598-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3599-
// p++;
3600-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3601-
// p++;
3602-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3603-
// p++;
3604-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3605-
// p++;
3606-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3607-
// p++;
3608-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3609-
// p++;
3610-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3611-
// p++;
3612-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3613-
// p++;
3614-
// }
3615-
// while (p < p_end) {
3616-
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
3617-
// p++;
3618-
// }
3619-
// }
3620-
// // else {
3621-
// // NpyIter *iter = NpyIter_New(
3622-
// // array, // array
3623-
// // NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP, // iter flags
3624-
// // NPY_KEEPORDER, // order
3625-
// // NPY_NO_CASTING, // casting
3626-
// // NULL // dtype
3627-
// // );
3628-
// // if (iter == NULL) {
3629-
// // free(indices);
3630-
// // return NULL;
3631-
// // }
3632-
// // NpyIter_IterNextFunc *iter_next = NpyIter_GetIterNext(iter, NULL);
3633-
// // if (iter_next == NULL) {
3634-
// // free(indices);
3635-
// // NpyIter_Deallocate(iter);
3636-
// // return NULL;
3637-
// // }
3638-
// // char **data_ptr = NpyIter_GetDataPtrArray(iter);
3639-
// // char* data;
3640-
// // npy_intp *stride_ptr = NpyIter_GetInnerStrideArray(iter);
3641-
// // npy_intp stride;
3642-
// // npy_intp *inner_size_ptr = NpyIter_GetInnerLoopSizePtr(iter);
3643-
// // npy_intp inner_size;
3644-
// // npy_int64 i = 0;
3645-
// // do {
3646-
// // data = *data_ptr;
3647-
// // stride = *stride_ptr;
3648-
// // inner_size = *inner_size_ptr;
3649-
// // while (inner_size--) {
3650-
// // if (*(npy_bool*)data) {
3651-
// // if (AK_UNLIKELY(count == capacity)) {
3652-
// // capacity <<= 1;
3653-
// // indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
3654-
// // if (indices == NULL) {
3655-
// // NpyIter_Deallocate(iter);
3656-
// // return NULL;
3657-
// // }
3658-
// // }
3659-
// // indices[count++] = i;
3660-
// // }
3661-
// // i++;
3662-
// // data += stride;
3663-
// // }
3664-
// // } while(iter_next(iter));
3665-
// // NpyIter_Deallocate(iter);
3666-
// // }
3667-
// else {
3668-
// npy_intp i = 0; // position within Boolean array
3669-
// npy_intp i_end = count_max;
3670-
// npy_intp i_end_roll = count_max - size_div.rem;
3671-
// while (i < i_end_roll) {
3672-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3673-
// i++;
3674-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3675-
// i++;
3676-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3677-
// i++;
3678-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3679-
// i++;
3680-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3681-
// i++;
3682-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3683-
// i++;
3684-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3685-
// i++;
3686-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3687-
// i++;
3688-
// }
3689-
// while (i < i_end) {
3690-
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
3691-
// i++;
3692-
// }
3693-
// }
3694-
// NPY_END_THREADS;
3695-
3696-
// npy_intp dims = {count};
3697-
// final = PyArray_SimpleNewFromData(1, &dims, NPY_INT64, (void*)indices);
3698-
// if (!final) {
3699-
// free(indices);
3700-
// return NULL;
3701-
// }
3702-
// // This ensures that the array frees the indices array; this has been tested by calling free(indices) and observing segfault
3703-
// PyArray_ENABLEFLAGS((PyArrayObject*)final, NPY_ARRAY_OWNDATA);
3704-
// PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
3705-
// return final;
3706-
// }
3707-
// #undef NONZERO_APPEND_INDEX_RELATIVE
3708-
// #undef NONZERO_APPEND_INDEX_ABSOLUTE
3709-
3710-
37113538
// Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory provides the best performance at all scales. Using NpyIter, or using, bit masks does not improve performance over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy benefits from first counting the nonzeros, then allocating only enough data for the expexted number of indices.
37123539
static inline PyObject*
37133540
AK_nonzero_1d(PyArrayObject* array) {

0 commit comments

Comments
 (0)