Skip to content

Commit d838f6a

Browse files
authored
Merge pull request #325 from abergeron/switch
Switch gs and ls
2 parents dc5508f + 7c1b198 commit d838f6a

17 files changed

+54
-54
lines changed

pygpu/gpuarray.pxd

+5-5
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,9 @@ cdef extern from "gpuarray/kernel.h":
127127
unsigned int argcount, const int *types, int flags, char **err_str)
128128
void GpuKernel_clear(_GpuKernel *k)
129129
gpucontext *GpuKernel_context(_GpuKernel *k)
130-
int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs)
130+
int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls)
131131
int GpuKernel_call(_GpuKernel *k, unsigned int n,
132-
const size_t *ls, const size_t *gs,
132+
const size_t *gs, const size_t *ls,
133133
size_t shared, void **args)
134134
int GpuKernel_binary(_GpuKernel *, size_t *, void **)
135135

@@ -265,9 +265,9 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx,
265265
int flags) except -1
266266
cdef int kernel_clear(GpuKernel k) except -1
267267
cdef gpucontext *kernel_context(GpuKernel k) except NULL
268-
cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1
268+
cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1
269269
cdef int kernel_call(GpuKernel k, unsigned int n,
270-
const size_t *ls, const size_t *gs,
270+
const size_t *gs, const size_t *ls,
271271
size_t shared, void **args) except -1
272272
cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1
273273
cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1
@@ -346,5 +346,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]:
346346
cdef void **callbuf
347347
cdef object __weakref__
348348

349-
cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared)
349+
cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared)
350350
cdef _setarg(self, unsigned int index, int typecode, object o)

pygpu/gpuarray.pyx

+12-12
Original file line numberDiff line numberDiff line change
@@ -446,16 +446,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL:
446446
raise GpuArrayException, "Invalid kernel or destroyed context"
447447
return res
448448

449-
cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1:
449+
cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1:
450450
cdef int err
451-
err = GpuKernel_sched(&k.k, n, ls, gs)
451+
err = GpuKernel_sched(&k.k, n, gs, ls)
452452
if err != GA_NO_ERROR:
453453
raise get_exc(err), kernel_error(k, err)
454454

455-
cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls,
456-
const size_t *gs, size_t shared, void **args) except -1:
455+
cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs,
456+
const size_t *ls, size_t shared, void **args) except -1:
457457
cdef int err
458-
err = GpuKernel_call(&k.k, n, ls, gs, shared, args)
458+
err = GpuKernel_call(&k.k, n, gs, ls, shared, args)
459459
if err != GA_NO_ERROR:
460460
raise get_exc(err), kernel_error(k, err)
461461

@@ -2113,10 +2113,10 @@ cdef class GpuKernel:
21132113
sure to test against the size of your data.
21142114
21152115
If you want more control over thread allocation you can use the
2116-
`ls` and `gs` parameters like so::
2116+
`gs` and `ls` parameters like so::
21172117
21182118
k = GpuKernel(...)
2119-
k(param1, param2, ls=ls, gs=gs)
2119+
k(param1, param2, gs=gs, ls=ls)
21202120
21212121
If you choose to use this interface, make sure to stay within the
21222122
limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail.
@@ -2200,12 +2200,12 @@ cdef class GpuKernel:
22002200
finally:
22012201
free(_types)
22022202

2203-
def __call__(self, *args, n=None, ls=None, gs=None, shared=0):
2203+
def __call__(self, *args, n=None, gs=None, ls=None, shared=0):
22042204
if n == None and (ls == None or gs == None):
22052205
raise ValueError, "Must specify size (n) or both gs and ls"
2206-
self.do_call(n, ls, gs, args, shared)
2206+
self.do_call(n, gs, ls, args, shared)
22072207

2208-
cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared):
2208+
cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared):
22092209
cdef size_t n
22102210
cdef size_t gs[3]
22112211
cdef size_t ls[3]
@@ -2272,8 +2272,8 @@ cdef class GpuKernel:
22722272
if nd != 1:
22732273
raise ValueError, "n is specified and nd != 1"
22742274
n = py_n
2275-
kernel_sched(self, n, &ls[0], &gs[0])
2276-
kernel_call(self, nd, ls, gs, shared, self.callbuf)
2275+
kernel_sched(self, n, &gs[0], &ls[0])
2276+
kernel_call(self, nd, gs, ls, shared, self.callbuf)
22772277

22782278
cdef _setarg(self, unsigned int index, int typecode, object o):
22792279
if typecode == GA_BUFFER:

pygpu/reduction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ def __call__(self, *args, **kwargs):
282282
kargs.append(offsets[i])
283283
kargs.extend(strs[i])
284284

285-
k(*kargs, ls=ls, gs=gs)
285+
k(*kargs, gs=gs, ls=ls)
286286

287287
return out
288288

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
MAJOR = 0
77
MINOR = 6
88
PATCH = 0
9-
SUFFIX = 'rc1'
9+
SUFFIX = 'rc2'
1010
FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX)
1111

1212
try:

src/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
8888
INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
8989
MACOSX_RPATH OFF
9090
# This is the shared library version
91-
VERSION 1.0
91+
VERSION 2.0
9292
)
9393

9494
add_library(gpuarray-static STATIC ${GPUARRAY_SRC})

src/gpuarray/buffer.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
482482
*
483483
* \param k kernel
484484
* \param n number of dimensions of grid/block
485-
* \param bs block sizes for this call (also known as local size)
486485
* \param gs grid sizes for this call (also known as global size)
486+
* \param ls block sizes for this call (also known as local size)
487487
* \param shared amount of dynamic shared memory to reserve
488488
* \param args table of pointers to each argument (optional).
489489
*
490490
* \returns GA_NO_ERROR or an error code if an error occurred.
491491
*/
492492
GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
493-
const size_t *ls, const size_t *gs,
493+
const size_t *gs, const size_t *ls,
494494
size_t shared, void **args);
495495

496496
/**

src/gpuarray/config.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
/* The following included file should have been generated by CMake. */
55
#include <gpuarray/abi_version.h>
6-
#define GPUARRAY_API_VERSION 0
6+
#define GPUARRAY_API_VERSION 1
77

88
#ifdef GPUARRAY_SHARED
99
#ifdef _WIN32

src/gpuarray/kernel.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val);
8787
*
8888
* \param k the kernel to schedule for
8989
* \param n number of elements to handle
90-
* \param ls local size (in/out)
9190
* \param gs grid size (in/out)
91+
* \param ls local size (in/out)
9292
*/
9393
GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
94-
size_t *ls, size_t *gs);
94+
size_t *gs, size_t *ls);
9595

9696
/**
9797
* Launch the execution of a kernel.
9898
*
9999
* \param k the kernel to launch
100100
* \param n dimensionality of the grid/blocks
101-
* \param ls sizes of launch blocks
102101
* \param gs sizes of launch grid
102+
* \param ls sizes of launch blocks
103103
* \param amount of dynamic shared memory to allocate
104104
* \param args table of pointers to arguments
105105
*/
106106
GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
107-
const size_t *ls, const size_t *gs,
107+
const size_t *gs, const size_t *ls,
108108
size_t shared, void **args);
109109

110110
GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,

src/gpuarray_array.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
487487
if (err != GA_NO_ERROR)
488488
return err;
489489

490-
err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]);
490+
err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
491491
if (err != GA_NO_ERROR)
492492
goto out;
493493

@@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
521521
GpuKernel_setarg(&k, argp++, &n[1]);
522522
GpuKernel_setarg(&k, argp++, errbuf);
523523

524-
err = GpuKernel_call(&k, 2, ls, gs, 0, NULL);
524+
err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
525525
if (check_error && err == GA_NO_ERROR) {
526526
err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
527527
if (err == GA_NO_ERROR && kerr != 0) {

src/gpuarray_blas_cuda_cublas.c

+6-6
Original file line numberDiff line numberDiff line change
@@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
10991099
args[8] = &N;
11001100

11011101
if (transA == cb_no_trans) {
1102-
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
1102+
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
11031103
} else {
1104-
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
1104+
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
11051105
}
11061106

11071107
cuda_ops.buffer_release(Aa);
@@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
12231223
args[8] = &N;
12241224

12251225
if (transA == cb_no_trans) {
1226-
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
1226+
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
12271227
} else {
1228-
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
1228+
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
12291229
}
12301230

12311231
cuda_ops.buffer_release(Aa);
@@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
14861486
args[8] = &M;
14871487
args[9] = &N;
14881488

1489-
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
1489+
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
14901490

14911491
cuda_ops.buffer_release(Aa);
14921492
cuda_ops.buffer_release(xa);
@@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
16181618
args[8] = &M;
16191619
args[9] = &N;
16201620

1621-
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
1621+
err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
16221622

16231623
cuda_ops.buffer_release(Aa);
16241624
cuda_ops.buffer_release(xa);

src/gpuarray_buffer.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) {
180180
return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);
181181
}
182182

183-
int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls,
184-
const size_t *gs, size_t shared, void **args) {
185-
return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,
183+
int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
184+
const size_t *ls, size_t shared, void **args) {
185+
return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,
186186
shared, args);
187187
}
188188

src/gpuarray_buffer_cuda.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) {
12371237
}
12381238

12391239
static int cuda_callkernel(gpukernel *k, unsigned int n,
1240-
const size_t *bs, const size_t *gs,
1240+
const size_t *gs, const size_t *ls,
12411241
size_t shared, void **args) {
12421242
cuda_context *ctx = k->ctx;
12431243
unsigned int i;
@@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
12581258

12591259
switch (n) {
12601260
case 1:
1261-
ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
1261+
ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared,
12621262
ctx->s, args, NULL);
12631263
break;
12641264
case 2:
1265-
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
1265+
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared,
12661266
ctx->s, args, NULL);
12671267
break;
12681268
case 3:
1269-
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
1269+
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2],
12701270
shared, ctx->s, args, NULL);
12711271
break;
12721272
default:

src/gpuarray_buffer_opencl.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
274274

275275
static void cl_releasekernel(gpukernel *k);
276276
static int cl_callkernel(gpukernel *k, unsigned int n,
277-
const size_t *bs, const size_t *gs,
277+
const size_t *gs, const size_t *ls,
278278
size_t shared, void **args);
279279

280280
static const char CL_PREAMBLE[] =
@@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
748748
if (res != GA_NO_ERROR) goto fail;
749749
gs = ((n-1) / ls) + 1;
750750
args[0] = dst;
751-
res = cl_callkernel(m, 1, &ls, &gs, 0, args);
751+
res = cl_callkernel(m, 1, &gs, &ls, 0, args);
752752

753753
fail:
754754
cl_releasekernel(m);
@@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) {
998998
}
999999

10001000
static int cl_callkernel(gpukernel *k, unsigned int n,
1001-
const size_t *ls, const size_t *gs,
1001+
const size_t *gs, const size_t *ls,
10021002
size_t shared, void **args) {
10031003
cl_ctx *ctx = k->ctx;
10041004
size_t _gs[3];

src/gpuarray_elemwise.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
414414
}
415415
}
416416

417-
err = GpuKernel_sched(k, n, &ls, &gs);
417+
err = GpuKernel_sched(k, n, &gs, &ls);
418418
if (err != GA_NO_ERROR) goto error;
419419

420-
err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);
420+
err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);
421421
error:
422422
return err;
423423
}
@@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) {
572572
if (err != GA_NO_ERROR) return err;
573573
}
574574
}
575-
err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);
575+
err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);
576576
if (err != GA_NO_ERROR) return err;
577-
return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);
577+
return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);
578578
}
579579

580580
GpuElemwise *GpuElemwise_new(gpucontext *ctx,

src/gpuarray_kernel.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) {
3232
return gpukernel_context(k->k);
3333
}
3434

35-
int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
35+
int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) {
3636
size_t min_l;
3737
size_t max_l;
3838
size_t target_l;
@@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) {
9090
}
9191

9292
int GpuKernel_call(GpuKernel *k, unsigned int n,
93-
const size_t *bs, const size_t *gs,
93+
const size_t *gs, const size_t *ls,
9494
size_t shared, void **args) {
95-
return gpukernel_call(k->k, n, bs, gs, shared, args);
95+
return gpukernel_call(k->k, n, gs, ls, shared, args);
9696
}
9797

9898
int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {

src/gpuarray_reduction.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -815,8 +815,8 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){
815815
ctx->dstArgmaxStepsGD){
816816
ctx->ret = GpuKernel_call(&ctx->kernel,
817817
ctx->ndh>0 ? ctx->ndh : 1,
818-
ctx->blockSize,
819818
ctx->gridSize,
819+
ctx->blockSize,
820820
0,
821821
args);
822822
}else{

src/private.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops {
9797
void (*kernel_release)(gpukernel *k);
9898
int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a);
9999
int (*kernel_call)(gpukernel *k, unsigned int n,
100-
const size_t *bs, const size_t *gs,
100+
const size_t *gs, const size_t *ls,
101101
size_t shared, void **args);
102102

103103
int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);

0 commit comments

Comments
 (0)