Merge pull request #325 from abergeron/switch

nouiz · web-flow · commit d838f6a43bcc · 2017-01-16T13:20:41.000-05:00
Switch gs and ls
diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd
@@ -127,9 +127,9 @@ cdef extern from "gpuarray/kernel.h":
                        unsigned int argcount, const int *types, int flags, char **err_str)
     void GpuKernel_clear(_GpuKernel *k)
     gpucontext *GpuKernel_context(_GpuKernel *k)
-    int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs)
+    int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls)
     int GpuKernel_call(_GpuKernel *k, unsigned int n,
-                       const size_t *ls, const size_t *gs,
+                       const size_t *gs, const size_t *ls,
                        size_t shared, void **args)
     int GpuKernel_binary(_GpuKernel *, size_t *, void **)
 
@@ -265,9 +265,9 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx,
                      int flags) except -1
 cdef int kernel_clear(GpuKernel k) except -1
 cdef gpucontext *kernel_context(GpuKernel k) except NULL
-cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1
+cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1
 cdef int kernel_call(GpuKernel k, unsigned int n,
-                     const size_t *ls, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args) except -1
 cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1
 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1
@@ -346,5 +346,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]:
     cdef void **callbuf
     cdef object __weakref__
 
-    cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared)
+    cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared)
     cdef _setarg(self, unsigned int index, int typecode, object o)
diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx
@@ -446,16 +446,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL:
         raise GpuArrayException, "Invalid kernel or destroyed context"
     return res
 
-cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1:
+cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1:
     cdef int err
-    err = GpuKernel_sched(&k.k, n, ls, gs)
+    err = GpuKernel_sched(&k.k, n, gs, ls)
     if err != GA_NO_ERROR:
         raise get_exc(err), kernel_error(k, err)
 
-cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls,
-                     const size_t *gs, size_t shared, void **args) except -1:
+cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs,
+                     const size_t *ls, size_t shared, void **args) except -1:
     cdef int err
-    err = GpuKernel_call(&k.k, n, ls, gs, shared, args)
+    err = GpuKernel_call(&k.k, n, gs, ls, shared, args)
     if err != GA_NO_ERROR:
         raise get_exc(err), kernel_error(k, err)
 
@@ -2113,10 +2113,10 @@ cdef class GpuKernel:
     sure to test against the size of your data.
 
     If you want more control over thread allocation you can use the
-    `ls` and `gs` parameters like so::
+    `gs` and `ls` parameters like so::
 
         k = GpuKernel(...)
-        k(param1, param2, ls=ls, gs=gs)
+        k(param1, param2, gs=gs, ls=ls)
 
     If you choose to use this interface, make sure to stay within the
     limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail.
@@ -2200,12 +2200,12 @@ cdef class GpuKernel:
         finally:
             free(_types)
 
-    def __call__(self, *args, n=None, ls=None, gs=None, shared=0):
+    def __call__(self, *args, n=None, gs=None, ls=None, shared=0):
         if n == None and (ls == None or gs == None):
             raise ValueError, "Must specify size (n) or both gs and ls"
-        self.do_call(n, ls, gs, args, shared)
+        self.do_call(n, gs, ls, args, shared)
 
-    cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared):
+    cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared):
         cdef size_t n
         cdef size_t gs[3]
         cdef size_t ls[3]
@@ -2272,8 +2272,8 @@ cdef class GpuKernel:
             if nd != 1:
                 raise ValueError, "n is specified and nd != 1"
             n = py_n
-            kernel_sched(self, n, &ls[0], &gs[0])
-        kernel_call(self, nd, ls, gs, shared, self.callbuf)
+            kernel_sched(self, n, &gs[0], &ls[0])
+        kernel_call(self, nd, gs, ls, shared, self.callbuf)
 
     cdef _setarg(self, unsigned int index, int typecode, object o):
         if typecode == GA_BUFFER:
diff --git a/pygpu/reduction.py b/pygpu/reduction.py
@@ -282,7 +282,7 @@ def __call__(self, *args, **kwargs):
                 kargs.append(offsets[i])
                 kargs.extend(strs[i])
 
-        k(*kargs, ls=ls, gs=gs)
+        k(*kargs, gs=gs, ls=ls)
 
         return out
 
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 MAJOR = 0
 MINOR = 6
 PATCH = 0
-SUFFIX = 'rc1'
+SUFFIX = 'rc2'
 FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX)
 
 try:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
   INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
   MACOSX_RPATH OFF
   # This is the shared library version
-  VERSION 1.0
+  VERSION 2.0
   )
 
 add_library(gpuarray-static STATIC ${GPUARRAY_SRC})
diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
@@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
  *
  * \param k kernel
  * \param n number of dimensions of grid/block
- * \param bs block sizes for this call (also known as local size)
  * \param gs grid sizes for this call (also known as global size)
+ * \param ls block sizes for this call (also known as local size)
  * \param shared amount of dynamic shared memory to reserve
  * \param args table of pointers to each argument (optional).
  *
  * \returns GA_NO_ERROR or an error code if an error occurred.
  */
 GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
 /**
diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h
@@ -3,7 +3,7 @@
 
 /* The following included file should have been generated by CMake. */
 #include <gpuarray/abi_version.h>
-#define GPUARRAY_API_VERSION 0
+#define GPUARRAY_API_VERSION 1
 
 #ifdef GPUARRAY_SHARED
  #ifdef _WIN32
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
@@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val);
  *
  * \param k the kernel to schedule for
  * \param n number of elements to handle
- * \param ls local size (in/out)
  * \param gs grid size (in/out)
+ * \param ls local size (in/out)
  */
 GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
-                                    size_t *ls, size_t *gs);
+                                    size_t *gs, size_t *ls);
 
 /**
  * Launch the execution of a kernel.
  *
  * \param k the kernel to launch
  * \param n dimensionality of the grid/blocks
- * \param ls sizes of launch blocks
  * \param gs sizes of launch grid
+ * \param ls sizes of launch blocks
  * \param amount of dynamic shared memory to allocate
  * \param args table of pointers to arguments
  */
 GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
 GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c
@@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   if (err != GA_NO_ERROR)
     return err;
 
-  err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]);
+  err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
   if (err != GA_NO_ERROR)
     goto out;
 
@@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   GpuKernel_setarg(&k, argp++, &n[1]);
   GpuKernel_setarg(&k, argp++, errbuf);
 
-  err = GpuKernel_call(&k, 2, ls, gs, 0, NULL);
+  err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
   if (check_error && err == GA_NO_ERROR) {
     err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
     if (err == GA_NO_ERROR && kerr != 0) {
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
@@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
@@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
@@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) {
   return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);
 }
 
-int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls,
-                   const size_t *gs, size_t shared, void **args) {
-  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,
+int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
+                   const size_t *ls, size_t shared, void **args) {
+  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,
                                                          shared, args);
 }
 
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) {
 }
 
 static int cuda_callkernel(gpukernel *k, unsigned int n,
-                           const size_t *bs, const size_t *gs,
+                           const size_t *gs, const size_t *ls,
                            size_t shared, void **args) {
     cuda_context *ctx = k->ctx;
     unsigned int i;
@@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
 
     switch (n) {
     case 1:
-      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
+      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared,
                                 ctx->s, args, NULL);
       break;
     case 2:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared,
                                 ctx->s, args, NULL);
       break;
     case 3:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2],
                                 shared, ctx->s, args, NULL);
       break;
     default:
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
@@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
 
 static void cl_releasekernel(gpukernel *k);
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *bs, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args);
 
 static const char CL_PREAMBLE[] =
@@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   if (res != GA_NO_ERROR) goto fail;
   gs = ((n-1) / ls) + 1;
   args[0] = dst;
-  res = cl_callkernel(m, 1, &ls, &gs, 0, args);
+  res = cl_callkernel(m, 1, &gs, &ls, 0, args);
 
  fail:
   cl_releasekernel(m);
@@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) {
 }
 
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *ls, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args) {
   cl_ctx *ctx = k->ctx;
   size_t _gs[3];
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
@@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
     }
   }
 
-  err = GpuKernel_sched(k, n, &ls, &gs);
+  err = GpuKernel_sched(k, n, &gs, &ls);
   if (err != GA_NO_ERROR) goto error;
 
-  err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);
+  err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);
  error:
   return err;
 }
@@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) {
       if (err != GA_NO_ERROR) return err;
     }
   }
-  err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);
+  err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);
   if (err != GA_NO_ERROR) return err;
-  return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);
+  return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);
 }
 
 GpuElemwise *GpuElemwise_new(gpucontext *ctx,
diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c
@@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) {
   return gpukernel_context(k->k);
 }
 
-int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
+int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) {
   size_t min_l;
   size_t max_l;
   size_t target_l;
@@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) {
 }
 
 int GpuKernel_call(GpuKernel *k, unsigned int n,
-                   const size_t *bs, const size_t *gs,
+                   const size_t *gs, const size_t *ls,
                    size_t shared, void **args) {
-  return gpukernel_call(k->k, n, bs, gs, shared, args);
+  return gpukernel_call(k->k, n, gs, ls, shared, args);
 }
 
 int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
@@ -815,8 +815,8 @@ static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
 	   ctx->dstArgmaxStepsGD){
 		ctx->ret = GpuKernel_call(&ctx->kernel,
 		                          ctx->ndh>0 ? ctx->ndh : 1,
-		                          ctx->blockSize,
 		                          ctx->gridSize,
+		                          ctx->blockSize,
 		                          0,
 		                          args);
 	}else{
diff --git a/src/private.h b/src/private.h
@@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops {
   void (*kernel_release)(gpukernel *k);
   int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a);
   int (*kernel_call)(gpukernel *k, unsigned int n,
-                     const size_t *bs, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args);
 
   int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES`
`88`	`88`	`INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib`
`89`	`89`	`MACOSX_RPATH OFF`
`90`	`90`	`# This is the shared library version`
`91`		`- VERSION 1.0`
	`91`	`+ VERSION 2.0`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`add_library(gpuarray-static STATIC ${GPUARRAY_SRC})`
Original file line number	Diff line number	Diff line change
`@@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel k, unsigned int i, void a) {`
`180`	`180`	`return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);`
`181`	`181`	`}`
`182`	`182`
`183`		`-int gpukernel_call(gpukernel k, unsigned int n, const size_t ls,`
`184`		`- const size_t gs, size_t shared, void *args) {`
`185`		`- return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,`
	`183`	`+int gpukernel_call(gpukernel k, unsigned int n, const size_t gs,`
	`184`	`+ const size_t ls, size_t shared, void *args) {`
	`185`	`+ return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,`
`186`	`186`	`shared, args);`
`187`	`187`	`}`
`188`	`188`
Original file line number	Diff line number	Diff line change
`@@ -414,10 +414,10 @@ static int call_basic(GpuElemwise ge, void *args, size_t n, unsigned int nd,`
`414`	`414`	`}`
`415`	`415`	`}`
`416`	`416`
`417`		`- err = GpuKernel_sched(k, n, &ls, &gs);`
	`417`	`+ err = GpuKernel_sched(k, n, &gs, &ls);`
`418`	`418`	`if (err != GA_NO_ERROR) goto error;`
`419`	`419`
`420`		`- err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);`
	`420`	`+ err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);`
`421`	`421`	`error:`
`422`	`422`	`return err;`
`423`	`423`	`}`
`@@ -572,9 +572,9 @@ static int call_contig(GpuElemwise ge, void *args, size_t n) {`
`572`	`572`	`if (err != GA_NO_ERROR) return err;`
`573`	`573`	`}`
`574`	`574`	`}`
`575`		`- err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);`
	`575`	`+ err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);`
`576`	`576`	`if (err != GA_NO_ERROR) return err;`
`577`		`- return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);`
	`577`	`+ return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);`
`578`	`578`	`}`
`579`	`579`
`580`	`580`	`GpuElemwise GpuElemwise_new(gpucontext ctx,`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ gpucontext GpuKernel_context(GpuKernel k) {`
`32`	`32`	`return gpukernel_context(k->k);`
`33`	`33`	`}`
`34`	`34`
`35`		`-int GpuKernel_sched(GpuKernel k, size_t n, size_t ls, size_t *gs) {`
	`35`	`+int GpuKernel_sched(GpuKernel k, size_t n, size_t gs, size_t *ls) {`
`36`	`36`	`size_t min_l;`
`37`	`37`	`size_t max_l;`
`38`	`38`	`size_t target_l;`
`@@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel k, unsigned int i, void a) {`
`90`	`90`	`}`
`91`	`91`
`92`	`92`	`int GpuKernel_call(GpuKernel *k, unsigned int n,`
`93`		`- const size_t bs, const size_t gs,`
	`93`	`+ const size_t gs, const size_t ls,`
`94`	`94`	`size_t shared, void **args) {`
`95`		`- return gpukernel_call(k->k, n, bs, gs, shared, args);`
	`95`	`+ return gpukernel_call(k->k, n, gs, ls, shared, args);`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`int GpuKernel_binary(const GpuKernel k, size_t sz, void **bin) {`