abergeron
diff --git a/‎Makefile
+1-1 b/‎Makefile
+1-1
diff --git a/‎src/cache.h
+2-1 b/‎src/cache.h
+2-1
diff --git a/‎src/cache/disk.c
+15-5 b/‎src/cache/disk.c
+15-5
diff --git a/‎src/cache/twoq.c
+2 b/‎src/cache/twoq.c
+2
diff --git a/‎src/gpuarray/buffer.h
+1-1 b/‎src/gpuarray/buffer.h
+1-1
diff --git a/‎src/gpuarray_array.c
+2-1 b/‎src/gpuarray_array.c
+2-1
diff --git a/‎src/gpuarray_blas_cuda_cublas.c
+11-9 b/‎src/gpuarray_blas_cuda_cublas.c
+11-9
diff --git a/‎src/gpuarray_buffer.c
+6-2 b/‎src/gpuarray_buffer.c
+6-2
diff --git a/‎src/gpuarray_buffer_collectives.c
+1-4 b/‎src/gpuarray_buffer_collectives.c
+1-4
@@ -11,7 +11,7 @@ debug: install-debugc py
 
 .PHONY: install-debugc py debug install-relc rel config
 
-Debug/Makefile: Debug Makefile.conf
+Debug/Makefile: Makefile.conf
 	mkdir -p Debug
 ifndef INSTALL_PREFIX
 	(cd Debug && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Debug)
 
@@ -89,7 +89,8 @@ cache *cache_twoq(size_t hot_size, size_t warm_size,
 
 cache *cache_disk(const char *dirpath, cache *mem,
                   kwrite_fn kwrite, vwrite_fn vwrite,
-                  kread_fn kread, vread_fn vread);
+                  kread_fn kread, vread_fn vread,
+                  error *e);
 
 /* API functions */
 static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) {
 
@@ -397,7 +397,7 @@ static void disk_destroy(cache *_c) {
 
 cache *cache_disk(const char *dirpath, cache *mem,
                   kwrite_fn kwrite, vwrite_fn vwrite,
-                  kread_fn kread, vread_fn vread) {
+                  kread_fn kread, vread_fn vread, error *e) {
   struct stat st;
   disk_cache *res;
   char *dirp;
@@ -414,7 +414,10 @@ cache *cache_disk(const char *dirpath, cache *mem,
 
   dirp = malloc(dirl + 1);  /* With the NUL */
 
-  if (dirp == NULL) return NULL;
+  if (dirp == NULL) {
+    error_sys(e, "malloc");
+    return NULL;
+  }
 
   strlcpy(dirp, dirpath, dirl + 1);
 
@@ -425,6 +428,7 @@ cache *cache_disk(const char *dirpath, cache *mem,
 
   if (ensurep(NULL, dirp) != 0) {
     free(dirp);
+    error_sys(e, "ensurep");
     return NULL;
   }
 
@@ -433,18 +437,24 @@ cache *cache_disk(const char *dirpath, cache *mem,
 
   mkdir(dirp, 0777); /* This may fail, but it's ok */
 
-  if (lstat(dirp, &st) != 0)
+  if (lstat(dirp, &st) != 0) {
+    error_sys(e, "lstat");
     return NULL;
+  }
 
   /* Restore the good path at the end */
   dirp[dirl - 1] = sep;
 
-  if (!(st.st_mode & S_IFDIR))
+  if (!(st.st_mode & S_IFDIR)) {
+    error_set(e, GA_SYS_ERROR, "Cache path exists but is not a directory");
     return NULL;
+  }
 
   res = calloc(sizeof(*res), 1);
-  if (res == NULL)
+  if (res == NULL) {
+    error_sys(e, "calloc");
     return NULL;
+  }
 
   res->dirp = dirp;
   res->mem = mem;
 
@@ -1,6 +1,8 @@
 #include <assert.h>
 #include <stdlib.h>
 
+#include <gpuarray/error.h>
+
 #include "cache.h"
 #include "private_config.h"
 
 
@@ -500,7 +500,7 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
  *
  * This can be use to cache kernel binaries after compilation of a
  * specific device.  The kernel can be recreated by calling
- * kernel_alloc with the binary and size and passing `GA_USE_BINARY`
+ * gpukernel_alloc with the binary and size and passing `GA_USE_BINARY`
  * as the use flags.
  *
  * The returned pointer is allocated and must be freed by the caller.
 
@@ -71,7 +71,8 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) {
     if (ctx->extcopy_cache == NULL)
       ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash,
                                       extcopy_free,
-                                      (cache_freev_fn)GpuElemwise_free);
+                                      (cache_freev_fn)GpuElemwise_free,
+                                      ctx->err);
     if (ctx->extcopy_cache == NULL)
       return GA_MISC_ERROR;
     if (cache_add(ctx->extcopy_cache, aa, k) != 0)
 
@@ -24,7 +24,7 @@ static inline cublasOperation_t convT(cb_transpose trans) {
   }
 }
 
-static const char *error(cublasStatus_t err) {
+static const char *estr(cublasStatus_t err) {
   switch (err) {
   case CUBLAS_STATUS_SUCCESS:
     return "(cublas) Operation completed successfully.";
@@ -53,12 +53,12 @@ static const char *error(cublasStatus_t err) {
 
 static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) {
   return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR,
-                   "%s: %s", msg, error(err));
+                   "%s: %s", msg, estr(err));
 }
 
 #define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do {       \
     cublasStatus_t err = (cmd);                   \
-    if (err != CUBLAS_SUCCESS) {                  \
+    if (err != CUBLAS_STATUS_SUCCESS) {           \
       cuda_exit(ctx);                             \
       return error_cublas((ctx)->err, #cmd, err); \
     }                                             \
@@ -525,13 +525,14 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   const size_t threshold = 650;
   cb_transpose transT;
 
+  ASSERT_BUF(A[0]);
+  ctx = A[0]->ctx;
+
   if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
       LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
       LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
     return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
 
-  ASSERT_BUF(A[0]);
-  ctx = A[0]->ctx;
   h = (blas_handle *)ctx->blas_handle;
   cuda_enter(ctx);
 
@@ -623,7 +624,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     gpudata_release(Ta);
     if (err != CUBLAS_STATUS_SUCCESS) {
       cuda_exit(ctx);
-      return error_cublas(ctx, "cublasSgemmBatched", err);
+      return error_cublas(ctx->err, "cublasSgemmBatched", err);
     }
 
     for (i = 0; i < batchCount; i++) {
@@ -651,13 +652,14 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   const size_t threshold = 650;
   cb_transpose transT;
 
+  ASSERT_BUF(A[0]);
+  ctx = A[0]->ctx;
+
   if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
       LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
       LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
     return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
 
-  ASSERT_BUF(A[0]);
-  ctx = A[0]->ctx;
   h = (blas_handle *)ctx->blas_handle;
   cuda_enter(ctx);
 
@@ -697,7 +699,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                                             (double*)A[i]->ptr + offA[i], lda,
                                             (double*)B[i]->ptr + offB[i], ldb,
                                             &beta,
-                                            (double*)C[i]->ptr + offC[i], ldc);
+                                            (double*)C[i]->ptr + offC[i], ldc));
 
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
 
@@ -163,8 +163,12 @@ gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count,
                           const char *fname, unsigned int numargs,
                           const int *typecodes, int flags, int *ret,
                           char **err_str) {
-  return ctx->ops->kernel_alloc(ctx, count, strings, lengths, fname, numargs,
-                                typecodes, flags, ret, err_str);
+  gpukernel *res;
+  res = ctx->ops->kernel_alloc(ctx, count, strings, lengths, fname, numargs,
+                               typecodes, flags, err_str);
+  if (res == NULL && ret)
+    *ret = ctx->err->code;
+  return res;
 }
 
 void gpukernel_retain(gpukernel *k) {
 
@@ -22,10 +22,7 @@ void gpucomm_free(gpucomm* comm) {
 }
 
 const char* gpucomm_error(gpucontext* ctx) {
-  if (ctx->comm_ops != NULL)
-    return ctx->error->msg;
-  return "No collective ops available, API error. Is a collectives library "
-         "installed?";
+  return ctx->err->msg;
 }
 
 gpucontext* gpucomm_context(gpucomm* comm) {
Original file line number	Diff line number	Diff line change
`@@ -500,7 +500,7 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,`
`500`	`500`	`*`
`501`	`501`	`* This can be use to cache kernel binaries after compilation of a`
`502`	`502`	`* specific device. The kernel can be recreated by calling`
`503`		- * kernel_alloc with the binary and size and passing `GA_USE_BINARY`
	`503`	+ * gpukernel_alloc with the binary and size and passing `GA_USE_BINARY`
`504`	`504`	`* as the use flags.`
`505`	`505`	`*`
`506`	`506`	`* The returned pointer is allocated and must be freed by the caller.`
Original file line number	Diff line number	Diff line change
`@@ -22,10 +22,7 @@ void gpucomm_free(gpucomm* comm) {`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`const char* gpucomm_error(gpucontext* ctx) {`
`25`		`- if (ctx->comm_ops != NULL)`
`26`		`- return ctx->error->msg;`
`27`		`- return "No collective ops available, API error. Is a collectives library "`
`28`		`- "installed?";`
	`25`	`+ return ctx->err->msg;`
`29`	`26`	`}`
`30`	`27`
`31`	`28`	`gpucontext* gpucomm_context(gpucomm* comm) {`