-
Notifications
You must be signed in to change notification settings - Fork 11.8k
[SYCL] Overcoming workaround for mmap() allocation on Windows and remove useless wait #13482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -364,9 +364,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, | |
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); | ||
|
||
if (padded_size > original_size && tensor->view_src == nullptr) { | ||
SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset( | ||
(char *)tensor->data + original_size, 0, | ||
padded_size - original_size).wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR( | ||
ctx->stream->memset((char *) tensor->data + original_size, 0, padded_size - original_size))); | ||
} | ||
} | ||
return GGML_STATUS_SUCCESS; | ||
|
@@ -385,16 +384,16 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, | |
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; | ||
ggml_sycl_set_device(ctx->device); | ||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); | ||
SYCL_CHECK( | ||
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); | ||
#ifndef _WIN32 | ||
// Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU. | ||
// This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here. | ||
char* host_buf = (char*)malloc(size); | ||
char * host_buf = (char *) malloc(size); | ||
memcpy(host_buf, data, size); | ||
SYCL_CHECK( | ||
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size) | ||
.wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait())); | ||
free(host_buf); | ||
#else | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait())); | ||
#endif | ||
} | ||
catch (sycl::exception const &exc) { | ||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ | ||
|
@@ -498,9 +497,7 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, | |
SYCL_CHECK( | ||
CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); | ||
|
||
SYCL_CHECK(CHECK_TRY_ERROR((*stream) | ||
.memset(ctx->dev_ptr, value, buffer->size) | ||
.wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(ctx->dev_ptr, value, buffer->size))); | ||
} | ||
catch (sycl::exception const &exc) { | ||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ | ||
|
@@ -522,7 +519,6 @@ static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, | |
} | ||
void * target_ptr = static_cast<char *>(tensor->data) + offset; | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(target_ptr, value, size))); | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).wait())); | ||
} | ||
|
||
static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) { | ||
|
@@ -841,10 +837,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, | |
the error codes. The original code was commented out and a warning | ||
string was inserted. You need to rewrite this code. | ||
*/ | ||
SYCL_CHECK(CHECK_TRY_ERROR( | ||
(*stream) | ||
.memset(buf + original_size, 0, size - original_size) | ||
.wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(buf + original_size, 0, size - original_size))); | ||
} | ||
|
||
extra->data_device[i] = buf; | ||
|
@@ -909,10 +902,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, | |
*/ | ||
ggml_sycl_set_device(i); | ||
const queue_ptr stream = ctx->streams[i]; | ||
SYCL_CHECK(CHECK_TRY_ERROR( | ||
(*stream) | ||
.memcpy(extra->data_device[i], buf_host, original_size) | ||
.wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(extra->data_device[i], buf_host, original_size))); | ||
} | ||
} | ||
catch (sycl::exception const &exc) { | ||
|
@@ -962,10 +952,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, | |
*/ | ||
ggml_sycl_set_device(i); | ||
const queue_ptr stream = ctx->streams[i]; | ||
SYCL_CHECK(CHECK_TRY_ERROR( | ||
(*stream) | ||
.memcpy(buf_host, extra->data_device[i], original_size) | ||
.wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(buf_host, extra->data_device[i], original_size))); | ||
} | ||
} | ||
catch (sycl::exception const &exc) { | ||
|
@@ -2502,10 +2489,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten | |
if (i != ctx.device) { | ||
if (convert_src1_to_q8_1) { | ||
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset; | ||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( | ||
src1_ddq_i, src1_ddq_i_source, | ||
src1_ncols * src1_padded_col_size * q8_1_ts / | ||
q8_1_bs).wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( | ||
src1_ddq_i, src1_ddq_i_source, src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs))); | ||
Comment on lines
+2492
to
+2493
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From what I understand this copy happen when 2 different devices are used. We don't test that but given that events are not used to synchronize I think the wait should stay for now. We could add a comment to explain this and explain it could be improved by using events. |
||
} else { | ||
|
||
float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device]; | ||
|
@@ -2570,9 +2555,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten | |
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); | ||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); | ||
dhf_dst_i += src1_col_0*ne0; | ||
SYCL_CHECK(CHECK_TRY_ERROR( | ||
stream->memcpy(dhf_dst_i, dst_dd_i, | ||
src1_ncols * ne0 * sizeof(float)).wait())); | ||
SYCL_CHECK( | ||
CHECK_TRY_ERROR(stream->memcpy(dhf_dst_i, dst_dd_i, src1_ncols * ne0 * sizeof(float)))); | ||
Comment on lines
+2558
to
+2559
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From the comment above: |
||
} | ||
} | ||
|
||
|
@@ -3740,8 +3724,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, | |
|
||
GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); | ||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); | ||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy( | ||
data, (const char *)tensor->data + offset, size).wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(data, (const char *) tensor->data + offset, size))); | ||
} | ||
catch (sycl::exception const &exc) { | ||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ | ||
|
@@ -3760,8 +3743,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, | |
was inserted. You need to rewrite this code. | ||
*/ | ||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); | ||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy( | ||
dst->data, src->data, ggml_nbytes(dst)).wait())); | ||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(dst->data, src->data, ggml_nbytes(dst)))); | ||
return true; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,10 +80,6 @@ Using the `-d <n>` option, each test can be run at a specified context depth, pr | |
|
||
For a description of the other options, see the [main example](../main/README.md). | ||
|
||
Note: | ||
|
||
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`. | ||
|
||
Comment on lines
-83
to
-86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this still exist on Linux ? , and hence we still have the workaround for Linux There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you @AD2605 for your comment. The issue is still present for battlemage on Linux, but the workaround already in-place in master prevents the hanging mentioned even with |
||
## Examples | ||
|
||
### Text generation with different models | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
ggml_backend_sycl_split_buffer_set_tensor
,ggml_backend_sycl_split_buffer_get_tensor
are expected to be blocking so these should keep the wait. Even if we have in order queues the host could free the data or try reading it before the operation completes.We have to expect the host would use the equivalent
async
functions when possible.