Skip to content

[SYCL] Overcoming workaround for mmap() allocation on Windows and remove useless wait #13482

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 18 additions & 36 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

if (padded_size > original_size && tensor->view_src == nullptr) {
SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset(
(char *)tensor->data + original_size, 0,
padded_size - original_size).wait()));
SYCL_CHECK(CHECK_TRY_ERROR(
ctx->stream->memset((char *) tensor->data + original_size, 0, padded_size - original_size)));
}
}
return GGML_STATUS_SUCCESS;
Expand All @@ -385,16 +384,16 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
ggml_sycl_set_device(ctx->device);
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
SYCL_CHECK(
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
#ifndef _WIN32
// Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
// This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
char* host_buf = (char*)malloc(size);
char * host_buf = (char *) malloc(size);
memcpy(host_buf, data, size);
SYCL_CHECK(
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
.wait()));
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
free(host_buf);
#else
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
#endif
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
Expand Down Expand Up @@ -498,9 +497,7 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
SYCL_CHECK(
CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));

SYCL_CHECK(CHECK_TRY_ERROR((*stream)
.memset(ctx->dev_ptr, value, buffer->size)
.wait()));
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(ctx->dev_ptr, value, buffer->size)));
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
Expand All @@ -522,7 +519,6 @@ static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer,
}
void * target_ptr = static_cast<char *>(tensor->data) + offset;
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(target_ptr, value, size)));
SYCL_CHECK(CHECK_TRY_ERROR((*stream).wait()));
}

static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) {
Expand Down Expand Up @@ -841,10 +837,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
the error codes. The original code was commented out and a warning
string was inserted. You need to rewrite this code.
*/
SYCL_CHECK(CHECK_TRY_ERROR(
(*stream)
.memset(buf + original_size, 0, size - original_size)
.wait()));
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(buf + original_size, 0, size - original_size)));
}

extra->data_device[i] = buf;
Expand Down Expand Up @@ -909,10 +902,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
*/
ggml_sycl_set_device(i);
const queue_ptr stream = ctx->streams[i];
SYCL_CHECK(CHECK_TRY_ERROR(
(*stream)
.memcpy(extra->data_device[i], buf_host, original_size)
.wait()));
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(extra->data_device[i], buf_host, original_size)));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ggml_backend_sycl_split_buffer_set_tensor, ggml_backend_sycl_split_buffer_get_tensor are expected to be blocking so these should keep the wait. Even if we have in order queues the host could free the data or try reading it before the operation completes.
We have to expect the host would use the equivalent async functions when possible.

}
}
catch (sycl::exception const &exc) {
Expand Down Expand Up @@ -962,10 +952,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
*/
ggml_sycl_set_device(i);
const queue_ptr stream = ctx->streams[i];
SYCL_CHECK(CHECK_TRY_ERROR(
(*stream)
.memcpy(buf_host, extra->data_device[i], original_size)
.wait()));
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(buf_host, extra->data_device[i], original_size)));
}
}
catch (sycl::exception const &exc) {
Expand Down Expand Up @@ -2502,10 +2489,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
if (i != ctx.device) {
if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
src1_ddq_i, src1_ddq_i_source,
src1_ncols * src1_padded_col_size * q8_1_ts /
q8_1_bs).wait()));
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
src1_ddq_i, src1_ddq_i_source, src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs)));
Comment on lines +2492 to +2493
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understand this copy happen when 2 different devices are used. We don't test that but given that events are not used to synchronize I think the wait should stay for now. We could add a comment to explain this and explain it could be improved by using events.

} else {

float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device];
Expand Down Expand Up @@ -2570,9 +2555,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0;
SYCL_CHECK(CHECK_TRY_ERROR(
stream->memcpy(dhf_dst_i, dst_dd_i,
src1_ncols * ne0 * sizeof(float)).wait()));
SYCL_CHECK(
CHECK_TRY_ERROR(stream->memcpy(dhf_dst_i, dst_dd_i, src1_ncols * ne0 * sizeof(float))));
Comment on lines +2558 to +2559
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the comment above: copy dst to host or other device if necessary. Similarly I think we need to keep this wait. If dst is on the host we'll probably always want to keep it, if it's on the device maybe it could be improved with events.

}
}

Expand Down Expand Up @@ -3740,8 +3724,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,

GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
data, (const char *)tensor->data + offset, size).wait()));
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(data, (const char *) tensor->data + offset, size)));
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
Expand All @@ -3760,8 +3743,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
was inserted. You need to rewrite this code.
*/
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
dst->data, src->data, ggml_nbytes(dst)).wait()));
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(dst->data, src->data, ggml_nbytes(dst))));
return true;
}

Expand Down
4 changes: 0 additions & 4 deletions tools/llama-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,6 @@ Using the `-d <n>` option, each test can be run at a specified context depth, pr

For a description of the other options, see the [main example](../main/README.md).

Note:

- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.

Comment on lines -83 to -86
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this still exist on Linux ? , and hence we still have the workaround for Linux
Maybe we mention linux has it at the moment and windows does not ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @AD2605 for your comment. The issue is still present for battlemage on Linux, but the workaround already in-place in master prevents the hanging mentioned even with --mmp 1. So it is not necessary forcing the usage of --mmp 0

## Examples

### Text generation with different models
Expand Down
Loading