diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 625542e0..d500a08b 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -188,6 +188,11 @@ __STATIC_INLINE__ ggml_fp16_t ggml_ext_tensor_get_f16(const ggml_tensor* tensor, return *(ggml_fp16_t*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]); } +__STATIC_INLINE__ ggml_bf16_t ggml_ext_tensor_get_bf16(const ggml_tensor* tensor, int i0, int i1 = 0, int i2 = 0, int i3 = 0) { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + return *(ggml_bf16_t*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]); +} + __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int iw, int ih, int ic, bool scale = true) { float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic); if (scale) { @@ -231,6 +236,8 @@ __STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_ printf(" [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_ext_tensor_get_f32(tensor, i0, i1, i2, i3)); } else if (tensor->type == GGML_TYPE_F16) { printf(" [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_fp16_to_fp32(ggml_ext_tensor_get_f16(tensor, i0, i1, i2, i3))); + } else if (tensor->type == GGML_TYPE_BF16) { + printf(" [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_bf16_to_fp32(ggml_ext_tensor_get_bf16(tensor, i0, i1, i2, i3))); } else if (tensor->type == GGML_TYPE_I32) { printf(" [%d, %d, %d, %d] = %i3\n", i3, i2, i1, i0, ggml_ext_tensor_get_i32(tensor, i0, i1, i2, i3)); } @@ -1344,7 +1351,7 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe } __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) { - GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32); + GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16 || tensor->type == GGML_TYPE_I32); float value; if (tensor->type == GGML_TYPE_F32) { ggml_backend_tensor_get(tensor, &value, 0, sizeof(value)); @@ -1352,6 +1359,10 @@ __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) { ggml_fp16_t f16_value; ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value)); value = ggml_fp16_to_fp32(f16_value); + } else if (tensor->type == GGML_TYPE_BF16) { + ggml_bf16_t bf16_value; + ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value)); + value = ggml_bf16_to_fp32(bf16_value); } else { // GGML_TYPE_I32 int int32_value; ggml_backend_tensor_get(tensor, &int32_value, 0, sizeof(int32_value)); @@ -2011,11 +2022,19 @@ class Linear : public UnaryBlock { }; __STATIC_INLINE__ bool support_get_rows(ggml_type wtype) { - std::set allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0}; - if (allow_types.find(wtype) != allow_types.end()) { - return true; + switch (wtype) { + case GGML_TYPE_F32: + case GGML_TYPE_BF16: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_0: + return true; + default: + return false; } - return false; } class Embedding : public UnaryBlock { diff --git a/model.cpp b/model.cpp index da77afed..ba97f0af 100644 --- a/model.cpp +++ b/model.cpp @@ -111,8 +111,12 @@ const char* unused_tensors[] = { "embedding_manager", "denoiser.sigmas", "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training + "text_encoders.t5xxl.logit_scale", // only used during training + "text_encoders.t5xxl.transformer.scaled_fp8", "text_encoders.qwen2vl.output.weight", "text_encoders.qwen2vl.lm_head.", + "text_encoders.qwen2vl.logit_scale", // only used during training + "text_encoders.qwen2vl.transformer.scaled_fp8", }; bool is_unused_tensor(std::string name) { @@ -742,6 +746,10 @@ std::string convert_tensor_name(std::string name) { void add_preprocess_tensor_storage_types(String2GGMLType& tensor_storages_types, std::string name, enum ggml_type type) { std::string new_name = convert_tensor_name(name); + // TODO: enable bf16 once it is supported by the relevant ggml ops + if (type == GGML_TYPE_BF16) { + type = GGML_TYPE_F32; + } if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_weight")) { size_t prefix_size = new_name.find("attn.in_proj_weight"); @@ -814,126 +822,47 @@ void preprocess_tensor(TensorStorage tensor_storage, } } -float bf16_to_f32(uint16_t bfloat16) { - uint32_t val_bits = (static_cast(bfloat16) << 16); - return *reinterpret_cast(&val_bits); -} - -uint16_t f8_e4m3_to_f16(uint8_t f8) { - // do we need to support uz? - - const uint32_t exponent_bias = 7; - if (f8 == 0xff) { - return ggml_fp32_to_fp16(-NAN); - } else if (f8 == 0x7f) { - return ggml_fp32_to_fp16(NAN); - } - - uint32_t sign = f8 & 0x80; - uint32_t exponent = (f8 & 0x78) >> 3; - uint32_t mantissa = f8 & 0x07; - uint32_t result = sign << 24; - if (exponent == 0) { - if (mantissa > 0) { - exponent = 0x7f - exponent_bias; - - // yes, 2 times - if ((mantissa & 0x04) == 0) { - mantissa &= 0x03; - mantissa <<= 1; - exponent -= 1; - } - if ((mantissa & 0x04) == 0) { - mantissa &= 0x03; - mantissa <<= 1; - exponent -= 1; - } - - result |= (mantissa & 0x03) << 21; - result |= exponent << 23; - } - } else { - result |= mantissa << 20; - exponent += 0x7f - exponent_bias; - result |= exponent << 23; - } - - return ggml_fp32_to_fp16(*reinterpret_cast(&result)); +float f8_e4m3fn_to_f32(uint8_t f8) { + switch (f8) { + case 0: + return 0.0f; + case 127: + return NAN; + case 128: + return -0.0f; + case 255: + return -NAN; + } + + const uint32_t exponent_bias_delta = 127 - 7; + uint32_t exponent = ((f8 >> 3) & 15) + exponent_bias_delta; + uint32_t mantissa = f8 & 7; + + // subnormal + if (exponent == exponent_bias_delta) { + // yes, 2 times + if ((mantissa & 4) == 0) { + mantissa &= 3; + mantissa <<= 1; + exponent -= 1; + } + if ((mantissa & 4) == 0) { + mantissa &= 3; + mantissa <<= 1; + exponent -= 1; + } + // (mantisa & 4) == 4 + mantissa &= 3; + mantissa <<= 1; + } + + const uint32_t sign = f8 >> 7; + const uint32_t result = (sign << 31) | (exponent << 23) | (mantissa << 20); + return *reinterpret_cast(&result); } uint16_t f8_e5m2_to_f16(uint8_t fp8) { - uint8_t sign = (fp8 >> 7) & 0x1; - uint8_t exponent = (fp8 >> 2) & 0x1F; - uint8_t mantissa = fp8 & 0x3; - - uint16_t fp16_sign = sign << 15; - uint16_t fp16_exponent; - uint16_t fp16_mantissa; - - if (exponent == 0 && mantissa == 0) { // zero - return fp16_sign; - } - - if (exponent == 0x1F) { // NAN and INF - fp16_exponent = 0x1F; - fp16_mantissa = mantissa ? (mantissa << 8) : 0; - return fp16_sign | (fp16_exponent << 10) | fp16_mantissa; - } - - if (exponent == 0) { // subnormal numbers - fp16_mantissa = (mantissa << 8); - return fp16_sign | fp16_mantissa; - } - - // normal numbers - int16_t true_exponent = (int16_t)exponent - 15 + 15; - if (true_exponent <= 0) { - fp16_exponent = 0; - fp16_mantissa = (mantissa << 8); - } else if (true_exponent >= 0x1F) { - fp16_exponent = 0x1F; - fp16_mantissa = 0; - } else { - fp16_exponent = (uint16_t)true_exponent; - fp16_mantissa = mantissa << 8; - } - - return fp16_sign | (fp16_exponent << 10) | fp16_mantissa; -} - -void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) { - // support inplace op - for (int64_t i = n - 1; i >= 0; i--) { - dst[i] = bf16_to_f32(src[i]); - } -} - -void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { - // support inplace op - for (int64_t i = n - 1; i >= 0; i--) { - dst[i] = f8_e4m3_to_f16(src[i]); - } -} - -void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { - // support inplace op - for (int64_t i = n - 1; i >= 0; i--) { - dst[i] = f8_e5m2_to_f16(src[i]); - } -} - -void f64_to_f32_vec(double* src, float* dst, int64_t n) { - // support inplace op - for (int64_t i = 0; i < n; i++) { - dst[i] = (float)src[i]; - } -} - -void i64_to_i32_vec(int64_t* src, int32_t* dst, int64_t n) { - // support inplace op - for (int64_t i = 0; i < n; i++) { - dst[i] = (int32_t)src[i]; - } + return static_cast(fp8) << 8; } void convert_tensor(void* src, @@ -942,49 +871,38 @@ void convert_tensor(void* src, ggml_type dst_type, int nrows, int n_per_row) { + GGML_ASSERT(src != dst); int n = nrows * n_per_row; if (src_type == dst_type) { size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type); - memcpy(((char*)dst), ((char*)src), nbytes); - } else if (src_type == GGML_TYPE_F32) { - if (dst_type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n); - } else { - std::vector imatrix(n_per_row, 1.0f); // dummy importance matrix - const float* im = imatrix.data(); - ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im); - } - } else if (dst_type == GGML_TYPE_F32) { - if (src_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n); - } else { - auto qtype = ggml_get_type_traits(src_type); - if (qtype->to_float == nullptr) { - throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", - ggml_type_name(src_type))); - } - qtype->to_float(src, (float*)dst, n); - } - } else { - // src_type == GGML_TYPE_F16 => dst_type is quantized - // src_type is quantized => dst_type == GGML_TYPE_F16 or dst_type is quantized + memcpy(dst, src, nbytes); + return; + } + + std::vector buffer; + float* ptr = static_cast(src); + + // convert src_type to f32. allocate a buffer when dequantizing if necessary. + if (src_type != GGML_TYPE_F32) { auto qtype = ggml_get_type_traits(src_type); if (qtype->to_float == nullptr) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(src_type))); } - std::vector buf; - buf.resize(sizeof(float) * n); - char* src_data_f32 = buf.data(); - qtype->to_float(src, (float*)src_data_f32, n); - if (dst_type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n); - } else { - std::vector imatrix(n_per_row, 1.0f); // dummy importance matrix - const float* im = imatrix.data(); - ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im); + if (dst_type == GGML_TYPE_F32) { + // no need to re-quant - write to dest directly + qtype->to_float(src, static_cast(dst), n); + return; } + buffer.resize(n); + ptr = buffer.data(); + qtype->to_float(src, ptr, n); } + + // convert f32 to dst_type + std::vector imatrix(n_per_row, 1.0f); // dummy importance matrix + const float* im = imatrix.data(); + ggml_quantize_chunk(dst_type, ptr, dst, 0, nrows, n_per_row, im); } /*================================================= ModelLoader ==================================================*/ @@ -1194,22 +1112,15 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s /*================================================= SafeTensorsModelLoader ==================================================*/ +// most ggml operations only support f16, bf16, and f32 tensors. ggml_type str_to_ggml_type(const std::string& dtype) { ggml_type ttype = GGML_TYPE_COUNT; if (dtype == "F16") { ttype = GGML_TYPE_F16; } else if (dtype == "BF16") { + ttype = GGML_TYPE_BF16; + } else if (dtype == "F32" || dtype == "F64" || dtype == "F8_E4M3" || dtype == "F8_E5M2" || dtype == "I64") { ttype = GGML_TYPE_F32; - } else if (dtype == "F32") { - ttype = GGML_TYPE_F32; - } else if (dtype == "F64") { - ttype = GGML_TYPE_F32; - } else if (dtype == "F8_E4M3") { - ttype = GGML_TYPE_F16; - } else if (dtype == "F8_E5M2") { - ttype = GGML_TYPE_F16; - } else if (dtype == "I64") { - ttype = GGML_TYPE_I32; } return ttype; } @@ -1327,24 +1238,21 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const size_t tensor_data_size = end - begin; - if (dtype == "BF16") { - tensor_storage.is_bf16 = true; - GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2); - } else if (dtype == "F8_E4M3") { - tensor_storage.is_f8_e4m3 = true; - // f8 -> f16 - GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2); + if (dtype == "F8_E4M3") { + tensor_storage.is_f8_e4m3fn = true; + // f8 -> f32 + GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 4); } else if (dtype == "F8_E5M2") { tensor_storage.is_f8_e5m2 = true; - // f8 -> f16 - GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2); + // f8 -> f32 + GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 4); } else if (dtype == "F64") { tensor_storage.is_f64 = true; // f64 -> f32 GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size); } else if (dtype == "I64") { tensor_storage.is_i64 = true; - // i64 -> i32 + // i64 -> f32 GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size); } else { GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size); @@ -2036,6 +1944,33 @@ std::string ModelLoader::load_umt5_tokenizer_json() { return json_str; } +bool TensorStorage::read_data(void* buf, std::ifstream& file) const { + file.seekg(this->offset); + file.read(static_cast(buf), this->nbytes_to_read()); + return !file.fail(); +} + +bool TensorStorage::read_data(void* buf, struct zip_t* zip, std::atomic& memcpy_time_ms) const { + size_t n = this->nbytes_to_read(); + if (zip_entry_openbyindex(zip, this->index_in_zip)) { + return false; + } + bool failed = false; + size_t entry_size = zip_entry_size(zip); + if (entry_size != n) { + int64_t t_memcpy_start; + std::vector read_buffer(entry_size); + failed = zip_entry_noallocread(zip, read_buffer.data(), entry_size) != entry_size; + t_memcpy_start = ggml_time_ms(); + memcpy(buf, read_buffer.data() + this->offset, n); + memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start); + } else { + failed = zip_entry_noallocread(zip, buf, n) != n; + } + zip_entry_close(zip); + return !failed; +} + bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) { int64_t process_time_ms = 0; std::atomic read_time_ms(0); @@ -2157,6 +2092,26 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::atomic failed(false); std::vector workers; + std::unordered_map scale_idx; + std::unordered_map scale_count; + for (int i = 0; i < file_tensors.size(); i++) { + const TensorStorage* tensor = file_tensors[i]; + if (ends_with(tensor->name, ".scale_weight")) { + std::string new_name = tensor->name.substr(0, tensor->name.size() - strlen(".scale_weight")) + ".weight"; + GGML_ASSERT(tensor->nelements() == 1 && tensor->type == GGML_TYPE_F32 && tensor->nbytes_to_read() == 4); + scale_idx[new_name] = i; + scale_count[new_name]++; + } else if (ends_with(tensor->name, ".weight") && (tensor->is_f8_e4m3fn || tensor->is_f8_e5m2)) { + scale_count[tensor->name]--; + } + } + for (auto& x : scale_count) { + if (x.second > 0) { + LOG_ERROR("f8 weight not found for scale_weight: '%s'", x.first.c_str()); + return false; + } + } + for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, file_path, is_zip]() { std::ifstream file; @@ -2180,17 +2135,73 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::vector read_buffer; std::vector convert_buffer; + auto read_data = [&](int64_t t0, const TensorStorage& tensor_storage, void* buf) { + bool fail = false; + if (is_zip) { + fail |= !tensor_storage.read_data(buf, zip, memcpy_time_ms); + } else { + fail |= !tensor_storage.read_data(buf, file); + } + float scale = 1; + if (scale_idx.count(tensor_storage.name)) { + const TensorStorage* tensor = file_tensors[scale_idx[tensor_storage.name]]; + if (is_zip) { + fail |= !tensor->read_data(&scale, zip, memcpy_time_ms); + } else { + fail |= !tensor->read_data(&scale, file); + } + } + if (fail) { + failed = true; + LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); + } + int64_t t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); + t0 = t1; + int count = tensor_storage.nelements(); + if (tensor_storage.is_f8_e4m3fn) { + for (int64_t i = count - 1; i >= 0; i--) { + static_cast(buf)[i] = f8_e4m3fn_to_f32(static_cast(buf)[i]); + if (scale != 1) { + static_cast(buf)[i] *= scale; + } + } + } else if (tensor_storage.is_f8_e5m2) { + for (int64_t i = count - 1; i >= 0; i--) { + static_cast(buf)[i] = + ggml_fp16_to_fp32(f8_e5m2_to_f16(static_cast(buf)[i])); + if (scale != 1) { + static_cast(buf)[i] *= scale; + } + } + } else if (tensor_storage.is_f64) { + for (int64_t i = 0; i < count; i++) { + static_cast(buf)[i] = static_cast(buf)[i]; + } + } else if (tensor_storage.is_i64) { + for (int64_t i = 0; i < count; i++) { + static_cast(buf)[i] = static_cast(buf)[i]; + } + } + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + return t1; + }; + + int64_t t0 = ggml_time_ms(); while (true) { - int64_t t0, t1; + int64_t t1; size_t idx = tensor_idx.fetch_add(1); if (idx >= file_tensors.size() || failed) { break; } const TensorStorage& tensor_storage = *file_tensors[idx]; - ggml_tensor* dst_tensor = nullptr; + if (ends_with(tensor_storage.name, ".scale_weight")) { + continue; + } - t0 = ggml_time_ms(); + ggml_tensor* dst_tensor = nullptr; if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) { LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str()); @@ -2201,136 +2212,41 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread if (dst_tensor == nullptr) { t1 = ggml_time_ms(); read_time_ms.fetch_add(t1 - t0); + t0 = t1; continue; } size_t nbytes_to_read = tensor_storage.nbytes_to_read(); - auto read_data = [&](char* buf, size_t n) { - if (zip != nullptr) { - zip_entry_openbyindex(zip, tensor_storage.index_in_zip); - size_t entry_size = zip_entry_size(zip); - if (entry_size != n) { - int64_t t_memcpy_start; - read_buffer.resize(entry_size); - zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); - t_memcpy_start = ggml_time_ms(); - memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); - memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start); - } else { - zip_entry_noallocread(zip, (void*)buf, n); - } - zip_entry_close(zip); - } else { - file.seekg(tensor_storage.offset); - file.read(buf, n); - if (!file) { - LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); - failed = true; - } - } - }; - if (dst_tensor->buffer == nullptr || ggml_backend_buffer_is_host(dst_tensor->buffer)) { if (tensor_storage.type == dst_tensor->type) { GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); - if (tensor_storage.is_f64 || tensor_storage.is_i64) { - read_buffer.resize(tensor_storage.nbytes_to_read()); - read_data((char*)read_buffer.data(), nbytes_to_read); - } else { - read_data((char*)dst_tensor->data, nbytes_to_read); - } - t1 = ggml_time_ms(); - read_time_ms.fetch_add(t1 - t0); - - t0 = ggml_time_ms(); - if (tensor_storage.is_bf16) { - // inplace op - bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { - // inplace op - f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { - // inplace op - f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); - } - t1 = ggml_time_ms(); - convert_time_ms.fetch_add(t1 - t0); + t0 = read_data(t0, tensor_storage, dst_tensor->data); } else { read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); - read_data((char*)read_buffer.data(), nbytes_to_read); - t1 = ggml_time_ms(); - read_time_ms.fetch_add(t1 - t0); - - t0 = ggml_time_ms(); - if (tensor_storage.is_bf16) { - // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { - // inplace op - f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { - // inplace op - f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - // inplace op - f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - // inplace op - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); - } - convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + t0 = read_data(t0, tensor_storage, read_buffer.data()); + convert_tensor(read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); t1 = ggml_time_ms(); convert_time_ms.fetch_add(t1 - t0); + t0 = t1; } } else { read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); - read_data((char*)read_buffer.data(), nbytes_to_read); - t1 = ggml_time_ms(); - read_time_ms.fetch_add(t1 - t0); - - t0 = ggml_time_ms(); - if (tensor_storage.is_bf16) { - // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { - // inplace op - f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { - // inplace op - f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - // inplace op - f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - // inplace op - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); - } - - if (tensor_storage.type == dst_tensor->type) { - // copy to device memory - t1 = ggml_time_ms(); - convert_time_ms.fetch_add(t1 - t0); - t0 = ggml_time_ms(); - ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); - t1 = ggml_time_ms(); - copy_to_backend_time_ms.fetch_add(t1 - t0); - } else { - // convert first, then copy to device memory - + uint8_t* ptr = read_buffer.data(); + t0 = read_data(t0, tensor_storage, ptr); + if (tensor_storage.type != dst_tensor->type) { convert_buffer.resize(ggml_nbytes(dst_tensor)); - convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + ptr = convert_buffer.data(); + convert_tensor(read_buffer.data(), tensor_storage.type, ptr, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); t1 = ggml_time_ms(); convert_time_ms.fetch_add(t1 - t0); - t0 = ggml_time_ms(); - ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); - t1 = ggml_time_ms(); - copy_to_backend_time_ms.fetch_add(t1 - t0); + t0 = t1; } + // copy to device memory + ggml_backend_tensor_set(dst_tensor, ptr, 0, ggml_nbytes(dst_tensor)); + t1 = ggml_time_ms(); + copy_to_backend_time_ms.fetch_add(t1 - t0); + t0 = t1; } } if (zip != nullptr) { diff --git a/model.h b/model.h index f1711e67..ccde293e 100644 --- a/model.h +++ b/model.h @@ -134,8 +134,7 @@ enum PMVersion { struct TensorStorage { std::string name; ggml_type type = GGML_TYPE_F32; - bool is_bf16 = false; - bool is_f8_e4m3 = false; + bool is_f8_e4m3fn = false; bool is_f8_e5m2 = false; bool is_f64 = false; bool is_i64 = false; @@ -168,8 +167,8 @@ struct TensorStorage { } int64_t nbytes_to_read() const { - if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) { - return nbytes() / 2; + if (is_f8_e4m3fn || is_f8_e5m2) { + return nbytes() / 4; } else if (is_f64 || is_i64) { return nbytes() * 2; } else { @@ -215,17 +214,17 @@ struct TensorStorage { std::string to_string() const { std::stringstream ss; - const char* type_name = ggml_type_name(type); - if (is_bf16) { - type_name = "bf16"; - } else if (is_f8_e4m3) { - type_name = "f8_e4m3"; + const char* type_name; + if (is_f8_e4m3fn) { + type_name = "f8_e4m3fn"; } else if (is_f8_e5m2) { type_name = "f8_e5m2"; } else if (is_f64) { type_name = "f64"; } else if (is_i64) { type_name = "i64"; + } else { + type_name = ggml_type_name(type); } ss << name << " | " << type_name << " | "; ss << n_dims << " ["; @@ -238,6 +237,9 @@ struct TensorStorage { ss << "]"; return ss.str(); } + + bool read_data(void* buf, std::ifstream& file) const; + bool read_data(void* buf, struct zip_t* zip, std::atomic& memcpy_time_ms) const; }; typedef std::function on_new_tensor_cb_t;