diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 625542e0..d500a08b 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -188,6 +188,11 @@ __STATIC_INLINE__ ggml_fp16_t ggml_ext_tensor_get_f16(const ggml_tensor* tensor,
     return *(ggml_fp16_t*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]);
 }
 
+__STATIC_INLINE__ ggml_bf16_t ggml_ext_tensor_get_bf16(const ggml_tensor* tensor, int i0, int i1 = 0, int i2 = 0, int i3 = 0) {
+    GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+    return *(ggml_bf16_t*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]);
+}
+
 __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int iw, int ih, int ic, bool scale = true) {
     float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
     if (scale) {
@@ -231,6 +236,8 @@ __STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_
                         printf("  [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_ext_tensor_get_f32(tensor, i0, i1, i2, i3));
                     } else if (tensor->type == GGML_TYPE_F16) {
                         printf("  [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_fp16_to_fp32(ggml_ext_tensor_get_f16(tensor, i0, i1, i2, i3)));
+                    } else if (tensor->type == GGML_TYPE_BF16) {
+                        printf("  [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_bf16_to_fp32(ggml_ext_tensor_get_bf16(tensor, i0, i1, i2, i3)));
                     } else if (tensor->type == GGML_TYPE_I32) {
                         printf("  [%d, %d, %d, %d] = %i3\n", i3, i2, i1, i0, ggml_ext_tensor_get_i32(tensor, i0, i1, i2, i3));
                     }
@@ -1344,7 +1351,7 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe
 }
 
 __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32);
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16 || tensor->type == GGML_TYPE_I32);
     float value;
     if (tensor->type == GGML_TYPE_F32) {
         ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
@@ -1352,6 +1359,10 @@ __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
         ggml_fp16_t f16_value;
         ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
         value = ggml_fp16_to_fp32(f16_value);
+    } else if (tensor->type == GGML_TYPE_BF16) {
+        ggml_bf16_t bf16_value;
+        ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value));
+        value = ggml_bf16_to_fp32(bf16_value);
     } else {  // GGML_TYPE_I32
         int int32_value;
         ggml_backend_tensor_get(tensor, &int32_value, 0, sizeof(int32_value));
@@ -2011,11 +2022,19 @@ class Linear : public UnaryBlock {
 };
 
 __STATIC_INLINE__ bool support_get_rows(ggml_type wtype) {
-    std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
-    if (allow_types.find(wtype) != allow_types.end()) {
-        return true;
+    switch (wtype) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_F16:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q4_0:
+            return true;
+        default:
+            return false;
     }
-    return false;
 }
 
 class Embedding : public UnaryBlock {
diff --git a/model.cpp b/model.cpp
index da77afed..ba97f0af 100644
--- a/model.cpp
+++ b/model.cpp
@@ -111,8 +111,12 @@ const char* unused_tensors[] = {
     "embedding_manager",
     "denoiser.sigmas",
     "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
+    "text_encoders.t5xxl.logit_scale",                              // only used during training
+    "text_encoders.t5xxl.transformer.scaled_fp8",
     "text_encoders.qwen2vl.output.weight",
     "text_encoders.qwen2vl.lm_head.",
+    "text_encoders.qwen2vl.logit_scale",  // only used during training
+    "text_encoders.qwen2vl.transformer.scaled_fp8",
 };
 
 bool is_unused_tensor(std::string name) {
@@ -742,6 +746,10 @@ std::string convert_tensor_name(std::string name) {
 
 void add_preprocess_tensor_storage_types(String2GGMLType& tensor_storages_types, std::string name, enum ggml_type type) {
     std::string new_name = convert_tensor_name(name);
+    // TODO: enable bf16 once it is supported by the relevant ggml ops
+    if (type == GGML_TYPE_BF16) {
+        type = GGML_TYPE_F32;
+    }
 
     if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_weight")) {
         size_t prefix_size                                        = new_name.find("attn.in_proj_weight");
@@ -814,126 +822,47 @@ void preprocess_tensor(TensorStorage tensor_storage,
     }
 }
 
-float bf16_to_f32(uint16_t bfloat16) {
-    uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
-    return *reinterpret_cast<float*>(&val_bits);
-}
-
-uint16_t f8_e4m3_to_f16(uint8_t f8) {
-    // do we need to support uz?
-
-    const uint32_t exponent_bias = 7;
-    if (f8 == 0xff) {
-        return ggml_fp32_to_fp16(-NAN);
-    } else if (f8 == 0x7f) {
-        return ggml_fp32_to_fp16(NAN);
-    }
-
-    uint32_t sign     = f8 & 0x80;
-    uint32_t exponent = (f8 & 0x78) >> 3;
-    uint32_t mantissa = f8 & 0x07;
-    uint32_t result   = sign << 24;
-    if (exponent == 0) {
-        if (mantissa > 0) {
-            exponent = 0x7f - exponent_bias;
-
-            // yes, 2 times
-            if ((mantissa & 0x04) == 0) {
-                mantissa &= 0x03;
-                mantissa <<= 1;
-                exponent -= 1;
-            }
-            if ((mantissa & 0x04) == 0) {
-                mantissa &= 0x03;
-                mantissa <<= 1;
-                exponent -= 1;
-            }
-
-            result |= (mantissa & 0x03) << 21;
-            result |= exponent << 23;
-        }
-    } else {
-        result |= mantissa << 20;
-        exponent += 0x7f - exponent_bias;
-        result |= exponent << 23;
-    }
-
-    return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
+float f8_e4m3fn_to_f32(uint8_t f8) {
+    switch (f8) {
+        case 0:
+            return 0.0f;
+        case 127:
+            return NAN;
+        case 128:
+            return -0.0f;
+        case 255:
+            return -NAN;
+    }
+
+    const uint32_t exponent_bias_delta = 127 - 7;
+    uint32_t exponent                  = ((f8 >> 3) & 15) + exponent_bias_delta;
+    uint32_t mantissa                  = f8 & 7;
+
+    // subnormal
+    if (exponent == exponent_bias_delta) {
+        // yes, 2 times
+        if ((mantissa & 4) == 0) {
+            mantissa &= 3;
+            mantissa <<= 1;
+            exponent -= 1;
+        }
+        if ((mantissa & 4) == 0) {
+            mantissa &= 3;
+            mantissa <<= 1;
+            exponent -= 1;
+        }
+        // (mantisa & 4) == 4
+        mantissa &= 3;
+        mantissa <<= 1;
+    }
+
+    const uint32_t sign   = f8 >> 7;
+    const uint32_t result = (sign << 31) | (exponent << 23) | (mantissa << 20);
+    return *reinterpret_cast<const float*>(&result);
 }
 
 uint16_t f8_e5m2_to_f16(uint8_t fp8) {
-    uint8_t sign     = (fp8 >> 7) & 0x1;
-    uint8_t exponent = (fp8 >> 2) & 0x1F;
-    uint8_t mantissa = fp8 & 0x3;
-
-    uint16_t fp16_sign = sign << 15;
-    uint16_t fp16_exponent;
-    uint16_t fp16_mantissa;
-
-    if (exponent == 0 && mantissa == 0) {  // zero
-        return fp16_sign;
-    }
-
-    if (exponent == 0x1F) {  // NAN and INF
-        fp16_exponent = 0x1F;
-        fp16_mantissa = mantissa ? (mantissa << 8) : 0;
-        return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
-    }
-
-    if (exponent == 0) {  // subnormal numbers
-        fp16_mantissa = (mantissa << 8);
-        return fp16_sign | fp16_mantissa;
-    }
-
-    // normal numbers
-    int16_t true_exponent = (int16_t)exponent - 15 + 15;
-    if (true_exponent <= 0) {
-        fp16_exponent = 0;
-        fp16_mantissa = (mantissa << 8);
-    } else if (true_exponent >= 0x1F) {
-        fp16_exponent = 0x1F;
-        fp16_mantissa = 0;
-    } else {
-        fp16_exponent = (uint16_t)true_exponent;
-        fp16_mantissa = mantissa << 8;
-    }
-
-    return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
-}
-
-void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
-    // support inplace op
-    for (int64_t i = n - 1; i >= 0; i--) {
-        dst[i] = bf16_to_f32(src[i]);
-    }
-}
-
-void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
-    // support inplace op
-    for (int64_t i = n - 1; i >= 0; i--) {
-        dst[i] = f8_e4m3_to_f16(src[i]);
-    }
-}
-
-void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
-    // support inplace op
-    for (int64_t i = n - 1; i >= 0; i--) {
-        dst[i] = f8_e5m2_to_f16(src[i]);
-    }
-}
-
-void f64_to_f32_vec(double* src, float* dst, int64_t n) {
-    // support inplace op
-    for (int64_t i = 0; i < n; i++) {
-        dst[i] = (float)src[i];
-    }
-}
-
-void i64_to_i32_vec(int64_t* src, int32_t* dst, int64_t n) {
-    // support inplace op
-    for (int64_t i = 0; i < n; i++) {
-        dst[i] = (int32_t)src[i];
-    }
+    return static_cast<uint16_t>(fp8) << 8;
 }
 
 void convert_tensor(void* src,
@@ -942,49 +871,38 @@ void convert_tensor(void* src,
                     ggml_type dst_type,
                     int nrows,
                     int n_per_row) {
+    GGML_ASSERT(src != dst);
     int n = nrows * n_per_row;
     if (src_type == dst_type) {
         size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type);
-        memcpy(((char*)dst), ((char*)src), nbytes);
-    } else if (src_type == GGML_TYPE_F32) {
-        if (dst_type == GGML_TYPE_F16) {
-            ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
-        } else {
-            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
-            const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
-        }
-    } else if (dst_type == GGML_TYPE_F32) {
-        if (src_type == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n);
-        } else {
-            auto qtype = ggml_get_type_traits(src_type);
-            if (qtype->to_float == nullptr) {
-                throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
-                                                ggml_type_name(src_type)));
-            }
-            qtype->to_float(src, (float*)dst, n);
-        }
-    } else {
-        // src_type == GGML_TYPE_F16 => dst_type is quantized
-        // src_type is quantized => dst_type == GGML_TYPE_F16 or dst_type is quantized
+        memcpy(dst, src, nbytes);
+        return;
+    }
+
+    std::vector<float> buffer;
+    float* ptr = static_cast<float*>(src);
+
+    // convert src_type to f32. allocate a buffer when dequantizing if necessary.
+    if (src_type != GGML_TYPE_F32) {
         auto qtype = ggml_get_type_traits(src_type);
         if (qtype->to_float == nullptr) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
                                             ggml_type_name(src_type)));
         }
-        std::vector<char> buf;
-        buf.resize(sizeof(float) * n);
-        char* src_data_f32 = buf.data();
-        qtype->to_float(src, (float*)src_data_f32, n);
-        if (dst_type == GGML_TYPE_F16) {
-            ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
-        } else {
-            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
-            const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
+        if (dst_type == GGML_TYPE_F32) {
+            // no need to re-quant - write to dest directly
+            qtype->to_float(src, static_cast<float*>(dst), n);
+            return;
         }
+        buffer.resize(n);
+        ptr = buffer.data();
+        qtype->to_float(src, ptr, n);
     }
+
+    // convert f32 to dst_type
+    std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
+    const float* im = imatrix.data();
+    ggml_quantize_chunk(dst_type, ptr, dst, 0, nrows, n_per_row, im);
 }
 
 /*================================================= ModelLoader ==================================================*/
@@ -1194,22 +1112,15 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
 
 /*================================================= SafeTensorsModelLoader ==================================================*/
 
+// most ggml operations only support f16, bf16, and f32 tensors.
 ggml_type str_to_ggml_type(const std::string& dtype) {
     ggml_type ttype = GGML_TYPE_COUNT;
     if (dtype == "F16") {
         ttype = GGML_TYPE_F16;
     } else if (dtype == "BF16") {
+        ttype = GGML_TYPE_BF16;
+    } else if (dtype == "F32" || dtype == "F64" || dtype == "F8_E4M3" || dtype == "F8_E5M2" || dtype == "I64") {
         ttype = GGML_TYPE_F32;
-    } else if (dtype == "F32") {
-        ttype = GGML_TYPE_F32;
-    } else if (dtype == "F64") {
-        ttype = GGML_TYPE_F32;
-    } else if (dtype == "F8_E4M3") {
-        ttype = GGML_TYPE_F16;
-    } else if (dtype == "F8_E5M2") {
-        ttype = GGML_TYPE_F16;
-    } else if (dtype == "I64") {
-        ttype = GGML_TYPE_I32;
     }
     return ttype;
 }
@@ -1327,24 +1238,21 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
 
         size_t tensor_data_size = end - begin;
 
-        if (dtype == "BF16") {
-            tensor_storage.is_bf16 = true;
-            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
-        } else if (dtype == "F8_E4M3") {
-            tensor_storage.is_f8_e4m3 = true;
-            // f8 -> f16
-            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
+        if (dtype == "F8_E4M3") {
+            tensor_storage.is_f8_e4m3fn = true;
+            // f8 -> f32
+            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 4);
         } else if (dtype == "F8_E5M2") {
             tensor_storage.is_f8_e5m2 = true;
-            // f8 -> f16
-            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
+            // f8 -> f32
+            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 4);
         } else if (dtype == "F64") {
             tensor_storage.is_f64 = true;
             // f64 -> f32
             GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
         } else if (dtype == "I64") {
             tensor_storage.is_i64 = true;
-            // i64 -> i32
+            // i64 -> f32
             GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
         } else {
             GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
@@ -2036,6 +1944,33 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
     return json_str;
 }
 
+bool TensorStorage::read_data(void* buf, std::ifstream& file) const {
+    file.seekg(this->offset);
+    file.read(static_cast<char*>(buf), this->nbytes_to_read());
+    return !file.fail();
+}
+
+bool TensorStorage::read_data(void* buf, struct zip_t* zip, std::atomic<int64_t>& memcpy_time_ms) const {
+    size_t n = this->nbytes_to_read();
+    if (zip_entry_openbyindex(zip, this->index_in_zip)) {
+        return false;
+    }
+    bool failed       = false;
+    size_t entry_size = zip_entry_size(zip);
+    if (entry_size != n) {
+        int64_t t_memcpy_start;
+        std::vector<uint8_t> read_buffer(entry_size);
+        failed         = zip_entry_noallocread(zip, read_buffer.data(), entry_size) != entry_size;
+        t_memcpy_start = ggml_time_ms();
+        memcpy(buf, read_buffer.data() + this->offset, n);
+        memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start);
+    } else {
+        failed = zip_entry_noallocread(zip, buf, n) != n;
+    }
+    zip_entry_close(zip);
+    return !failed;
+}
+
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
     int64_t process_time_ms = 0;
     std::atomic<int64_t> read_time_ms(0);
@@ -2157,6 +2092,26 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
         std::atomic<bool> failed(false);
         std::vector<std::thread> workers;
 
+        std::unordered_map<std::string, int> scale_idx;
+        std::unordered_map<std::string, int> scale_count;
+        for (int i = 0; i < file_tensors.size(); i++) {
+            const TensorStorage* tensor = file_tensors[i];
+            if (ends_with(tensor->name, ".scale_weight")) {
+                std::string new_name = tensor->name.substr(0, tensor->name.size() - strlen(".scale_weight")) + ".weight";
+                GGML_ASSERT(tensor->nelements() == 1 && tensor->type == GGML_TYPE_F32 && tensor->nbytes_to_read() == 4);
+                scale_idx[new_name] = i;
+                scale_count[new_name]++;
+            } else if (ends_with(tensor->name, ".weight") && (tensor->is_f8_e4m3fn || tensor->is_f8_e5m2)) {
+                scale_count[tensor->name]--;
+            }
+        }
+        for (auto& x : scale_count) {
+            if (x.second > 0) {
+                LOG_ERROR("f8 weight not found for scale_weight: '%s'", x.first.c_str());
+                return false;
+            }
+        }
+
         for (int i = 0; i < n_threads; ++i) {
             workers.emplace_back([&, file_path, is_zip]() {
                 std::ifstream file;
@@ -2180,17 +2135,73 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                 std::vector<uint8_t> read_buffer;
                 std::vector<uint8_t> convert_buffer;
 
+                auto read_data = [&](int64_t t0, const TensorStorage& tensor_storage, void* buf) {
+                    bool fail = false;
+                    if (is_zip) {
+                        fail |= !tensor_storage.read_data(buf, zip, memcpy_time_ms);
+                    } else {
+                        fail |= !tensor_storage.read_data(buf, file);
+                    }
+                    float scale = 1;
+                    if (scale_idx.count(tensor_storage.name)) {
+                        const TensorStorage* tensor = file_tensors[scale_idx[tensor_storage.name]];
+                        if (is_zip) {
+                            fail |= !tensor->read_data(&scale, zip, memcpy_time_ms);
+                        } else {
+                            fail |= !tensor->read_data(&scale, file);
+                        }
+                    }
+                    if (fail) {
+                        failed = true;
+                        LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
+                    }
+                    int64_t t1 = ggml_time_ms();
+                    read_time_ms.fetch_add(t1 - t0);
+                    t0        = t1;
+                    int count = tensor_storage.nelements();
+                    if (tensor_storage.is_f8_e4m3fn) {
+                        for (int64_t i = count - 1; i >= 0; i--) {
+                            static_cast<float*>(buf)[i] = f8_e4m3fn_to_f32(static_cast<uint8_t*>(buf)[i]);
+                            if (scale != 1) {
+                                static_cast<float*>(buf)[i] *= scale;
+                            }
+                        }
+                    } else if (tensor_storage.is_f8_e5m2) {
+                        for (int64_t i = count - 1; i >= 0; i--) {
+                            static_cast<float*>(buf)[i] =
+                                ggml_fp16_to_fp32(f8_e5m2_to_f16(static_cast<uint8_t*>(buf)[i]));
+                            if (scale != 1) {
+                                static_cast<float*>(buf)[i] *= scale;
+                            }
+                        }
+                    } else if (tensor_storage.is_f64) {
+                        for (int64_t i = 0; i < count; i++) {
+                            static_cast<float*>(buf)[i] = static_cast<double*>(buf)[i];
+                        }
+                    } else if (tensor_storage.is_i64) {
+                        for (int64_t i = 0; i < count; i++) {
+                            static_cast<float*>(buf)[i] = static_cast<int64_t*>(buf)[i];
+                        }
+                    }
+                    t1 = ggml_time_ms();
+                    convert_time_ms.fetch_add(t1 - t0);
+                    return t1;
+                };
+
+                int64_t t0 = ggml_time_ms();
                 while (true) {
-                    int64_t t0, t1;
+                    int64_t t1;
                     size_t idx = tensor_idx.fetch_add(1);
                     if (idx >= file_tensors.size() || failed) {
                         break;
                     }
 
                     const TensorStorage& tensor_storage = *file_tensors[idx];
-                    ggml_tensor* dst_tensor             = nullptr;
+                    if (ends_with(tensor_storage.name, ".scale_weight")) {
+                        continue;
+                    }
 
-                    t0 = ggml_time_ms();
+                    ggml_tensor* dst_tensor = nullptr;
 
                     if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) {
                         LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
@@ -2201,136 +2212,41 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     if (dst_tensor == nullptr) {
                         t1 = ggml_time_ms();
                         read_time_ms.fetch_add(t1 - t0);
+                        t0 = t1;
                         continue;
                     }
 
                     size_t nbytes_to_read = tensor_storage.nbytes_to_read();
 
-                    auto read_data = [&](char* buf, size_t n) {
-                        if (zip != nullptr) {
-                            zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
-                            size_t entry_size = zip_entry_size(zip);
-                            if (entry_size != n) {
-                                int64_t t_memcpy_start;
-                                read_buffer.resize(entry_size);
-                                zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
-                                t_memcpy_start = ggml_time_ms();
-                                memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
-                                memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start);
-                            } else {
-                                zip_entry_noallocread(zip, (void*)buf, n);
-                            }
-                            zip_entry_close(zip);
-                        } else {
-                            file.seekg(tensor_storage.offset);
-                            file.read(buf, n);
-                            if (!file) {
-                                LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
-                                failed = true;
-                            }
-                        }
-                    };
-
                     if (dst_tensor->buffer == nullptr || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
                         if (tensor_storage.type == dst_tensor->type) {
                             GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                            if (tensor_storage.is_f64 || tensor_storage.is_i64) {
-                                read_buffer.resize(tensor_storage.nbytes_to_read());
-                                read_data((char*)read_buffer.data(), nbytes_to_read);
-                            } else {
-                                read_data((char*)dst_tensor->data, nbytes_to_read);
-                            }
-                            t1 = ggml_time_ms();
-                            read_time_ms.fetch_add(t1 - t0);
-
-                            t0 = ggml_time_ms();
-                            if (tensor_storage.is_bf16) {
-                                // inplace op
-                                bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
-                            } else if (tensor_storage.is_f8_e4m3) {
-                                // inplace op
-                                f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
-                            } else if (tensor_storage.is_f8_e5m2) {
-                                // inplace op
-                                f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
-                            } else if (tensor_storage.is_f64) {
-                                f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
-                            } else if (tensor_storage.is_i64) {
-                                i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
-                            }
-                            t1 = ggml_time_ms();
-                            convert_time_ms.fetch_add(t1 - t0);
+                            t0 = read_data(t0, tensor_storage, dst_tensor->data);
                         } else {
                             read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
-                            read_data((char*)read_buffer.data(), nbytes_to_read);
-                            t1 = ggml_time_ms();
-                            read_time_ms.fetch_add(t1 - t0);
-
-                            t0 = ggml_time_ms();
-                            if (tensor_storage.is_bf16) {
-                                // inplace op
-                                bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                            } else if (tensor_storage.is_f8_e4m3) {
-                                // inplace op
-                                f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                            } else if (tensor_storage.is_f8_e5m2) {
-                                // inplace op
-                                f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                            } else if (tensor_storage.is_f64) {
-                                // inplace op
-                                f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                            } else if (tensor_storage.is_i64) {
-                                // inplace op
-                                i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
-                            }
-                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            t0 = read_data(t0, tensor_storage, read_buffer.data());
+                            convert_tensor(read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
                             t1 = ggml_time_ms();
                             convert_time_ms.fetch_add(t1 - t0);
+                            t0 = t1;
                         }
                     } else {
                         read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
-                        read_data((char*)read_buffer.data(), nbytes_to_read);
-                        t1 = ggml_time_ms();
-                        read_time_ms.fetch_add(t1 - t0);
-
-                        t0 = ggml_time_ms();
-                        if (tensor_storage.is_bf16) {
-                            // inplace op
-                            bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                        } else if (tensor_storage.is_f8_e4m3) {
-                            // inplace op
-                            f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                        } else if (tensor_storage.is_f8_e5m2) {
-                            // inplace op
-                            f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                        } else if (tensor_storage.is_f64) {
-                            // inplace op
-                            f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                        } else if (tensor_storage.is_i64) {
-                            // inplace op
-                            i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
-                        }
-
-                        if (tensor_storage.type == dst_tensor->type) {
-                            // copy to device memory
-                            t1 = ggml_time_ms();
-                            convert_time_ms.fetch_add(t1 - t0);
-                            t0 = ggml_time_ms();
-                            ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                            t1 = ggml_time_ms();
-                            copy_to_backend_time_ms.fetch_add(t1 - t0);
-                        } else {
-                            // convert first, then copy to device memory
-
+                        uint8_t* ptr = read_buffer.data();
+                        t0           = read_data(t0, tensor_storage, ptr);
+                        if (tensor_storage.type != dst_tensor->type) {
                             convert_buffer.resize(ggml_nbytes(dst_tensor));
-                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            ptr = convert_buffer.data();
+                            convert_tensor(read_buffer.data(), tensor_storage.type, ptr, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
                             t1 = ggml_time_ms();
                             convert_time_ms.fetch_add(t1 - t0);
-                            t0 = ggml_time_ms();
-                            ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                            t1 = ggml_time_ms();
-                            copy_to_backend_time_ms.fetch_add(t1 - t0);
+                            t0 = t1;
                         }
+                        // copy to device memory
+                        ggml_backend_tensor_set(dst_tensor, ptr, 0, ggml_nbytes(dst_tensor));
+                        t1 = ggml_time_ms();
+                        copy_to_backend_time_ms.fetch_add(t1 - t0);
+                        t0 = t1;
                     }
                 }
                 if (zip != nullptr) {
diff --git a/model.h b/model.h
index f1711e67..ccde293e 100644
--- a/model.h
+++ b/model.h
@@ -134,8 +134,7 @@ enum PMVersion {
 struct TensorStorage {
     std::string name;
     ggml_type type          = GGML_TYPE_F32;
-    bool is_bf16            = false;
-    bool is_f8_e4m3         = false;
+    bool is_f8_e4m3fn       = false;
     bool is_f8_e5m2         = false;
     bool is_f64             = false;
     bool is_i64             = false;
@@ -168,8 +167,8 @@ struct TensorStorage {
     }
 
     int64_t nbytes_to_read() const {
-        if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
-            return nbytes() / 2;
+        if (is_f8_e4m3fn || is_f8_e5m2) {
+            return nbytes() / 4;
         } else if (is_f64 || is_i64) {
             return nbytes() * 2;
         } else {
@@ -215,17 +214,17 @@ struct TensorStorage {
 
     std::string to_string() const {
         std::stringstream ss;
-        const char* type_name = ggml_type_name(type);
-        if (is_bf16) {
-            type_name = "bf16";
-        } else if (is_f8_e4m3) {
-            type_name = "f8_e4m3";
+        const char* type_name;
+        if (is_f8_e4m3fn) {
+            type_name = "f8_e4m3fn";
         } else if (is_f8_e5m2) {
             type_name = "f8_e5m2";
         } else if (is_f64) {
             type_name = "f64";
         } else if (is_i64) {
             type_name = "i64";
+        } else {
+            type_name = ggml_type_name(type);
         }
         ss << name << " | " << type_name << " | ";
         ss << n_dims << " [";
@@ -238,6 +237,9 @@ struct TensorStorage {
         ss << "]";
         return ss.str();
     }
+
+    bool read_data(void* buf, std::ifstream& file) const;
+    bool read_data(void* buf, struct zip_t* zip, std::atomic<int64_t>& memcpy_time_ms) const;
 };
 
 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;