From 4c61c275f6bef1048b61c2932fa39728c546b11c Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Mon, 7 Apr 2025 10:39:49 +0530
Subject: [PATCH 1/7] SYCL: Add fp16 support to some elementwise OP kernels

---
 ggml/src/ggml-sycl/element_wise.cpp | 903 ++++++++++++++++++++--------
 ggml/src/ggml-sycl/element_wise.hpp |  16 +
 ggml/src/ggml-sycl/ggml-sycl.cpp    |  51 +-
 3 files changed, 666 insertions(+), 304 deletions(-)
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 0423305bb4016..0d0b97a06715e 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -1,4 +1,5 @@
 #include "common.hpp"
+#include "ggml.h"
 #include "element_wise.hpp"
 
 static void acc_f32(const float * x, const float * y, float * dst, const int ne,
@@ -20,10 +21,11 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
     }
 }
 
-static void gelu_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void gelu(const T * x, T * dst, const int k,
                      const sycl::nd_item<3> &item_ct1) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const T GELU_COEF_A    = to_T<T>(0.044715f);
+    const T SQRT_2_OVER_PI = to_T<T>(0.79788456080286535587989211986876f);
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
 
@@ -32,12 +34,13 @@ static void gelu_f32(const float * x, float * dst, const int k,
     }
 
     float xi = x[i];
-    dst[i] = 0.5f * xi *
-             (1.0f +
-              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
+    dst[i] = to_T<T>(0.5f) * xi *
+             (to_T<T>(1.0f) +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (to_T<T>(1.0f) + GELU_COEF_A * xi * xi)));
 }
 
-static void silu_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void silu(const T * x, T * dst, const int k,
                      const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -45,10 +48,11 @@ static void silu_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
+    dst[i] = x[i] / (to_T<T>(1.0f) + sycl::native::exp(-x[i]));
 }
 
-static void gelu_quick_f32(const float *x, float *dst, int k,
+template<typename T>
+static void gelu_quick(const T *x, T *dst, int k,
                            const sycl::nd_item<3> &item_ct1) {
     const float GELU_QUICK_COEF = -1.702f;
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -56,20 +60,22 @@ static void gelu_quick_f32(const float *x, float *dst, int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+    dst[i] = x[i] * (to_T<T>(1.0f) / (to_T<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
 }
 
-static void tanh_f32(const float *x, float *dst, int k,
+template<typename T>
+static void tanh(const T *x, T *dst, int k,
                      const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::tanh((float)(x[i]));
+    dst[i] = sycl::tanh((T)(x[i]));
 }
 
-static void relu_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void relu(const T * x, T * dst, const int k,
                      const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -77,10 +83,11 @@ static void relu_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0);
+    dst[i] = sycl::fmax((T)(x[i]), (T)0);
 }
 
-static void sigmoid_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void sigmoid(const T * x, T * dst, const int k,
                             const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -88,10 +95,11 @@ static void sigmoid_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
+    dst[i] = 1.0f / (to_T<T>(1.0f) + sycl::native::exp(-x[i]));
 }
 
-static void sqrt_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void sqrt(const T * x, T * dst, const int k,
                             const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -102,7 +110,8 @@ static void sqrt_f32(const float * x, float * dst, const int k,
     dst[i] = sycl::sqrt(x[i]);
 }
 
-static void sin_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void sin(const T * x, T * dst, const int k,
                             const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -113,7 +122,8 @@ static void sin_f32(const float * x, float * dst, const int k,
     dst[i] = sycl::sin(x[i]);
 }
 
-static void cos_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void cos(const T * x, T * dst, const int k,
                             const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -124,7 +134,8 @@ static void cos_f32(const float * x, float * dst, const int k,
     dst[i] = sycl::cos(x[i]);
 }
 
-static void hardsigmoid_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void hardsigmoid(const T * x, T * dst, const int k,
                             const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -132,10 +143,11 @@ static void hardsigmoid_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
+    dst[i] = sycl::fmin(to_T<T>(1.0f), sycl::fmax(to_T<T>(0.0f), (x[i] + to_T<T>(3.0f)) / to_T<T>(6.0f)));
 }
 
-static void hardswish_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void hardswish(const T * x, T * dst, const int k,
                           const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -143,10 +155,11 @@ static void hardswish_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
+    dst[i] = x[i] * sycl::fmin(to_T<T>(1.0f), sycl::fmax(to_T<T>(0.0f), (x[i] + to_T<T>(3.0f)) / to_T<T>(6.0f)));
 }
 
-static void exp_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void exp(const T * x, T * dst, const int k,
                           const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -157,7 +170,8 @@ static void exp_f32(const float * x, float * dst, const int k,
     dst[i] = sycl::exp(x[i]);
 }
 
-static void log_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void log(const T * x, T * dst, const int k,
                           const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -165,15 +179,16 @@ static void log_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    float xi = x[i];
+    T xi = x[i];
     if (xi <= 0) {
-        dst[i] = -INFINITY;
+        dst[i] = neg_infinity<T>();
     } else {
         dst[i] = sycl::log(xi);
     }
 }
 
-static void neg_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void neg(const T * x, T * dst, const int k,
                           const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -184,7 +199,8 @@ static void neg_f32(const float * x, float * dst, const int k,
     dst[i] = -x[i];
 }
 
-static void step_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void step(const T * x, T * dst, const int k,
                           const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -192,21 +208,23 @@ static void step_f32(const float * x, float * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] > 0.0f;
+    dst[i] = x[i] > (T)0.0f;
 }
 
-static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
+template<typename T>
+static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
                            const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
-             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
+    dst[i] = sycl::fmax((T)(x[i]), (T)0) +
+             sycl::fmin((T)(x[i]), (T)0.0f) * negative_slope;
 }
 
-static void sqr_f32(const float * x, float * dst, const int k,
+template<typename T>
+static void sqr(const T * x, T * dst, const int k,
                     const sycl::nd_item<3> &item_ct1) {
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
@@ -217,7 +235,8 @@ static void sqr_f32(const float * x, float * dst, const int k,
     dst[i] = x[i] * x[i];
 }
 
-static void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
+template<typename  T>
+static void upscale(const T  *x, T *dst, const int nb00, const int nb01,
                         const int nb02, const int nb03, const int ne10, const int ne11,
                         const int ne12, const int ne13, const float sf0, const float sf1,
                         const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
@@ -237,10 +256,11 @@ static void upscale_f32(const float  *x, float *dst, const int nb00, const int n
     int i02 = i12 / sf2;
     int i03 = i13 / sf3;
 
-    dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
+    dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
 }
 
-static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+template <typename T>
+static void pad(const T  *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02,
                     const sycl::nd_item<3> &item_ct1) {
     int nidx = item_ct1.get_local_id(2) +
                item_ct1.get_group(2) * item_ct1.get_local_range(2);
@@ -256,11 +276,23 @@ static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00,
                          item_ct1.get_group(0) * ne00 * ne01;
             dst[offset_dst] = x[offset_src];
     } else {
-        dst[offset_dst] = 0.0f;
+        dst[offset_dst] = to_T<T>(0.0f);
     }
 }
 
 
+template<typename T>
+static void clamp(const T * x, T * dst, const float min, const float max, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < (T)min ? (T)min : (x[i] > (T)max ? (T)max : x[i]);
+}
 
 static void acc_f32_sycl(const float *x, const float *y, float *dst,
                          const int n_elements, const int ne10, const int ne11,
@@ -277,7 +309,8 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
         });
 }
 
-static void gelu_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void gelu_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
     stream->parallel_for(
@@ -285,11 +318,12 @@ static void gelu_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            gelu_f32(x, dst, k, item_ct1);
+            gelu(x, dst, k, item_ct1);
         });
 }
 
-static void silu_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void silu_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
     stream->parallel_for(
@@ -297,11 +331,12 @@ static void silu_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            silu_f32(x, dst, k, item_ct1);
+            silu(x, dst, k, item_ct1);
         });
 }
 
-static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void gelu_quick_sycl(const T *x, T *dst, const int k,
                                 queue_ptr stream) {
     const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
     stream->parallel_for(
@@ -309,11 +344,12 @@ static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            gelu_quick_f32(x, dst, k, item_ct1);
+            gelu_quick(x, dst, k, item_ct1);
         });
 }
 
-static void tanh_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void tanh_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
     stream->parallel_for(
@@ -321,11 +357,12 @@ static void tanh_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            tanh_f32(x, dst, k, item_ct1);
+            tanh(x, dst, k, item_ct1);
         });
 }
 
-static void relu_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void relu_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
     stream->parallel_for(
@@ -333,11 +370,12 @@ static void relu_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            relu_f32(x, dst, k, item_ct1);
+            relu(x, dst, k, item_ct1);
         });
 }
 
-static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void hardsigmoid_sycl(const T *x, T *dst, const int k,
                                  queue_ptr stream) {
     const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
     stream->parallel_for(
@@ -345,11 +383,12 @@ static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            hardsigmoid_f32(x, dst, k, item_ct1);
+            hardsigmoid(x, dst, k, item_ct1);
         });
 }
 
-static void hardswish_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void hardswish_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
     stream->parallel_for(
@@ -357,11 +396,12 @@ static void hardswish_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            hardswish_f32(x, dst, k, item_ct1);
+            hardswish(x, dst, k, item_ct1);
         });
 }
 
-static void exp_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void exp_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
     stream->parallel_for(
@@ -369,11 +409,12 @@ static void exp_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            exp_f32(x, dst, k, item_ct1);
+            exp(x, dst, k, item_ct1);
         });
 }
 
-static void log_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void log_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
     stream->parallel_for(
@@ -381,11 +422,12 @@ static void log_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            log_f32(x, dst, k, item_ct1);
+            log(x, dst, k, item_ct1);
         });
 }
 
-static void neg_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void neg_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
     stream->parallel_for(
@@ -393,11 +435,12 @@ static void neg_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            neg_f32(x, dst, k, item_ct1);
+            neg(x, dst, k, item_ct1);
         });
 }
 
-static void step_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void step_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
     stream->parallel_for(
@@ -405,11 +448,12 @@ static void step_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            step_f32(x, dst, k, item_ct1);
+            step(x, dst, k, item_ct1);
         });
 }
 
-static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void sigmoid_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
     stream->parallel_for(
@@ -417,11 +461,12 @@ static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            sigmoid_f32(x, dst, k, item_ct1);
+            sigmoid(x, dst, k, item_ct1);
         });
 }
 
-static void sqrt_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void sqrt_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
     stream->parallel_for(
@@ -429,11 +474,12 @@ static void sqrt_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            sqrt_f32(x, dst, k, item_ct1);
+            sqrt(x, dst, k, item_ct1);
         });
 }
 
-static void sin_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void sin_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
     stream->parallel_for(
@@ -441,11 +487,12 @@ static void sin_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            sin_f32(x, dst, k, item_ct1);
+            sin(x, dst, k, item_ct1);
         });
 }
 
-static void cos_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void cos_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
     stream->parallel_for(
@@ -453,11 +500,12 @@ static void cos_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            cos_f32(x, dst, k, item_ct1);
+            cos(x, dst, k, item_ct1);
         });
 }
 
-static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void leaky_relu_sycl(const T *x, T *dst, const int k,
                                 const float negative_slope,
                                 queue_ptr stream) {
     const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
@@ -466,11 +514,12 @@ static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
+            leaky_relu(x, dst, k, negative_slope, item_ct1);
         });
 }
 
-static void sqr_f32_sycl(const float *x, float *dst, const int k,
+template<typename T>
+static void sqr_sycl(const T *x, T *dst, const int k,
                          queue_ptr stream) {
     const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
     stream->parallel_for(
@@ -478,11 +527,12 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
                               sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            sqr_f32(x, dst, k, item_ct1);
+            sqr(x, dst, k, item_ct1);
         });
 }
 
-static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
+template<typename T>
+static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
                              const int nb02, const int nb03, const int ne10, const int ne11,
                              const int ne12, const int ne13, const float sf0, const float sf1,
                              const float sf2, const float sf3, queue_ptr stream) {
@@ -492,11 +542,12 @@ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const i
     stream->parallel_for(
         sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
         [=](sycl::nd_item<1> item_ct1) {
-            upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
+            upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
         });
 }
 
-static void pad_f32_sycl(const float *x, float *dst, const int ne00,
+template<typename T>
+static void pad_sycl(const T *x, T *dst, const int ne00,
                          const int ne01, const int ne02, const int ne0,
                          const int ne1, const int ne2, queue_ptr stream) {
     int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
@@ -505,252 +556,586 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
         sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
+            pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
         });
 }
 
-inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+template<typename T>
+static void clamp_sycl(const T *x, T *dst, const float min,
+                           const float max, const int k,
+                           queue_ptr stream) {
+    const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            clamp(x, dst, min, max, k, item_ct1);
+        });
+}
 
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-
-    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                silu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                silu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                gelu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                gelu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
 inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                gelu_quick_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                gelu_quick_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                tanh_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                tanh_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
 
-    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx,  ggml_tensor *dst) {
+inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
 
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
 
-    hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                hardsigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                hardsigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                hardswish_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                hardswish_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                exp_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                exp_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    log_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                log_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                log_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-
-    sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                sigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                sigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
 
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-
-    sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                sqrt_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                sqrt_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                sin_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                sin_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                cos_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                cos_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    step_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                step_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                step_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-
-inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                neg_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                neg_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    dpct::queue_ptr main_stream = ctx.stream();
-    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
+inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     float negative_slope;
     memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream);
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                leaky_relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                leaky_relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                sqr_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                sqr_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
 
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    const float sf0 = (float)dst->ne[0]/dst->src[0]->ne[0];
-    const float sf1 = (float)dst->ne[1]/dst->src[0]->ne[1];
-    const float sf2 = (float)dst->ne[2]/dst->src[0]->ne[2];
-    const float sf3 = (float)dst->ne[3]/dst->src[0]->ne[3];
 
-    upscale_f32_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], dst->src[0]->nb[3],
-                     dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
-                     main_stream);
+    const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
+    const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
+    const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
+    const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                upscale_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
+                        dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
+                        main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                upscale_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
+                        dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
+                        main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
-inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
+    GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1);  // just 3D tensors
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                pad_sycl(src0_dd, dst_dd, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
+                        dst->ne[1], dst->ne[2], main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                pad_sycl(src0_dd, dst_dd, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
+                        dst->ne[1], dst->ne[2], main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
+}
 
-    pad_f32_sycl(src0_dd, dst_dd,
-        dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->src[0]->type == dst->type);
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+                float min;
+                float max;
+                memcpy(&min, dst->op_params, sizeof(float));
+                memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
+                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
+                clamp_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        case GGML_TYPE_F32:
+            {
+                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+                float *       dst_dd  = static_cast<float *>(dst->data);
+                clamp_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
 }
 
 inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -773,6 +1158,7 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
     acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
 }
 
+
 inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
 
     ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
@@ -795,126 +1181,133 @@ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
 
 
 void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_sqrt(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_sin(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_cos(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_acc(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_gelu(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_silu(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_gelu_quick(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_tanh(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_relu(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_sigmoid(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_hardsigmoid(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_hardswish(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 
 void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_exp(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_log(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_neg(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_step(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_leaky_relu(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_sqr(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_upscale(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
     ggml_sycl_op_pad(ctx, dst);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
+void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    ggml_sycl_op_clamp(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+
 
 
 void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index 46443264505cc..b688ee4b5ef45 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -2,6 +2,18 @@
 #define GGML_SYCL_ELEMENTWISE_HPP
 
 #include "common.hpp"
+#include <limits.h>
+
+template <typename T>
+T neg_infinity() {
+    return -std::numeric_limits<T>::infinity();
+}
+
+template <typename T>
+constexpr T to_T(float value) {
+    return static_cast<T>(value);
+}
+
 
 static __dpct_inline__ float op_repeat(const float a, const float b) {
     return b;
@@ -65,6 +77,10 @@ void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
+void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+// ---------
+
 void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index e6f1603d84e07..ba3f9e5c75a5f 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1617,17 +1617,6 @@ static void scale_f32(const float * x, float * dst, const float scale, const int
     dst[i] = scale * x[i];
 }
 
-static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
 
 template <typename Ti, typename To>
 static  void pool2d_nchw_kernel(
@@ -1768,18 +1757,6 @@ static void scale_f32_sycl(const float *x, float *dst, const float scale,
         });
 }
 
-static void clamp_f32_sycl(const float *x, float *dst, const float min,
-                           const float max, const int k,
-                           queue_ptr stream) {
-    const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            clamp_f32(x, dst, min, max, k, item_ct1);
-        });
-}
 
 static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
                               const int nrows, queue_ptr stream) {
@@ -2258,26 +2235,6 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst
     SYCL_CHECK(0);
 }
 
-inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float *       dst_dd  = static_cast<float *>(dst->data);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), ctx.stream());
-    /*
-    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    SYCL_CHECK(0);
-}
-
 static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     static bool peer_access_enabled = false;
 
@@ -3218,10 +3175,6 @@ static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
     ggml_sycl_op_scale(ctx, dst);
 }
 
-static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_sycl_op_clamp(ctx, dst);
-}
-
 static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_op_diag_mask_inf(ctx, dst);
 }
@@ -3900,7 +3853,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
-                    return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32);
+                    return ggml_is_contiguous(op->src[0]); // && (op->src[0]->type == GGML_TYPE_F32);
                 default:
                     return false;
             }
@@ -4022,13 +3975,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
+            return (op->src[0]->type == GGML_TYPE_F32);
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
         case GGML_OP_SIN:
         case GGML_OP_COS:
         case GGML_OP_CLAMP:
         case GGML_OP_LOG:
-            return (op->src[0]->type == GGML_TYPE_F32);
         case GGML_OP_NORM:
         case GGML_OP_RMS_NORM:
         case GGML_OP_L2_NORM:

From 8f89e132a25d4bf032bbc271fca5eafa736463c5 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Mon, 7 Apr 2025 11:15:32 +0530
Subject: [PATCH 2/7] remove comment

ggml-ci
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index ba3f9e5c75a5f..c4fe94bdda2f8 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3853,7 +3853,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
-                    return ggml_is_contiguous(op->src[0]); // && (op->src[0]->type == GGML_TYPE_F32);
+                    return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
             }

From 0219e20d2fbc8259e9a560ecec1ad46155347f06 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Tue, 8 Apr 2025 18:48:25 +0530
Subject: [PATCH 3/7] Use static_cast directly

---
 ggml/src/ggml-sycl/element_wise.cpp | 22 +++++++++++-----------
 ggml/src/ggml-sycl/element_wise.hpp |  6 ------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 0d0b97a06715e..4143fecefd0c3 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -24,8 +24,8 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
 template<typename T>
 static void gelu(const T * x, T * dst, const int k,
                      const sycl::nd_item<3> &item_ct1) {
-    const T GELU_COEF_A    = to_T<T>(0.044715f);
-    const T SQRT_2_OVER_PI = to_T<T>(0.79788456080286535587989211986876f);
+    const T GELU_COEF_A    = static_cast<T>(0.044715f);
+    const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
     const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                   item_ct1.get_local_id(2);
 
@@ -34,9 +34,9 @@ static void gelu(const T * x, T * dst, const int k,
     }
 
     float xi = x[i];
-    dst[i] = to_T<T>(0.5f) * xi *
-             (to_T<T>(1.0f) +
-              sycl::tanh(SQRT_2_OVER_PI * xi * (to_T<T>(1.0f) + GELU_COEF_A * xi * xi)));
+    dst[i] = static_cast<T>(0.5f) * xi *
+             (static_cast<T>(1.0f) +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast<T>(1.0f) + GELU_COEF_A * xi * xi)));
 }
 
 template<typename T>
@@ -48,7 +48,7 @@ static void silu(const T * x, T * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] / (to_T<T>(1.0f) + sycl::native::exp(-x[i]));
+    dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
 }
 
 template<typename T>
@@ -60,7 +60,7 @@ static void gelu_quick(const T *x, T *dst, int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] * (to_T<T>(1.0f) / (to_T<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+    dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
 }
 
 template<typename T>
@@ -95,7 +95,7 @@ static void sigmoid(const T * x, T * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = 1.0f / (to_T<T>(1.0f) + sycl::native::exp(-x[i]));
+    dst[i] = 1.0f / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
 }
 
 template<typename T>
@@ -143,7 +143,7 @@ static void hardsigmoid(const T * x, T * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::fmin(to_T<T>(1.0f), sycl::fmax(to_T<T>(0.0f), (x[i] + to_T<T>(3.0f)) / to_T<T>(6.0f)));
+    dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
 }
 
 template<typename T>
@@ -155,7 +155,7 @@ static void hardswish(const T * x, T * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] * sycl::fmin(to_T<T>(1.0f), sycl::fmax(to_T<T>(0.0f), (x[i] + to_T<T>(3.0f)) / to_T<T>(6.0f)));
+    dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
 }
 
 template<typename T>
@@ -276,7 +276,7 @@ static void pad(const T  *x, T *dst, const int ne0, const int ne00, const int ne
                          item_ct1.get_group(0) * ne00 * ne01;
             dst[offset_dst] = x[offset_src];
     } else {
-        dst[offset_dst] = to_T<T>(0.0f);
+        dst[offset_dst] = static_cast<T>(0.0f);
     }
 }
 
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index b688ee4b5ef45..47ab09408f7c7 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -9,12 +9,6 @@ T neg_infinity() {
     return -std::numeric_limits<T>::infinity();
 }
 
-template <typename T>
-constexpr T to_T(float value) {
-    return static_cast<T>(value);
-}
-
-
 static __dpct_inline__ float op_repeat(const float a, const float b) {
     return b;
     GGML_UNUSED(a);

From ecc5eaf994451238cd4322757e51fd59f03a0e4b Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Tue, 8 Apr 2025 18:53:57 +0530
Subject: [PATCH 4/7] remove not needed cast from tanh

---
 ggml/src/ggml-sycl/element_wise.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 4143fecefd0c3..7a3e200c0b2d3 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -71,7 +71,7 @@ static void tanh(const T *x, T *dst, int k,
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::tanh((T)(x[i]));
+    dst[i] = sycl::tanh((x[i]));
 }
 
 template<typename T>

From 129146580026966d3fc33962ff5c938685902fe8 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Tue, 8 Apr 2025 19:15:33 +0530
Subject: [PATCH 5/7] Use static cast and remove unneeded castings

---
 ggml/src/ggml-sycl/element_wise.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 7a3e200c0b2d3..3f93c8c160fb8 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -83,7 +83,7 @@ static void relu(const T * x, T * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::fmax((T)(x[i]), (T)0);
+    dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
 }
 
 template<typename T>
@@ -208,7 +208,7 @@ static void step(const T * x, T * dst, const int k,
     if (i >= k) {
         return;
     }
-    dst[i] = x[i] > (T)0.0f;
+    dst[i] = x[i] > static_cast<T>(0.0f);
 }
 
 template<typename T>
@@ -219,8 +219,8 @@ static void leaky_relu(const T *x, T *dst, const int k, const float negative_slo
     if (i >= k) {
         return;
     }
-    dst[i] = sycl::fmax((T)(x[i]), (T)0) +
-             sycl::fmin((T)(x[i]), (T)0.0f) * negative_slope;
+    dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
+             sycl::fmin((x[i]), static_cast<T>(0.0f)) * negative_slope;
 }
 
 template<typename T>
@@ -291,7 +291,7 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
         return;
     }
 
-    dst[i] = x[i] < (T)min ? (T)min : (x[i] > (T)max ? (T)max : x[i]);
+    dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
 }
 
 static void acc_f32_sycl(const float *x, const float *y, float *dst,

From f6a7c5f7ca04c52d0446c3d08ee0604c943eb116 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Wed, 9 Apr 2025 19:38:11 +0530
Subject: [PATCH 6/7] Adjust device_support_op for unary OPs

---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index c4fe94bdda2f8..236b039e618e2 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3853,7 +3853,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
-                    return ggml_is_contiguous(op->src[0]);
+#if defined (GGML_SYCL_F16)
+                    return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
+#else
+                    return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
+#endif
                 default:
                     return false;
             }
@@ -3982,6 +3986,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_COS:
         case GGML_OP_CLAMP:
         case GGML_OP_LOG:
+#if defined (GGML_SYCL_F16)
+            return ((op->type == GGML_TYPE_F32 || op->type == GGML_SYCL_F16) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_SYCL_F16) && (op->type == op->src[0]->type));
+#else
+            return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
+#endif
         case GGML_OP_NORM:
         case GGML_OP_RMS_NORM:
         case GGML_OP_L2_NORM:

From 83980606c9b01e09aea7ad02e216d3c71806a6d5 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Thu, 10 Apr 2025 10:20:39 +0530
Subject: [PATCH 7/7] Use cast_data and typed_data struct to deduplicate
 casting code

---
 ggml/src/ggml-sycl/element_wise.cpp | 350 ++++++++++++++++++----------
 ggml/src/ggml-sycl/element_wise.hpp |  14 ++
 2 files changed, 240 insertions(+), 124 deletions(-)

diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 3f93c8c160fb8..b36aa7a9d21e4 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -575,24 +575,29 @@ static void clamp_sycl(const T *x, T *dst, const float min,
 }
 
 inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                silu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                silu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -602,24 +607,29 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
 }
 
 inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                gelu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                gelu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -629,24 +639,29 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
 }
 
 inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                gelu_quick_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                gelu_quick_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -656,24 +671,29 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
 }
 
 inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                tanh_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                tanh_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -683,25 +703,30 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
 }
 
 inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -711,26 +736,31 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
 }
 
 inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
 
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                hardsigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                hardsigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -740,24 +770,29 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
 }
 
 inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                hardswish_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                hardswish_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -767,24 +802,29 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
 }
 
 inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                exp_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                exp_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -794,24 +834,29 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                log_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                log_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -821,24 +866,29 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                sigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                sigmoid_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -848,25 +898,30 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
 }
 
 inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
 
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                sqrt_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                sqrt_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -876,24 +931,29 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
 }
 
 inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                sin_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                sin_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -903,24 +963,29 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                cos_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                cos_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -930,24 +995,29 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                step_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                step_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -957,24 +1027,29 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
 }
 
 inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                neg_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                neg_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -984,26 +1059,32 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
+
     GGML_ASSERT(dst->src[0]->type == dst->type);
     float negative_slope;
     memcpy(&negative_slope, dst->op_params, sizeof(float));
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                leaky_relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                leaky_relu_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream);
+                auto data_pts = cast_data<float>(dst);
+                leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
                 break;
             }
         default:
@@ -1013,24 +1094,29 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
 }
 
 inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+ #if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                sqr_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                sqr_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
@@ -1040,8 +1126,13 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
 
     dpct::queue_ptr main_stream = ctx.stream();
@@ -1052,20 +1143,20 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
     const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
     const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                upscale_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
+                auto data_pts = cast_data<sycl::half>(dst);
+                upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
                         dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
                         main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                upscale_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
+                auto data_pts = cast_data<float>(dst);
+                upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
                         dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
                         main_stream);
                 break;
@@ -1077,26 +1168,31 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
 }
 
 inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1);  // just 3D tensors
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     switch (dst->type) {
+#if defined (GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                pad_sycl(src0_dd, dst_dd, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
+                auto data_pts = cast_data<sycl::half>(dst);
+                pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
                         dst->ne[1], dst->ne[2], main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                pad_sycl(src0_dd, dst_dd, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
+                auto data_pts = cast_data<float>(dst);
+                pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
                         dst->ne[1], dst->ne[2], main_stream);
                 break;
             }
@@ -1107,29 +1203,35 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined(GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
     GGML_ASSERT(dst->src[0]->type == dst->type);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-                float min;
-                float max;
-                memcpy(&min, dst->op_params, sizeof(float));
-                memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
 
     switch (dst->type) {
+#if defined(GGML_SYCL_F16)
         case GGML_TYPE_F16:
             {
-                const sycl::half * src0_dd = static_cast<const sycl::half *>(dst->src[0]->data);
-                sycl::half *       dst_dd  = static_cast<sycl::half *>(dst->data);
-                clamp_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<sycl::half>(dst);
+                clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
+#endif
         case GGML_TYPE_F32:
             {
-                const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-                float *       dst_dd  = static_cast<float *>(dst->data);
-                clamp_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream);
+                auto data_pts = cast_data<float>(dst);
+                clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
                 break;
             }
         default:
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index 47ab09408f7c7..76d91ba108547 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -2,6 +2,7 @@
 #define GGML_SYCL_ELEMENTWISE_HPP
 
 #include "common.hpp"
+#include "ggml.h"
 #include <limits.h>
 
 template <typename T>
@@ -30,6 +31,19 @@ static __dpct_inline__ float op_div(const float a, const float b) {
     return a / b;
 }
 
+template<typename T>
+struct typed_data {
+    const T * src;
+    T * dst;
+};
+
+template<typename T>
+typed_data<T> cast_data(ggml_tensor * dst) {
+    return {
+        /* .src = */ static_cast<const T *>(dst->src[0]->data),
+        /* .dst = */ static_cast<T *>(dst->data)
+    };
+}
 
 void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst);