llama : add thread safety test (ggml-org#14035)

slaren · ggerganov · NotAnotherGayDude · commit 03473ae2c5c9 · 2025-06-17T09:20:15.000-04:00
* llama : add thread safety test

* llamafile : remove global state

* llama : better LLAMA_SPLIT_MODE_NONE logic

when main_gpu &lt; 0 GPU devices are not used

---------

Co-Authored-By: Georgi Gerganov &lt;ggerganov@gmail.com&gt;
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -778,6 +778,7 @@ jobs:
           cmake -S . -B build ${{ matrix.defines }} `
             -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
           cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
 
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
diff --git a/ci/run.sh b/ci/run.sh
@@ -39,7 +39,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
diff --git a/common/common.cpp b/common/common.cpp
@@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
     return true;
 }
 
+#include <iostream>
+
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
@@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
     // process path from front to back, procedurally creating directories
     while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
         const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
 
-        const bool success = CreateDirectoryW(test, NULL);
+        pos_slash += 1;
+
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
+
         if (!success) {
             const DWORD error = GetLastError();
 
@@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
                 return false;
             }
         }
-
-        pos_slash += 1;
     }
 
     return true;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
 
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
+int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
 #endif
 }
 
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
+    atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
+int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
+    return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
 #if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
     cpu_set_t cpuset;
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -53,7 +53,6 @@
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
-#include <atomic>
 #include <array>
 #include <type_traits>
 
@@ -394,8 +393,6 @@ class tinyBLAS {
 
     template <int RM, int RN, int BM>
     NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
-        static std::atomic<int64_t> current_chunk;
-
         GGML_ASSERT(m % (RM * BM) == 0);
         const int64_t ytiles = m / (RM * BM);
         const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +407,7 @@ class tinyBLAS {
         if (params->ith == 0) {
             GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
             // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+            ggml_threadpool_chunk_set(params->threadpool, params->nth);
         }
 
         ggml_barrier(params->threadpool);
@@ -439,8 +436,7 @@ class tinyBLAS {
                 GGML_ASSERT(jj == jj2);
             }
 
-            // next step.
-            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
+            job = ggml_threadpool_chunk_add(params->threadpool, 1);
         }
 
         ggml_barrier(params->threadpool);
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
@@ -7,9 +7,7 @@
 #include <algorithm>
 
 #if __cplusplus >= 202000L
-    #define LU8(x) (const char*)(u8##x)
 #else
-    #define LU8(x) u8##x
 #endif
 
 // trim whitespace from the beginning and end of a string
@@ -158,12 +156,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     } else if (tmpl_contains("[gMASK]sop")) {
         // chatglm3-6b
         return LLM_CHAT_TEMPLATE_CHATGLM_3;
-    } else if (tmpl_contains(LU8("<用户>"))) {
+    } else if (tmpl_contains(("<用户>"))) {
         // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
         return LLM_CHAT_TEMPLATE_MINICPM;
     } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
         return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
-    } else if (tmpl_contains(LU8("<｜Assistant｜>")) && tmpl_contains(LU8("<｜User｜>")) && tmpl_contains(LU8("<｜end▁of▁sentence｜>"))) {
+    } else if (tmpl_contains(("<｜Assistant｜>")) && tmpl_contains(("<｜User｜>")) && tmpl_contains(("<｜end▁of▁sentence｜>"))) {
         return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
     } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
         // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
@@ -473,7 +471,7 @@ int32_t llm_chat_apply_template(
         for (auto message : chat) {
             std::string role(message->role);
             if (role == "user") {
-                ss << LU8("<用户>");
+                ss << ("<用户>");
                 ss << trim(message->content);
                 ss << "<AI>";
             } else {
@@ -489,7 +487,7 @@ int32_t llm_chat_apply_template(
             } else if (role == "user") {
                 ss << "User: " << message->content << "\n\n";
             } else if (role == "assistant") {
-                ss << "Assistant: " << message->content << LU8("<｜end▁of▁sentence｜>");
+                ss << "Assistant: " << message->content << ("<｜end▁of▁sentence｜>");
             }
         }
         if (add_ass) {
@@ -502,13 +500,13 @@ int32_t llm_chat_apply_template(
             if (role == "system") {
                 ss << message->content << "\n\n";
             } else if (role == "user") {
-                ss << LU8("<｜User｜>") << message->content;
+                ss << ("<｜User｜>") << message->content;
             } else if (role == "assistant") {
-                ss << LU8("<｜Assistant｜>") << message->content << LU8("<｜end▁of▁sentence｜>");
+                ss << ("<｜Assistant｜>") << message->content << ("<｜end▁of▁sentence｜>");
             }
         }
         if (add_ass) {
-            ss << LU8("<｜Assistant｜>");
+            ss << ("<｜Assistant｜>");
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
         // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -1,5 +1,7 @@
 #pragma once
-
+#include <chrono>
+#include <mutex>
+#include <atomic>
 #include "llama.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
@@ -11,6 +13,115 @@
 #include <map>
 #include <vector>
 
+
+namespace test {
+
+// from
+// https://stackoverflow.com/questions/16337610/how-to-know-if-a-type-is-a-specialization-of-stdvector
+template <typename, template <typename...> typename> constexpr bool is_specialization_v = false;
+
+template <template <typename...> typename value_type, typename... arg_types>
+constexpr bool is_specialization_v<value_type<arg_types...>, value_type> = true;
+
+template <typename value_type = std::chrono::nanoseconds> class stop_watch {
+  public:
+    using hr_clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
+                                        std::chrono::high_resolution_clock, std::chrono::steady_clock>;
+    static constexpr bool lock_free{ std::atomic<value_type>::is_always_lock_free };
+    using time_type = std::conditional_t<lock_free, value_type, uint64_t>;
+
+    stop_watch(uint64_t newTime) noexcept { total_time_units.store(time_type{ newTime }, std::memory_order_release); }
+
+    stop_watch & operator=(stop_watch && other) noexcept {
+        if (this != &other) {
+            total_time_units.store(other.total_time_units.load(std::memory_order_acquire), std::memory_order_release);
+            start_time_units.store(other.start_time_units.load(std::memory_order_acquire), std::memory_order_release);
+        }
+        return *this;
+    }
+
+    stop_watch(stop_watch && other) noexcept { *this = std::move(other); }
+
+    stop_watch & operator=(const stop_watch & other) noexcept {
+        if (this != &other) {
+            total_time_units.store(other.total_time_units.load(std::memory_order_acquire), std::memory_order_release);
+            start_time_units.store(other.start_time_units.load(std::memory_order_acquire), std::memory_order_release);
+        }
+        return *this;
+    }
+
+    stop_watch(const stop_watch & other) noexcept { *this = other; }
+
+    bool has_time_elapsed() noexcept {
+        return ((get_current_time() - start_time_units.load(std::memory_order_acquire)) >=
+                total_time_units.load(std::memory_order_acquire));
+    }
+
+    void add_time() noexcept {
+        //std::unique_lock lock{ mutex };
+        values.emplace_back(total_time_elapsed());
+        //lock.release();
+        reset();
+    }
+
+    uint64_t get_count() noexcept { return values.size(); }
+
+    uint64_t get_average(time_type newTimeValue = time_type{}) noexcept {
+        std::unique_lock lock{ mutex };
+        uint64_t         total_time{};
+        for (auto & value : values) {
+            total_time += get_value_as_uint(value);
+        }
+        return total_time / ((values.size() > 0) ? values.size() : 1);
+    }
+
+    void reset(time_type newTimeValue = time_type{}) noexcept {
+        if (newTimeValue != time_type{}) {
+            total_time_units.store(newTimeValue, std::memory_order_release);
+        }
+        start_time_units.store(get_current_time(), std::memory_order_release);
+    }
+
+    uint64_t get_total_wait_time() const noexcept {
+        return get_value_as_uint(total_time_units.load(std::memory_order_acquire));
+    }
+
+    time_type total_time_elapsed() noexcept {
+        return get_current_time() - start_time_units.load(std::memory_order_acquire);
+    }
+
+    uint64_t total_time_elapsed_uint64() noexcept {
+        return get_value_as_uint(get_current_time()) -
+               get_value_as_uint(start_time_units.load(std::memory_order_acquire));
+    }
+
+  protected:
+    std::atomic<time_type> total_time_units{};
+    std::atomic<time_type> start_time_units{};
+    std::vector<time_type> values{};
+    std::mutex             mutex{};
+
+    time_type get_current_time() {
+        if constexpr (lock_free) {
+            return std::chrono::duration_cast<value_type>(hr_clock::now().time_since_epoch());
+        } else {
+            return std::chrono::duration_cast<value_type>(hr_clock::now().time_since_epoch()).count();
+        }
+    }
+
+    uint64_t get_value_as_uint(time_type time) {
+        if constexpr (lock_free) {
+            return time.count();
+        } else {
+            return time;
+        }
+    }
+};
+}  // namespace test
+
+inline test::stop_watch stop_watch_val{ 0 };
+
+
 struct llama_model;
 class llama_batch_allocr;
 
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
 
     // if using single GPU mode, remove all except the main GPU
     if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
-        if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
-            LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
-            llama_model_free(model);
-            return nullptr;
+        if (params.main_gpu < 0) {
+            model->devices.clear();
+        } else {
+            if (params.main_gpu >= (int)model->devices.size()) {
+                LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
+                llama_model_free(model);
+                return nullptr;
+            }
+            ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
+            model->devices.clear();
+            model->devices.push_back(main_gpu);
         }
-        ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
-        model->devices.clear();
-        model->devices.push_back(main_gpu);
     }
 
     for (auto * dev : model->devices) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -185,6 +185,8 @@ llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(test-regex-partial.cpp)
 
+llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)
+
 # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
 if (NOT WIN32)
     llama_build_and_test(test-arg-parser.cpp)
diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp

Original file line number	Diff line number	Diff line change
`@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {`
`503`	`503`	`// TODO: move to ggml-threading`
`504`	`504`	`void ggml_barrier(struct ggml_threadpool * tp);`
`505`	`505`
	`506`	`+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);`
	`507`	`+int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);`
	`508`	`+`
`506`	`509`	`#ifdef __cplusplus`
`507`	`510`	`}`
`508`	`511`	`#endif`