Skip to content

Commit 03473ae

Browse files
slarenggerganov
authored andcommitted
llama : add thread safety test (ggml-org#14035)
* llama : add thread safety test * llamafile : remove global state * llama : better LLAMA_SPLIT_MODE_NONE logic when main_gpu < 0 GPU devices are not used --------- Co-Authored-By: Georgi Gerganov <[email protected]>
1 parent 0dbcabd commit 03473ae

File tree

11 files changed

+311
-28
lines changed

11 files changed

+311
-28
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,7 @@ jobs:
778778
cmake -S . -B build ${{ matrix.defines }} `
779779
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
780780
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
781+
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
781782
782783
- name: Add libopenblas.dll
783784
id: add_libopenblas_dll

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ sd=`dirname $0`
3939
cd $sd/../
4040
SRC=`pwd`
4141

42-
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
42+
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
4343

4444
if [ ! -z ${GG_BUILD_METAL} ]; then
4545
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"

common/common.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
767767
return true;
768768
}
769769

770+
#include <iostream>
771+
772+
770773
// returns true if successful, false otherwise
771774
bool fs_create_directory_with_parents(const std::string & path) {
772775
#ifdef _WIN32
@@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
784787
// process path from front to back, procedurally creating directories
785788
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
786789
const std::wstring subpath = wpath.substr(0, pos_slash);
787-
const wchar_t * test = subpath.c_str();
788790

789-
const bool success = CreateDirectoryW(test, NULL);
791+
pos_slash += 1;
792+
793+
// skip the drive letter, in some systems it can return an access denied error
794+
if (subpath.length() == 2 && subpath[1] == ':') {
795+
continue;
796+
}
797+
798+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
799+
790800
if (!success) {
791801
const DWORD error = GetLastError();
792802

@@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
800810
return false;
801811
}
802812
}
803-
804-
pos_slash += 1;
805813
}
806814

807815
return true;

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
503503
// TODO: move to ggml-threading
504504
void ggml_barrier(struct ggml_threadpool * tp);
505505

506+
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
507+
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
508+
506509
#ifdef __cplusplus
507510
}
508511
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
559559
#endif
560560
}
561561

562+
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
563+
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
564+
}
565+
566+
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
567+
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
568+
}
569+
562570
#if defined(__gnu_linux__)
563571
static cpu_set_t ggml_get_numa_affinity(void) {
564572
cpu_set_t cpuset;

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
#include "ggml-cpu-impl.h"
5454
#include "ggml-quants.h"
5555

56-
#include <atomic>
5756
#include <array>
5857
#include <type_traits>
5958

@@ -394,8 +393,6 @@ class tinyBLAS {
394393

395394
template <int RM, int RN, int BM>
396395
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
397-
static std::atomic<int64_t> current_chunk;
398-
399396
GGML_ASSERT(m % (RM * BM) == 0);
400397
const int64_t ytiles = m / (RM * BM);
401398
const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +407,7 @@ class tinyBLAS {
410407
if (params->ith == 0) {
411408
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
412409
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
413-
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
410+
ggml_threadpool_chunk_set(params->threadpool, params->nth);
414411
}
415412

416413
ggml_barrier(params->threadpool);
@@ -439,8 +436,7 @@ class tinyBLAS {
439436
GGML_ASSERT(jj == jj2);
440437
}
441438

442-
// next step.
443-
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
439+
job = ggml_threadpool_chunk_add(params->threadpool, 1);
444440
}
445441

446442
ggml_barrier(params->threadpool);

src/llama-chat.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77
#include <algorithm>
88

99
#if __cplusplus >= 202000L
10-
#define LU8(x) (const char*)(u8##x)
1110
#else
12-
#define LU8(x) u8##x
1311
#endif
1412

1513
// trim whitespace from the beginning and end of a string
@@ -158,12 +156,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
158156
} else if (tmpl_contains("[gMASK]sop")) {
159157
// chatglm3-6b
160158
return LLM_CHAT_TEMPLATE_CHATGLM_3;
161-
} else if (tmpl_contains(LU8("<用户>"))) {
159+
} else if (tmpl_contains(("<用户>"))) {
162160
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
163161
return LLM_CHAT_TEMPLATE_MINICPM;
164162
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
165163
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
166-
} else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
164+
} else if (tmpl_contains(("<|Assistant|>")) && tmpl_contains(("<|User|>")) && tmpl_contains(("<|end▁of▁sentence|>"))) {
167165
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
168166
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
169167
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
@@ -473,7 +471,7 @@ int32_t llm_chat_apply_template(
473471
for (auto message : chat) {
474472
std::string role(message->role);
475473
if (role == "user") {
476-
ss << LU8("<用户>");
474+
ss << ("<用户>");
477475
ss << trim(message->content);
478476
ss << "<AI>";
479477
} else {
@@ -489,7 +487,7 @@ int32_t llm_chat_apply_template(
489487
} else if (role == "user") {
490488
ss << "User: " << message->content << "\n\n";
491489
} else if (role == "assistant") {
492-
ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
490+
ss << "Assistant: " << message->content << ("<|end▁of▁sentence|>");
493491
}
494492
}
495493
if (add_ass) {
@@ -502,13 +500,13 @@ int32_t llm_chat_apply_template(
502500
if (role == "system") {
503501
ss << message->content << "\n\n";
504502
} else if (role == "user") {
505-
ss << LU8("<|User|>") << message->content;
503+
ss << ("<|User|>") << message->content;
506504
} else if (role == "assistant") {
507-
ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
505+
ss << ("<|Assistant|>") << message->content << ("<|end▁of▁sentence|>");
508506
}
509507
}
510508
if (add_ass) {
511-
ss << LU8("<|Assistant|>");
509+
ss << ("<|Assistant|>");
512510
}
513511
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
514512
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb

src/llama-context.h

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#pragma once
2-
2+
#include <chrono>
3+
#include <mutex>
4+
#include <atomic>
35
#include "llama.h"
46
#include "llama-cparams.h"
57
#include "llama-graph.h"
@@ -11,6 +13,115 @@
1113
#include <map>
1214
#include <vector>
1315

16+
17+
namespace test {
18+
19+
// from
20+
// https://stackoverflow.com/questions/16337610/how-to-know-if-a-type-is-a-specialization-of-stdvector
21+
template <typename, template <typename...> typename> constexpr bool is_specialization_v = false;
22+
23+
template <template <typename...> typename value_type, typename... arg_types>
24+
constexpr bool is_specialization_v<value_type<arg_types...>, value_type> = true;
25+
26+
template <typename value_type = std::chrono::nanoseconds> class stop_watch {
27+
public:
28+
using hr_clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
29+
std::chrono::high_resolution_clock, std::chrono::steady_clock>;
30+
static constexpr bool lock_free{ std::atomic<value_type>::is_always_lock_free };
31+
using time_type = std::conditional_t<lock_free, value_type, uint64_t>;
32+
33+
stop_watch(uint64_t newTime) noexcept { total_time_units.store(time_type{ newTime }, std::memory_order_release); }
34+
35+
stop_watch & operator=(stop_watch && other) noexcept {
36+
if (this != &other) {
37+
total_time_units.store(other.total_time_units.load(std::memory_order_acquire), std::memory_order_release);
38+
start_time_units.store(other.start_time_units.load(std::memory_order_acquire), std::memory_order_release);
39+
}
40+
return *this;
41+
}
42+
43+
stop_watch(stop_watch && other) noexcept { *this = std::move(other); }
44+
45+
stop_watch & operator=(const stop_watch & other) noexcept {
46+
if (this != &other) {
47+
total_time_units.store(other.total_time_units.load(std::memory_order_acquire), std::memory_order_release);
48+
start_time_units.store(other.start_time_units.load(std::memory_order_acquire), std::memory_order_release);
49+
}
50+
return *this;
51+
}
52+
53+
stop_watch(const stop_watch & other) noexcept { *this = other; }
54+
55+
bool has_time_elapsed() noexcept {
56+
return ((get_current_time() - start_time_units.load(std::memory_order_acquire)) >=
57+
total_time_units.load(std::memory_order_acquire));
58+
}
59+
60+
void add_time() noexcept {
61+
//std::unique_lock lock{ mutex };
62+
values.emplace_back(total_time_elapsed());
63+
//lock.release();
64+
reset();
65+
}
66+
67+
uint64_t get_count() noexcept { return values.size(); }
68+
69+
uint64_t get_average(time_type newTimeValue = time_type{}) noexcept {
70+
std::unique_lock lock{ mutex };
71+
uint64_t total_time{};
72+
for (auto & value : values) {
73+
total_time += get_value_as_uint(value);
74+
}
75+
return total_time / ((values.size() > 0) ? values.size() : 1);
76+
}
77+
78+
void reset(time_type newTimeValue = time_type{}) noexcept {
79+
if (newTimeValue != time_type{}) {
80+
total_time_units.store(newTimeValue, std::memory_order_release);
81+
}
82+
start_time_units.store(get_current_time(), std::memory_order_release);
83+
}
84+
85+
uint64_t get_total_wait_time() const noexcept {
86+
return get_value_as_uint(total_time_units.load(std::memory_order_acquire));
87+
}
88+
89+
time_type total_time_elapsed() noexcept {
90+
return get_current_time() - start_time_units.load(std::memory_order_acquire);
91+
}
92+
93+
uint64_t total_time_elapsed_uint64() noexcept {
94+
return get_value_as_uint(get_current_time()) -
95+
get_value_as_uint(start_time_units.load(std::memory_order_acquire));
96+
}
97+
98+
protected:
99+
std::atomic<time_type> total_time_units{};
100+
std::atomic<time_type> start_time_units{};
101+
std::vector<time_type> values{};
102+
std::mutex mutex{};
103+
104+
time_type get_current_time() {
105+
if constexpr (lock_free) {
106+
return std::chrono::duration_cast<value_type>(hr_clock::now().time_since_epoch());
107+
} else {
108+
return std::chrono::duration_cast<value_type>(hr_clock::now().time_since_epoch()).count();
109+
}
110+
}
111+
112+
uint64_t get_value_as_uint(time_type time) {
113+
if constexpr (lock_free) {
114+
return time.count();
115+
} else {
116+
return time;
117+
}
118+
}
119+
};
120+
} // namespace test
121+
122+
inline test::stop_watch stop_watch_val{ 0 };
123+
124+
14125
struct llama_model;
15126
class llama_batch_allocr;
16127

src/llama.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
198198

199199
// if using single GPU mode, remove all except the main GPU
200200
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
201-
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
202-
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
203-
llama_model_free(model);
204-
return nullptr;
201+
if (params.main_gpu < 0) {
202+
model->devices.clear();
203+
} else {
204+
if (params.main_gpu >= (int)model->devices.size()) {
205+
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
206+
llama_model_free(model);
207+
return nullptr;
208+
}
209+
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
210+
model->devices.clear();
211+
model->devices.push_back(main_gpu);
205212
}
206-
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
207-
model->devices.clear();
208-
model->devices.push_back(main_gpu);
209213
}
210214

211215
for (auto * dev : model->devices) {

tests/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ llama_build_and_test(test-json-partial.cpp)
185185
llama_build_and_test(test-log.cpp)
186186
llama_build_and_test(test-regex-partial.cpp)
187187

188+
llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)
189+
188190
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
189191
if (NOT WIN32)
190192
llama_build_and_test(test-arg-parser.cpp)

0 commit comments

Comments
 (0)