Skip to content

Commit 29ef746

Browse files
Merge pull request #312 from janhq/update-dev-from-master-2025-11-03-00-37
Sync master with upstream release b6929
2 parents 5770254 + a2054e3 commit 29ef746

35 files changed

+708
-145
lines changed

.devops/s390x.Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ RUN --mount=type=cache,target=/root/.ccache \
2424
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
2525
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
2626
-DLLAMA_BUILD_TESTS=OFF \
27-
-DGGML_BACKEND_DL=OFF \
2827
-DGGML_NATIVE=OFF \
28+
-DGGML_BACKEND_DL=ON \
29+
-DGGML_CPU_ALL_VARIANTS=ON \
2930
-DGGML_BLAS=ON \
3031
-DGGML_BLAS_VENDOR=OpenBLAS && \
3132
cmake --build build --config Release -j $(nproc) && \
@@ -103,6 +104,7 @@ FROM base AS light
103104
WORKDIR /llama.cpp/bin
104105

105106
# Copy llama.cpp binaries and libraries
107+
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
106108
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
107109

108110
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -116,6 +118,7 @@ ENV LLAMA_ARG_HOST=0.0.0.0
116118
WORKDIR /llama.cpp/bin
117119

118120
# Copy llama.cpp binaries and libraries
121+
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
119122
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
120123

121124
EXPOSE 8080

.github/workflows/build-linux-cross.yml

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,49 +4,49 @@ on:
44
workflow_call:
55

66
jobs:
7-
ubuntu-24-riscv64-cpu-cross:
8-
runs-on: ubuntu-24.04
7+
# ubuntu-24-riscv64-cpu-cross:
8+
# runs-on: ubuntu-24.04
99

10-
steps:
11-
- uses: actions/checkout@v4
12-
- name: Setup Riscv
13-
run: |
14-
sudo dpkg --add-architecture riscv64
10+
# steps:
11+
# - uses: actions/checkout@v4
12+
# - name: Setup Riscv
13+
# run: |
14+
# sudo dpkg --add-architecture riscv64
1515

16-
# Add arch-specific repositories for non-amd64 architectures
17-
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
18-
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
19-
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
20-
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
21-
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
22-
EOF
16+
# # Add arch-specific repositories for non-amd64 architectures
17+
# cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
18+
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
19+
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
20+
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
21+
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
22+
# EOF
2323

24-
sudo apt-get update || true ;# Prevent failure due to missing URLs.
24+
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
2525

26-
sudo apt-get install -y --no-install-recommends \
27-
build-essential \
28-
gcc-14-riscv64-linux-gnu \
29-
g++-14-riscv64-linux-gnu
26+
# sudo apt-get install -y --no-install-recommends \
27+
# build-essential \
28+
# gcc-14-riscv64-linux-gnu \
29+
# g++-14-riscv64-linux-gnu
3030

31-
- name: Build
32-
run: |
33-
cmake -B build -DLLAMA_CURL=OFF \
34-
-DCMAKE_BUILD_TYPE=Release \
35-
-DGGML_OPENMP=OFF \
36-
-DLLAMA_BUILD_EXAMPLES=ON \
37-
-DLLAMA_BUILD_TOOLS=ON \
38-
-DLLAMA_BUILD_TESTS=OFF \
39-
-DCMAKE_SYSTEM_NAME=Linux \
40-
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
41-
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
42-
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
43-
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
44-
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
45-
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
46-
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
47-
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
31+
# - name: Build
32+
# run: |
33+
# cmake -B build -DLLAMA_CURL=OFF \
34+
# -DCMAKE_BUILD_TYPE=Release \
35+
# -DGGML_OPENMP=OFF \
36+
# -DLLAMA_BUILD_EXAMPLES=ON \
37+
# -DLLAMA_BUILD_TOOLS=ON \
38+
# -DLLAMA_BUILD_TESTS=OFF \
39+
# -DCMAKE_SYSTEM_NAME=Linux \
40+
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
41+
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
42+
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
43+
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
44+
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
45+
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
46+
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
47+
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
4848

49-
cmake --build build --config Release -j $(nproc)
49+
# cmake --build build --config Release -j $(nproc)
5050

5151
# ubuntu-24-riscv64-vulkan-cross:
5252
# runs-on: ubuntu-24.04

.github/workflows/release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,8 @@ jobs:
134134
include:
135135
- build: 'x64'
136136
os: ubuntu-22.04
137-
- build: 's390x-z15' # z15 because our CI runners are on z15
138-
os: ubuntu-22.04-s390x
137+
- build: 's390x'
138+
os: ubuntu-24.04-s390x
139139
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
140140
# - build: 'arm64'
141141
# os: ubuntu-22.04-arm

common/chat.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
313313
}
314314
if (!msg.reasoning_content.empty()) {
315315
jmsg["reasoning_content"] = msg.reasoning_content;
316-
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
317316
}
318317
if (!msg.tool_name.empty()) {
319318
jmsg["name"] = msg.tool_name;
@@ -1810,7 +1809,23 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
18101809

18111810
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
18121811
common_chat_params data;
1813-
auto prompt = apply(tmpl, inputs);
1812+
1813+
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
1814+
auto adjusted_messages = json::array();
1815+
for (const auto & msg : inputs.messages) {
1816+
auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
1817+
auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
1818+
1819+
if (has_reasoning_content && has_tool_calls) {
1820+
auto adjusted_message = msg;
1821+
adjusted_message["thinking"] = msg.at("reasoning_content");
1822+
adjusted_messages.push_back(adjusted_message);
1823+
} else {
1824+
adjusted_messages.push_back(msg);
1825+
}
1826+
}
1827+
1828+
auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
18141829

18151830
// Check if we need to replace the return token with end token during
18161831
// inference and without generation prompt. For more details see:

convert_hf_to_gguf.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9802,6 +9802,113 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
98029802

98039803
return [(self.map_tensor_name(name), data_torch)]
98049804

9805+
9806+
@ModelBase.register("JanusForConditionalGeneration")
9807+
class JanusProModel(LlamaModel):
9808+
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
9809+
9810+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9811+
# Skip vision, aligner, and generation tensors
9812+
skip_prefixes = (
9813+
'model.vision_model.',
9814+
'model.aligner.',
9815+
'model.vqmodel.',
9816+
'model.generation_embeddings.',
9817+
'model.generation_aligner.',
9818+
'model.generation_head.',
9819+
)
9820+
if name.startswith(skip_prefixes):
9821+
return []
9822+
9823+
if name.startswith('model.language_model.'):
9824+
name = name.replace('model.language_model.', 'model.')
9825+
elif name.startswith('language_model.'):
9826+
name = name.replace('language_model.', '')
9827+
9828+
return super().modify_tensors(data_torch, name, bid)
9829+
9830+
9831+
@ModelBase.register("JanusForConditionalGeneration")
9832+
class JanusProVisionModel(MmprojModel):
9833+
def __init__(self, *args, **kwargs):
9834+
super().__init__(*args, **kwargs)
9835+
assert self.hparams_vision is not None
9836+
if "intermediate_size" not in self.hparams_vision:
9837+
mlp_ratio = self.hparams_vision.get("mlp_ratio")
9838+
hidden_size = self.hparams_vision.get("hidden_size")
9839+
if mlp_ratio is not None and hidden_size is not None:
9840+
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
9841+
9842+
def set_gguf_parameters(self):
9843+
super().set_gguf_parameters()
9844+
assert self.hparams_vision is not None
9845+
9846+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
9847+
9848+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
9849+
9850+
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
9851+
if hidden_act == "gelu":
9852+
self.gguf_writer.add_vision_use_gelu(True)
9853+
elif hidden_act == "silu":
9854+
self.gguf_writer.add_vision_use_silu(True)
9855+
9856+
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
9857+
"""Map aligner tensors to projector format"""
9858+
suffix = ".bias" if name.endswith(".bias") else ".weight"
9859+
9860+
if name.startswith("model.aligner."):
9861+
local_name = name[len("model.aligner."):]
9862+
elif name.startswith("aligner."):
9863+
local_name = name[len("aligner."):]
9864+
else:
9865+
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
9866+
9867+
if local_name.startswith("fc1."):
9868+
mm_index = 0
9869+
elif local_name.startswith("hidden_layers."):
9870+
parts = local_name.split(".", 2)
9871+
if len(parts) < 3:
9872+
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
9873+
mm_index = int(parts[1]) + 1
9874+
else:
9875+
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
9876+
9877+
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
9878+
return [(tensor_name, data_torch)]
9879+
9880+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9881+
del bid # unused
9882+
9883+
# Skip language model tensors as they will be handled by `JanusProModel`
9884+
if name.startswith(('model.language_model.', 'language_model.')):
9885+
return []
9886+
9887+
# Skip generation-related components
9888+
skip_generation_prefixes = (
9889+
'model.vqmodel.',
9890+
'vqmodel.',
9891+
'model.generation_embeddings.',
9892+
'generation_embeddings.',
9893+
'model.generation_aligner.',
9894+
'generation_aligner.',
9895+
'model.generation_head.',
9896+
'generation_head.',
9897+
)
9898+
if name.startswith(skip_generation_prefixes):
9899+
return []
9900+
9901+
# Handle aligner tensors
9902+
if name.startswith(('model.aligner.', 'aligner.')):
9903+
return list(self._map_aligner_tensor(data_torch, name))
9904+
9905+
# Handle vision tensors
9906+
if name.startswith(('model.vision_model.', 'vision_model.')):
9907+
return [(self.map_tensor_name(name), data_torch)]
9908+
9909+
return []
9910+
9911+
98059912
###### CONVERSION LOGIC ######
98069913

98079914

docs/docker.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
## Images
88
We have three Docker images available for this project:
99

10-
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
11-
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
12-
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
10+
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
11+
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
12+
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
1313

1414
Additionally, there the following images, similar to the above:
1515

ggml/src/CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,10 @@ function(ggml_add_cpu_backend_variant tag_name)
308308
set(GGML_INTERNAL_${feat} ON)
309309
endforeach()
310310
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
311+
foreach (feat VXE2 NNPA)
312+
set(GGML_INTERNAL_${feat} OFF)
313+
endforeach()
314+
311315
foreach (feat ${ARGN})
312316
set(GGML_INTERNAL_${feat} ON)
313317
endforeach()
@@ -377,9 +381,8 @@ if (GGML_CPU_ALL_VARIANTS)
377381
endif()
378382
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
379383
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
380-
ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
381-
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
382-
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
384+
ggml_add_cpu_backend_variant(z15 Z15 VXE2)
385+
ggml_add_cpu_backend_variant(z16 Z16 VXE2 NNPA)
383386
else()
384387
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
385388
endif()

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -504,11 +504,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
504504
endforeach()
505505
endif()
506506

507-
if (GGML_VXE OR GGML_INTERNAL_VXE)
508-
message(STATUS "VX/VXE/VXE2 enabled")
507+
if (GGML_VXE OR GGML_INTERNAL_VXE2)
508+
message(STATUS "VXE2 enabled")
509509
list(APPEND ARCH_FLAGS -mvx -mzvector)
510-
list(APPEND ARCH_DEFINITIONS GGML_VXE)
510+
list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
511511
endif()
512+
513+
if (GGML_INTERNAL_NNPA)
514+
message(STATUS "NNPA enabled")
515+
list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
516+
endif()
517+
518+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
512519
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
513520
message(STATUS "Wasm detected")
514521
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include "ggml-backend-impl.h"
2+
3+
#if defined(__s390x__)
4+
#include <sys/auxv.h>
5+
6+
// find hwcap bits in asm/elf.h
7+
#ifndef HWCAP_VXRS_EXT2
8+
#define HWCAP_VXRS_EXT2 (1 << 15)
9+
#endif
10+
11+
#ifndef HWCAP_NNPA
12+
#define HWCAP_NNPA (1 << 20)
13+
#endif
14+
15+
struct s390x_features {
16+
bool has_vxe2 = false;
17+
bool has_nnpa = false;
18+
19+
s390x_features() {
20+
uint32_t hwcap = getauxval(AT_HWCAP);
21+
// NOTE: use hwcap2 with DFLT for z17 and later
22+
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
23+
24+
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
25+
has_nnpa = !!(hwcap & HWCAP_NNPA);
26+
}
27+
};
28+
29+
static int ggml_backend_cpu_s390x_score() {
30+
int score = 1;
31+
s390x_features sf;
32+
33+
// IBM z15 / LinuxONE 3
34+
#ifdef GGML_USE_VXE2
35+
if (!sf.has_vxe2) { return 0; }
36+
score += 1 << 1;
37+
#endif
38+
39+
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
40+
#ifdef GGML_USE_NNPA
41+
if (!sf.has_nnpa) { return 0; }
42+
score += 1 << 2;
43+
#endif
44+
45+
return score;
46+
}
47+
48+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
49+
50+
#endif // __s390x__

0 commit comments

Comments
 (0)