...
[set_tensor] saved to '/var/cache/llama/rpc/081ba1bf3fcff680'
[set_tensor] buffer: 0x2ce224f0, data: 0xffffd55971dc0000, offset: 0, size: 198180864
[set_tensor] saved to '/var/cache/llama/rpc/00803106cce709fe'
[alloc_buffer] device: 0, size: 2113929216 -> remote_ptr: 2ce20e00, remote_size: 2113929216
[buffer_get_base] remote_ptr: 2ce20e00
[buffer_clear] remote_ptr: 2ce20e00, value: 0
[get_device_memory] device: 0, free_mem: 20051484672, total_mem: 34242297856
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[alloc_buffer] device: 0, size: 478242816 -> remote_ptr: 2ce177f0, remote_size: 478242816
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[buffer_get_base] remote_ptr: 2ce177f0
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[get_alloc_size] device: 0, buffer: (nil), data: (nil)
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fbc00000, offset: 0, size: 98304
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fd400000, offset: 0, size: 8
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fd400800, offset: 0, size: 65536
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fd410800, offset: 0, size: 16384
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fd414800, offset: 0, size: 16
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fd415800, offset: 0, size: 16
[set_tensor] buffer: 0x2ce177f0, data: 0xffffd559fd416800, offset: 0, size: 2048
[graph_compute] device: 0, n_nodes: 603, n_tensors: 764
No kernel named _ZTSZZL17rms_norm_f32_syclPKfPfiiiilllfPN4sycl3_V15queueEiENKUlRNS3_7handlerEE0_clES7_EUlNS3_7nd_itemILi3EEEE_ was foundException caught at file:/opt/llama/ggml/src/ggml-sycl/ggml-sycl.cpp, line:4269
Error OP RMS_NORM
Name and Version
Using
rpc-serverwith container built using--target rpc-server, so nollama-cli. Container was built cleanly against commit82764d8f405ff7928c061d8c100b50e9f77939f6.Operating systems
Linux
GGML backends
RPC server for Intel cards built with SYCL, different container build with ROCm for the AMD GPUs. No apparent errors with the AMD instances.
Hardware
Host 1 has Intel Arc Pro B70 x2, AMD Radeon AI Pro 9700 XT x1
Host 2 has AMD Radeon AI Pro 9700 XT x3
Host 3 has AMD Radeon 7900XTX x1
Models
Was attempted with Unsloth MiniMax 2.7 and Unsloth Devstral 2, both at Q4_K_XL.
Problem description & steps to reproduce
Running in RPC mode with custom container. Using separate RPC server instances for each card on host with Intel GPUs due to #21747, other hosts have one RPC server for all cards.
As soon as
llama-serverfinishes loading the models, the first Intel RPC server crashes, which killsllama-server.I am able to run on these cards with Vulkan, but that is suspiciously slow. I am hoping to get SYCL to work for better performance.
It crashes with the following message:
Dockerfile for Intel RPC server
First Bad Commit
Unknown. These Intel cards are new to me, and I have yet to get them working with SYCL.
Relevant log output
Logs
Output from RPC server for first Intel card (also first card in RPC list):
Output from
llama-server: