Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.
Tested with dmd v2.107 on Windows.
Code
llamad.c:
#include "llama.h"
A ported D version of simple
example from llama.cpp
llamad.d:
module llama_d;
import std.string;
import std.stdio;
import llamad; // imports llamad.c
// pragma(msg, __traits(allMembers, llamad));
void main(string[] args)
{
if (args.length < 3) {
writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> <your_prompt>");
return;
}
llama_backend_init();
llama_numa_init(GGML_NUMA_STRATEGY_DISABLED);
auto mparams = llama_model_default_params();
// mparams.n_gpu_layers = 30; // offload layers to the GPU to accelerate inference
auto ctx_params = llama_context_default_params();
ctx_params.n_ctx = 2048;
import std.parallelism;
ctx_params.n_threads = totalCPUs-1;
ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 ? ctx_params.n_threads : ctx_params.n_threads_batch;
llama_model* model = llama_load_model_from_file(toStringz(args[1]), mparams);
llama_context* ctx = llama_new_context_with_model(model, ctx_params);
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
const bool allow_special = false;
string prompt = args[2];
if (!prompt.length)
return;
// convert prompt to embedings
llama_token[] embd_inp;
embd_inp.length = prompt.length;
writeln("tokenizing...");
auto n_of_tok = llama_tokenize(llama_get_model(ctx), prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) embd_inp.length, add_bos, allow_special);
embd_inp.length = n_of_tok;
if (!n_of_tok) {
writeln("no tokens generated, something gone wrong");
return;
}
writeln("input has ", n_of_tok, " tokens");
foreach (id; embd_inp) {
write(llama_token_to_piece(ctx, id));
}
writeln();
// total length of the sequence including the prompt
const int n_len = 128;
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = cast(int)(embd_inp.length + (n_len - embd_inp.length));
if (n_kv_req > n_ctx) {
writeln("error: prompt is too long");
return;
}
writeln("building batch");
// create a llama_batch with size 512
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(512, 0, 1);
// evaluate the initial prompt
for (size_t i = 0; i < embd_inp.length; i++) {
// note that seq_pos_id = [0] is required as there should be at least one token
llama_batch_add(batch, embd_inp[i], cast(int) i, [0], false);
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
writeln("decoding batch");
if (llama_decode(ctx, batch) != 0) {
writeln("llama_decode() failed");
return;
}
// main loop
int n_cur = batch.n_tokens;
int n_decode = 0;
const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) {
// sample the next token
{
auto n_vocab = llama_n_vocab(model);
auto logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
llama_token_data[] candidates;
candidates.length = n_vocab;
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates ~= llama_token_data(token_id, logits[token_id], 0.0f);
}
llama_token_data_array candidates_p = { candidates.ptr, cast(int) candidates.length, false };
// sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream?
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
writeln();
break;
}
writef("%s", llama_token_to_piece(ctx, new_token_id));
// prepare the next batch
llama_batch_clear(batch);
// push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, [0], true);
n_decode += 1;
}
n_cur += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
writefln("%s : failed to eval, return code %d\n", __FUNCTION__, 1);
return;
}
}
const auto t_main_end = ggml_time_us();
llama_print_timings(ctx);
writeln();
// cleanup
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
}
void llama_batch_add(
ref llama_batch batch,
llama_token id,
llama_pos pos,
const llama_seq_id[] seq_ids,
bool logits) {
batch.token [batch.n_tokens] = id;
batch.pos [batch.n_tokens] = pos;
batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length;
for (size_t i = 0; i < seq_ids.length; ++i) {
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
}
batch.logits [batch.n_tokens] = logits;
batch.n_tokens++;
}
string llama_token_to_piece(llama_context* ctx, llama_token token) {
char[] result;
result.length = 8;
const int n_tokens = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length);
if (n_tokens < 0) {
result.length = -n_tokens;
int check = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length);
assert(check == -n_tokens);
} else {
result.length = n_tokens;
}
return cast(string) result;
}
void llama_batch_clear(ref llama_batch batch) {
batch.n_tokens = 0;
}
Build
Build inside llama.cpp folder with this command (I've been using CUDA but it is possible to use without it)
dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib -L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib -L/NODEFAULTLIB:libcmt.lib msvcprtd.lib
Run
And run
llama-d "E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizardlm-13b.Q5_K_M.gguf" "How to quit vim?"