Thread overview
This one simple trick allows D programmer use llama.cpp, rust programmers hate him!
Mar 21
evilrat
Mar 21
Serg Gini
Mar 22
Martyn
March 21

Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.

Tested with dmd v2.107 on Windows.

Code

llamad.c:

#include "llama.h"

A ported D version of simple example from llama.cpp

llamad.d:

module llama_d;

import std.string;
import std.stdio;

import llamad; // imports llamad.c

// pragma(msg, __traits(allMembers, llamad));

void main(string[] args)
{
    if (args.length < 3) {
        writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> <your_prompt>");
        return;
    }

    llama_backend_init();
    llama_numa_init(GGML_NUMA_STRATEGY_DISABLED);

    auto mparams = llama_model_default_params();
    // mparams.n_gpu_layers = 30; // offload layers to the GPU to accelerate inference

    auto ctx_params = llama_context_default_params();
    ctx_params.n_ctx = 2048;

    import std.parallelism;
    ctx_params.n_threads = totalCPUs-1;
    ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 ? ctx_params.n_threads : ctx_params.n_threads_batch;

    llama_model*  model = llama_load_model_from_file(toStringz(args[1]), mparams);
    llama_context*	ctx = llama_new_context_with_model(model, ctx_params);

    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
    const bool allow_special = false;

    string prompt = args[2];

    if (!prompt.length)
        return;

    // convert prompt to embedings
    llama_token[] embd_inp;
    embd_inp.length = prompt.length;

    writeln("tokenizing...");

    auto n_of_tok = llama_tokenize(llama_get_model(ctx), prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) embd_inp.length, add_bos, allow_special);
    embd_inp.length = n_of_tok;

    if (!n_of_tok) {
        writeln("no tokens generated, something gone wrong");
        return;
    }

    writeln("input has ", n_of_tok, " tokens");

    foreach (id; embd_inp) {
        write(llama_token_to_piece(ctx, id));
    }
    writeln();

    // total length of the sequence including the prompt
    const int n_len = 128;

    const int n_ctx = llama_n_ctx(ctx);
    const int n_kv_req = cast(int)(embd_inp.length + (n_len - embd_inp.length));

    if (n_kv_req > n_ctx) {
        writeln("error: prompt is too long");
        return;
    }

    writeln("building batch");

    // create a llama_batch with size 512
    // we use this object to submit token data for decoding
    llama_batch batch = llama_batch_init(512, 0, 1);

    // evaluate the initial prompt
    for (size_t i = 0; i < embd_inp.length; i++) {
        // note that seq_pos_id = [0] is required as there should be at least one token
        llama_batch_add(batch, embd_inp[i], cast(int) i, [0], false);
    }

    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

    writeln("decoding batch");

    if (llama_decode(ctx, batch) != 0) {
        writeln("llama_decode() failed");
        return;
    }

    // main loop

    int n_cur    = batch.n_tokens;
    int n_decode = 0;

    const auto t_main_start = ggml_time_us();

    while (n_cur <= n_len) {
        // sample the next token
        {
            auto   n_vocab = llama_n_vocab(model);
            auto   logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);

            llama_token_data[] candidates;
            candidates.length = n_vocab;

            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                candidates ~= llama_token_data(token_id, logits[token_id], 0.0f);
            }

            llama_token_data_array candidates_p = { candidates.ptr, cast(int) candidates.length, false };

            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of stream?
            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                writeln();

                break;
            }

            writef("%s", llama_token_to_piece(ctx, new_token_id));

            // prepare the next batch
            llama_batch_clear(batch);

            // push this new token for next evaluation
            llama_batch_add(batch, new_token_id, n_cur, [0], true);

            n_decode += 1;
        }

        n_cur += 1;

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
            writefln("%s : failed to eval, return code %d\n", __FUNCTION__, 1);
            return;
        }
    }

    const auto t_main_end = ggml_time_us();
    llama_print_timings(ctx);
    writeln();

    // cleanup
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
}


void llama_batch_add(
                    ref llama_batch batch,
                    llama_token id,
                    llama_pos pos,
                    const llama_seq_id[] seq_ids,
                    bool logits) {
    batch.token   [batch.n_tokens] = id;
    batch.pos     [batch.n_tokens] = pos;
    batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length;
    for (size_t i = 0; i < seq_ids.length; ++i) {
        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
    }
    batch.logits  [batch.n_tokens] = logits;

    batch.n_tokens++;
}

string llama_token_to_piece(llama_context* ctx, llama_token token) {
    char[] result;
    result.length = 8;
    const int n_tokens = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length);
    if (n_tokens < 0) {
        result.length = -n_tokens;
        int check = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length);
        assert(check == -n_tokens);
    } else {
        result.length = n_tokens;
    }

    return cast(string) result;
}

void llama_batch_clear(ref llama_batch batch) {
    batch.n_tokens = 0;
}

Build

Build inside llama.cpp folder with this command (I've been using CUDA but it is possible to use without it)

dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib -L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib -L/NODEFAULTLIB:libcmt.lib msvcprtd.lib

Run

And run

llama-d "E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizardlm-13b.Q5_K_M.gguf" "How to quit vim?"
March 21

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:

>

Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.

Wow! :)

March 21

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:

>

Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.

Tested with dmd v2.107 on Windows.

Tested on linux, it works too!

<dummy00022>, why is sky blue?

The sky is blue because of the way that the atmosphere scatters sunlight. Blue light has more energy than the other colors in the visible spectrum, so it's more likely to penetrate the atmosphere and reach our eyes. This causes the blue light to be scattered in all directions, making the sky appear blue.

Additionally, the color of the sky can change depending on the weather and the time of day. For example, the sky may appear more orange or red during sunrise or sunset when the sun is low

Andrea

March 21

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:

>

Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.

Nice catch. Thanks for sharing!
Another option is using pure D implementation based on llama2.c code from Karpathy

https://github.com/cyrusmsk/llama2.d

March 22

On Thursday, 21 March 2024 at 22:42:31 UTC, Serg Gini wrote:

>

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:

>

Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.

Nice catch. Thanks for sharing!
Another option is using pure D implementation based on llama2.c code from Karpathy

https://github.com/cyrusmsk/llama2.d

This is really awesome!

Only issue I have with is the title. It should have ended with either:-

.. and thats a good thing!

or

.. lets get into it.

You are now a blogger, a journalist, a "youtuber" lol

March 26

On Thursday, 21 March 2024 at 16:45:09 UTC, evilrat wrote:

>

Just like the title says, no need to make bindings anymore, just make a dummy C file with single include and start building your D app powered with llama.cpp, that's it, no extra work needed except making a dummy file.

Tested with dmd v2.107 on Windows.

Can confirm with the NAIF SPICE library. Tested with dmd v2.103 on Linux.

Nice!