Remove AGPLv3 code

2024-09-28 14:54:30 +01:00 · 2023-07-10 12:21:05 +03:00 · 2023-07-10 12:21:05 +03:00 · 43e55bebfe
commit 43e55bebfe
parent 85ed8afec9
14 changed files with 1495 additions and 3330 deletions
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@ -22,9 +22,9 @@ LATIN_IME_JNI_SRC_FILES := \

 LATIN_IME_CORE_SRC_FILES := \
    ggml/ggml.c \
-    ggml/utils.cpp \
-    ggml/model_adapter.cpp \
-    ggml/neox_v3.cpp \
+    ggml/common.cpp \
+    ggml/context.cpp \
+    ggml/gpt_neox.cpp \
    $(addprefix dictionary/header/, \
        header_policy.cpp \
        header_read_write_utils.cpp) \
--- a/native/jni/org_futo_inputmethod_latin_GGMLDictionary.cpp
+++ b/native/jni/org_futo_inputmethod_latin_GGMLDictionary.cpp
@ -38,7 +38,9 @@
 #include "utils/profiler.h"
 #include "utils/time_keeper.h"

-#include "ggml/otherarch.h"
+#include "ggml/gpt_neox.h"
+#include "ggml/context.h"
+#include "ggml/common.h"

 #include <android/log.h>

@ -81,13 +83,12 @@ class ProximityInfo;
 struct GGMLDictionaryState {
    int n_threads = 3;

-    std::vector<int> smartcontext;
-    std::vector<gpt_vocab::id> current_context_tokens;
+    transformer_context t_context;
+
    std::vector<float> logits;
    std::vector<gpt_vocab::id> bad_logits;

    size_t mem_per_token = 0;
-    bool use_scratch = true;

    gpt_neox_model model;
    gpt_vocab vocab;
@ -109,12 +110,10 @@ static jlong latinime_GGMLDictionary_open(JNIEnv *env, jclass clazz, jstring sou
    GGMLDictionaryState *state = new GGMLDictionaryState();

    std::string fname(sourceDirChars);
-    FileFormat format = check_file_format(fname);
-    assert(format == 405);

-    ModelLoadResult result = gpt_neox_model_load(fname, state->model, state->vocab, format, 0);
+    bool result = gpt_neox_model_load(fname, state->model, state->vocab);

-    if(result != ModelLoadResult::SUCCESS) {
+    if(!result) {
        AKLOGE("GGMLDict: Could not load model");
        free(state);
        return 0;
@ -171,33 +170,28 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl
        env->ReleaseStringUTFChars(partialWord, pwstr);
    }

-    auto embd_inp = gpt_tokenize(state->vocab, contextString);
+    token_sequence next_context = gpt_tokenize(state->vocab, contextString);

    //truncate to front of the prompt if its too long
    int32_t nctx = state->model.hparams.n_ctx;

-    if (embd_inp.size() + 2 > nctx) {
-        int offset = embd_inp.size() - nctx + 2;
-        embd_inp = std::vector<int>(embd_inp.begin() + offset, embd_inp.end());
+    if (next_context.size() + 2 > nctx) {
+        int offset = next_context.size() - nctx + 2;
+        next_context = std::vector<int>(next_context.begin() + offset, next_context.end());
    }

-    size_t size = env->GetArrayLength(outPredictions);

-    int n_past = 0;
+    auto fastforward_info = transformer_context_fastforward(state->t_context, next_context);

-    bool useSmartContext = true;
-    ContextFastForward(state->current_context_tokens, embd_inp, n_past, nctx, state->smartcontext, useSmartContext, false);
+    token_sequence &embd_inp = fastforward_info.first;
+    int n_past = fastforward_info.second;

    if(embd_inp.empty()) return;

-    state->current_context_tokens.resize(n_past);
-
    AKLOGI("npast = %d, size(embd) = %d\n", n_past, (int)embd_inp.size());
-    gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token, state->use_scratch);
+    gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token);

-    for(auto token : embd_inp) {
-        state->current_context_tokens.emplace_back(token);
-    }
+    transformer_context_apply(state->t_context, fastforward_info);

    int topid = std::min_element(state->logits.begin(),state->logits.end())-state->logits.begin();
    float zeroValue = (state->logits[topid] < 0 ? state->logits[topid] : 0);
@ -249,6 +243,8 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl
    }


+    size_t size = env->GetArrayLength(outPredictions);
+
    // Get the array elements
    jint *probsArray = env->GetIntArrayElements(outProbabilities, nullptr);

--- a/native/jni/src/ggml/common.cpp
+++ b/native/jni/src/ggml/common.cpp
@ -0,0 +1,143 @@
+#include "common.h"
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <regex>
+#include <locale>
+#include <codecvt>
+#include <sstream>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+std::string replace(const std::string & s, const std::string & from, const std::string & to) {
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(from, pos)) != std::string::npos) {
+        result.replace(pos, from.length(), to);
+        pos += to.length();
+    }
+    return result;
+}
+
+void gpt_vocab::add_special_token(const std::string & token) {
+    special_tokens.push_back(token);
+}
+
+std::string convert_to_utf8(const std::wstring & input) {
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    return converter.to_bytes(input);
+}
+
+
+std::wstring convert_to_wstring(const std::string & input) {
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    return converter.from_bytes(input);
+}
+
+void gpt_split_words(std::string str, std::vector<std::string>& words) {
+    const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+    const std::regex re(pattern);
+    std::smatch m;
+
+    while (std::regex_search(str, m, re)) {
+        for (auto x : m) {
+            words.push_back(x);
+        }
+        str = m.suffix();
+    }
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+
+        // Generate the subpattern from the special_tokens vector if it's not empty
+        if (!vocab.special_tokens.empty()) {
+            const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
+            std::string special_tokens_subpattern;
+            for (const auto & token : vocab.special_tokens) {
+                if (!special_tokens_subpattern.empty()) {
+                    special_tokens_subpattern += "|";
+                }
+                special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
+            }
+
+            std::regex re(special_tokens_subpattern);
+            std::smatch m;
+            // Split the text by special tokens.
+            while (std::regex_search(str, m, re)) {
+                // Split the substrings in-between special tokens into words.
+                gpt_split_words(m.prefix(), words);
+                // Add matched special tokens as words.
+                for (auto x : m) {
+                    words.push_back(x);
+                }
+                str = m.suffix();
+            }
+            // Remaining text without special tokens will be handled below.
+        }
+
+        gpt_split_words(str, words);
+    }
+
+    // find the longest token that forms each word in words:
+    std::vector<gpt_vocab::id> tokens;
+    for (const auto & word : words) {
+        for (int i = 0; i < (int) word.size(); ){
+            for (int j = word.size() - 1; j >= i; j--){
+                auto cand = word.substr(i, j-i+1);
+                auto it = vocab.token_to_id.find(cand);
+                if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
+                    tokens.push_back(it->second);
+                    i = j + 1;
+                    break;
+                }
+                else if (j == i){ // word.substr(i, 1) has no matching
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
+                    i++;
+                }
+            }
+        }
+    }
+
+    return tokens;
+}
+
+float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
--- a/native/jni/src/ggml/common.h
+++ b/native/jni/src/ggml/common.h
@ -1,5 +1,3 @@
-// Various helper functions and utilities
-
 #pragma once

 #include <string>
@ -8,15 +6,6 @@
 #include <random>
 #include <thread>

-//
-// CLI argument parsing
-//
-
-
-//
-// Vocab utils
-//
-
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
@ -28,16 +17,7 @@ struct gpt_vocab {
    void add_special_token(const std::string & token);
 };

-void utreplace(std::string & str, const std::string & needle, const std::string & replacement);
-
-// poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
-
-std::string convert_to_utf8(const std::wstring & input);
-
-std::wstring convert_to_wstring(const std::string & input);
-
-void gpt_split_words(std::string str, std::vector<std::string>& words);
+typedef std::vector<gpt_vocab::id> token_sequence;

 // split text into tokens
 //
@ -49,8 +29,5 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
 // Regex (C++):
 // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 //
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+token_sequence gpt_tokenize(const gpt_vocab & vocab, const std::string & text);

-
-
-bool should_transpose_layer(std::string name);
--- a/native/jni/src/ggml/context.cpp
+++ b/native/jni/src/ggml/context.cpp
@ -0,0 +1,29 @@
+#include "context.h"
+
+
+std::pair<token_sequence, int> transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context) {
+    int npast = 0;
+
+    // Compare the two sequences and find the first index at which they differ.
+    int max_length = std::min(ctx.active_context.size(), next_context.size());
+    for(int i=0; i<max_length; i++) {
+        npast = i;
+        if(ctx.active_context[i] != next_context[i]) {
+            break;
+        }
+    }
+
+    token_sequence new_context(next_context.size() - npast);
+    new_context.assign(next_context.begin() + npast, next_context.end());
+
+    return {new_context, npast};
+}
+
+
+void transformer_context_apply(transformer_context &ctx, const std::pair<token_sequence, int> &fastforward_info) {
+    ctx.active_context.resize(fastforward_info.second);
+
+    for(auto i : fastforward_info.first) {
+        ctx.active_context.emplace_back(i);
+    }
+}
--- a/native/jni/src/ggml/context.h
+++ b/native/jni/src/ggml/context.h
@ -0,0 +1,12 @@
+#pragma once
+
+#include <vector>
+
+#include "common.h"
+
+struct transformer_context {
+    token_sequence active_context;
+};
+
+std::pair<token_sequence, int> transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context);
+void transformer_context_apply(transformer_context &ctx, const std::pair<token_sequence, int> &fastforward_info);
--- a/native/jni/src/ggml/ggml.c
+++ b/native/jni/src/ggml/ggml.c
--- a/native/jni/src/ggml/ggml.h
+++ b/native/jni/src/ggml/ggml.h
@ -65,7 +65,7 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       ggml_graph_compute(ctx0, &gf);
+//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 //
@ -201,6 +201,8 @@
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4

+#define GGML_UNUSED(x) (void)(x)
+
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
@ -209,6 +211,30 @@
        } \
    } while (0)

+// used to copy the number of elements and stride in bytes of tensors into local variables.
+// main purpose is to reduce code duplication and improve readability.
+//
+// example:
+//
+//    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+//    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+//
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@ -224,8 +250,8 @@ extern "C" {
    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);

-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);

    struct ggml_object;
    struct ggml_context;
@ -295,12 +321,15 @@ extern "C" {
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
+        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
        GGML_OP_ABS,
        GGML_OP_SGN,
        GGML_OP_NEG,
        GGML_OP_STEP,
+        GGML_OP_TANH,
+        GGML_OP_ELU,
        GGML_OP_RELU,
        GGML_OP_GELU,
        GGML_OP_GELU_QUICK,
@ -332,9 +361,8 @@ extern "C" {
        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
        GGML_OP_CLAMP,
-        GGML_OP_CONV_1D_S1_PH,
-        GGML_OP_CONV_1D_S2_PH,
-        GGML_OP_CONV_2D_SK_P0,
+        GGML_OP_CONV_1D,
+        GGML_OP_CONV_2D,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@ -390,9 +418,6 @@ extern "C" {
        struct ggml_tensor * src1;
        struct ggml_tensor * opt[GGML_MAX_OPT];

-        // thread scheduling
-        int n_tasks;
-
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@ -404,19 +429,27 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[4];
+        char padding[8];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+
+        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
+        int n_tasks[GGML_MAX_NODES];
+    };
+
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
        int n_leafs;
-        int n_threads;
-
-        size_t work_size;
-        struct ggml_tensor * work;

        struct ggml_tensor * nodes[GGML_MAX_NODES];
        struct ggml_tensor * grads[GGML_MAX_NODES];
@ -504,8 +537,6 @@ extern "C" {
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);

-    GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
-
    // main

    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -692,6 +723,11 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // argmax along rows
+    GGML_API struct ggml_tensor * ggml_argmax(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // if a is the same shape as b, and a is not parameter, return a
    // otherwise, return a new tensor: repeat(a) to fit in b
    GGML_API struct ggml_tensor * ggml_repeat(
@ -736,6 +772,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_tanh(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_tanh_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    GGML_API struct ggml_tensor * ggml_relu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -1086,58 +1138,33 @@ extern "C" {
            float                 min,
            float                 max);

-    // TODO: implement general-purpose convolutions
-    // GGML_API struct ggml_tensor * ggml_conv_1d(
-    //        struct ggml_context * ctx,
-    //        struct ggml_tensor  * a,
-    //        struct ggml_tensor  * b,
-    //        int                   s0
-    //        int                   p0,
-    //        int                   d0);
-    //
-    // GGML_API struct ggml_tensor * ggml_conv_2d(
-    //        struct ggml_context * ctx,
-    //        struct ggml_tensor  * a,
-    //        struct ggml_tensor  * b,
-    //        int                   s0,
-    //        int                   s1,
-    //        int                   p0,
-    //        int                   p1,
-    //        int                   d0,
-    //        int                   d1);
-
-    // padding = half
-    // TODO: we don't support extra parameters for now
-    //       that's why we are hard-coding the stride, padding, and dilation
-    //       not great ..
-    // example:
-    // a:      3   80  768    1
-    // b:   3000   80    1    1
-    // res: 3000  768    1    1
-    // used in whisper
-    GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
+    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation

-    // used in whisper
-    GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
+    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);

-    // kernel size is a->ne[0] x a->ne[1]
-    // stride is equal to kernel size
-    // padding is zero
-    // example:
-    // a:     16   16    3  768
-    // b:   1024 1024    3    1
-    // res:   64   64  768    1
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);

    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
@ -1268,15 +1295,22 @@ extern "C" {

    GGML_API void ggml_set_param(
            struct ggml_context * ctx,
-            struct ggml_tensor * tensor);
+            struct ggml_tensor  * tensor);

    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

-    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

@ -1493,25 +1527,24 @@ extern "C" {
    //

 #ifdef  __cplusplus
-    // restrict not standard in C++
+// restrict not standard in C++
 #define GGML_RESTRICT
 #else
 #define GGML_RESTRICT restrict
 #endif
-    typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-    typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
+    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

    typedef struct {
-        dequantize_row_q_t dequantize_row_q;
-        quantize_row_q_t   quantize_row_q;
-        quantize_row_q_t   quantize_row_q_reference;
-        quantize_row_q_t   quantize_row_q_dot;
-        vec_dot_q_t        vec_dot_q;
-        enum ggml_type     vec_dot_type;
-    } quantize_fns_t;
+        ggml_to_float_t   to_float;
+        ggml_from_float_t from_float;
+        ggml_from_float_t from_float_reference;
+        ggml_vec_dot_t    vec_dot;
+        enum ggml_type    vec_dot_type;
+    } ggml_type_traits_t;

-    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);

 #ifdef  __cplusplus
 }
--- a/native/jni/src/ggml/gpt_neox.cpp
+++ b/native/jni/src/ggml/gpt_neox.cpp
@ -1,49 +1,42 @@
-#include "ggml.h"
-#include "otherarch.h"
-
-#include "utils.h"
-#include "defines.h"
+#include "ggml/ggml.h"
+#include "gpt_neox.h"
+#include "common.h"

 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <cinttypes>
 #include <fstream>
 #include <map>
 #include <string>
 #include <vector>
-#include <iostream>
-#include <algorithm>

-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-#if defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+
 // load the model's weights from a file
-ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
-    AKLOGI("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
    if (!fin) {
-        AKLOGE("%s: failed to open '%s'\n", __func__, fname.c_str());
-        return ModelLoadResult::FAIL;
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
    }

    // verify magic
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            AKLOGE("%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return ModelLoadResult::FAIL;
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
        }
    }

-    int32_t origmaxctx = model.hparams.n_ctx;
-
    // load hparams
    {
        auto & hparams = model.hparams;
@ -59,17 +52,15 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &

        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

-        AKLOGI("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        AKLOGI("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
-        AKLOGI("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        AKLOGI("%s: n_head  = %d\n", __func__, hparams.n_head);
-        AKLOGI("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        AKLOGI("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        AKLOGI("%s: par_res = %d\n", __func__, hparams.par_res);
-        AKLOGI("%s: ftype   = %d\n", __func__, hparams.ftype);
-        AKLOGI("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        printf("%s: par_res = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);

        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
    }
@ -94,14 +85,13 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
        }
    }

-
    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
    if (wtype == GGML_TYPE_COUNT) {
-        AKLOGE("%s: invalid model file '%s' (bad ftype value %d)\n",
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                __func__, fname.c_str(), model.hparams.ftype);
-        return ModelLoadResult::FAIL;
+        return false;
    }

    auto & ctx = model.ctx;
@ -142,25 +132,26 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 16*n_layer)*1024; // object overhead

-        AKLOGI("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
-            AKLOGE("%s: ggml_init() failed\n", __func__);
-            return ModelLoadResult::FAIL;
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
        }
    }

@ -241,7 +232,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;

-        const int64_t n_mem      = n_layer*std::max(origmaxctx,n_ctx);
+        const int64_t n_mem      = n_layer*n_ctx;
        const int64_t n_elements = n_embd*n_mem;

        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
@ -249,7 +240,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &

        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

-        AKLOGI("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
    }

    // load weights
@ -257,7 +248,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
        int n_tensors = 0;
        size_t total_size = 0;

-        AKLOGI("%s: ", __func__);
+        printf("%s: ", __func__);

        while (true) {
            int32_t n_dims;
@ -283,83 +274,52 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
            fin.read(&name[0], length);

            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                AKLOGE("%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return ModelLoadResult::FAIL;
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
            }

            auto tensor = model.tensors[name.data()];
            if (ggml_nelements(tensor) != nelements) {
-                AKLOGE("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return ModelLoadResult::FAIL;
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
            }

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                AKLOGE("%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return ModelLoadResult::FAIL;
+                return false;
            }

            // for debugging
            if (0) {
-                AKLOGI("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }

            const size_t bpe = ggml_type_size(ggml_type(ttype));

            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                AKLOGE("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                 ggml_free(ctx);
-                 return ModelLoadResult::RETRY_LOAD;
+                return false;
            }

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

            total_size += ggml_nbytes(tensor);
            if (++n_tensors % 8 == 0) {
-                AKLOGI(".");
+                printf(".");
                fflush(stdout);
            }
        }

-        AKLOGI(" done\n");
+        printf(" done\n");

-        AKLOGI("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
    }

    fin.close();

-    //gpu offload
-    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
-    if(gpulayers>0)
-    {
-        const auto & hparams = model.hparams;
-        size_t vram_total = 0;
-        const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
-        AKLOGE("%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
-        for (int i = 0; i < n_gpu; ++i) {
-            const auto & layer = model.layers[i];
-            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
-            #if defined(GGML_USE_CLBLAST)
-            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
-            #else
-            ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
-            #endif
-        }
-        AKLOGE("%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-    }
-    #endif
-
-    return ModelLoadResult::SUCCESS;
+    return true;
 }


@ -408,13 +368,12 @@ ggml_tensor * gpt_neox_ff(
 //   - embd_w:    the predicted logits for the next token
 //
 bool gpt_neox_eval(
-        const gpt_neox_model & model,
+        gpt_neox_model & model,
        const int n_threads,
        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token,
-              bool use_scratch) {
+        const token_sequence     & embd_inp,
+              std::vector<float> & embd_w,
+              size_t             & mem_per_token) {
    const int N = embd_inp.size();

    const auto & hparams = model.hparams;
@ -426,43 +385,40 @@ bool gpt_neox_eval(
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_rot;

+    // TODO: All of this allocates over 800 megabytes of memory, way more than the size of the model!
+
    static size_t buf_size = 256u*1024*1024;
    static void * buf = malloc(buf_size);

    // use 2 scratch buffers
    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
-    static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
-
+    static size_t scr0_size = 256u*1024*1024;
    static void * scr0 = malloc(scr0_size);
+
+    static size_t scr1_size = 256u*1024*1024;
    static void * scr1 = malloc(scr1_size);

-    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
-        const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //AKLOGI("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate
-        if (buf_size_new > buf_size)
-        {
-            buf_size = buf_size_new;
-            buf = realloc(buf, buf_size);
-            if (buf == nullptr)
-            {
-                AKLOGE("%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
-                return false;
-            }
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
-    params.no_alloc   = false;
-
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -473,9 +429,7 @@ bool gpt_neox_eval(
    for (int il = 0; il < n_layer; ++il) {
        struct ggml_tensor * cur;

-        if(use_scratch){
        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-        }

        // self-attention
        {
@ -580,9 +534,7 @@ bool gpt_neox_eval(
            }
        }

-        if(use_scratch){
        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-        }

        if (hparams.par_res == 0) {
            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
@ -606,9 +558,7 @@ bool gpt_neox_eval(
        }
    }

-    if(use_scratch){
    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-    }

    // norm
    {
@ -622,9 +572,7 @@ bool gpt_neox_eval(
                ggml_repeat(ctx0, model.ln_f_b, inpL));
    }

-    if(use_scratch){
    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-    }

    // lm_head
    {
@ -640,7 +588,18 @@ bool gpt_neox_eval(

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+
+    struct ggml_cplan plan = ggml_graph_plan(&gf, n_threads);
+
+    if (plan.work_size > 0) {
+        if(model.work_buf.size() < plan.work_size) {
+            model.work_buf.resize(plan.work_size);
+        }
+
+        plan.work_data = model.work_buf.data();
+    }
+
+    ggml_graph_compute(&gf, &plan);

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
@ -657,7 +616,7 @@ bool gpt_neox_eval(
    if (mem_per_token == 0) {
        mem_per_token = ggml_used_mem(ctx0)/N;
    }
-    //AKLOGI("used_mem = %zu\n", ggml_used_mem(ctx0));
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));

    ggml_free(ctx0);

--- a/native/jni/src/ggml/gpt_neox.h
+++ b/native/jni/src/ggml/gpt_neox.h
@ -0,0 +1,86 @@
+#pragma once
+
+#include "ggml/ggml.h"
+#include "common.h"
+
+// default hparams (StableLM 3B)
+struct gpt_neox_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 4096;
+    int32_t n_embd  = 4096;
+    int32_t n_head  = 32;
+    int32_t n_layer = 16;
+    int32_t n_rot   = 32; // rotary_pct * (n_embd / n_head)
+    int32_t par_res = 1; // 1 = true, 0 = false
+    int32_t ftype   = 1;
+};
+
+struct gpt_neox_layer {
+    // pre normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // post normalization
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // ff
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt_neox_model {
+    gpt_neox_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte; // position embedding
+
+    struct ggml_tensor * lmh_g; // language model head
+    //struct ggml_tensor * lmh_b; // language model bias
+
+    std::vector<gpt_neox_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+
+    std::vector<uint8_t> work_buf;
+};
+
+
+bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab);
+
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - logits:    the predicted logits for the next token
+//
+bool gpt_neox_eval(
+        gpt_neox_model & model,
+        const int n_threads,
+        const int n_past,
+        const token_sequence & embd_inp,
+        std::vector<float>   & logits,
+        size_t               & mem_per_token
+);
--- a/native/jni/src/ggml/model_adapter.cpp
+++ b/native/jni/src/ggml/model_adapter.cpp
@ -1,466 +0,0 @@
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <regex>
-#include <iostream>
-#include <iterator>
-#include <queue>
-#include <string>
-#include <math.h>
-#include <vector>
-
-#include "model_adapter.h"
-
-#include <chrono>
-
-static auto bench_timer = std::chrono::high_resolution_clock().now();
-
-void timer_start()
-{
-    bench_timer = std::chrono::high_resolution_clock().now();
-}
-double timer_check()
-{
-    auto endtime = std::chrono::high_resolution_clock().now();
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endtime - bench_timer);
-    double time_taken = duration.count()/1000.0;
-    return time_taken;
-}
-
-void print_vec(std::vector<std::string> &embd)
-{
-    std::cout << "[";
-    bool first = true;
-    for (auto i : embd)
-    {
-        if (!first)
-        {
-            std::cout << ',';
-        }
-        first = false;
-        std::cout << i;
-    }
-    std::cout << "]\n";
-}
-void print_tok_vec(std::vector<int> &embd)
-{
-    std::cout << "[";
-    bool first = true;
-    for (auto i : embd)
-    {
-        if (!first)
-        {
-            std::cout << ',';
-        }
-        first = false;
-        std::cout << i;
-    }
-    std::cout << "]\n";
-}
-void print_tok_vec(std::vector<float> &embd)
-{
-    std::cout << "[";
-    bool first = true;
-    int n = 0;
-    for (auto i : embd)
-    {
-        if (!first)
-        {
-            std::cout << ',';
-        }
-        first = false;
-        std::cout << i;
-        if(++n>20)
-        {
-            break;
-        }
-    }
-    std::cout << "]\n";
-}
-
-//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
- FileFormat check_file_format(const std::string & fname)
- {
-    std::vector<char> f_buf(1024*1024);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return FileFormat::BADFORMAT;
-    }
-
-    FileFormat fileformat = FileFormat::BADFORMAT;
-    uint32_t magic;
-    fin.read((char *) &magic, sizeof(magic));
-    if (magic == 0x67676d6c) {  //v1 format ggml, alpaca, old gptj and gpt2 models
-       fileformat = FileFormat::GGML;
-       //we need to read more to determine
-       int32_t vocabsiz = 0;
-       fin.read((char *) &vocabsiz, sizeof(int32_t));
-       if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
-       {
-           fileformat = FileFormat::MPT_1;
-       }
-       else if(vocabsiz==50400) //know GPT-J vocab size
-       {
-           fileformat = FileFormat::GPTJ_1;
-           uint32_t temp;
-           fin.read((char *)&temp, sizeof(temp)); //ctx
-           fin.read((char *)&temp, sizeof(temp)); //n_embd
-           fin.read((char *)&temp, sizeof(temp)); //n_head
-           fin.read((char *)&temp, sizeof(temp)); //n_layer
-           fin.read((char *)&temp, sizeof(temp)); //n_rot
-           fin.read((char *)&temp, sizeof(temp)); //f16
-           const int32_t qntvr = temp / 1000;
-           temp %= 1000;
-           if (qntvr != 0)
-           {
-               if (qntvr == 1)
-               {
-                   fileformat = FileFormat::GPTJ_4;
-               }
-               else
-               {
-                   fileformat = FileFormat::GPTJ_5;
-               }
-           }
-           else if (temp != 0 && temp != 1)
-           {
-               fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
-           }
-       }
-       else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder
-       {
-           fileformat = FileFormat::GPT2_1;
-           uint32_t temp;
-           fin.read((char *)&temp, sizeof(temp)); //ctx
-           fin.read((char *)&temp, sizeof(temp)); //n_embd
-           fin.read((char *)&temp, sizeof(temp)); //n_head
-           fin.read((char *)&temp, sizeof(temp)); //n_layer
-           fin.read((char *)&temp, sizeof(temp)); //f16
-           const int32_t qntvr = temp / 1000;
-           temp %= 1000;
-           if (qntvr != 0)
-           {
-               if (qntvr == 1)
-               {
-                   fileformat = FileFormat::GPT2_3;
-               }
-               else
-               {
-                   fileformat = FileFormat::GPT2_4;
-               }
-           }
-           else if (temp != 0 && temp != 1)
-           {
-               fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
-           }
-       }
-       else if(vocabsiz < 31998 || vocabsiz > 33000)
-       {
-           //anything outside the llama v1 range is assumed to be NeoX
-           fileformat = FileFormat::NEOX_6;
-           uint32_t temp,temp2;
-           fin.read((char *)&temp, sizeof(temp)); //ctx
-           fin.read((char *)&temp, sizeof(temp)); //n_embd
-           fin.read((char *)&temp, sizeof(temp)); //n_head
-           fin.read((char *)&temp, sizeof(temp)); //n_layer
-           fin.read((char *)&temp, sizeof(temp)); //n_rot
-           fin.read((char *)&temp, sizeof(temp)); //either par_res or ftype (for older ver)
-
-           if(temp!=0 && temp!=1){
-               //must be ftype, means its an older model. par_res will be undefined
-               fileformat = FileFormat::NEOX_2;
-           }
-           else
-           {
-                //it could be a newer model, or an old f16/f32 model
-                fin.read((char *)&temp2, sizeof(temp2)); //if previous was par_res, this is ftype. else unknown
-
-                //if it is new ftype, then it must have these properties: > 1000, low multiple of 1k and small remaineder
-                bool isNewFtype = (temp2>=1000 && temp2<=9000 && temp2%1000<20);
-
-                if(!isNewFtype)
-                {
-                    fileformat = FileFormat::NEOX_2;
-                    if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0
-                    {
-                        //its a f16/f32 model in the new format
-                        fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6;
-                    }
-                }
-                else
-                {
-                    const int32_t qntvr = temp2 / 1000; //for future use
-                    //then temp was par_res, use_parallel_residual is false in RedPajama
-                    if(qntvr==1)
-                    {
-                        fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4);
-                    }
-                    else
-                    {
-                        fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6);
-                    }
-                }
-           }
-
-       }
-    }
-    else if(magic == 0x67676d66) //v2 format ggmf
-    {
-        fileformat = FileFormat::GGHF;
-        uint32_t temp;
-        fin.read((char *)&temp, sizeof(temp)); //file version
-        if(temp==100)
-        {
-            fileformat = FileFormat::RWKV_1;
-        }
-        else if(temp==101)
-        {
-            fileformat = FileFormat::RWKV_2;
-        }
-    }
-    else if(magic == 0x67676a74) //v3 format ggjt
-    {
-        fileformat = FileFormat::GGJT_3; //ggjt by default
-        uint32_t ver, temp, ftype;
-        fin.read((char *)&ver, sizeof(ver)); //file version
-        fin.read((char *)&temp, sizeof(temp));//vocab
-        fin.read((char *)&temp, sizeof(temp)); //embd
-        fin.read((char *)&temp, sizeof(temp)); //mult
-        fin.read((char *)&temp, sizeof(temp));//head
-        fin.read((char *)&temp, sizeof(temp));//layer
-        fin.read((char *)&temp, sizeof(temp));//rot
-        fin.read((char *)&ftype, sizeof(ftype));//filetype
-
-        if(ver==1)
-        {
-            fileformat = FileFormat::GGJT;
-        }
-        else if(ver==2)
-        {
-            fileformat = FileFormat::GGJT_2;
-        }
-    }
-    fin.close();
-
-    return fileformat;
- }
-
- bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> searchSeq)
- {
-     int ss = searchSeq.size();
-     if(targetArray.size()<ss)
-     {
-         return false;
-     }
-     for(int i=0;i<ss;++i)
-     {
-         if(targetArray[i]!=searchSeq[i])
-         {
-             return false;
-         }
-     }
-     return true;
- }
-
- int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq)
- {
-     int ss = searchSeq.size();
-     int tas = targetArray.size();
-     if(tas<ss)
-     {
-         return -1;
-     }
-     for(int i=0;i<tas;++i)
-     {
-         int srch = 0;
-         bool fail = false;
-         for(int srch=0;srch<ss;++srch)
-         {
-             if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
-             {
-                 fail = true;
-                 break;
-             }
-         }
-         if(!fail)
-         {
-             return i;
-         }
-     }
-     return -1;
- }
-
- std::vector<int> LongestCommonSubseq(const std::vector<int> x, const std::vector<int> y)
- {
-     int m = x.size(), n = y.size();
-
-     //int LCSuff[m+1][n+1];
-     std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
-
-     for (int j = 0; j <= n; j++)
-         LCSuff[0][j] = 0;
-     for (int i = 0; i <= m; i++)
-         LCSuff[i][0] = 0;
-
-     for (int i = 1; i <= m; i++)
-     {
-         for (int j = 1; j <= n; j++)
-         {
-             if (x[i - 1] == y[j - 1])
-                 LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
-             else
-                 LCSuff[i][j] = 0;
-         }
-     }
-
-     std::vector<int> longest;
-     for (int i = 1; i <= m; i++)
-     {
-         for (int j = 1; j <= n; j++)
-         {
-             if (LCSuff[i][j] > longest.size())
-             {
-                 auto off1 = ((i - LCSuff[i][j] + 1) - 1);
-                 auto off2 = off1 + LCSuff[i][j];
-                 longest.clear();
-                //  std::vector<int>().swap(longest);
-                 longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
-                // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
-             }
-         }
-     }
-     return longest;
- }
-
- void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
- int &n_past, const int nctx, std::vector<int> &smartcontext,
- bool useSmartContext, const bool requireFullSubset)
- {
-     const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
-     const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
-     const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
-     const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
-     const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
-
-
-    //fast forward the past based on identical tokens, stop once a divergence is noted
-    int embd_inp_len = embd_inp.size();
-    bool fastforwardok = true;
-
-    for (int i = 0; i < current_context_tokens.size(); ++i)
-    {
-        if (current_context_tokens[i] == embd_inp[i])
-        {
-            n_past += 1;
-        }
-        else
-        {
-            if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
-            {
-                n_past = 0;
-                fastforwardok = false;
-            }
-            break;
-        }
-
-        if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
-        {
-            if (i >= embd_inp_len)
-            {
-                n_past = 0;
-                fastforwardok = false;
-                break;
-            }
-        }
-        else
-        {
-            if ((i + 2) >= embd_inp_len)
-            {
-                break;
-            }
-        }
-    }
-
-    if(fastforwardok)
-    {
-        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
-        embd_inp_len = embd_inp.size();
-    }
-
-    //smart context mode, detect if we have a shifted context at max length
-    //requirement: previous context was at least nctx/2 longer than current,
-    //mode is on, and current context already maxed.
-
-    if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
-    {
-        //see if smartcontext is still usable
-        auto shared = LongestCommonSubseq(smartcontext, embd_inp);
-        if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common
-        {
-            int found = ArrFindIndexOf(embd_inp,shared);
-            if(found>=0)
-            {
-                auto trimmed = std::vector<int>(embd_inp.begin() + found, embd_inp.end());
-                embd_inp = trimmed;
-                embd_inp_len = embd_inp.size();
-                printf("\n[Reusing Smart Context: %d allowance remaining]", found);
-
-                int old_n_past = n_past;
-                int offset_fix = old_n_past;
-                if (current_context_tokens[n_past] != embd_inp[0])
-                {
-                    offset_fix = 0;
-                }
-
-                for (int i = n_past; i < current_context_tokens.size(); ++i)
-                {
-                    if (current_context_tokens[i] == embd_inp[i-offset_fix])
-                    {
-                        n_past += 1;
-                    }
-                    else
-                    {
-                        break;
-                    }
-                    if ((i + 2 - offset_fix) >= embd_inp_len)
-                    {
-                        break;
-                    }
-                }
-
-                embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past));
-
-            }else{
-                smartcontext.clear();
-            }
-        }
-        else
-        {
-            smartcontext.clear();
-        }
-    }
-    else
-    {
-        smartcontext.clear();
-    }
-
-    if(fastforwardok && useSmartContext
-    && smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold
-    && embd_inp_len >= SCInpLenThreshold
-    && current_context_tokens.size() - n_past > SCPastLenThreshold)
-    {
-        //determine longest common substring after removing start part
-        int shiftamt = embd_inp.size() * SCTruncationRatio;
-        smartcontext = std::vector<int>(embd_inp.begin() + shiftamt, embd_inp.end());
-         printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt);
-
-        embd_inp = smartcontext;
-        //if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker.
-        //when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS
-        //remove all tokens between start part and start of LCS in new prompt, thus avoiding shift
-        //if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B
-    }
- }
--- a/native/jni/src/ggml/model_adapter.h
+++ b/native/jni/src/ggml/model_adapter.h
@ -1,67 +0,0 @@
-#pragma once
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <regex>
-#include <iostream>
-#include <iterator>
-#include <queue>
-#include <string>
-#include <math.h>
-#include <vector>
-
-enum FileFormat
-{
-    BADFORMAT=0, //unknown, uninit, or failed to load
-    GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
-    GGHF=2, // 2=(llama ggmf)
-    GGJT=3, // 3=(llama ggjt)
-    GGJT_2=4, //newer llama format unshuffled
-    GGJT_3=5, //using 16bit scalar
-
-    GPTJ_1=100, //the very first super old GPTJ format
-    GPTJ_2=101, //pygmalion, uses old ggml lib
-    GPTJ_3=102, //uses new ggml lib
-    GPTJ_4=103, //unshuffled
-    GPTJ_5=104, //using 16bit scalar
-
-    GPT2_1=200,
-    GPT2_2=201,
-    GPT2_3=202, //unshuffled
-    GPT2_4=203, //using 16bit scalar
-
-    RWKV_1=300,
-    RWKV_2=301,
-
-    NEOX_1=400,
-    NEOX_2=401,
-    NEOX_3=402, //redpajama
-    NEOX_4=403, //unshuffled
-    NEOX_5=404, //unshuffled redpajama
-    NEOX_6=405, //using 16bit scalar
-    NEOX_7=406, //using 16bit scalar redpajama
-
-    MPT_1=500, //first supported mpt version
-};
-
-enum ModelLoadResult
-{
-    FAIL = 0,
-    SUCCESS = 1,
-    RETRY_LOAD = 2, //used if it's suspected that the model is an older format
-};
-
-void timer_start();
-double timer_check();
-void print_tok_vec(std::vector<int> &embd);
-void print_tok_vec(std::vector<float> &embd);
-void print_vec(std::vector<std::string> &embd);
-std::vector<int> LongestCommonSubseq(const std::vector<int> x, const std::vector<int> y);
-bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> searchSeq);
-int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);
-
-FileFormat check_file_format(const std::string & fname);
-void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
- int &n_past, const int nctx, std::vector<int> &smartcontext,
- const bool useSmartContext, const bool requireFullSubset);
--- a/native/jni/src/ggml/otherarch.h
+++ b/native/jni/src/ggml/otherarch.h
@ -1,464 +0,0 @@
-#pragma once
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "utils.h"
-#include "model_adapter.h"
-
-
-// default hparams (GPT-J 6B)
-struct gptj_hparams {
-    int32_t n_vocab = 50400;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 16;
-    int32_t n_layer = 28;
-    int32_t n_rot   = 64;
-    int32_t ftype   = 1;
-};
-
-struct gptj_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_q_proj_w;
-    struct ggml_tensor * c_attn_k_proj_w;
-    struct ggml_tensor * c_attn_v_proj_w;
-
-    struct ggml_tensor * c_attn_proj_w;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-struct gptj_layer_v2 {
-    // normalization
-    struct ggml_v2_tensor * ln_1_g;
-    struct ggml_v2_tensor * ln_1_b;
-
-    // attention
-    struct ggml_v2_tensor * c_attn_q_proj_w;
-    struct ggml_v2_tensor * c_attn_k_proj_w;
-    struct ggml_v2_tensor * c_attn_v_proj_w;
-
-    struct ggml_v2_tensor * c_attn_proj_w;
-
-    // ff
-    struct ggml_v2_tensor * c_mlp_fc_w;
-    struct ggml_v2_tensor * c_mlp_fc_b;
-
-    struct ggml_v2_tensor * c_mlp_proj_w;
-    struct ggml_v2_tensor * c_mlp_proj_w_trans; //for backwards compatibility
-    struct ggml_v2_tensor * c_mlp_proj_b;
-};
-struct gptj_layer_v1 {
-    // normalization
-    struct ggml_v1_tensor * ln_1_g;
-    struct ggml_v1_tensor * ln_1_b;
-
-    // attention
-    struct ggml_v1_tensor * c_attn_q_proj_w;
-    struct ggml_v1_tensor * c_attn_k_proj_w;
-    struct ggml_v1_tensor * c_attn_v_proj_w;
-
-    struct ggml_v1_tensor * c_attn_proj_w;
-
-    // ff
-    struct ggml_v1_tensor * c_mlp_fc_w;
-    struct ggml_v1_tensor * c_mlp_fc_b;
-
-    struct ggml_v1_tensor * c_mlp_proj_w;
-    struct ggml_v1_tensor * c_mlp_proj_w_trans; //for backwards compatibility
-    struct ggml_v1_tensor * c_mlp_proj_b;
-};
-
-struct gptj_v1_model {
-    gptj_hparams hparams;
-
-    // normalization
-    struct ggml_v1_tensor * ln_f_g;
-    struct ggml_v1_tensor * ln_f_b;
-
-    struct ggml_v1_tensor * wte; // position embedding
-
-    struct ggml_v1_tensor * lmh_g; // language model head
-    struct ggml_v1_tensor * lmh_b; // language model bias
-
-    std::vector<gptj_layer_v1> layers;
-
-    // key + value memory
-    struct ggml_v1_tensor * memory_k;
-    struct ggml_v1_tensor * memory_v;
-
-    //
-    struct ggml_v1_context * ctx;
-    std::map<std::string, struct ggml_v1_tensor *> tensors;
-};
-
-struct gptj_v2_model {
-    gptj_hparams hparams;
-
-    // normalization
-    struct ggml_v2_tensor * ln_f_g;
-    struct ggml_v2_tensor * ln_f_b;
-
-    struct ggml_v2_tensor * wte; // position embedding
-
-    struct ggml_v2_tensor * lmh_g; // language model head
-    struct ggml_v2_tensor * lmh_b; // language model bias
-
-    std::vector<gptj_layer_v2> layers;
-
-    // key + value memory
-    struct ggml_v2_tensor * memory_k;
-    struct ggml_v2_tensor * memory_v;
-
-    //
-    struct ggml_v2_context * ctx;
-    std::map<std::string, struct ggml_v2_tensor *> tensors;
-};
-
-struct gptj_model {
-    gptj_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gptj_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype     = 1;
-};
-
-struct gpt2_v1_layer {
-    // normalization
-    struct ggml_v1_tensor * ln_1_g;
-    struct ggml_v1_tensor * ln_1_b;
-
-    struct ggml_v1_tensor * ln_2_g;
-    struct ggml_v1_tensor * ln_2_b;
-
-    // attention
-    struct ggml_v1_tensor * c_attn_attn_w;
-    struct ggml_v1_tensor * c_attn_attn_b;
-
-    struct ggml_v1_tensor * c_attn_proj_w;
-    struct ggml_v1_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_v1_tensor * c_mlp_fc_w;
-    struct ggml_v1_tensor * c_mlp_fc_b;
-
-    struct ggml_v1_tensor * c_mlp_proj_w_trans; // transposed for efficiency
-    struct ggml_v1_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_v1_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_v1_tensor * ln_f_g;
-    struct ggml_v1_tensor * ln_f_b;
-
-    struct ggml_v1_tensor * wte; // position embedding
-    struct ggml_v1_tensor * wpe; //    token embedding
-
-    std::vector<gpt2_v1_layer> layers;
-
-    // key + value memory
-    struct ggml_v1_tensor * memory_k;
-    struct ggml_v1_tensor * memory_v;
-
-    //
-    struct ggml_v1_context * ctx;
-    std::map<std::string, struct ggml_v1_tensor *> tensors;
-};
-
-struct gpt2_layer_v2 {
-    // normalization
-    struct ggml_v2_tensor * ln_1_g;
-    struct ggml_v2_tensor * ln_1_b;
-
-    struct ggml_v2_tensor * ln_2_g;
-    struct ggml_v2_tensor * ln_2_b;
-
-    // attention
-    struct ggml_v2_tensor * c_attn_attn_w;
-    struct ggml_v2_tensor * c_attn_attn_b;
-
-    struct ggml_v2_tensor * c_attn_proj_w;
-    struct ggml_v2_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_v2_tensor * c_mlp_fc_w;
-    struct ggml_v2_tensor * c_mlp_fc_b;
-
-    struct ggml_v2_tensor * c_mlp_proj_w;
-    struct ggml_v2_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_v2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_v2_tensor * ln_f_g;
-    struct ggml_v2_tensor * ln_f_b;
-
-    struct ggml_v2_tensor * wte;     // position embedding
-    struct ggml_v2_tensor * wpe;     //    token embedding
-    struct ggml_v2_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer_v2> layers;
-
-    // key + value memory
-    struct ggml_v2_tensor * memory_k;
-    struct ggml_v2_tensor * memory_v;
-
-    //
-    struct ggml_v2_context * ctx;
-    std::map<std::string, struct ggml_v2_tensor *> tensors;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// default hparams (StableLM 3B)
-struct gpt_neox_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 4096;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 32;
-    int32_t n_layer = 16;
-    int32_t n_rot   = 32; // rotary_pct * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = 1;
-};
-
-struct gpt_neox_layer_v2 {
-    // pre normalization
-    struct ggml_v2_tensor * ln_1_g;
-    struct ggml_v2_tensor * ln_1_b;
-
-    // attention
-    struct ggml_v2_tensor * c_attn_attn_w;
-    struct ggml_v2_tensor * c_attn_attn_b;
-
-    struct ggml_v2_tensor * c_attn_proj_w;
-    struct ggml_v2_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_v2_tensor * ln_2_g;
-    struct ggml_v2_tensor * ln_2_b;
-
-    // ff
-    struct ggml_v2_tensor * c_mlp_fc_w;
-    struct ggml_v2_tensor * c_mlp_fc_b;
-
-    struct ggml_v2_tensor * c_mlp_proj_w;
-    struct ggml_v2_tensor * c_mlp_proj_b;
-};
-
-struct gpt_neox_v2_model {
-    gpt_neox_hparams hparams;
-
-    // normalization
-    struct ggml_v2_tensor * ln_f_g;
-    struct ggml_v2_tensor * ln_f_b;
-
-    struct ggml_v2_tensor * wte; // position embedding
-
-    struct ggml_v2_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gpt_neox_layer_v2> layers;
-
-    // key + value memory
-    struct ggml_v2_tensor * memory_k;
-    struct ggml_v2_tensor * memory_v;
-
-    //
-    struct ggml_v2_context * ctx;
-    std::map<std::string, struct ggml_v2_tensor *> tensors;
-};
-
-struct gpt_neox_layer {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_neox_model {
-    gpt_neox_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gpt_neox_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-
-// no defaults for now
-struct mpt_hparams {
-    int32_t d_model      = 0;
-    int32_t max_seq_len  = 0;
-    int32_t n_heads      = 0;
-    int32_t n_layers     = 0;
-    int32_t n_vocab      = 0;
-    float alibi_bias_max = 0;
-    float clip_qkv       = 0;
-    int32_t ftype        = 0;
-    int32_t n_ctx        = 0;
-
-};
-
-struct mpt_layer {
-    // pre normalization
-    struct ggml_tensor * norm_1_weight;
-
-    // attention
-    struct ggml_tensor * c_attn_wqkv_weight;
-    struct ggml_tensor * c_attn_out_proj_weight;
-
-    // post normalization
-    struct ggml_tensor * norm_2_weight;
-
-    // ff
-    struct ggml_tensor * ffn_up_proj;
-    struct ggml_tensor * ffn_down_proj;
-};
-
-struct mpt_model {
-    mpt_hparams hparams;
-
-    struct ggml_tensor * wte_weight;    // position embedding
-    struct ggml_tensor * norm_f_weight; // language model head
-
-    std::vector<mpt_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-
-ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers);
-bool gpt_neox_eval(
-        const gpt_neox_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-        std::vector<float>         & embd_w,
-        size_t                     & mem_per_token,
-        bool use_scratch);
--- a/native/jni/src/ggml/utils.cpp
+++ b/native/jni/src/ggml/utils.cpp
@ -1,224 +0,0 @@
-#include "utils.h"
-
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <regex>
-#include <locale>
-#include <codecvt>
-#include <sstream>
-
-
-
-void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
-    size_t pos = 0;
-    while ((pos = str.find(needle, pos)) != std::string::npos) {
-        str.replace(pos, needle.length(), replacement);
-        pos += replacement.length();
-    }
-}
-
-std::map<std::string, int32_t> json_parse(const std::string & fname) {
-    std::map<std::string, int32_t> result;
-
-    // read file into string
-    std::string json;
-    {
-        std::ifstream ifs(fname);
-        if (!ifs) {
-            fprintf(stderr, "Failed to open %s\n", fname.c_str());
-            exit(1);
-        }
-
-        json = std::string((std::istreambuf_iterator<char>(ifs)),
-                (std::istreambuf_iterator<char>()));
-    }
-
-    if (json[0] != '{') {
-        return result;
-    }
-
-    // parse json
-    {
-        bool has_key  = false;
-        bool in_token = false;
-
-        std::string str_key = "";
-        std::string str_val = "";
-
-        int n = json.size();
-        for (int i = 1; i < n; ++i) {
-            if (!in_token) {
-                if (json[i] == ' ') continue;
-                if (json[i] == '"') {
-                    in_token = true;
-                    continue;
-                }
-            } else {
-                if (json[i] == '\\' && i+1 < n) {
-                    if (has_key == false) {
-                        str_key += json[i];
-                    } else {
-                        str_val += json[i];
-                    }
-                    ++i;
-                } else if (json[i] == '"') {
-                    if (has_key == false) {
-                        has_key = true;
-                        ++i;
-                        while (json[i] == ' ') ++i;
-                        ++i; // :
-                        while (json[i] == ' ') ++i;
-                        if (json[i] != '\"') {
-                            while (json[i] != ',' && json[i] != '}') {
-                                str_val += json[i++];
-                            }
-                            has_key = false;
-                        } else {
-                            in_token = true;
-                            continue;
-                        }
-                    } else {
-                        has_key = false;
-                    }
-
-                    ::utreplace(str_key, "\\u0120", " " ); // \u0120 -> space
-                    ::utreplace(str_key, "\\u010a", "\n"); // \u010a -> new line
-                    ::utreplace(str_key, "\\\"",    "\""); // \\\"   -> "
-
-                    try {
-                        result[str_key] = std::stoi(str_val);
-                    } catch (...) {
-                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
-
-                    }
-                    str_key = "";
-                    str_val = "";
-                    in_token = false;
-                    continue;
-                }
-                if (has_key == false) {
-                    str_key += json[i];
-                } else {
-                    str_val += json[i];
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-
-void gpt_vocab::add_special_token(const std::string & token) {
-    special_tokens.push_back(token);
-}
-
-
-std::string convert_to_utf8(const std::wstring & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.to_bytes(input);
-}
-
-
-std::wstring convert_to_wstring(const std::string & input) {
-    try {
-        std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-        return converter.from_bytes(input);
-    } catch (const std::range_error& e) {
-        return L"";
-    } catch (...) {
-        return L"";
-    }
-}
-
-void gpt_split_words(std::string str, std::vector<std::string>& words) {
-    const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-    const std::regex re(pattern);
-    std::smatch m;
-
-    while (std::regex_search(str, m, re)) {
-        for (auto x : m) {
-            words.push_back(x);
-        }
-        str = m.suffix();
-    }
-}
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-
-        // Generate the subpattern from the special_tokens vector if it's not empty
-        if (!vocab.special_tokens.empty()) {
-            const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
-            std::string special_tokens_subpattern;
-            for (const auto & token : vocab.special_tokens) {
-                if (!special_tokens_subpattern.empty()) {
-                    special_tokens_subpattern += "|";
-                }
-                special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
-            }
-
-            std::regex re(special_tokens_subpattern);
-            std::smatch m;
-            // Split the text by special tokens.
-            while (std::regex_search(str, m, re)) {
-                // Split the substrings in-between special tokens into words.
-                gpt_split_words(m.prefix(), words);
-                // Add matched special tokens as words.
-                for (auto x : m) {
-                    words.push_back(x);
-                }
-                str = m.suffix();
-            }
-            // Remaining text without special tokens will be handled below.
-        }
-
-        gpt_split_words(str, words);
-    }
-
-    // find the longest token that forms each word in words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        for (unsigned long i = 0; i < word.size(); ){
-            for (unsigned long j = word.size() - 1; j >= i; j--){
-                auto cand = word.substr(i, j-i+1);
-                auto it = vocab.token_to_id.find(cand);
-                if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
-                    tokens.push_back(it->second);
-                    i = j + 1;
-                    break;
-                }
-                else if (j == i){ // word.substr(i, 1) has no matching
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
-                    i++;
-                }
-            }
-        }
-    }
-
-
-    return tokens;
-}
-
-bool should_transpose_layer(std::string name)
-{
-
-    if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
-    name.find(".attn.out_proj.weight")!=std::string::npos ||
-    name.find(".attn.q_proj.weight")!=std::string::npos ||
-    name.find(".attn.k_proj.weight")!=std::string::npos ||
-    name.find(".attn.v_proj.weight")!=std::string::npos ||
-    name.find("/attn/c_attn/w")!=std::string::npos ||
-    name.find("/attn/c_proj/w")!=std::string::npos ||
-    name.find("/mlp/c_fc/w")!=std::string::npos ||
-    name.find("/mlp/c_proj/w")!=std::string::npos)
-    {
-        return true;
-    }
-    return false;
-}