From 43e55bebfec2893bbdfd27b0a7310c107a0d65f1 Mon Sep 17 00:00:00 2001 From: abb128 <65567823+abb128@users.noreply.github.com> Date: Mon, 10 Jul 2023 12:21:05 +0300 Subject: [PATCH] Remove AGPLv3 code --- native/jni/NativeFileList.mk | 6 +- ..._futo_inputmethod_latin_GGMLDictionary.cpp | 40 +- native/jni/src/ggml/common.cpp | 143 + native/jni/src/ggml/{utils.h => common.h} | 27 +- native/jni/src/ggml/context.cpp | 29 + native/jni/src/ggml/context.h | 12 + native/jni/src/ggml/ggml.c | 2865 ++++++----------- native/jni/src/ggml/ggml.h | 183 +- .../src/ggml/{neox_v3.cpp => gpt_neox.cpp} | 213 +- native/jni/src/ggml/gpt_neox.h | 86 + native/jni/src/ggml/model_adapter.cpp | 466 --- native/jni/src/ggml/model_adapter.h | 67 - native/jni/src/ggml/otherarch.h | 464 --- native/jni/src/ggml/utils.cpp | 224 -- 14 files changed, 1495 insertions(+), 3330 deletions(-) create mode 100644 native/jni/src/ggml/common.cpp rename native/jni/src/ggml/{utils.h => common.h} (53%) create mode 100644 native/jni/src/ggml/context.cpp create mode 100644 native/jni/src/ggml/context.h rename native/jni/src/ggml/{neox_v3.cpp => gpt_neox.cpp} (75%) create mode 100644 native/jni/src/ggml/gpt_neox.h delete mode 100644 native/jni/src/ggml/model_adapter.cpp delete mode 100644 native/jni/src/ggml/model_adapter.h delete mode 100644 native/jni/src/ggml/otherarch.h delete mode 100644 native/jni/src/ggml/utils.cpp diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk index 687500474..18c6e7fd0 100755 --- a/native/jni/NativeFileList.mk +++ b/native/jni/NativeFileList.mk @@ -22,9 +22,9 @@ LATIN_IME_JNI_SRC_FILES := \ LATIN_IME_CORE_SRC_FILES := \ ggml/ggml.c \ - ggml/utils.cpp \ - ggml/model_adapter.cpp \ - ggml/neox_v3.cpp \ + ggml/common.cpp \ + ggml/context.cpp \ + ggml/gpt_neox.cpp \ $(addprefix dictionary/header/, \ header_policy.cpp \ header_read_write_utils.cpp) \ diff --git a/native/jni/org_futo_inputmethod_latin_GGMLDictionary.cpp b/native/jni/org_futo_inputmethod_latin_GGMLDictionary.cpp index 6abf219a5..3b32caddd 100644 --- a/native/jni/org_futo_inputmethod_latin_GGMLDictionary.cpp +++ b/native/jni/org_futo_inputmethod_latin_GGMLDictionary.cpp @@ -38,7 +38,9 @@ #include "utils/profiler.h" #include "utils/time_keeper.h" -#include "ggml/otherarch.h" +#include "ggml/gpt_neox.h" +#include "ggml/context.h" +#include "ggml/common.h" #include @@ -81,13 +83,12 @@ class ProximityInfo; struct GGMLDictionaryState { int n_threads = 3; - std::vector smartcontext; - std::vector current_context_tokens; + transformer_context t_context; + std::vector logits; std::vector bad_logits; size_t mem_per_token = 0; - bool use_scratch = true; gpt_neox_model model; gpt_vocab vocab; @@ -109,12 +110,10 @@ static jlong latinime_GGMLDictionary_open(JNIEnv *env, jclass clazz, jstring sou GGMLDictionaryState *state = new GGMLDictionaryState(); std::string fname(sourceDirChars); - FileFormat format = check_file_format(fname); - assert(format == 405); - ModelLoadResult result = gpt_neox_model_load(fname, state->model, state->vocab, format, 0); + bool result = gpt_neox_model_load(fname, state->model, state->vocab); - if(result != ModelLoadResult::SUCCESS) { + if(!result) { AKLOGE("GGMLDict: Could not load model"); free(state); return 0; @@ -171,33 +170,28 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl env->ReleaseStringUTFChars(partialWord, pwstr); } - auto embd_inp = gpt_tokenize(state->vocab, contextString); + token_sequence next_context = gpt_tokenize(state->vocab, contextString); //truncate to front of the prompt if its too long int32_t nctx = state->model.hparams.n_ctx; - if (embd_inp.size() + 2 > nctx) { - int offset = embd_inp.size() - nctx + 2; - embd_inp = std::vector(embd_inp.begin() + offset, embd_inp.end()); + if (next_context.size() + 2 > nctx) { + int offset = next_context.size() - nctx + 2; + next_context = std::vector(next_context.begin() + offset, next_context.end()); } - size_t size = env->GetArrayLength(outPredictions); - int n_past = 0; + auto fastforward_info = transformer_context_fastforward(state->t_context, next_context); - bool useSmartContext = true; - ContextFastForward(state->current_context_tokens, embd_inp, n_past, nctx, state->smartcontext, useSmartContext, false); + token_sequence &embd_inp = fastforward_info.first; + int n_past = fastforward_info.second; if(embd_inp.empty()) return; - state->current_context_tokens.resize(n_past); - AKLOGI("npast = %d, size(embd) = %d\n", n_past, (int)embd_inp.size()); - gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token, state->use_scratch); + gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token); - for(auto token : embd_inp) { - state->current_context_tokens.emplace_back(token); - } + transformer_context_apply(state->t_context, fastforward_info); int topid = std::min_element(state->logits.begin(),state->logits.end())-state->logits.begin(); float zeroValue = (state->logits[topid] < 0 ? state->logits[topid] : 0); @@ -249,6 +243,8 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl } + size_t size = env->GetArrayLength(outPredictions); + // Get the array elements jint *probsArray = env->GetIntArrayElements(outProbabilities, nullptr); diff --git a/native/jni/src/ggml/common.cpp b/native/jni/src/ggml/common.cpp new file mode 100644 index 000000000..6ec69c194 --- /dev/null +++ b/native/jni/src/ggml/common.cpp @@ -0,0 +1,143 @@ +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +std::string trim(const std::string & s) { + std::regex e("^\\s+|\\s+$"); + return std::regex_replace(s, e, ""); +} + +std::string replace(const std::string & s, const std::string & from, const std::string & to) { + std::string result = s; + size_t pos = 0; + while ((pos = result.find(from, pos)) != std::string::npos) { + result.replace(pos, from.length(), to); + pos += to.length(); + } + return result; +} + +void gpt_vocab::add_special_token(const std::string & token) { + special_tokens.push_back(token); +} + +std::string convert_to_utf8(const std::wstring & input) { + std::wstring_convert> converter; + return converter.to_bytes(input); +} + + +std::wstring convert_to_wstring(const std::string & input) { + std::wstring_convert> converter; + return converter.from_bytes(input); +} + +void gpt_split_words(std::string str, std::vector& words) { + const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; + const std::regex re(pattern); + std::smatch m; + + while (std::regex_search(str, m, re)) { + for (auto x : m) { + words.push_back(x); + } + str = m.suffix(); + } +} + +std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { + std::vector words; + + // first split the text into words + { + std::string str = text; + + // Generate the subpattern from the special_tokens vector if it's not empty + if (!vocab.special_tokens.empty()) { + const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])"); + std::string special_tokens_subpattern; + for (const auto & token : vocab.special_tokens) { + if (!special_tokens_subpattern.empty()) { + special_tokens_subpattern += "|"; + } + special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)"); + } + + std::regex re(special_tokens_subpattern); + std::smatch m; + // Split the text by special tokens. + while (std::regex_search(str, m, re)) { + // Split the substrings in-between special tokens into words. + gpt_split_words(m.prefix(), words); + // Add matched special tokens as words. + for (auto x : m) { + words.push_back(x); + } + str = m.suffix(); + } + // Remaining text without special tokens will be handled below. + } + + gpt_split_words(str, words); + } + + // find the longest token that forms each word in words: + std::vector tokens; + for (const auto & word : words) { + for (int i = 0; i < (int) word.size(); ){ + for (int j = word.size() - 1; j >= i; j--){ + auto cand = word.substr(i, j-i+1); + auto it = vocab.token_to_id.find(cand); + if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + tokens.push_back(it->second); + i = j + 1; + break; + } + else if (j == i){ // word.substr(i, 1) has no matching + fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + i++; + } + } + } + } + + return tokens; +} + +float similarity(const std::string & s0, const std::string & s1) { + const size_t len0 = s0.size() + 1; + const size_t len1 = s1.size() + 1; + + std::vector col(len1, 0); + std::vector prevCol(len1, 0); + + for (size_t i = 0; i < len1; i++) { + prevCol[i] = i; + } + + for (size_t i = 0; i < len0; i++) { + col[0] = i; + for (size_t j = 1; j < len1; j++) { + col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1)); + } + col.swap(prevCol); + } + + const float dist = prevCol[len1 - 1]; + + return 1.0f - (dist / std::max(s0.size(), s1.size())); +} diff --git a/native/jni/src/ggml/utils.h b/native/jni/src/ggml/common.h similarity index 53% rename from native/jni/src/ggml/utils.h rename to native/jni/src/ggml/common.h index 603c3583c..b3c03391f 100644 --- a/native/jni/src/ggml/utils.h +++ b/native/jni/src/ggml/common.h @@ -1,5 +1,3 @@ -// Various helper functions and utilities - #pragma once #include @@ -8,15 +6,6 @@ #include #include -// -// CLI argument parsing -// - - -// -// Vocab utils -// - struct gpt_vocab { using id = int32_t; using token = std::string; @@ -28,16 +17,7 @@ struct gpt_vocab { void add_special_token(const std::string & token); }; -void utreplace(std::string & str, const std::string & needle, const std::string & replacement); - -// poor-man's JSON parsing -std::map json_parse(const std::string & fname); - -std::string convert_to_utf8(const std::wstring & input); - -std::wstring convert_to_wstring(const std::string & input); - -void gpt_split_words(std::string str, std::vector& words); +typedef std::vector token_sequence; // split text into tokens // @@ -49,8 +29,5 @@ void gpt_split_words(std::string str, std::vector& words); // Regex (C++): // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" // -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); +token_sequence gpt_tokenize(const gpt_vocab & vocab, const std::string & text); - - -bool should_transpose_layer(std::string name); \ No newline at end of file diff --git a/native/jni/src/ggml/context.cpp b/native/jni/src/ggml/context.cpp new file mode 100644 index 000000000..abdcee4e5 --- /dev/null +++ b/native/jni/src/ggml/context.cpp @@ -0,0 +1,29 @@ +#include "context.h" + + +std::pair transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context) { + int npast = 0; + + // Compare the two sequences and find the first index at which they differ. + int max_length = std::min(ctx.active_context.size(), next_context.size()); + for(int i=0; i &fastforward_info) { + ctx.active_context.resize(fastforward_info.second); + + for(auto i : fastforward_info.first) { + ctx.active_context.emplace_back(i); + } +} \ No newline at end of file diff --git a/native/jni/src/ggml/context.h b/native/jni/src/ggml/context.h new file mode 100644 index 000000000..8118364fe --- /dev/null +++ b/native/jni/src/ggml/context.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include "common.h" + +struct transformer_context { + token_sequence active_context; +}; + +std::pair transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context); +void transformer_context_apply(transformer_context &ctx, const std::pair &fastforward_info); \ No newline at end of file diff --git a/native/jni/src/ggml/ggml.c b/native/jni/src/ggml/ggml.c index e55decf7d..c10877a76 100644 --- a/native/jni/src/ggml/ggml.c +++ b/native/jni/src/ggml/ggml.c @@ -220,16 +220,38 @@ inline static void* ggml_aligned_malloc(size_t size) { #define GGML_ALIGNED_FREE(ptr) free(ptr) #endif -#define UNUSED(x) (void)(x) +#define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) +// +// tensor access macros +// + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + #if defined(GGML_USE_ACCELERATE) #include #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" #endif #elif defined(GGML_USE_OPENBLAS) +#if defined(GGML_BLAS_USE_MKL) +#include +#else #include +#endif #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -463,14 +485,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { return GGML_FP32_TO_FP16(x); } -void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) { - for (size_t i = 0; i < n; i++) { +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) { + for (int i = 0; i < n; i++) { y[i] = GGML_FP16_TO_FP32(x[i]); } } -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { - size_t i = 0; +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { + int i = 0; #if defined(__F16C__) for (; i + 7 < n; i += 8) { __m256 x_vec = _mm256_loadu_ps(x + i); @@ -1609,109 +1631,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in } } +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { +static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = { + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + .vec_dot_type = GGML_TYPE_F32, + }, + [GGML_TYPE_F16] = { + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot_type = GGML_TYPE_F16, + }, [GGML_TYPE_Q4_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q4_0_q8_0, + .to_float = (ggml_to_float_t) dequantize_row_q4_0, + .from_float = quantize_row_q4_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, + .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q4_1_q8_1, + .to_float = (ggml_to_float_t) dequantize_row_q4_1, + .from_float = quantize_row_q4_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, + .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q5_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, - .quantize_row_q = quantize_row_q5_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q5_0_q8_0, + .to_float = (ggml_to_float_t) dequantize_row_q5_0, + .from_float = quantize_row_q5_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, + .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q5_1] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1, - .quantize_row_q = quantize_row_q5_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q5_1_q8_1, + .to_float = (ggml_to_float_t) dequantize_row_q5_1, + .from_float = quantize_row_q5_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, + .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q8_0] = { - .dequantize_row_q = dequantize_row_q8_0, - .quantize_row_q = quantize_row_q8_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q8_0_q8_0, + .to_float = dequantize_row_q8_0, + .from_float = quantize_row_q8_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, + .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q8_1] = { - .dequantize_row_q = NULL, // TODO - .quantize_row_q = quantize_row_q8_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = NULL, // TODO + .from_float = quantize_row_q8_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, }, #ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, - .quantize_row_q = quantize_row_q2_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q2_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q2_K, + .from_float = quantize_row_q2_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, + .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, - .quantize_row_q = quantize_row_q3_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q3_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q3_K, + .from_float = quantize_row_q3_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, + .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, - .quantize_row_q = quantize_row_q4_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q4_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q4_K, + .from_float = quantize_row_q4_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, + .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, - .quantize_row_q = quantize_row_q5_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q5_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q5_K, + .from_float = quantize_row_q5_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, + .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, - .quantize_row_q = quantize_row_q6_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q6_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float = quantize_row_q6_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, + .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, + [GGML_TYPE_Q8_K] = { + .from_float = quantize_row_q8_K, + } #endif }; // For internal test use -quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { +ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) { GGML_ASSERT(i < GGML_TYPE_COUNT); - return quantize_fns[i]; + return type_traits[i]; } @@ -2257,7 +2282,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -2294,7 +2319,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -3447,6 +3472,8 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } static const float GELU_COEF_A = 0.044715f; @@ -3598,6 +3625,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x *s = 1.f/(*s); } +inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { + float max = -INFINITY; + int idx = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + if (max == x[i]) { idx = i; } + } + *s = idx; +} + // // data types // @@ -3707,12 +3744,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "SUM", "SUM_ROWS", "MEAN", + "ARGMAX", "REPEAT", "REPEAT_BACK", "ABS", "SGN", "NEG", "STEP", + "TANH", + "ELU", "RELU", "GELU", "GELU_QUICK", @@ -3744,9 +3784,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ROPE_BACK", "ALIBI", "CLAMP", - "CONV_1D_S1_PH", - "CONV_1D_S2_PH", - "CONV_2D_SK_P0", + "CONV_1D", + "CONV_2D", "FLASH_ATTN", "FLASH_FF", @@ -3765,7 +3804,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); +static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3783,12 +3822,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "Σx", "Σx_k", "Σx/n", + "argmax(x)", "repeat(x)", "repeat_back(x)", "abs(x)", "sgn(x)", "-x", "step(x)", + "tanh(x)", + "elu(x)", "relu(x)", "gelu(x)", "gelu_quick(x)", @@ -3820,9 +3862,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rope_back(x)", "alibi(x)", "clamp(x)", - "conv_1d_s1_ph(x)", - "conv_1d_s2_ph(x)", - "conv_2d_sk_p0(x)", + "conv_1d(x)", + "conv_2d(x)", "flash_attn(x)", "flash_ff(x)", @@ -3841,7 +3882,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); +static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -3867,9 +3908,8 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_GET_ROWS_BACK ] = true; p[GGML_OP_DIAG_MASK_INF ] = true; p[GGML_OP_DIAG_MASK_ZERO ] = true; - p[GGML_OP_CONV_1D_S1_PH ] = true; - p[GGML_OP_CONV_1D_S2_PH ] = true; - p[GGML_OP_CONV_2D_SK_P0 ] = true; + p[GGML_OP_CONV_1D ] = true; + p[GGML_OP_CONV_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; } @@ -4243,22 +4283,6 @@ static inline int ggml_up(int n, int m) { #define ggml_assert_aligned(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) -float get_theta_scale(int n_dims,int n_past,int n_ctx) -{ - if(n_ctx<=2048) //normie mode - { - return powf(10000.0, -2.0f/n_dims); - } - else - { - //using scaled NTK aware ctx - float a = (n_ctx<=4096?4.0:8.0); - float m = powf(a, n_dims / (n_dims - 2.0)); - float s = powf(10000.0 * m, -2.0f/n_dims); - return s; - } -} - //////////////////////////////////////////////////////////////////////////////// struct ggml_context * ggml_init(struct ggml_init_params params) { @@ -4563,14 +4587,13 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.src0 =*/ NULL, /*.src1 =*/ NULL, /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.pad =*/ { 0 }, + /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads @@ -5456,6 +5479,30 @@ struct ggml_tensor * ggml_mean( return result; } +// ggml_argmax + +struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(ggml_is_matrix(a)); + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); + is_node = true; + } + + int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); + + result->op = GGML_OP_ARGMAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + // ggml_repeat struct ggml_tensor * ggml_repeat( @@ -5649,6 +5696,74 @@ struct ggml_tensor * ggml_step_inplace( return ggml_step_impl(ctx, a, true); } +// ggml_tanh + +struct ggml_tensor * ggml_tanh_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_TANH; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_tanh_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_tanh_impl(ctx, a, true); +} + +// ggml_elu + +struct ggml_tensor * ggml_elu_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ELU; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_elu_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_elu_impl(ctx, a, true); +} + // ggml_relu struct ggml_tensor * ggml_relu_impl( @@ -6890,6 +7005,8 @@ struct ggml_tensor * ggml_rope_back( int n_dims, int mode) { GGML_ASSERT(n_past >= 0); + GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); + bool is_node = false; if (a->grad) { @@ -6990,15 +7107,21 @@ struct ggml_tensor * ggml_clamp( return result; } -// ggml_conv_1d_s1_ph +// ggml_conv_1d -struct ggml_tensor * ggml_conv_1d_s1_ph( +static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; +} + +GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * b, + int s0, + int p0, + int d0) { GGML_ASSERT(ggml_is_matrix(b)); GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); bool is_node = false; if (a->grad || b->grad) { @@ -7006,54 +7129,43 @@ struct ggml_tensor * ggml_conv_1d_s1_ph( is_node = true; } - const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + const int64_t ne[4] = { + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + a->ne[2], 1, 1, + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - result->op = GGML_OP_CONV_1D_S1_PH; + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + ((int32_t*)c->data)[0] = s0; + ((int32_t*)c->data)[1] = p0; + ((int32_t*)c->data)[2] = d0; + ggml_scratch_load(ctx); + + result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; + result->opt[0] = c; return result; } -// ggml_conv_1d_s2_ph +// ggml_conv_2d -struct ggml_tensor * ggml_conv_1d_s2_ph( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); - bool is_node = false; +struct ggml_tensor* ggml_conv_2d( + struct ggml_context* ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - - result->op = GGML_OP_CONV_1D_S2_PH; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - - return result; -} - -// ggml_conv_2d_sk_p0 - -struct ggml_tensor * ggml_conv_2d_sk_p0( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { GGML_ASSERT(b->ne[3] == 1); GGML_ASSERT(a->ne[2] == b->ne[2]); - GGML_ASSERT(b->ne[0] % a->ne[0] == 0); - GGML_ASSERT(b->ne[1] % a->ne[1] == 0); bool is_node = false; if (a->grad || b->grad) { @@ -7061,15 +7173,42 @@ struct ggml_tensor * ggml_conv_2d_sk_p0( is_node = true; } - const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + const int64_t ne[4] = { + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), + a->ne[3], 1, + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_CONV_2D_SK_P0; + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); + ((int32_t*)c->data)[0] = s0; + ((int32_t*)c->data)[1] = s1; + ((int32_t*)c->data)[2] = p0; + ((int32_t*)c->data)[3] = p1; + ((int32_t*)c->data)[4] = d0; + ((int32_t*)c->data)[5] = d1; + ggml_scratch_load(ctx); + + result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; + result->opt[0] = c; return result; + +} + +// ggml_conv_1d_ph + +struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d) { + return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } // ggml_flash_attn @@ -7619,25 +7758,7 @@ static void ggml_compute_forward_dup_f16( return; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -7710,8 +7831,8 @@ static void ggml_compute_forward_dup_f16( id += ne00 * (ne01 - ir1); } } - } else if (ggml_is_quantized(dst->type)) { - quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; @@ -7908,25 +8029,7 @@ static void ggml_compute_forward_dup_f32( return; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -7981,26 +8084,8 @@ static void ggml_compute_forward_dup_f32( id += rs * (ne01 - ir1); } } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (ggml_is_quantized(dst->type)) { - quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; size_t id = 0; size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); @@ -8224,24 +8309,8 @@ static void ggml_compute_forward_add_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -8310,28 +8379,12 @@ static void ggml_compute_forward_add_f16_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -8380,24 +8433,8 @@ static void ggml_compute_forward_add_f16_f16( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); @@ -8447,32 +8484,15 @@ static void ggml_compute_forward_add_q_f32( } const int nr = ggml_nrows(src0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; const enum ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); @@ -8586,19 +8606,8 @@ static void ggml_compute_forward_add1_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -8652,23 +8661,12 @@ static void ggml_compute_forward_add1_f16_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -8713,23 +8711,12 @@ static void ggml_compute_forward_add1_f16_f16( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -8774,23 +8761,12 @@ static void ggml_compute_forward_add1_q_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const enum ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); @@ -8918,15 +8894,8 @@ static void ggml_compute_forward_acc_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // src0 and dst as viewed during acc const size_t nb0 = ggml_element_size(src0); @@ -9015,24 +8984,8 @@ static void ggml_compute_forward_sub_f32( } const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9122,29 +9075,7 @@ static void ggml_compute_forward_mul_f32( const int64_t nr = ggml_nrows(src0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9232,24 +9163,8 @@ static void ggml_compute_forward_div_f32( } const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9456,14 +9371,8 @@ static void ggml_compute_forward_sum_f32( assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); ggml_float sum = 0; ggml_float row_sum = 0; @@ -9512,29 +9421,13 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(dst->nb[0] == sizeof(float)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT(ne0 == 1); GGML_ASSERT(ne1 == ne01); GGML_ASSERT(ne2 == ne02); GGML_ASSERT(ne3 == ne03); - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { @@ -9578,19 +9471,7 @@ static void ggml_compute_forward_mean_f32( assert(src0->nb[0] == sizeof(float)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + GGML_TENSOR_UNARY_OP_LOCALS; assert(ne0 == 1); assert(ne1 == ne01); @@ -9602,10 +9483,6 @@ static void ggml_compute_forward_mean_f32( UNUSED(ne2); UNUSED(ne3); - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { @@ -9635,6 +9512,52 @@ static void ggml_compute_forward_mean( } } +// ggml_compute_forward_argmax + +static void ggml_compute_forward_argmax_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(float)); + assert(dst->nb[0] == sizeof(float)); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + + const size_t nb01 = src0->nb[1]; + const size_t nb0 = dst->nb[0]; + + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src = (float *) ((char *) src0->data + i1*nb01); + int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); + int v = 0; + ggml_vec_argmax_f32(ne00, &v, src); + dst_[0] = v; + } +} + +static void ggml_compute_forward_argmax( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_argmax_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_repeat static void ggml_compute_forward_repeat_f32( @@ -9648,25 +9571,7 @@ static void ggml_compute_forward_repeat_f32( return; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne0/ne00); @@ -9727,25 +9632,7 @@ static void ggml_compute_forward_repeat_back_f32( return; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne00/ne0); @@ -9975,6 +9862,90 @@ static void ggml_compute_forward_step( } } +// ggml_compute_forward_tanh + +static void ggml_compute_forward_tanh_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_tanh_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_tanh( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_tanh_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_elu + +static void ggml_compute_forward_elu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_elu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_elu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_elu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_relu static void ggml_compute_forward_relu_f32( @@ -10276,18 +10247,7 @@ static void ggml_compute_forward_norm_f32( const int ith = params->ith; const int nth = params->nth; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const float eps = 1e-5f; // TODO: make this a parameter @@ -10353,18 +10313,7 @@ static void ggml_compute_forward_rms_norm_f32( const int ith = params->ith; const int nth = params->nth; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const float eps = 1e-6f; // TODO: make this a parameter @@ -10429,22 +10378,7 @@ static void ggml_compute_forward_rms_norm_back_f32( const int ith = params->ith; const int nth = params->nth; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const float eps = 1e-6f; // TODO: make this a parameter @@ -10632,7 +10566,7 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif -static void ggml_compute_forward_mul_mat_f32( +static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -10640,406 +10574,7 @@ static void ggml_compute_forward_mul_mat_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - const int64_t ne10 = src1->ne[0]; -#endif - const int64_t ne11 = src1->ne[1]; -#ifndef NDEBUG - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nb00 = src0->nb[0]; -#endif - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - -#ifndef NDEBUG - const int nb10 = src1->nb[0]; -#endif - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int ith = params->ith; - const int nth = params->nth; - - assert(ne02 == ne12); - assert(ne03 == ne13); - assert(ne2 == ne12); - assert(ne3 == ne13); - - // we don't support permuted src0 or src1 - assert(nb00 == sizeof(float)); - assert(nb10 == sizeof(float)); - - // dst cannot be transposed or permuted - assert(nb0 == sizeof(float)); - assert(nb0 <= nb1); - assert(nb1 <= nb2); - assert(nb2 <= nb3); - - assert(ne0 == ne01); - assert(ne1 == ne11); - assert(ne2 == ne02); - assert(ne3 == ne03); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - -#if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#endif - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - // parallelize by src0 rows using ggml_vec_dot_f32 - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - for (int64_t ic = 0; ic < ne11; ++ic) { - // src1 indices - const int i13 = i03; - const int i12 = i02; - const int i11 = ic; - - // dst indices - const int i0 = i01; - const int i1 = i11; - const int i2 = i02; - const int i3 = i03; - - ggml_vec_dot_f32(ne00, - (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), - (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); - } - } - - //int64_t t1 = ggml_perf_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} -} - -static void ggml_compute_forward_mul_mat_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int ith = params->ith; - const int nth = params->nth; - - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // TODO: we don't support permuted src0 - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - -#if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#endif - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - GGML_ASSERT(nb10 == sizeof(float)); - - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - float * const wdata = params->wdata; - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - for (int64_t i00 = 0; i00 < ne00; ++i00) { - wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - } - } - - assert(id*sizeof(float) <= params->wsize); - } - - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ - - return; - } -#endif - - if (params->type == GGML_TASK_INIT) { - ggml_fp16_t * const wdata = params->wdata; - - size_t id = 0; - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - for (int64_t i10 = 0; i10 < ne10; ++i10) { - wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); - } - } - } - } - - GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize); - - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - // fp16 -> half the size, so divide by 2 - // TODO: do not support transposed src1 - assert(nb10/2 == sizeof(ggml_fp16_t)); - - // parallelize by src0 rows using ggml_vec_dot_f16 - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - ggml_fp16_t * wdata = params->wdata; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int i13 = i03; - const int i12 = i02; - - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; - - ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; - - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - - for (int64_t ic = 0; ic < ne11; ++ic) { - ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); - } - } - - //int64_t t1 = ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} -} - -static void ggml_compute_forward_mul_mat_q_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -11050,12 +10585,13 @@ static void ggml_compute_forward_mul_mat_q_f32( GGML_ASSERT(ne3 == ne13); const enum ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; - vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; - enum ggml_type const vec_dot_type = quantize_fns[type].vec_dot_type; + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -11095,27 +10631,27 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i03*nb03 + i02*nb02; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - { + if (type != GGML_TYPE_F32) { + float * const wdata = params->wdata; + ggml_to_float_t const to_float = type_traits[type].to_float; + size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { - dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); + to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); id += ne00; } assert(id*sizeof(float) <= params->wsize); + x = wdata; } - const float * x = wdata; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, @@ -11131,14 +10667,16 @@ static void ggml_compute_forward_mul_mat_q_f32( #endif if (params->type == GGML_TASK_INIT) { - char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); - wdata += row_size; + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } } } } @@ -11162,7 +10700,7 @@ static void ggml_compute_forward_mul_mat_q_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - void * wdata = params->wdata; + void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; for (int ir = ir0; ir < ir1; ++ir) { @@ -11183,10 +10721,8 @@ static void ggml_compute_forward_mul_mat_q_f32( float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - assert(ne00 % 32 == 0); - for (int64_t ic = 0; ic < ne11; ++ic) { - vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); + vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); } } @@ -11203,40 +10739,6 @@ static void ggml_compute_forward_mul_mat_q_f32( //} } -static void ggml_compute_forward_mul_mat( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - switch (src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - { - ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} // ggml_compute_forward_out_prod @@ -11249,35 +10751,7 @@ static void ggml_compute_forward_out_prod_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - //const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -11512,15 +10986,8 @@ static void ggml_compute_forward_set_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // src0 and dst as viewed during set const size_t nb0 = ggml_element_size(src0); @@ -11661,7 +11128,7 @@ static void ggml_compute_forward_get_rows_q( const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); const enum ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); @@ -11911,29 +11378,14 @@ static void ggml_compute_forward_diag_f32( // TODO: handle transposed/permuted matrices - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; - const int ne02 = src0->ne[2]; - const int ne03 = src0->ne[3]; - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; + GGML_TENSOR_UNARY_OP_LOCALS; + GGML_ASSERT(ne00 == ne0); GGML_ASSERT(ne00 == ne1); GGML_ASSERT(ne01 == 1); GGML_ASSERT(ne02 == ne2); GGML_ASSERT(ne03 == ne3); - const int nb00 = src0->nb[0]; - //const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float)); @@ -12510,20 +11962,7 @@ static void ggml_compute_forward_rope_f32( assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12548,7 +11987,7 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); + const float theta_scale = powf(10000.0, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12588,7 +12027,6 @@ static void ggml_compute_forward_rope_f32( dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; } } else if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12651,20 +12089,7 @@ static void ggml_compute_forward_rope_f16( assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12689,7 +12114,7 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); + const float theta_scale = powf(10000.0, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12814,25 +12239,10 @@ static void ggml_compute_forward_rope_back_f32( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12854,7 +12264,7 @@ static void ggml_compute_forward_rope_back_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); + const float theta_scale = powf(10000.0, -2.0f/n_dims); const bool is_neox = mode & 2; @@ -12928,25 +12338,10 @@ static void ggml_compute_forward_rope_back_f16( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12968,7 +12363,7 @@ static void ggml_compute_forward_rope_back_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); + const float theta_scale = powf(10000.0, -2.0f/n_dims); const bool is_neox = mode & 2; @@ -13044,7 +12439,7 @@ static void ggml_compute_forward_rope_back( } } -// ggml_compute_forward_conv_1d_s1_ph +// ggml_compute_forward_conv_1d static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( const struct ggml_compute_params * params, @@ -13058,36 +12453,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13178,36 +12544,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13307,8 +12644,6 @@ static void ggml_compute_forward_conv_1d_s1_ph( } } -// ggml_compute_forward_conv_1d_s2_ph - static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -13321,36 +12656,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13441,36 +12747,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13570,6 +12847,28 @@ static void ggml_compute_forward_conv_1d_s2_ph( } } +// ggml_compute_forward_conv_1d + +static void ggml_compute_forward_conv_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t p0 = ((const int32_t*)(opt0->data))[1]; + const int32_t d0 = ((const int32_t*)(opt0->data))[2]; + GGML_ASSERT(d0 == 1); // dilation not supported + GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported + if (s0 == 1) { + ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst); + } else if (s0 == 2) { + ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); + } else { + GGML_ASSERT(false); // only stride 1 and 2 supported + }; +} + // ggml_compute_forward_conv_2d_sk_p0 static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( @@ -13584,36 +12883,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; - const int ne02 = src0->ne[2]; - //const int ne03 = src0->ne[3]; - - const int ne10 = src1->ne[0]; - //const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - //const int ne13 = src1->ne[3]; - - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - //const int ne3 = dst->ne[3]; - //const int ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - //const int nb01 = src0->nb[1]; - //const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - //const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - //const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13706,6 +12976,34 @@ static void ggml_compute_forward_conv_2d_sk_p0( } } +// ggml_compute_forward_conv_2d + +static void ggml_compute_forward_conv_2d( + const struct ggml_compute_params* params, + const struct ggml_tensor* src0, + const struct ggml_tensor* src1, + const struct ggml_tensor* opt0, + struct ggml_tensor* dst) { + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t s1 = ((const int32_t*)(opt0->data))[1]; + const int32_t p0 = ((const int32_t*)(opt0->data))[2]; + const int32_t p1 = ((const int32_t*)(opt0->data))[3]; + const int32_t d0 = ((const int32_t*)(opt0->data))[4]; + const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + GGML_ASSERT(d0 == 1); // dilation not supported + GGML_ASSERT(d1 == 1); + GGML_ASSERT(p0 == 0); // padding not supported + GGML_ASSERT(p1 == 0); + + if (s0 == src0->ne[0] && s1 == src0->ne[1]) { + ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); // only stride equal to kernel size is supported + }; +} + + // ggml_compute_forward_flash_attn static void ggml_compute_forward_flash_attn_f32( @@ -13718,45 +13016,14 @@ static void ggml_compute_forward_flash_attn_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t neq0 = q->ne[0]; - const int64_t neq1 = q->ne[1]; - const int64_t neq2 = q->ne[2]; - const int64_t neq3 = q->ne[3]; - - const int64_t nek0 = k->ne[0]; - const int64_t nek1 = k->ne[1]; - //const int64_t nek2 = k->ne[2]; - //const int64_t nek3 = k->ne[3]; - - //const int64_t nev0 = v->ne[0]; - const int64_t nev1 = v->ne[1]; - //const int64_t nev2 = v->ne[2]; - //const int64_t nev3 = v->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - - const int nbk0 = k->nb[0]; - const int nbk1 = k->nb[1]; - const int nbk2 = k->nb[2]; - const int nbk3 = k->nb[3]; - - const int nbq0 = q->nb[0]; - const int nbq1 = q->nb[1]; - const int nbq2 = q->nb[2]; - const int nbq3 = q->nb[3]; - - const int nbv0 = v->nb[0]; - const int nbv1 = v->nb[1]; - const int nbv2 = v->nb[2]; - const int nbv3 = v->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -13927,45 +13194,14 @@ static void ggml_compute_forward_flash_attn_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t neq0 = q->ne[0]; - const int64_t neq1 = q->ne[1]; - const int64_t neq2 = q->ne[2]; - const int64_t neq3 = q->ne[3]; - - const int64_t nek0 = k->ne[0]; - const int64_t nek1 = k->ne[1]; - //const int64_t nek2 = k->ne[2]; - //const int64_t nek3 = k->ne[3]; - - //const int64_t nev0 = v->ne[0]; - const int64_t nev1 = v->ne[1]; - //const int64_t nev2 = v->ne[2]; - //const int64_t nev3 = v->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - - const int nbk0 = k->nb[0]; - const int nbk1 = k->nb[1]; - const int nbk2 = k->nb[2]; - const int nbk3 = k->nb[3]; - - const int nbq0 = q->nb[0]; - const int nbq1 = q->nb[1]; - const int nbq2 = q->nb[2]; - const int nbq3 = q->nb[3]; - - const int nbv0 = v->nb[0]; - const int nbv1 = v->nb[1]; - const int nbv2 = v->nb[2]; - const int nbv3 = v->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -14199,65 +13435,18 @@ static void ggml_compute_forward_flash_ff_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t nea0 = a->ne[0]; - const int64_t nea1 = a->ne[1]; - const int64_t nea2 = a->ne[2]; - const int64_t nea3 = a->ne[3]; - - const int64_t neb00 = b0->ne[0]; - const int64_t neb01 = b0->ne[1]; - //const int64_t neb02 = b0->ne[2]; - //const int64_t neb03 = b0->ne[3]; - - const int64_t neb10 = b1->ne[0]; - const int64_t neb11 = b1->ne[1]; - //const int64_t neb12 = b1->ne[2]; - //const int64_t neb13 = b1->ne[3]; - - const int64_t nec00 = c0->ne[0]; - const int64_t nec01 = c0->ne[1]; - //const int64_t nec02 = c0->ne[2]; - //const int64_t nec03 = c0->ne[3]; - - const int64_t nec10 = c1->ne[0]; - const int64_t nec11 = c1->ne[1]; - //const int64_t nec12 = c1->ne[2]; - //const int64_t nec13 = c1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - - const int nba0 = a->nb[0]; - const int nba1 = a->nb[1]; - const int nba2 = a->nb[2]; - const int nba3 = a->nb[3]; - - const int nbb00 = b0->nb[0]; - const int nbb01 = b0->nb[1]; - const int nbb02 = b0->nb[2]; - const int nbb03 = b0->nb[3]; - - const int nbb10 = b1->nb[0]; - //const int nbb11 = b1->nb[1]; - //const int nbb12 = b1->nb[2]; - //const int nbb13 = b1->nb[3]; - - const int nbc00 = c0->nb[0]; - const int nbc01 = c0->nb[1]; - const int nbc02 = c0->nb[2]; - const int nbc03 = c0->nb[3]; - - const int nbc10 = c1->nb[0]; - //const int nbc11 = c1->nb[1]; - //const int nbc12 = c1->nb[2]; - //const int nbc13 = c1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, nea, a, ne); + GGML_TENSOR_LOCALS(size_t, nba, a, nb); + GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); + GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); + GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); + GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); + GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); + GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); + GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); + GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -14405,55 +13594,16 @@ static void ggml_compute_forward_flash_attn_back_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t neq0 = q->ne[0]; - const int64_t neq1 = q->ne[1]; - const int64_t neq2 = q->ne[2]; - const int64_t neq3 = q->ne[3]; - - const int64_t nek0 = k->ne[0]; - const int64_t nek1 = k->ne[1]; - //const int64_t nek2 = k->ne[2]; - //const int64_t nek3 = k->ne[3]; - - const int64_t nev0 = v->ne[0]; - const int64_t nev1 = v->ne[1]; - //const int64_t nev2 = v->ne[2]; - //const int64_t nev3 = v->ne[3]; - - const int64_t ned0 = d->ne[0]; - const int64_t ned1 = d->ne[1]; - //const int64_t ned2 = d->ne[2]; - //const int64_t ned3 = d->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nbk0 = k->nb[0]; - const int nbk1 = k->nb[1]; - const int nbk2 = k->nb[2]; - const int nbk3 = k->nb[3]; - - const int nbq0 = q->nb[0]; - const int nbq1 = q->nb[1]; - const int nbq2 = q->nb[2]; - const int nbq3 = q->nb[3]; - - const int nbv0 = v->nb[0]; - const int nbv1 = v->nb[1]; - const int nbv2 = v->nb[2]; - const int nbv3 = v->nb[3]; - - const int nbd0 = d->nb[0]; - const int nbd1 = d->nb[1]; - const int nbd2 = d->nb[2]; - const int nbd3 = d->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ned, d, ne); + GGML_TENSOR_LOCALS(size_t, nbd, d, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -14811,15 +13961,8 @@ static void ggml_compute_forward_win_part_f32( return; } - const int64_t ne00 = src0->ne[0]; UNUSED(ne00); - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; UNUSED(ne03); - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; UNUSED(ne3); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); const int32_t nep0 = ((const int32_t *)(opt0->data))[0]; const int32_t nep1 = ((const int32_t *)(opt0->data))[1]; @@ -14882,14 +14025,8 @@ static void ggml_compute_forward_win_unpart_f32( return; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); const int32_t w = ((const int32_t *)(opt0->data))[0]; @@ -15487,6 +14624,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_mean(params, tensor->src0, tensor); } break; + case GGML_OP_ARGMAX: + { + ggml_compute_forward_argmax(params, tensor->src0, tensor); + } break; case GGML_OP_REPEAT: { ggml_compute_forward_repeat(params, tensor->src0, tensor); @@ -15511,6 +14652,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_step(params, tensor->src0, tensor); } break; + case GGML_OP_TANH: + { + ggml_compute_forward_tanh(params, tensor->src0, tensor); + } break; + case GGML_OP_ELU: + { + ggml_compute_forward_elu(params, tensor->src0, tensor); + } break; case GGML_OP_RELU: { ggml_compute_forward_relu(params, tensor->src0, tensor); @@ -15627,17 +14776,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_CONV_1D_S1_PH: + case GGML_OP_CONV_1D: { - ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); } break; - case GGML_OP_CONV_1D_S2_PH: + case GGML_OP_CONV_2D: { - ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor); - } break; - case GGML_OP_CONV_2D_SK_P0: - { - ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); } break; case GGML_OP_FLASH_ATTN: { @@ -15886,6 +15031,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } } break; case GGML_OP_MEAN: + case GGML_OP_ARGMAX: { GGML_ASSERT(false); // TODO: implement } break; @@ -15939,6 +15085,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; + case GGML_OP_TANH: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_ELU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_RELU: { if (src0->grad) { @@ -15958,14 +15112,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_ALIBI: - { - GGML_ASSERT(false); // TODO: not implemented - } break; - case GGML_OP_CLAMP: - { - GGML_ASSERT(false); // TODO: not implemented - } break; case GGML_OP_SILU: { // necessary for llama @@ -16282,7 +15428,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(ggml_nelements(src1) == 4); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; @@ -16322,15 +15468,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; - case GGML_OP_CONV_1D_S1_PH: + case GGML_OP_ALIBI: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_1D_S2_PH: + case GGML_OP_CLAMP: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_2D_SK_P0: + case GGML_OP_CONV_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; @@ -16623,9 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -16796,12 +15943,13 @@ void clear_numa_thread_affinity(void) {} #endif struct ggml_compute_state_shared { - struct ggml_cgraph * cgraph; + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - int n_threads; + const int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -16825,9 +15973,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_cgraph * cgraph = state->shared->cgraph; - const int n_threads = state->shared->n_threads; + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; + set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; @@ -16840,15 +15992,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { /* FINALIZE */ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = node->n_tasks; + params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); ggml_graph_compute_perf_stats_node(node, state->shared); } @@ -16859,11 +16011,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); - params.nth = node->n_tasks; + params.nth = n_tasks; /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { @@ -16871,7 +16024,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); } - if (node->n_tasks == 1) { + if (n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; @@ -16893,7 +16046,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // wait for other threads to finish const int last = node_n; do { - sched_yield(); + //sched_yield(); node_n = atomic_load(&state->shared->node_n); } while (node_n == last); } @@ -16903,16 +16056,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; struct ggml_compute_params params = { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; - if (state->ith < node->n_tasks) { + if (state->ith < n_tasks) { ggml_compute_forward(¶ms, node); } } @@ -16920,11 +16074,364 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } + + size_t work_size = 0; + + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); + + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; + + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; + + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_TANH: + case GGML_OP_ELU: + case GGML_OP_RELU: + { + n_tasks = 1; + } break; + case GGML_OP_MUL: + case GGML_OP_GELU: + case GGML_OP_GELU_QUICK: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src0); + //const int nr1 = ggml_nrows(node->src1); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; + +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } else +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } else +#endif +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src0->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + } + } else +#endif + if (node->src1->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; + } else { + cur = 0; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src0->ne[3] == 1); + GGML_ASSERT(node->src1->ne[2] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src0->ne[0]; + + if (node->src0->type == GGML_TYPE_F16 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else { + GGML_ASSERT(false); + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src1->ne[3] == 1); + + const int64_t ne00 = node->src0->ne[0]; // W + const int64_t ne01 = node->src0->ne[1]; // H + const int64_t ne02 = node->src0->ne[2]; // C + const int64_t ne03 = node->src0->ne[3]; // N + + const int64_t ne10 = node->src1->ne[0]; // W + const int64_t ne11 = node->src1->ne[1]; // H + const int64_t ne12 = node->src1->ne[2]; // C + + const int64_t nk = ne00*ne01; + + UNUSED(ne02); + UNUSED(ne03); + UNUSED(nk); + + size_t cur = 0; + + if (node->src0->type == GGML_TYPE_F16 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; + + size_t cur = 0; + + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; + + size_t cur = 0; + + const int64_t D = node->src0->ne[0]; + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: + { + n_tasks = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + } + + cplan.n_tasks[i] = n_tasks; + } + + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } + + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; + + return cplan; +} + +void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); + + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(cplan->n_tasks[i] > 0); + } + } + } + + const int n_threads = cplan->n_threads; struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, /*.perf_node_start_cycles =*/ 0, /*.perf_node_start_time_us =*/ 0, /*.n_threads =*/ n_threads, @@ -16933,352 +16440,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - // initialize tasks + work buffer - { - size_t work_size = 0; - - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - node->n_tasks = n_threads; - - size_t cur = 0; - if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - node->n_tasks = n_threads; - - size_t cur = 0; - - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ACC: - { - node->n_tasks = n_threads; - - size_t cur = 0; - - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_ABS: - case GGML_OP_SGN: - case GGML_OP_NEG: - case GGML_OP_STEP: - case GGML_OP_RELU: - { - node->n_tasks = 1; - } break; - case GGML_OP_MUL: - case GGML_OP_GELU: - case GGML_OP_GELU_QUICK: - case GGML_OP_SILU: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - { - node->n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); - - size_t cur = 0; - -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } - else -#elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else -#endif - if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); - } -#else - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#endif - } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { - cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - } -#endif - } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else -#endif - { - const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; - cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; - } - } else { - GGML_ASSERT(false); - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SCALE: - { - node->n_tasks = 1; - } break; - case GGML_OP_SET: - case GGML_OP_CONT: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: - { - node->n_tasks = 1; - } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_ALIBI: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CLAMP: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CONV_1D_S1_PH: - case GGML_OP_CONV_1D_S2_PH: - { - node->n_tasks = n_threads; - - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); - - size_t cur = 0; - const int nk = node->src0->ne[0]; - - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else { - GGML_ASSERT(false); - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CONV_2D_SK_P0: - { - node->n_tasks = n_threads; - - GGML_ASSERT(node->src1->ne[3] == 1); - - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N - - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C - - const int64_t nk = ne00*ne01; - - UNUSED(ne02); - UNUSED(ne03); - UNUSED(nk); - - size_t cur = 0; - - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - GGML_ASSERT(false); - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN: - { - node->n_tasks = n_threads; - - size_t cur = 0; - - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } - - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_FF: - { - node->n_tasks = n_threads; - - size_t cur = 0; - - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } - - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - node->n_tasks = n_threads; - - size_t cur = 0; - - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } - - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_WIN_PART: - case GGML_OP_WIN_UNPART: - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: - { - node->n_tasks = 1; - } break; - case GGML_OP_CROSS_ENTROPY_LOSS: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_NONE: - { - node->n_tasks = 1; - } break; - case GGML_OP_COUNT: - { - GGML_ASSERT(false); - } break; - } - } - - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling - } - - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); - - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); - } - } - // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -17340,6 +16501,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); + GGML_ASSERT(buf); + + cplan.work_data = buf->data; + + ggml_graph_compute(cgraph, &cplan); +} + struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -17378,14 +16550,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, tensor->data, tensor->name); } @@ -17498,13 +16669,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&nb, sizeof(uint64_t), 1, fout); } - // store the pointer address - { - const uint64_t ptr = (uint64_t) tensor->data; - - fwrite(&ptr, sizeof(uint64_t), 1, fout); - } - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); // dump the data @@ -17538,13 +16702,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&nb, sizeof(uint64_t), 1, fout); } - // store the pointer address - { - const uint64_t ptr = (uint64_t) tensor->data; - - fwrite(&ptr, sizeof(uint64_t), 1, fout); - } - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); // output the op arguments @@ -17729,8 +16886,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** tensor->op = (enum ggml_op) op; - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); - memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; tensor->data = (void *) ptr; @@ -17776,8 +16931,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** nb[j] = nb_cur; } - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used - const char * ptr_name = ptr; ptr += GGML_MAX_NAME; const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t); @@ -18139,9 +17292,6 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_cgraph * gb) { GGML_ASSERT(ggml_is_scalar(f)); - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; @@ -18188,7 +17338,8 @@ static enum ggml_opt_result ggml_opt_adam( // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -18268,7 +17419,8 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -18390,7 +17542,8 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -18458,9 +17611,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - const int m = params.lbfgs.m; // these will store the parameters we want to optimize @@ -18512,7 +17662,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); ggml_opt_get_grad(np, ps, g); @@ -19240,4 +18391,4 @@ int ggml_cpu_has_vsx(void) { #endif } -//////////////////////////////////////////////////////////////////////////////// \ No newline at end of file +//////////////////////////////////////////////////////////////////////////////// diff --git a/native/jni/src/ggml/ggml.h b/native/jni/src/ggml/ggml.h index d4d0330d1..ab84bef68 100644 --- a/native/jni/src/ggml/ggml.h +++ b/native/jni/src/ggml/ggml.h @@ -65,7 +65,7 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// ggml_graph_compute(ctx0, &gf); +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); // @@ -201,6 +201,8 @@ #define GGML_MAX_NAME 48 #define GGML_DEFAULT_N_THREADS 4 +#define GGML_UNUSED(x) (void)(x) + #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ @@ -209,6 +211,30 @@ } \ } while (0) +// used to copy the number of elements and stride in bytes of tensors into local variables. +// main purpose is to reduce code duplication and improve readability. +// +// example: +// +// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); +// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); +// +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer)->array[0]; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer)->array[1]; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer)->array[2]; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer)->array[3]; \ + GGML_UNUSED(prefix##3); + #ifdef __cplusplus extern "C" { #endif @@ -224,8 +250,8 @@ extern "C" { GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); - GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n); - GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n); + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n); struct ggml_object; struct ggml_context; @@ -295,12 +321,15 @@ extern "C" { GGML_OP_SUM, GGML_OP_SUM_ROWS, GGML_OP_MEAN, + GGML_OP_ARGMAX, GGML_OP_REPEAT, GGML_OP_REPEAT_BACK, GGML_OP_ABS, GGML_OP_SGN, GGML_OP_NEG, GGML_OP_STEP, + GGML_OP_TANH, + GGML_OP_ELU, GGML_OP_RELU, GGML_OP_GELU, GGML_OP_GELU_QUICK, @@ -332,9 +361,8 @@ extern "C" { GGML_OP_ROPE_BACK, GGML_OP_ALIBI, GGML_OP_CLAMP, - GGML_OP_CONV_1D_S1_PH, - GGML_OP_CONV_1D_S2_PH, - GGML_OP_CONV_2D_SK_P0, + GGML_OP_CONV_1D, + GGML_OP_CONV_2D, GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, @@ -390,9 +418,6 @@ extern "C" { struct ggml_tensor * src1; struct ggml_tensor * opt[GGML_MAX_OPT]; - // thread scheduling - int n_tasks; - // performance int perf_runs; int64_t perf_cycles; @@ -404,19 +429,27 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes + int n_tasks[GGML_MAX_NODES]; + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; - int n_threads; - - size_t work_size; - struct ggml_tensor * work; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; @@ -504,8 +537,6 @@ extern "C" { // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); - GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx); - // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); @@ -692,6 +723,11 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // argmax along rows + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); + // if a is the same shape as b, and a is not parameter, return a // otherwise, return a new tensor: repeat(a) to fit in b GGML_API struct ggml_tensor * ggml_repeat( @@ -736,6 +772,22 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_relu( struct ggml_context * ctx, struct ggml_tensor * a); @@ -1086,58 +1138,33 @@ extern "C" { float min, float max); - // TODO: implement general-purpose convolutions - // GGML_API struct ggml_tensor * ggml_conv_1d( - // struct ggml_context * ctx, - // struct ggml_tensor * a, - // struct ggml_tensor * b, - // int s0 - // int p0, - // int d0); - // - // GGML_API struct ggml_tensor * ggml_conv_2d( - // struct ggml_context * ctx, - // struct ggml_tensor * a, - // struct ggml_tensor * b, - // int s0, - // int s1, - // int p0, - // int p1, - // int d0, - // int d1); - - // padding = half - // TODO: we don't support extra parameters for now - // that's why we are hard-coding the stride, padding, and dilation - // not great .. - // example: - // a: 3 80 768 1 - // b: 3000 80 1 1 - // res: 3000 768 1 1 - // used in whisper - GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph( + GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + int s0, // stride + int p0, // padding + int d0); // dilation - // used in whisper - GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph( + GGML_API struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); - // kernel size is a->ne[0] x a->ne[1] - // stride is equal to kernel size - // padding is zero - // example: - // a: 16 16 3 768 - // b: 1024 1024 3 1 - // res: 64 64 768 1 - // used in sam - GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + int s, + int d); GGML_API struct ggml_tensor * ggml_flash_attn( struct ggml_context * ctx, @@ -1268,15 +1295,22 @@ extern "C" { GGML_API void ggml_set_param( struct ggml_context * ctx, - struct ggml_tensor * tensor); + struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); @@ -1493,25 +1527,24 @@ extern "C" { // #ifdef __cplusplus - // restrict not standard in C++ +// restrict not standard in C++ #define GGML_RESTRICT #else #define GGML_RESTRICT restrict #endif - typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); - typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); - typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - quantize_row_q_t quantize_row_q_reference; - quantize_row_q_t quantize_row_q_dot; - vec_dot_q_t vec_dot_q; - enum ggml_type vec_dot_type; - } quantize_fns_t; + ggml_to_float_t to_float; + ggml_from_float_t from_float; + ggml_from_float_t from_float_reference; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + } ggml_type_traits_t; - quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i); #ifdef __cplusplus } diff --git a/native/jni/src/ggml/neox_v3.cpp b/native/jni/src/ggml/gpt_neox.cpp similarity index 75% rename from native/jni/src/ggml/neox_v3.cpp rename to native/jni/src/ggml/gpt_neox.cpp index d944a4dc5..50ef24abb 100644 --- a/native/jni/src/ggml/neox_v3.cpp +++ b/native/jni/src/ggml/gpt_neox.cpp @@ -1,49 +1,42 @@ -#include "ggml.h" -#include "otherarch.h" - -#include "utils.h" -#include "defines.h" +#include "ggml/ggml.h" +#include "gpt_neox.h" +#include "common.h" #include #include #include #include +#include #include #include #include #include -#include -#include -#ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" -#endif -#if defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data #endif + // load the model's weights from a file -ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) { - AKLOGI("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); +bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) { + printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { - AKLOGE("%s: failed to open '%s'\n", __func__, fname.c_str()); - return ModelLoadResult::FAIL; + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; } // verify magic { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != 0x67676d6c) { - AKLOGE("%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return ModelLoadResult::FAIL; + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; } } - int32_t origmaxctx = model.hparams.n_ctx; - // load hparams { auto & hparams = model.hparams; @@ -59,17 +52,15 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - AKLOGI("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - AKLOGI("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx); - AKLOGI("%s: n_embd = %d\n", __func__, hparams.n_embd); - AKLOGI("%s: n_head = %d\n", __func__, hparams.n_head); - AKLOGI("%s: n_layer = %d\n", __func__, hparams.n_layer); - AKLOGI("%s: n_rot = %d\n", __func__, hparams.n_rot); - AKLOGI("%s: par_res = %d\n", __func__, hparams.par_res); - AKLOGI("%s: ftype = %d\n", __func__, hparams.ftype); - AKLOGI("%s: qntvr = %d\n", __func__, qntvr); - - hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx); + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: n_rot = %d\n", __func__, hparams.n_rot); + printf("%s: par_res = %d\n", __func__, hparams.par_res); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); hparams.ftype %= GGML_QNT_VERSION_FACTOR; } @@ -94,14 +85,13 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & } } - // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); if (wtype == GGML_TYPE_COUNT) { - AKLOGE("%s: invalid model file '%s' (bad ftype value %d)\n", + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), model.hparams.ftype); - return ModelLoadResult::FAIL; + return false; } auto & ctx = model.ctx; @@ -142,25 +132,26 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k - ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v ctx_size += (6 + 16*n_layer)*1024; // object overhead - AKLOGI("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context { - struct ggml_init_params params; - params.mem_size = ctx_size; - params.mem_buffer = NULL; - params.no_alloc = false; + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; model.ctx = ggml_init(params); if (!model.ctx) { - AKLOGE("%s: ggml_init() failed\n", __func__); - return ModelLoadResult::FAIL; + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; } } @@ -241,7 +232,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; - const int64_t n_mem = n_layer*std::max(origmaxctx,n_ctx); + const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); @@ -249,7 +240,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - AKLOGI("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); + printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); } // load weights @@ -257,7 +248,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & int n_tensors = 0; size_t total_size = 0; - AKLOGI("%s: ", __func__); + printf("%s: ", __func__); while (true) { int32_t n_dims; @@ -283,83 +274,52 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & fin.read(&name[0], length); if (model.tensors.find(name.data()) == model.tensors.end()) { - AKLOGE("%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return ModelLoadResult::FAIL; + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; } auto tensor = model.tensors[name.data()]; if (ggml_nelements(tensor) != nelements) { - AKLOGE("%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return ModelLoadResult::FAIL; + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - AKLOGE("%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n", + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n", __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return ModelLoadResult::FAIL; + return false; } // for debugging if (0) { - AKLOGI("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } const size_t bpe = ggml_type_size(ggml_type(ttype)); if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - AKLOGE("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); - ggml_free(ctx); - return ModelLoadResult::RETRY_LOAD; + return false; } fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); total_size += ggml_nbytes(tensor); if (++n_tensors % 8 == 0) { - AKLOGI("."); + printf("."); fflush(stdout); } } - AKLOGI(" done\n"); + printf(" done\n"); - AKLOGI("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } fin.close(); - //gpu offload - #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) - if(gpulayers>0) - { - const auto & hparams = model.hparams; - size_t vram_total = 0; - const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); - AKLOGE("%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); - for (int i = 0; i < n_gpu; ++i) { - const auto & layer = model.layers[i]; - layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; - layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; - layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; - layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; - #if defined(GGML_USE_CLBLAST) - ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); - ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); - ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); - ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); - #else - ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); - ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); - ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); - ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); - #endif - } - AKLOGE("%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); - } - #endif - - return ModelLoadResult::SUCCESS; + return true; } @@ -408,13 +368,12 @@ ggml_tensor * gpt_neox_ff( // - embd_w: the predicted logits for the next token // bool gpt_neox_eval( - const gpt_neox_model & model, + gpt_neox_model & model, const int n_threads, const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token, - bool use_scratch) { + const token_sequence & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -426,43 +385,40 @@ bool gpt_neox_eval( const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_rot; + // TODO: All of this allocates over 800 megabytes of memory, way more than the size of the model! + static size_t buf_size = 256u*1024*1024; static void * buf = malloc(buf_size); // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024; - static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024; - + static size_t scr0_size = 256u*1024*1024; static void * scr0 = malloc(scr0_size); + + static size_t scr1_size = 256u*1024*1024; static void * scr1 = malloc(scr1_size); - if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { - const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead - //AKLOGI("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate - if (buf_size_new > buf_size) - { - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) - { - AKLOGE("%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size); - return false; - } + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; } } - struct ggml_init_params params; - params.mem_size = buf_size; - params.mem_buffer = buf; - params.no_alloc = false; - + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -473,9 +429,7 @@ bool gpt_neox_eval( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; - if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - } // self-attention { @@ -580,9 +534,7 @@ bool gpt_neox_eval( } } - if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - } if (hparams.par_res == 0) { struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); @@ -606,9 +558,7 @@ bool gpt_neox_eval( } } - if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - } // norm { @@ -622,9 +572,7 @@ bool gpt_neox_eval( ggml_repeat(ctx0, model.ln_f_b, inpL)); } - if(use_scratch){ ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - } // lm_head { @@ -640,7 +588,18 @@ bool gpt_neox_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + + struct ggml_cplan plan = ggml_graph_plan(&gf, n_threads); + + if (plan.work_size > 0) { + if(model.work_buf.size() < plan.work_size) { + model.work_buf.resize(plan.work_size); + } + + plan.work_data = model.work_buf.data(); + } + + ggml_graph_compute(&gf, &plan); //if (n_past%100 == 0) { // ggml_graph_print (&gf); @@ -657,7 +616,7 @@ bool gpt_neox_eval( if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; } - //AKLOGI("used_mem = %zu\n", ggml_used_mem(ctx0)); + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); diff --git a/native/jni/src/ggml/gpt_neox.h b/native/jni/src/ggml/gpt_neox.h new file mode 100644 index 000000000..fe0f03bca --- /dev/null +++ b/native/jni/src/ggml/gpt_neox.h @@ -0,0 +1,86 @@ +#pragma once + +#include "ggml/ggml.h" +#include "common.h" + +// default hparams (StableLM 3B) +struct gpt_neox_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 4096; + int32_t n_embd = 4096; + int32_t n_head = 32; + int32_t n_layer = 16; + int32_t n_rot = 32; // rotary_pct * (n_embd / n_head) + int32_t par_res = 1; // 1 = true, 0 = false + int32_t ftype = 1; +}; + +struct gpt_neox_layer { + // pre normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // post normalization + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // ff + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct gpt_neox_model { + gpt_neox_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + + struct ggml_tensor * lmh_g; // language model head + //struct ggml_tensor * lmh_b; // language model bias + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; + + std::vector work_buf; +}; + + +bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab); + + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - logits: the predicted logits for the next token +// +bool gpt_neox_eval( + gpt_neox_model & model, + const int n_threads, + const int n_past, + const token_sequence & embd_inp, + std::vector & logits, + size_t & mem_per_token +); \ No newline at end of file diff --git a/native/jni/src/ggml/model_adapter.cpp b/native/jni/src/ggml/model_adapter.cpp deleted file mode 100644 index efe99cd38..000000000 --- a/native/jni/src/ggml/model_adapter.cpp +++ /dev/null @@ -1,466 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "model_adapter.h" - -#include - -static auto bench_timer = std::chrono::high_resolution_clock().now(); - -void timer_start() -{ - bench_timer = std::chrono::high_resolution_clock().now(); -} -double timer_check() -{ - auto endtime = std::chrono::high_resolution_clock().now(); - auto duration = std::chrono::duration_cast(endtime - bench_timer); - double time_taken = duration.count()/1000.0; - return time_taken; -} - -void print_vec(std::vector &embd) -{ - std::cout << "["; - bool first = true; - for (auto i : embd) - { - if (!first) - { - std::cout << ','; - } - first = false; - std::cout << i; - } - std::cout << "]\n"; -} -void print_tok_vec(std::vector &embd) -{ - std::cout << "["; - bool first = true; - for (auto i : embd) - { - if (!first) - { - std::cout << ','; - } - first = false; - std::cout << i; - } - std::cout << "]\n"; -} -void print_tok_vec(std::vector &embd) -{ - std::cout << "["; - bool first = true; - int n = 0; - for (auto i : embd) - { - if (!first) - { - std::cout << ','; - } - first = false; - std::cout << i; - if(++n>20) - { - break; - } - } - std::cout << "]\n"; -} - -//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) - FileFormat check_file_format(const std::string & fname) - { - std::vector f_buf(1024*1024); - - auto fin = std::ifstream(fname, std::ios::binary); - fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return FileFormat::BADFORMAT; - } - - FileFormat fileformat = FileFormat::BADFORMAT; - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic == 0x67676d6c) { //v1 format ggml, alpaca, old gptj and gpt2 models - fileformat = FileFormat::GGML; - //we need to read more to determine - int32_t vocabsiz = 0; - fin.read((char *) &vocabsiz, sizeof(int32_t)); - if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt - { - fileformat = FileFormat::MPT_1; - } - else if(vocabsiz==50400) //know GPT-J vocab size - { - fileformat = FileFormat::GPTJ_1; - uint32_t temp; - fin.read((char *)&temp, sizeof(temp)); //ctx - fin.read((char *)&temp, sizeof(temp)); //n_embd - fin.read((char *)&temp, sizeof(temp)); //n_head - fin.read((char *)&temp, sizeof(temp)); //n_layer - fin.read((char *)&temp, sizeof(temp)); //n_rot - fin.read((char *)&temp, sizeof(temp)); //f16 - const int32_t qntvr = temp / 1000; - temp %= 1000; - if (qntvr != 0) - { - if (qntvr == 1) - { - fileformat = FileFormat::GPTJ_4; - } - else - { - fileformat = FileFormat::GPTJ_5; - } - } - else if (temp != 0 && temp != 1) - { - fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type - } - } - else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder - { - fileformat = FileFormat::GPT2_1; - uint32_t temp; - fin.read((char *)&temp, sizeof(temp)); //ctx - fin.read((char *)&temp, sizeof(temp)); //n_embd - fin.read((char *)&temp, sizeof(temp)); //n_head - fin.read((char *)&temp, sizeof(temp)); //n_layer - fin.read((char *)&temp, sizeof(temp)); //f16 - const int32_t qntvr = temp / 1000; - temp %= 1000; - if (qntvr != 0) - { - if (qntvr == 1) - { - fileformat = FileFormat::GPT2_3; - } - else - { - fileformat = FileFormat::GPT2_4; - } - } - else if (temp != 0 && temp != 1) - { - fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type - } - } - else if(vocabsiz < 31998 || vocabsiz > 33000) - { - //anything outside the llama v1 range is assumed to be NeoX - fileformat = FileFormat::NEOX_6; - uint32_t temp,temp2; - fin.read((char *)&temp, sizeof(temp)); //ctx - fin.read((char *)&temp, sizeof(temp)); //n_embd - fin.read((char *)&temp, sizeof(temp)); //n_head - fin.read((char *)&temp, sizeof(temp)); //n_layer - fin.read((char *)&temp, sizeof(temp)); //n_rot - fin.read((char *)&temp, sizeof(temp)); //either par_res or ftype (for older ver) - - if(temp!=0 && temp!=1){ - //must be ftype, means its an older model. par_res will be undefined - fileformat = FileFormat::NEOX_2; - } - else - { - //it could be a newer model, or an old f16/f32 model - fin.read((char *)&temp2, sizeof(temp2)); //if previous was par_res, this is ftype. else unknown - - //if it is new ftype, then it must have these properties: > 1000, low multiple of 1k and small remaineder - bool isNewFtype = (temp2>=1000 && temp2<=9000 && temp2%1000<20); - - if(!isNewFtype) - { - fileformat = FileFormat::NEOX_2; - if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0 - { - //its a f16/f32 model in the new format - fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6; - } - } - else - { - const int32_t qntvr = temp2 / 1000; //for future use - //then temp was par_res, use_parallel_residual is false in RedPajama - if(qntvr==1) - { - fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4); - } - else - { - fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6); - } - } - } - - } - } - else if(magic == 0x67676d66) //v2 format ggmf - { - fileformat = FileFormat::GGHF; - uint32_t temp; - fin.read((char *)&temp, sizeof(temp)); //file version - if(temp==100) - { - fileformat = FileFormat::RWKV_1; - } - else if(temp==101) - { - fileformat = FileFormat::RWKV_2; - } - } - else if(magic == 0x67676a74) //v3 format ggjt - { - fileformat = FileFormat::GGJT_3; //ggjt by default - uint32_t ver, temp, ftype; - fin.read((char *)&ver, sizeof(ver)); //file version - fin.read((char *)&temp, sizeof(temp));//vocab - fin.read((char *)&temp, sizeof(temp)); //embd - fin.read((char *)&temp, sizeof(temp)); //mult - fin.read((char *)&temp, sizeof(temp));//head - fin.read((char *)&temp, sizeof(temp));//layer - fin.read((char *)&temp, sizeof(temp));//rot - fin.read((char *)&ftype, sizeof(ftype));//filetype - - if(ver==1) - { - fileformat = FileFormat::GGJT; - } - else if(ver==2) - { - fileformat = FileFormat::GGJT_2; - } - } - fin.close(); - - return fileformat; - } - - bool ArrStartWith(const std::vector targetArray, const std::vector searchSeq) - { - int ss = searchSeq.size(); - if(targetArray.size() targetArray, const std::vector searchSeq) - { - int ss = searchSeq.size(); - int tas = targetArray.size(); - if(tas= tas || targetArray[i + srch] != searchSeq[srch]) - { - fail = true; - break; - } - } - if(!fail) - { - return i; - } - } - return -1; - } - - std::vector LongestCommonSubseq(const std::vector x, const std::vector y) - { - int m = x.size(), n = y.size(); - - //int LCSuff[m+1][n+1]; - std::vector> LCSuff(m+1, std::vector(n+1)); - - for (int j = 0; j <= n; j++) - LCSuff[0][j] = 0; - for (int i = 0; i <= m; i++) - LCSuff[i][0] = 0; - - for (int i = 1; i <= m; i++) - { - for (int j = 1; j <= n; j++) - { - if (x[i - 1] == y[j - 1]) - LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1; - else - LCSuff[i][j] = 0; - } - } - - std::vector longest; - for (int i = 1; i <= m; i++) - { - for (int j = 1; j <= n; j++) - { - if (LCSuff[i][j] > longest.size()) - { - auto off1 = ((i - LCSuff[i][j] + 1) - 1); - auto off2 = off1 + LCSuff[i][j]; - longest.clear(); - // std::vector().swap(longest); - longest = std::vector(x.begin() + off1, x.begin() + off2); - // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]); - } - } - } - return longest; - } - - void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, - int &n_past, const int nctx, std::vector &smartcontext, - bool useSmartContext, const bool requireFullSubset) - { - const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext - const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext - const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context - const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward - const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext - - - //fast forward the past based on identical tokens, stop once a divergence is noted - int embd_inp_len = embd_inp.size(); - bool fastforwardok = true; - - for (int i = 0; i < current_context_tokens.size(); ++i) - { - if (current_context_tokens[i] == embd_inp[i]) - { - n_past += 1; - } - else - { - if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context - { - n_past = 0; - fastforwardok = false; - } - break; - } - - if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context - { - if (i >= embd_inp_len) - { - n_past = 0; - fastforwardok = false; - break; - } - } - else - { - if ((i + 2) >= embd_inp_len) - { - break; - } - } - } - - if(fastforwardok) - { - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past); - embd_inp_len = embd_inp.size(); - } - - //smart context mode, detect if we have a shifted context at max length - //requirement: previous context was at least nctx/2 longer than current, - //mode is on, and current context already maxed. - - if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold) - { - //see if smartcontext is still usable - auto shared = LongestCommonSubseq(smartcontext, embd_inp); - if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common - { - int found = ArrFindIndexOf(embd_inp,shared); - if(found>=0) - { - auto trimmed = std::vector(embd_inp.begin() + found, embd_inp.end()); - embd_inp = trimmed; - embd_inp_len = embd_inp.size(); - printf("\n[Reusing Smart Context: %d allowance remaining]", found); - - int old_n_past = n_past; - int offset_fix = old_n_past; - if (current_context_tokens[n_past] != embd_inp[0]) - { - offset_fix = 0; - } - - for (int i = n_past; i < current_context_tokens.size(); ++i) - { - if (current_context_tokens[i] == embd_inp[i-offset_fix]) - { - n_past += 1; - } - else - { - break; - } - if ((i + 2 - offset_fix) >= embd_inp_len) - { - break; - } - } - - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past)); - - }else{ - smartcontext.clear(); - } - } - else - { - smartcontext.clear(); - } - } - else - { - smartcontext.clear(); - } - - if(fastforwardok && useSmartContext - && smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold - && embd_inp_len >= SCInpLenThreshold - && current_context_tokens.size() - n_past > SCPastLenThreshold) - { - //determine longest common substring after removing start part - int shiftamt = embd_inp.size() * SCTruncationRatio; - smartcontext = std::vector(embd_inp.begin() + shiftamt, embd_inp.end()); - printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt); - - embd_inp = smartcontext; - //if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker. - //when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS - //remove all tokens between start part and start of LCS in new prompt, thus avoiding shift - //if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B - } - } \ No newline at end of file diff --git a/native/jni/src/ggml/model_adapter.h b/native/jni/src/ggml/model_adapter.h deleted file mode 100644 index 98f8ed2cb..000000000 --- a/native/jni/src/ggml/model_adapter.h +++ /dev/null @@ -1,67 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -enum FileFormat -{ - BADFORMAT=0, //unknown, uninit, or failed to load - GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header) - GGHF=2, // 2=(llama ggmf) - GGJT=3, // 3=(llama ggjt) - GGJT_2=4, //newer llama format unshuffled - GGJT_3=5, //using 16bit scalar - - GPTJ_1=100, //the very first super old GPTJ format - GPTJ_2=101, //pygmalion, uses old ggml lib - GPTJ_3=102, //uses new ggml lib - GPTJ_4=103, //unshuffled - GPTJ_5=104, //using 16bit scalar - - GPT2_1=200, - GPT2_2=201, - GPT2_3=202, //unshuffled - GPT2_4=203, //using 16bit scalar - - RWKV_1=300, - RWKV_2=301, - - NEOX_1=400, - NEOX_2=401, - NEOX_3=402, //redpajama - NEOX_4=403, //unshuffled - NEOX_5=404, //unshuffled redpajama - NEOX_6=405, //using 16bit scalar - NEOX_7=406, //using 16bit scalar redpajama - - MPT_1=500, //first supported mpt version -}; - -enum ModelLoadResult -{ - FAIL = 0, - SUCCESS = 1, - RETRY_LOAD = 2, //used if it's suspected that the model is an older format -}; - -void timer_start(); -double timer_check(); -void print_tok_vec(std::vector &embd); -void print_tok_vec(std::vector &embd); -void print_vec(std::vector &embd); -std::vector LongestCommonSubseq(const std::vector x, const std::vector y); -bool ArrStartWith(const std::vector targetArray, const std::vector searchSeq); -int ArrFindIndexOf(const std::vector targetArray, const std::vector searchSeq); - -FileFormat check_file_format(const std::string & fname); -void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, - int &n_past, const int nctx, std::vector &smartcontext, - const bool useSmartContext, const bool requireFullSubset); diff --git a/native/jni/src/ggml/otherarch.h b/native/jni/src/ggml/otherarch.h deleted file mode 100644 index e8bd8dc86..000000000 --- a/native/jni/src/ggml/otherarch.h +++ /dev/null @@ -1,464 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "utils.h" -#include "model_adapter.h" - - -// default hparams (GPT-J 6B) -struct gptj_hparams { - int32_t n_vocab = 50400; - int32_t n_ctx = 2048; - int32_t n_embd = 4096; - int32_t n_head = 16; - int32_t n_layer = 28; - int32_t n_rot = 64; - int32_t ftype = 1; -}; - -struct gptj_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - // attention - struct ggml_tensor * c_attn_q_proj_w; - struct ggml_tensor * c_attn_k_proj_w; - struct ggml_tensor * c_attn_v_proj_w; - - struct ggml_tensor * c_attn_proj_w; - - // ff - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; -struct gptj_layer_v2 { - // normalization - struct ggml_v2_tensor * ln_1_g; - struct ggml_v2_tensor * ln_1_b; - - // attention - struct ggml_v2_tensor * c_attn_q_proj_w; - struct ggml_v2_tensor * c_attn_k_proj_w; - struct ggml_v2_tensor * c_attn_v_proj_w; - - struct ggml_v2_tensor * c_attn_proj_w; - - // ff - struct ggml_v2_tensor * c_mlp_fc_w; - struct ggml_v2_tensor * c_mlp_fc_b; - - struct ggml_v2_tensor * c_mlp_proj_w; - struct ggml_v2_tensor * c_mlp_proj_w_trans; //for backwards compatibility - struct ggml_v2_tensor * c_mlp_proj_b; -}; -struct gptj_layer_v1 { - // normalization - struct ggml_v1_tensor * ln_1_g; - struct ggml_v1_tensor * ln_1_b; - - // attention - struct ggml_v1_tensor * c_attn_q_proj_w; - struct ggml_v1_tensor * c_attn_k_proj_w; - struct ggml_v1_tensor * c_attn_v_proj_w; - - struct ggml_v1_tensor * c_attn_proj_w; - - // ff - struct ggml_v1_tensor * c_mlp_fc_w; - struct ggml_v1_tensor * c_mlp_fc_b; - - struct ggml_v1_tensor * c_mlp_proj_w; - struct ggml_v1_tensor * c_mlp_proj_w_trans; //for backwards compatibility - struct ggml_v1_tensor * c_mlp_proj_b; -}; - -struct gptj_v1_model { - gptj_hparams hparams; - - // normalization - struct ggml_v1_tensor * ln_f_g; - struct ggml_v1_tensor * ln_f_b; - - struct ggml_v1_tensor * wte; // position embedding - - struct ggml_v1_tensor * lmh_g; // language model head - struct ggml_v1_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_v1_tensor * memory_k; - struct ggml_v1_tensor * memory_v; - - // - struct ggml_v1_context * ctx; - std::map tensors; -}; - -struct gptj_v2_model { - gptj_hparams hparams; - - // normalization - struct ggml_v2_tensor * ln_f_g; - struct ggml_v2_tensor * ln_f_b; - - struct ggml_v2_tensor * wte; // position embedding - - struct ggml_v2_tensor * lmh_g; // language model head - struct ggml_v2_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_v2_tensor * memory_k; - struct ggml_v2_tensor * memory_v; - - // - struct ggml_v2_context * ctx; - std::map tensors; -}; - -struct gptj_model { - gptj_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - - struct ggml_tensor * lmh_g; // language model head - struct ggml_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; -}; - -struct gpt2_v1_layer { - // normalization - struct ggml_v1_tensor * ln_1_g; - struct ggml_v1_tensor * ln_1_b; - - struct ggml_v1_tensor * ln_2_g; - struct ggml_v1_tensor * ln_2_b; - - // attention - struct ggml_v1_tensor * c_attn_attn_w; - struct ggml_v1_tensor * c_attn_attn_b; - - struct ggml_v1_tensor * c_attn_proj_w; - struct ggml_v1_tensor * c_attn_proj_b; - - // mlp - struct ggml_v1_tensor * c_mlp_fc_w; - struct ggml_v1_tensor * c_mlp_fc_b; - - struct ggml_v1_tensor * c_mlp_proj_w_trans; // transposed for efficiency - struct ggml_v1_tensor * c_mlp_proj_b; -}; - -struct gpt2_v1_model { - gpt2_hparams hparams; - - // normalization - struct ggml_v1_tensor * ln_f_g; - struct ggml_v1_tensor * ln_f_b; - - struct ggml_v1_tensor * wte; // position embedding - struct ggml_v1_tensor * wpe; // token embedding - - std::vector layers; - - // key + value memory - struct ggml_v1_tensor * memory_k; - struct ggml_v1_tensor * memory_v; - - // - struct ggml_v1_context * ctx; - std::map tensors; -}; - -struct gpt2_layer_v2 { - // normalization - struct ggml_v2_tensor * ln_1_g; - struct ggml_v2_tensor * ln_1_b; - - struct ggml_v2_tensor * ln_2_g; - struct ggml_v2_tensor * ln_2_b; - - // attention - struct ggml_v2_tensor * c_attn_attn_w; - struct ggml_v2_tensor * c_attn_attn_b; - - struct ggml_v2_tensor * c_attn_proj_w; - struct ggml_v2_tensor * c_attn_proj_b; - - // mlp - struct ggml_v2_tensor * c_mlp_fc_w; - struct ggml_v2_tensor * c_mlp_fc_b; - - struct ggml_v2_tensor * c_mlp_proj_w; - struct ggml_v2_tensor * c_mlp_proj_b; -}; - -struct gpt2_v2_model { - gpt2_hparams hparams; - - // normalization - struct ggml_v2_tensor * ln_f_g; - struct ggml_v2_tensor * ln_f_b; - - struct ggml_v2_tensor * wte; // position embedding - struct ggml_v2_tensor * wpe; // token embedding - struct ggml_v2_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_v2_tensor * memory_k; - struct ggml_v2_tensor * memory_v; - - // - struct ggml_v2_context * ctx; - std::map tensors; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt2_model { - gpt2_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -// default hparams (StableLM 3B) -struct gpt_neox_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 4096; - int32_t n_embd = 4096; - int32_t n_head = 32; - int32_t n_layer = 16; - int32_t n_rot = 32; // rotary_pct * (n_embd / n_head) - int32_t par_res = 1; // 1 = true, 0 = false - int32_t ftype = 1; -}; - -struct gpt_neox_layer_v2 { - // pre normalization - struct ggml_v2_tensor * ln_1_g; - struct ggml_v2_tensor * ln_1_b; - - // attention - struct ggml_v2_tensor * c_attn_attn_w; - struct ggml_v2_tensor * c_attn_attn_b; - - struct ggml_v2_tensor * c_attn_proj_w; - struct ggml_v2_tensor * c_attn_proj_b; - - // post normalization - struct ggml_v2_tensor * ln_2_g; - struct ggml_v2_tensor * ln_2_b; - - // ff - struct ggml_v2_tensor * c_mlp_fc_w; - struct ggml_v2_tensor * c_mlp_fc_b; - - struct ggml_v2_tensor * c_mlp_proj_w; - struct ggml_v2_tensor * c_mlp_proj_b; -}; - -struct gpt_neox_v2_model { - gpt_neox_hparams hparams; - - // normalization - struct ggml_v2_tensor * ln_f_g; - struct ggml_v2_tensor * ln_f_b; - - struct ggml_v2_tensor * wte; // position embedding - - struct ggml_v2_tensor * lmh_g; // language model head - //struct ggml_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_v2_tensor * memory_k; - struct ggml_v2_tensor * memory_v; - - // - struct ggml_v2_context * ctx; - std::map tensors; -}; - -struct gpt_neox_layer { - // pre normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // post normalization - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // ff - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt_neox_model { - gpt_neox_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - - struct ggml_tensor * lmh_g; // language model head - //struct ggml_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - - -// no defaults for now -struct mpt_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - float alibi_bias_max = 0; - float clip_qkv = 0; - int32_t ftype = 0; - int32_t n_ctx = 0; - -}; - -struct mpt_layer { - // pre normalization - struct ggml_tensor * norm_1_weight; - - // attention - struct ggml_tensor * c_attn_wqkv_weight; - struct ggml_tensor * c_attn_out_proj_weight; - - // post normalization - struct ggml_tensor * norm_2_weight; - - // ff - struct ggml_tensor * ffn_up_proj; - struct ggml_tensor * ffn_down_proj; -}; - -struct mpt_model { - mpt_hparams hparams; - - struct ggml_tensor * wte_weight; // position embedding - struct ggml_tensor * norm_f_weight; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - struct ggml_context * ctx; - std::map tensors; -}; - - -ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers); -bool gpt_neox_eval( - const gpt_neox_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token, - bool use_scratch); diff --git a/native/jni/src/ggml/utils.cpp b/native/jni/src/ggml/utils.cpp deleted file mode 100644 index 73e27e72c..000000000 --- a/native/jni/src/ggml/utils.cpp +++ /dev/null @@ -1,224 +0,0 @@ -#include "utils.h" - -#include -#include -#include -#include -#include -#include -#include - - - -void utreplace(std::string & str, const std::string & needle, const std::string & replacement) { - size_t pos = 0; - while ((pos = str.find(needle, pos)) != std::string::npos) { - str.replace(pos, needle.length(), replacement); - pos += replacement.length(); - } -} - -std::map json_parse(const std::string & fname) { - std::map result; - - // read file into string - std::string json; - { - std::ifstream ifs(fname); - if (!ifs) { - fprintf(stderr, "Failed to open %s\n", fname.c_str()); - exit(1); - } - - json = std::string((std::istreambuf_iterator(ifs)), - (std::istreambuf_iterator())); - } - - if (json[0] != '{') { - return result; - } - - // parse json - { - bool has_key = false; - bool in_token = false; - - std::string str_key = ""; - std::string str_val = ""; - - int n = json.size(); - for (int i = 1; i < n; ++i) { - if (!in_token) { - if (json[i] == ' ') continue; - if (json[i] == '"') { - in_token = true; - continue; - } - } else { - if (json[i] == '\\' && i+1 < n) { - if (has_key == false) { - str_key += json[i]; - } else { - str_val += json[i]; - } - ++i; - } else if (json[i] == '"') { - if (has_key == false) { - has_key = true; - ++i; - while (json[i] == ' ') ++i; - ++i; // : - while (json[i] == ' ') ++i; - if (json[i] != '\"') { - while (json[i] != ',' && json[i] != '}') { - str_val += json[i++]; - } - has_key = false; - } else { - in_token = true; - continue; - } - } else { - has_key = false; - } - - ::utreplace(str_key, "\\u0120", " " ); // \u0120 -> space - ::utreplace(str_key, "\\u010a", "\n"); // \u010a -> new line - ::utreplace(str_key, "\\\"", "\""); // \\\" -> " - - try { - result[str_key] = std::stoi(str_val); - } catch (...) { - //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); - - } - str_key = ""; - str_val = ""; - in_token = false; - continue; - } - if (has_key == false) { - str_key += json[i]; - } else { - str_val += json[i]; - } - } - } - } - - return result; -} - - -void gpt_vocab::add_special_token(const std::string & token) { - special_tokens.push_back(token); -} - - -std::string convert_to_utf8(const std::wstring & input) { - std::wstring_convert> converter; - return converter.to_bytes(input); -} - - -std::wstring convert_to_wstring(const std::string & input) { - try { - std::wstring_convert> converter; - return converter.from_bytes(input); - } catch (const std::range_error& e) { - return L""; - } catch (...) { - return L""; - } -} - -void gpt_split_words(std::string str, std::vector& words) { - const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; - const std::regex re(pattern); - std::smatch m; - - while (std::regex_search(str, m, re)) { - for (auto x : m) { - words.push_back(x); - } - str = m.suffix(); - } -} - -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { - std::vector words; - - // first split the text into words - { - std::string str = text; - - // Generate the subpattern from the special_tokens vector if it's not empty - if (!vocab.special_tokens.empty()) { - const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])"); - std::string special_tokens_subpattern; - for (const auto & token : vocab.special_tokens) { - if (!special_tokens_subpattern.empty()) { - special_tokens_subpattern += "|"; - } - special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)"); - } - - std::regex re(special_tokens_subpattern); - std::smatch m; - // Split the text by special tokens. - while (std::regex_search(str, m, re)) { - // Split the substrings in-between special tokens into words. - gpt_split_words(m.prefix(), words); - // Add matched special tokens as words. - for (auto x : m) { - words.push_back(x); - } - str = m.suffix(); - } - // Remaining text without special tokens will be handled below. - } - - gpt_split_words(str, words); - } - - // find the longest token that forms each word in words: - std::vector tokens; - for (const auto & word : words) { - for (unsigned long i = 0; i < word.size(); ){ - for (unsigned long j = word.size() - 1; j >= i; j--){ - auto cand = word.substr(i, j-i+1); - auto it = vocab.token_to_id.find(cand); - if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab - tokens.push_back(it->second); - i = j + 1; - break; - } - else if (j == i){ // word.substr(i, 1) has no matching - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); - i++; - } - } - } - } - - - return tokens; -} - -bool should_transpose_layer(std::string name) -{ - - if(name.find(".mlp.fc_in.weight")!=std::string::npos || - name.find(".attn.out_proj.weight")!=std::string::npos || - name.find(".attn.q_proj.weight")!=std::string::npos || - name.find(".attn.k_proj.weight")!=std::string::npos || - name.find(".attn.v_proj.weight")!=std::string::npos || - name.find("/attn/c_attn/w")!=std::string::npos || - name.find("/attn/c_proj/w")!=std::string::npos || - name.find("/mlp/c_fc/w")!=std::string::npos || - name.find("/mlp/c_proj/w")!=std::string::npos) - { - return true; - } - return false; -} \ No newline at end of file