mirror of
https://gitlab.futo.org/keyboard/latinime.git
synced 2024-09-28 14:54:30 +01:00
Remove AGPLv3 code
This commit is contained in:
parent
85ed8afec9
commit
43e55bebfe
@ -22,9 +22,9 @@ LATIN_IME_JNI_SRC_FILES := \
|
||||
|
||||
LATIN_IME_CORE_SRC_FILES := \
|
||||
ggml/ggml.c \
|
||||
ggml/utils.cpp \
|
||||
ggml/model_adapter.cpp \
|
||||
ggml/neox_v3.cpp \
|
||||
ggml/common.cpp \
|
||||
ggml/context.cpp \
|
||||
ggml/gpt_neox.cpp \
|
||||
$(addprefix dictionary/header/, \
|
||||
header_policy.cpp \
|
||||
header_read_write_utils.cpp) \
|
||||
|
@ -38,7 +38,9 @@
|
||||
#include "utils/profiler.h"
|
||||
#include "utils/time_keeper.h"
|
||||
|
||||
#include "ggml/otherarch.h"
|
||||
#include "ggml/gpt_neox.h"
|
||||
#include "ggml/context.h"
|
||||
#include "ggml/common.h"
|
||||
|
||||
#include <android/log.h>
|
||||
|
||||
@ -81,13 +83,12 @@ class ProximityInfo;
|
||||
struct GGMLDictionaryState {
|
||||
int n_threads = 3;
|
||||
|
||||
std::vector<int> smartcontext;
|
||||
std::vector<gpt_vocab::id> current_context_tokens;
|
||||
transformer_context t_context;
|
||||
|
||||
std::vector<float> logits;
|
||||
std::vector<gpt_vocab::id> bad_logits;
|
||||
|
||||
size_t mem_per_token = 0;
|
||||
bool use_scratch = true;
|
||||
|
||||
gpt_neox_model model;
|
||||
gpt_vocab vocab;
|
||||
@ -109,12 +110,10 @@ static jlong latinime_GGMLDictionary_open(JNIEnv *env, jclass clazz, jstring sou
|
||||
GGMLDictionaryState *state = new GGMLDictionaryState();
|
||||
|
||||
std::string fname(sourceDirChars);
|
||||
FileFormat format = check_file_format(fname);
|
||||
assert(format == 405);
|
||||
|
||||
ModelLoadResult result = gpt_neox_model_load(fname, state->model, state->vocab, format, 0);
|
||||
bool result = gpt_neox_model_load(fname, state->model, state->vocab);
|
||||
|
||||
if(result != ModelLoadResult::SUCCESS) {
|
||||
if(!result) {
|
||||
AKLOGE("GGMLDict: Could not load model");
|
||||
free(state);
|
||||
return 0;
|
||||
@ -171,33 +170,28 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl
|
||||
env->ReleaseStringUTFChars(partialWord, pwstr);
|
||||
}
|
||||
|
||||
auto embd_inp = gpt_tokenize(state->vocab, contextString);
|
||||
token_sequence next_context = gpt_tokenize(state->vocab, contextString);
|
||||
|
||||
//truncate to front of the prompt if its too long
|
||||
int32_t nctx = state->model.hparams.n_ctx;
|
||||
|
||||
if (embd_inp.size() + 2 > nctx) {
|
||||
int offset = embd_inp.size() - nctx + 2;
|
||||
embd_inp = std::vector<int>(embd_inp.begin() + offset, embd_inp.end());
|
||||
if (next_context.size() + 2 > nctx) {
|
||||
int offset = next_context.size() - nctx + 2;
|
||||
next_context = std::vector<int>(next_context.begin() + offset, next_context.end());
|
||||
}
|
||||
|
||||
size_t size = env->GetArrayLength(outPredictions);
|
||||
|
||||
int n_past = 0;
|
||||
auto fastforward_info = transformer_context_fastforward(state->t_context, next_context);
|
||||
|
||||
bool useSmartContext = true;
|
||||
ContextFastForward(state->current_context_tokens, embd_inp, n_past, nctx, state->smartcontext, useSmartContext, false);
|
||||
token_sequence &embd_inp = fastforward_info.first;
|
||||
int n_past = fastforward_info.second;
|
||||
|
||||
if(embd_inp.empty()) return;
|
||||
|
||||
state->current_context_tokens.resize(n_past);
|
||||
|
||||
AKLOGI("npast = %d, size(embd) = %d\n", n_past, (int)embd_inp.size());
|
||||
gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token, state->use_scratch);
|
||||
gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token);
|
||||
|
||||
for(auto token : embd_inp) {
|
||||
state->current_context_tokens.emplace_back(token);
|
||||
}
|
||||
transformer_context_apply(state->t_context, fastforward_info);
|
||||
|
||||
int topid = std::min_element(state->logits.begin(),state->logits.end())-state->logits.begin();
|
||||
float zeroValue = (state->logits[topid] < 0 ? state->logits[topid] : 0);
|
||||
@ -249,6 +243,8 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl
|
||||
}
|
||||
|
||||
|
||||
size_t size = env->GetArrayLength(outPredictions);
|
||||
|
||||
// Get the array elements
|
||||
jint *probsArray = env->GetIntArrayElements(outProbabilities, nullptr);
|
||||
|
||||
|
143
native/jni/src/ggml/common.cpp
Normal file
143
native/jni/src/ggml/common.cpp
Normal file
@ -0,0 +1,143 @@
|
||||
#include "common.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
#include <sstream>
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
std::string trim(const std::string & s) {
|
||||
std::regex e("^\\s+|\\s+$");
|
||||
return std::regex_replace(s, e, "");
|
||||
}
|
||||
|
||||
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
|
||||
std::string result = s;
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(from, pos)) != std::string::npos) {
|
||||
result.replace(pos, from.length(), to);
|
||||
pos += to.length();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void gpt_vocab::add_special_token(const std::string & token) {
|
||||
special_tokens.push_back(token);
|
||||
}
|
||||
|
||||
std::string convert_to_utf8(const std::wstring & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.to_bytes(input);
|
||||
}
|
||||
|
||||
|
||||
std::wstring convert_to_wstring(const std::string & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.from_bytes(input);
|
||||
}
|
||||
|
||||
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
||||
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
const std::regex re(pattern);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(str, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
{
|
||||
std::string str = text;
|
||||
|
||||
// Generate the subpattern from the special_tokens vector if it's not empty
|
||||
if (!vocab.special_tokens.empty()) {
|
||||
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
||||
std::string special_tokens_subpattern;
|
||||
for (const auto & token : vocab.special_tokens) {
|
||||
if (!special_tokens_subpattern.empty()) {
|
||||
special_tokens_subpattern += "|";
|
||||
}
|
||||
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
||||
}
|
||||
|
||||
std::regex re(special_tokens_subpattern);
|
||||
std::smatch m;
|
||||
// Split the text by special tokens.
|
||||
while (std::regex_search(str, m, re)) {
|
||||
// Split the substrings in-between special tokens into words.
|
||||
gpt_split_words(m.prefix(), words);
|
||||
// Add matched special tokens as words.
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
// Remaining text without special tokens will be handled below.
|
||||
}
|
||||
|
||||
gpt_split_words(str, words);
|
||||
}
|
||||
|
||||
// find the longest token that forms each word in words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
for (int i = 0; i < (int) word.size(); ){
|
||||
for (int j = word.size() - 1; j >= i; j--){
|
||||
auto cand = word.substr(i, j-i+1);
|
||||
auto it = vocab.token_to_id.find(cand);
|
||||
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
||||
tokens.push_back(it->second);
|
||||
i = j + 1;
|
||||
break;
|
||||
}
|
||||
else if (j == i){ // word.substr(i, 1) has no matching
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
float similarity(const std::string & s0, const std::string & s1) {
|
||||
const size_t len0 = s0.size() + 1;
|
||||
const size_t len1 = s1.size() + 1;
|
||||
|
||||
std::vector<int> col(len1, 0);
|
||||
std::vector<int> prevCol(len1, 0);
|
||||
|
||||
for (size_t i = 0; i < len1; i++) {
|
||||
prevCol[i] = i;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < len0; i++) {
|
||||
col[0] = i;
|
||||
for (size_t j = 1; j < len1; j++) {
|
||||
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
|
||||
}
|
||||
col.swap(prevCol);
|
||||
}
|
||||
|
||||
const float dist = prevCol[len1 - 1];
|
||||
|
||||
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
||||
}
|
@ -1,5 +1,3 @@
|
||||
// Various helper functions and utilities
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
@ -8,15 +6,6 @@
|
||||
#include <random>
|
||||
#include <thread>
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
struct gpt_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
@ -28,16 +17,7 @@ struct gpt_vocab {
|
||||
void add_special_token(const std::string & token);
|
||||
};
|
||||
|
||||
void utreplace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||
|
||||
// poor-man's JSON parsing
|
||||
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
||||
|
||||
std::string convert_to_utf8(const std::wstring & input);
|
||||
|
||||
std::wstring convert_to_wstring(const std::string & input);
|
||||
|
||||
void gpt_split_words(std::string str, std::vector<std::string>& words);
|
||||
typedef std::vector<gpt_vocab::id> token_sequence;
|
||||
|
||||
// split text into tokens
|
||||
//
|
||||
@ -49,8 +29,5 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
|
||||
// Regex (C++):
|
||||
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
||||
//
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
||||
token_sequence gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
||||
|
||||
|
||||
|
||||
bool should_transpose_layer(std::string name);
|
29
native/jni/src/ggml/context.cpp
Normal file
29
native/jni/src/ggml/context.cpp
Normal file
@ -0,0 +1,29 @@
|
||||
#include "context.h"
|
||||
|
||||
|
||||
std::pair<token_sequence, int> transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context) {
|
||||
int npast = 0;
|
||||
|
||||
// Compare the two sequences and find the first index at which they differ.
|
||||
int max_length = std::min(ctx.active_context.size(), next_context.size());
|
||||
for(int i=0; i<max_length; i++) {
|
||||
npast = i;
|
||||
if(ctx.active_context[i] != next_context[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
token_sequence new_context(next_context.size() - npast);
|
||||
new_context.assign(next_context.begin() + npast, next_context.end());
|
||||
|
||||
return {new_context, npast};
|
||||
}
|
||||
|
||||
|
||||
void transformer_context_apply(transformer_context &ctx, const std::pair<token_sequence, int> &fastforward_info) {
|
||||
ctx.active_context.resize(fastforward_info.second);
|
||||
|
||||
for(auto i : fastforward_info.first) {
|
||||
ctx.active_context.emplace_back(i);
|
||||
}
|
||||
}
|
12
native/jni/src/ggml/context.h
Normal file
12
native/jni/src/ggml/context.h
Normal file
@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct transformer_context {
|
||||
token_sequence active_context;
|
||||
};
|
||||
|
||||
std::pair<token_sequence, int> transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context);
|
||||
void transformer_context_apply(transformer_context &ctx, const std::pair<token_sequence, int> &fastforward_info);
|
File diff suppressed because it is too large
Load Diff
@ -65,7 +65,7 @@
|
||||
// ggml_set_f32(a, 3.0f);
|
||||
// ggml_set_f32(b, 4.0f);
|
||||
//
|
||||
// ggml_graph_compute(ctx0, &gf);
|
||||
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
||||
//
|
||||
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
||||
//
|
||||
@ -201,6 +201,8 @@
|
||||
#define GGML_MAX_NAME 48
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
@ -209,6 +211,30 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
||||
// main purpose is to reduce code duplication and improve readability.
|
||||
//
|
||||
// example:
|
||||
//
|
||||
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
||||
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
||||
//
|
||||
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
||||
const type prefix##0 = (pointer)->array[0]; \
|
||||
GGML_UNUSED(prefix##0);
|
||||
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
||||
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
||||
const type prefix##1 = (pointer)->array[1]; \
|
||||
GGML_UNUSED(prefix##1);
|
||||
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
||||
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
||||
const type prefix##2 = (pointer)->array[2]; \
|
||||
GGML_UNUSED(prefix##2);
|
||||
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
||||
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
||||
const type prefix##3 = (pointer)->array[3]; \
|
||||
GGML_UNUSED(prefix##3);
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -224,8 +250,8 @@ extern "C" {
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
@ -295,12 +321,15 @@ extern "C" {
|
||||
GGML_OP_SUM,
|
||||
GGML_OP_SUM_ROWS,
|
||||
GGML_OP_MEAN,
|
||||
GGML_OP_ARGMAX,
|
||||
GGML_OP_REPEAT,
|
||||
GGML_OP_REPEAT_BACK,
|
||||
GGML_OP_ABS,
|
||||
GGML_OP_SGN,
|
||||
GGML_OP_NEG,
|
||||
GGML_OP_STEP,
|
||||
GGML_OP_TANH,
|
||||
GGML_OP_ELU,
|
||||
GGML_OP_RELU,
|
||||
GGML_OP_GELU,
|
||||
GGML_OP_GELU_QUICK,
|
||||
@ -332,9 +361,8 @@ extern "C" {
|
||||
GGML_OP_ROPE_BACK,
|
||||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D_S1_PH,
|
||||
GGML_OP_CONV_1D_S2_PH,
|
||||
GGML_OP_CONV_2D_SK_P0,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
@ -390,9 +418,6 @@ extern "C" {
|
||||
struct ggml_tensor * src1;
|
||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||
|
||||
// thread scheduling
|
||||
int n_tasks;
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
int64_t perf_cycles;
|
||||
@ -404,19 +429,27 @@ extern "C" {
|
||||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[4];
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
|
||||
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
||||
int n_tasks[GGML_MAX_NODES];
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
int n_threads;
|
||||
|
||||
size_t work_size;
|
||||
struct ggml_tensor * work;
|
||||
|
||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||
@ -504,8 +537,6 @@ extern "C" {
|
||||
// use this to compute the memory overhead of a tensor
|
||||
GGML_API size_t ggml_tensor_overhead(void);
|
||||
|
||||
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
|
||||
|
||||
// main
|
||||
|
||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
@ -692,6 +723,11 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// argmax along rows
|
||||
GGML_API struct ggml_tensor * ggml_argmax(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// if a is the same shape as b, and a is not parameter, return a
|
||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||
GGML_API struct ggml_tensor * ggml_repeat(
|
||||
@ -736,6 +772,22 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_tanh(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_elu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_relu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
@ -1086,58 +1138,33 @@ extern "C" {
|
||||
float min,
|
||||
float max);
|
||||
|
||||
// TODO: implement general-purpose convolutions
|
||||
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
// struct ggml_context * ctx,
|
||||
// struct ggml_tensor * a,
|
||||
// struct ggml_tensor * b,
|
||||
// int s0
|
||||
// int p0,
|
||||
// int d0);
|
||||
//
|
||||
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||
// struct ggml_context * ctx,
|
||||
// struct ggml_tensor * a,
|
||||
// struct ggml_tensor * b,
|
||||
// int s0,
|
||||
// int s1,
|
||||
// int p0,
|
||||
// int p1,
|
||||
// int d0,
|
||||
// int d1);
|
||||
|
||||
// padding = half
|
||||
// TODO: we don't support extra parameters for now
|
||||
// that's why we are hard-coding the stride, padding, and dilation
|
||||
// not great ..
|
||||
// example:
|
||||
// a: 3 80 768 1
|
||||
// b: 3000 80 1 1
|
||||
// res: 3000 768 1 1
|
||||
// used in whisper
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
int s0, // stride
|
||||
int p0, // padding
|
||||
int d0); // dilation
|
||||
|
||||
// used in whisper
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1);
|
||||
|
||||
// kernel size is a->ne[0] x a->ne[1]
|
||||
// stride is equal to kernel size
|
||||
// padding is zero
|
||||
// example:
|
||||
// a: 16 16 3 768
|
||||
// b: 1024 1024 3 1
|
||||
// res: 64 64 768 1
|
||||
// used in sam
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
||||
// conv_1d with padding = half
|
||||
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
||||
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
int s,
|
||||
int d);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
@ -1268,15 +1295,22 @@ extern "C" {
|
||||
|
||||
GGML_API void ggml_set_param(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
|
||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||
|
||||
@ -1493,25 +1527,24 @@ extern "C" {
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
quantize_row_q_t quantize_row_q_dot;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
enum ggml_type vec_dot_type;
|
||||
} quantize_fns_t;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_reference;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
} ggml_type_traits_t;
|
||||
|
||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -1,49 +1,42 @@
|
||||
#include "ggml.h"
|
||||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "defines.h"
|
||||
#include "ggml/ggml.h"
|
||||
#include "gpt_neox.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
|
||||
AKLOGI("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
AKLOGE("%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
AKLOGE("%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
if (magic != GGML_FILE_MAGIC) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t origmaxctx = model.hparams.n_ctx;
|
||||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model.hparams;
|
||||
@ -59,17 +52,15 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
AKLOGI("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
AKLOGI("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
|
||||
AKLOGI("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
AKLOGI("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
AKLOGI("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
AKLOGI("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||
AKLOGI("%s: par_res = %d\n", __func__, hparams.par_res);
|
||||
AKLOGI("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
AKLOGI("%s: qntvr = %d\n", __func__, qntvr);
|
||||
|
||||
hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||
printf("%s: par_res = %d\n", __func__, hparams.par_res);
|
||||
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr = %d\n", __func__, qntvr);
|
||||
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
}
|
||||
@ -94,14 +85,13 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
AKLOGE("%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
return false;
|
||||
}
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
@ -142,25 +132,26 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||
|
||||
ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
||||
ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
||||
|
||||
ctx_size += (6 + 16*n_layer)*1024; // object overhead
|
||||
|
||||
AKLOGI("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = ctx_size;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = false;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
AKLOGE("%s: ggml_init() failed\n", __func__);
|
||||
return ModelLoadResult::FAIL;
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -241,7 +232,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int64_t n_mem = n_layer*std::max(origmaxctx,n_ctx);
|
||||
const int64_t n_mem = n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
@ -249,7 +240,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
AKLOGI("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
// load weights
|
||||
@ -257,7 +248,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
int n_tensors = 0;
|
||||
size_t total_size = 0;
|
||||
|
||||
AKLOGI("%s: ", __func__);
|
||||
printf("%s: ", __func__);
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
@ -283,83 +274,52 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
AKLOGE("%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
AKLOGE("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
AKLOGE("%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
|
||||
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
||||
return ModelLoadResult::FAIL;
|
||||
return false;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
if (0) {
|
||||
AKLOGI("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||
AKLOGE("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
ggml_free(ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
return false;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
||||
total_size += ggml_nbytes(tensor);
|
||||
if (++n_tensors % 8 == 0) {
|
||||
AKLOGI(".");
|
||||
printf(".");
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
AKLOGI(" done\n");
|
||||
printf(" done\n");
|
||||
|
||||
AKLOGI("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
//gpu offload
|
||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
||||
if(gpulayers>0)
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
size_t vram_total = 0;
|
||||
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||
AKLOGE("%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
const auto & layer = model.layers[i];
|
||||
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
||||
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
||||
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
||||
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||
#else
|
||||
ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||
ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||
ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||
ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||
#endif
|
||||
}
|
||||
AKLOGE("%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
}
|
||||
#endif
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -408,13 +368,12 @@ ggml_tensor * gpt_neox_ff(
|
||||
// - embd_w: the predicted logits for the next token
|
||||
//
|
||||
bool gpt_neox_eval(
|
||||
const gpt_neox_model & model,
|
||||
gpt_neox_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token,
|
||||
bool use_scratch) {
|
||||
const token_sequence & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token) {
|
||||
const int N = embd_inp.size();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
@ -426,43 +385,40 @@ bool gpt_neox_eval(
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_rot = hparams.n_rot;
|
||||
|
||||
// TODO: All of this allocates over 800 megabytes of memory, way more than the size of the model!
|
||||
|
||||
static size_t buf_size = 256u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
// use 2 scratch buffers
|
||||
// TODO: very hacky solution - reimplement in a more elegant way
|
||||
static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
|
||||
static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
|
||||
|
||||
static size_t scr0_size = 256u*1024*1024;
|
||||
static void * scr0 = malloc(scr0_size);
|
||||
|
||||
static size_t scr1_size = 256u*1024*1024;
|
||||
static void * scr1 = malloc(scr1_size);
|
||||
|
||||
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
||||
const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//AKLOGI("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
if (buf_size_new > buf_size)
|
||||
{
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr)
|
||||
{
|
||||
AKLOGE("%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr) {
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = buf_size;
|
||||
params.mem_buffer = buf;
|
||||
params.no_alloc = false;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ buf,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph gf = {};
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||
@ -473,9 +429,7 @@ bool gpt_neox_eval(
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
if(use_scratch){
|
||||
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
@ -580,9 +534,7 @@ bool gpt_neox_eval(
|
||||
}
|
||||
}
|
||||
|
||||
if(use_scratch){
|
||||
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
|
||||
}
|
||||
|
||||
if (hparams.par_res == 0) {
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
||||
@ -606,9 +558,7 @@ bool gpt_neox_eval(
|
||||
}
|
||||
}
|
||||
|
||||
if(use_scratch){
|
||||
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
||||
}
|
||||
|
||||
// norm
|
||||
{
|
||||
@ -622,9 +572,7 @@ bool gpt_neox_eval(
|
||||
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||
}
|
||||
|
||||
if(use_scratch){
|
||||
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
||||
}
|
||||
|
||||
// lm_head
|
||||
{
|
||||
@ -640,7 +588,18 @@ bool gpt_neox_eval(
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
struct ggml_cplan plan = ggml_graph_plan(&gf, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
if(model.work_buf.size() < plan.work_size) {
|
||||
model.work_buf.resize(plan.work_size);
|
||||
}
|
||||
|
||||
plan.work_data = model.work_buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(&gf, &plan);
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
@ -657,7 +616,7 @@ bool gpt_neox_eval(
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//AKLOGI("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
86
native/jni/src/ggml/gpt_neox.h
Normal file
86
native/jni/src/ggml/gpt_neox.h
Normal file
@ -0,0 +1,86 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml/ggml.h"
|
||||
#include "common.h"
|
||||
|
||||
// default hparams (StableLM 3B)
|
||||
struct gpt_neox_hparams {
|
||||
int32_t n_vocab = 50257;
|
||||
int32_t n_ctx = 4096;
|
||||
int32_t n_embd = 4096;
|
||||
int32_t n_head = 32;
|
||||
int32_t n_layer = 16;
|
||||
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
|
||||
int32_t par_res = 1; // 1 = true, 0 = false
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct gpt_neox_layer {
|
||||
// pre normalization
|
||||
struct ggml_tensor * ln_1_g;
|
||||
struct ggml_tensor * ln_1_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_attn_w;
|
||||
struct ggml_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_tensor * c_attn_proj_w;
|
||||
struct ggml_tensor * c_attn_proj_b;
|
||||
|
||||
// post normalization
|
||||
struct ggml_tensor * ln_2_g;
|
||||
struct ggml_tensor * ln_2_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * c_mlp_fc_w;
|
||||
struct ggml_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_tensor * c_mlp_proj_w;
|
||||
struct ggml_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt_neox_model {
|
||||
gpt_neox_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ln_f_g;
|
||||
struct ggml_tensor * ln_f_b;
|
||||
|
||||
struct ggml_tensor * wte; // position embedding
|
||||
|
||||
struct ggml_tensor * lmh_g; // language model head
|
||||
//struct ggml_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gpt_neox_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
|
||||
std::vector<uint8_t> work_buf;
|
||||
};
|
||||
|
||||
|
||||
bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab);
|
||||
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
// - n_threads: number of threads to use
|
||||
// - n_past: the context size so far
|
||||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - logits: the predicted logits for the next token
|
||||
//
|
||||
bool gpt_neox_eval(
|
||||
gpt_neox_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const token_sequence & embd_inp,
|
||||
std::vector<float> & logits,
|
||||
size_t & mem_per_token
|
||||
);
|
@ -1,466 +0,0 @@
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <math.h>
|
||||
#include <vector>
|
||||
|
||||
#include "model_adapter.h"
|
||||
|
||||
#include <chrono>
|
||||
|
||||
static auto bench_timer = std::chrono::high_resolution_clock().now();
|
||||
|
||||
void timer_start()
|
||||
{
|
||||
bench_timer = std::chrono::high_resolution_clock().now();
|
||||
}
|
||||
double timer_check()
|
||||
{
|
||||
auto endtime = std::chrono::high_resolution_clock().now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endtime - bench_timer);
|
||||
double time_taken = duration.count()/1000.0;
|
||||
return time_taken;
|
||||
}
|
||||
|
||||
void print_vec(std::vector<std::string> &embd)
|
||||
{
|
||||
std::cout << "[";
|
||||
bool first = true;
|
||||
for (auto i : embd)
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
std::cout << ',';
|
||||
}
|
||||
first = false;
|
||||
std::cout << i;
|
||||
}
|
||||
std::cout << "]\n";
|
||||
}
|
||||
void print_tok_vec(std::vector<int> &embd)
|
||||
{
|
||||
std::cout << "[";
|
||||
bool first = true;
|
||||
for (auto i : embd)
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
std::cout << ',';
|
||||
}
|
||||
first = false;
|
||||
std::cout << i;
|
||||
}
|
||||
std::cout << "]\n";
|
||||
}
|
||||
void print_tok_vec(std::vector<float> &embd)
|
||||
{
|
||||
std::cout << "[";
|
||||
bool first = true;
|
||||
int n = 0;
|
||||
for (auto i : embd)
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
std::cout << ',';
|
||||
}
|
||||
first = false;
|
||||
std::cout << i;
|
||||
if(++n>20)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::cout << "]\n";
|
||||
}
|
||||
|
||||
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
||||
FileFormat check_file_format(const std::string & fname)
|
||||
{
|
||||
std::vector<char> f_buf(1024*1024);
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return FileFormat::BADFORMAT;
|
||||
}
|
||||
|
||||
FileFormat fileformat = FileFormat::BADFORMAT;
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic == 0x67676d6c) { //v1 format ggml, alpaca, old gptj and gpt2 models
|
||||
fileformat = FileFormat::GGML;
|
||||
//we need to read more to determine
|
||||
int32_t vocabsiz = 0;
|
||||
fin.read((char *) &vocabsiz, sizeof(int32_t));
|
||||
if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
|
||||
{
|
||||
fileformat = FileFormat::MPT_1;
|
||||
}
|
||||
else if(vocabsiz==50400) //know GPT-J vocab size
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_1;
|
||||
uint32_t temp;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_rot
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
const int32_t qntvr = temp / 1000;
|
||||
temp %= 1000;
|
||||
if (qntvr != 0)
|
||||
{
|
||||
if (qntvr == 1)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_4;
|
||||
}
|
||||
else
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_5;
|
||||
}
|
||||
}
|
||||
else if (temp != 0 && temp != 1)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
|
||||
}
|
||||
}
|
||||
else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder
|
||||
{
|
||||
fileformat = FileFormat::GPT2_1;
|
||||
uint32_t temp;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
const int32_t qntvr = temp / 1000;
|
||||
temp %= 1000;
|
||||
if (qntvr != 0)
|
||||
{
|
||||
if (qntvr == 1)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_3;
|
||||
}
|
||||
else
|
||||
{
|
||||
fileformat = FileFormat::GPT2_4;
|
||||
}
|
||||
}
|
||||
else if (temp != 0 && temp != 1)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
|
||||
}
|
||||
}
|
||||
else if(vocabsiz < 31998 || vocabsiz > 33000)
|
||||
{
|
||||
//anything outside the llama v1 range is assumed to be NeoX
|
||||
fileformat = FileFormat::NEOX_6;
|
||||
uint32_t temp,temp2;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_rot
|
||||
fin.read((char *)&temp, sizeof(temp)); //either par_res or ftype (for older ver)
|
||||
|
||||
if(temp!=0 && temp!=1){
|
||||
//must be ftype, means its an older model. par_res will be undefined
|
||||
fileformat = FileFormat::NEOX_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
//it could be a newer model, or an old f16/f32 model
|
||||
fin.read((char *)&temp2, sizeof(temp2)); //if previous was par_res, this is ftype. else unknown
|
||||
|
||||
//if it is new ftype, then it must have these properties: > 1000, low multiple of 1k and small remaineder
|
||||
bool isNewFtype = (temp2>=1000 && temp2<=9000 && temp2%1000<20);
|
||||
|
||||
if(!isNewFtype)
|
||||
{
|
||||
fileformat = FileFormat::NEOX_2;
|
||||
if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0
|
||||
{
|
||||
//its a f16/f32 model in the new format
|
||||
fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int32_t qntvr = temp2 / 1000; //for future use
|
||||
//then temp was par_res, use_parallel_residual is false in RedPajama
|
||||
if(qntvr==1)
|
||||
{
|
||||
fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4);
|
||||
}
|
||||
else
|
||||
{
|
||||
fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else if(magic == 0x67676d66) //v2 format ggmf
|
||||
{
|
||||
fileformat = FileFormat::GGHF;
|
||||
uint32_t temp;
|
||||
fin.read((char *)&temp, sizeof(temp)); //file version
|
||||
if(temp==100)
|
||||
{
|
||||
fileformat = FileFormat::RWKV_1;
|
||||
}
|
||||
else if(temp==101)
|
||||
{
|
||||
fileformat = FileFormat::RWKV_2;
|
||||
}
|
||||
}
|
||||
else if(magic == 0x67676a74) //v3 format ggjt
|
||||
{
|
||||
fileformat = FileFormat::GGJT_3; //ggjt by default
|
||||
uint32_t ver, temp, ftype;
|
||||
fin.read((char *)&ver, sizeof(ver)); //file version
|
||||
fin.read((char *)&temp, sizeof(temp));//vocab
|
||||
fin.read((char *)&temp, sizeof(temp)); //embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //mult
|
||||
fin.read((char *)&temp, sizeof(temp));//head
|
||||
fin.read((char *)&temp, sizeof(temp));//layer
|
||||
fin.read((char *)&temp, sizeof(temp));//rot
|
||||
fin.read((char *)&ftype, sizeof(ftype));//filetype
|
||||
|
||||
if(ver==1)
|
||||
{
|
||||
fileformat = FileFormat::GGJT;
|
||||
}
|
||||
else if(ver==2)
|
||||
{
|
||||
fileformat = FileFormat::GGJT_2;
|
||||
}
|
||||
}
|
||||
fin.close();
|
||||
|
||||
return fileformat;
|
||||
}
|
||||
|
||||
bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> searchSeq)
|
||||
{
|
||||
int ss = searchSeq.size();
|
||||
if(targetArray.size()<ss)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
for(int i=0;i<ss;++i)
|
||||
{
|
||||
if(targetArray[i]!=searchSeq[i])
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq)
|
||||
{
|
||||
int ss = searchSeq.size();
|
||||
int tas = targetArray.size();
|
||||
if(tas<ss)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
for(int i=0;i<tas;++i)
|
||||
{
|
||||
int srch = 0;
|
||||
bool fail = false;
|
||||
for(int srch=0;srch<ss;++srch)
|
||||
{
|
||||
if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
|
||||
{
|
||||
fail = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!fail)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<int> LongestCommonSubseq(const std::vector<int> x, const std::vector<int> y)
|
||||
{
|
||||
int m = x.size(), n = y.size();
|
||||
|
||||
//int LCSuff[m+1][n+1];
|
||||
std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
|
||||
|
||||
for (int j = 0; j <= n; j++)
|
||||
LCSuff[0][j] = 0;
|
||||
for (int i = 0; i <= m; i++)
|
||||
LCSuff[i][0] = 0;
|
||||
|
||||
for (int i = 1; i <= m; i++)
|
||||
{
|
||||
for (int j = 1; j <= n; j++)
|
||||
{
|
||||
if (x[i - 1] == y[j - 1])
|
||||
LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
|
||||
else
|
||||
LCSuff[i][j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int> longest;
|
||||
for (int i = 1; i <= m; i++)
|
||||
{
|
||||
for (int j = 1; j <= n; j++)
|
||||
{
|
||||
if (LCSuff[i][j] > longest.size())
|
||||
{
|
||||
auto off1 = ((i - LCSuff[i][j] + 1) - 1);
|
||||
auto off2 = off1 + LCSuff[i][j];
|
||||
longest.clear();
|
||||
// std::vector<int>().swap(longest);
|
||||
longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
|
||||
// x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return longest;
|
||||
}
|
||||
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp,
|
||||
int &n_past, const int nctx, std::vector<int> &smartcontext,
|
||||
bool useSmartContext, const bool requireFullSubset)
|
||||
{
|
||||
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
|
||||
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
|
||||
const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
|
||||
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
|
||||
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
|
||||
|
||||
|
||||
//fast forward the past based on identical tokens, stop once a divergence is noted
|
||||
int embd_inp_len = embd_inp.size();
|
||||
bool fastforwardok = true;
|
||||
|
||||
for (int i = 0; i < current_context_tokens.size(); ++i)
|
||||
{
|
||||
if (current_context_tokens[i] == embd_inp[i])
|
||||
{
|
||||
n_past += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
|
||||
{
|
||||
n_past = 0;
|
||||
fastforwardok = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
|
||||
{
|
||||
if (i >= embd_inp_len)
|
||||
{
|
||||
n_past = 0;
|
||||
fastforwardok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((i + 2) >= embd_inp_len)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(fastforwardok)
|
||||
{
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
|
||||
embd_inp_len = embd_inp.size();
|
||||
}
|
||||
|
||||
//smart context mode, detect if we have a shifted context at max length
|
||||
//requirement: previous context was at least nctx/2 longer than current,
|
||||
//mode is on, and current context already maxed.
|
||||
|
||||
if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
|
||||
{
|
||||
//see if smartcontext is still usable
|
||||
auto shared = LongestCommonSubseq(smartcontext, embd_inp);
|
||||
if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common
|
||||
{
|
||||
int found = ArrFindIndexOf(embd_inp,shared);
|
||||
if(found>=0)
|
||||
{
|
||||
auto trimmed = std::vector<int>(embd_inp.begin() + found, embd_inp.end());
|
||||
embd_inp = trimmed;
|
||||
embd_inp_len = embd_inp.size();
|
||||
printf("\n[Reusing Smart Context: %d allowance remaining]", found);
|
||||
|
||||
int old_n_past = n_past;
|
||||
int offset_fix = old_n_past;
|
||||
if (current_context_tokens[n_past] != embd_inp[0])
|
||||
{
|
||||
offset_fix = 0;
|
||||
}
|
||||
|
||||
for (int i = n_past; i < current_context_tokens.size(); ++i)
|
||||
{
|
||||
if (current_context_tokens[i] == embd_inp[i-offset_fix])
|
||||
{
|
||||
n_past += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
if ((i + 2 - offset_fix) >= embd_inp_len)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past));
|
||||
|
||||
}else{
|
||||
smartcontext.clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
smartcontext.clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
smartcontext.clear();
|
||||
}
|
||||
|
||||
if(fastforwardok && useSmartContext
|
||||
&& smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold
|
||||
&& embd_inp_len >= SCInpLenThreshold
|
||||
&& current_context_tokens.size() - n_past > SCPastLenThreshold)
|
||||
{
|
||||
//determine longest common substring after removing start part
|
||||
int shiftamt = embd_inp.size() * SCTruncationRatio;
|
||||
smartcontext = std::vector<int>(embd_inp.begin() + shiftamt, embd_inp.end());
|
||||
printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt);
|
||||
|
||||
embd_inp = smartcontext;
|
||||
//if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker.
|
||||
//when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS
|
||||
//remove all tokens between start part and start of LCS in new prompt, thus avoiding shift
|
||||
//if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B
|
||||
}
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <math.h>
|
||||
#include <vector>
|
||||
|
||||
enum FileFormat
|
||||
{
|
||||
BADFORMAT=0, //unknown, uninit, or failed to load
|
||||
GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
|
||||
GGHF=2, // 2=(llama ggmf)
|
||||
GGJT=3, // 3=(llama ggjt)
|
||||
GGJT_2=4, //newer llama format unshuffled
|
||||
GGJT_3=5, //using 16bit scalar
|
||||
|
||||
GPTJ_1=100, //the very first super old GPTJ format
|
||||
GPTJ_2=101, //pygmalion, uses old ggml lib
|
||||
GPTJ_3=102, //uses new ggml lib
|
||||
GPTJ_4=103, //unshuffled
|
||||
GPTJ_5=104, //using 16bit scalar
|
||||
|
||||
GPT2_1=200,
|
||||
GPT2_2=201,
|
||||
GPT2_3=202, //unshuffled
|
||||
GPT2_4=203, //using 16bit scalar
|
||||
|
||||
RWKV_1=300,
|
||||
RWKV_2=301,
|
||||
|
||||
NEOX_1=400,
|
||||
NEOX_2=401,
|
||||
NEOX_3=402, //redpajama
|
||||
NEOX_4=403, //unshuffled
|
||||
NEOX_5=404, //unshuffled redpajama
|
||||
NEOX_6=405, //using 16bit scalar
|
||||
NEOX_7=406, //using 16bit scalar redpajama
|
||||
|
||||
MPT_1=500, //first supported mpt version
|
||||
};
|
||||
|
||||
enum ModelLoadResult
|
||||
{
|
||||
FAIL = 0,
|
||||
SUCCESS = 1,
|
||||
RETRY_LOAD = 2, //used if it's suspected that the model is an older format
|
||||
};
|
||||
|
||||
void timer_start();
|
||||
double timer_check();
|
||||
void print_tok_vec(std::vector<int> &embd);
|
||||
void print_tok_vec(std::vector<float> &embd);
|
||||
void print_vec(std::vector<std::string> &embd);
|
||||
std::vector<int> LongestCommonSubseq(const std::vector<int> x, const std::vector<int> y);
|
||||
bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
||||
int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);
|
||||
|
||||
FileFormat check_file_format(const std::string & fname);
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp,
|
||||
int &n_past, const int nctx, std::vector<int> &smartcontext,
|
||||
const bool useSmartContext, const bool requireFullSubset);
|
@ -1,464 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "utils.h"
|
||||
#include "model_adapter.h"
|
||||
|
||||
|
||||
// default hparams (GPT-J 6B)
|
||||
struct gptj_hparams {
|
||||
int32_t n_vocab = 50400;
|
||||
int32_t n_ctx = 2048;
|
||||
int32_t n_embd = 4096;
|
||||
int32_t n_head = 16;
|
||||
int32_t n_layer = 28;
|
||||
int32_t n_rot = 64;
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct gptj_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * ln_1_g;
|
||||
struct ggml_tensor * ln_1_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_q_proj_w;
|
||||
struct ggml_tensor * c_attn_k_proj_w;
|
||||
struct ggml_tensor * c_attn_v_proj_w;
|
||||
|
||||
struct ggml_tensor * c_attn_proj_w;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * c_mlp_fc_w;
|
||||
struct ggml_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_tensor * c_mlp_proj_w;
|
||||
struct ggml_tensor * c_mlp_proj_b;
|
||||
};
|
||||
struct gptj_layer_v2 {
|
||||
// normalization
|
||||
struct ggml_v2_tensor * ln_1_g;
|
||||
struct ggml_v2_tensor * ln_1_b;
|
||||
|
||||
// attention
|
||||
struct ggml_v2_tensor * c_attn_q_proj_w;
|
||||
struct ggml_v2_tensor * c_attn_k_proj_w;
|
||||
struct ggml_v2_tensor * c_attn_v_proj_w;
|
||||
|
||||
struct ggml_v2_tensor * c_attn_proj_w;
|
||||
|
||||
// ff
|
||||
struct ggml_v2_tensor * c_mlp_fc_w;
|
||||
struct ggml_v2_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_v2_tensor * c_mlp_proj_w;
|
||||
struct ggml_v2_tensor * c_mlp_proj_w_trans; //for backwards compatibility
|
||||
struct ggml_v2_tensor * c_mlp_proj_b;
|
||||
};
|
||||
struct gptj_layer_v1 {
|
||||
// normalization
|
||||
struct ggml_v1_tensor * ln_1_g;
|
||||
struct ggml_v1_tensor * ln_1_b;
|
||||
|
||||
// attention
|
||||
struct ggml_v1_tensor * c_attn_q_proj_w;
|
||||
struct ggml_v1_tensor * c_attn_k_proj_w;
|
||||
struct ggml_v1_tensor * c_attn_v_proj_w;
|
||||
|
||||
struct ggml_v1_tensor * c_attn_proj_w;
|
||||
|
||||
// ff
|
||||
struct ggml_v1_tensor * c_mlp_fc_w;
|
||||
struct ggml_v1_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_v1_tensor * c_mlp_proj_w;
|
||||
struct ggml_v1_tensor * c_mlp_proj_w_trans; //for backwards compatibility
|
||||
struct ggml_v1_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gptj_v1_model {
|
||||
gptj_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_v1_tensor * ln_f_g;
|
||||
struct ggml_v1_tensor * ln_f_b;
|
||||
|
||||
struct ggml_v1_tensor * wte; // position embedding
|
||||
|
||||
struct ggml_v1_tensor * lmh_g; // language model head
|
||||
struct ggml_v1_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gptj_layer_v1> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_v1_tensor * memory_k;
|
||||
struct ggml_v1_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_v1_context * ctx;
|
||||
std::map<std::string, struct ggml_v1_tensor *> tensors;
|
||||
};
|
||||
|
||||
struct gptj_v2_model {
|
||||
gptj_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_v2_tensor * ln_f_g;
|
||||
struct ggml_v2_tensor * ln_f_b;
|
||||
|
||||
struct ggml_v2_tensor * wte; // position embedding
|
||||
|
||||
struct ggml_v2_tensor * lmh_g; // language model head
|
||||
struct ggml_v2_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gptj_layer_v2> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_v2_tensor * memory_k;
|
||||
struct ggml_v2_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_v2_context * ctx;
|
||||
std::map<std::string, struct ggml_v2_tensor *> tensors;
|
||||
};
|
||||
|
||||
struct gptj_model {
|
||||
gptj_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ln_f_g;
|
||||
struct ggml_tensor * ln_f_b;
|
||||
|
||||
struct ggml_tensor * wte; // position embedding
|
||||
|
||||
struct ggml_tensor * lmh_g; // language model head
|
||||
struct ggml_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gptj_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
// default hparams (GPT-2 117M)
|
||||
struct gpt2_hparams {
|
||||
int32_t n_vocab = 50257;
|
||||
int32_t n_ctx = 1024;
|
||||
int32_t n_embd = 768;
|
||||
int32_t n_head = 12;
|
||||
int32_t n_layer = 12;
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct gpt2_v1_layer {
|
||||
// normalization
|
||||
struct ggml_v1_tensor * ln_1_g;
|
||||
struct ggml_v1_tensor * ln_1_b;
|
||||
|
||||
struct ggml_v1_tensor * ln_2_g;
|
||||
struct ggml_v1_tensor * ln_2_b;
|
||||
|
||||
// attention
|
||||
struct ggml_v1_tensor * c_attn_attn_w;
|
||||
struct ggml_v1_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_v1_tensor * c_attn_proj_w;
|
||||
struct ggml_v1_tensor * c_attn_proj_b;
|
||||
|
||||
// mlp
|
||||
struct ggml_v1_tensor * c_mlp_fc_w;
|
||||
struct ggml_v1_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_v1_tensor * c_mlp_proj_w_trans; // transposed for efficiency
|
||||
struct ggml_v1_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt2_v1_model {
|
||||
gpt2_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_v1_tensor * ln_f_g;
|
||||
struct ggml_v1_tensor * ln_f_b;
|
||||
|
||||
struct ggml_v1_tensor * wte; // position embedding
|
||||
struct ggml_v1_tensor * wpe; // token embedding
|
||||
|
||||
std::vector<gpt2_v1_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_v1_tensor * memory_k;
|
||||
struct ggml_v1_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_v1_context * ctx;
|
||||
std::map<std::string, struct ggml_v1_tensor *> tensors;
|
||||
};
|
||||
|
||||
struct gpt2_layer_v2 {
|
||||
// normalization
|
||||
struct ggml_v2_tensor * ln_1_g;
|
||||
struct ggml_v2_tensor * ln_1_b;
|
||||
|
||||
struct ggml_v2_tensor * ln_2_g;
|
||||
struct ggml_v2_tensor * ln_2_b;
|
||||
|
||||
// attention
|
||||
struct ggml_v2_tensor * c_attn_attn_w;
|
||||
struct ggml_v2_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_v2_tensor * c_attn_proj_w;
|
||||
struct ggml_v2_tensor * c_attn_proj_b;
|
||||
|
||||
// mlp
|
||||
struct ggml_v2_tensor * c_mlp_fc_w;
|
||||
struct ggml_v2_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_v2_tensor * c_mlp_proj_w;
|
||||
struct ggml_v2_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt2_v2_model {
|
||||
gpt2_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_v2_tensor * ln_f_g;
|
||||
struct ggml_v2_tensor * ln_f_b;
|
||||
|
||||
struct ggml_v2_tensor * wte; // position embedding
|
||||
struct ggml_v2_tensor * wpe; // token embedding
|
||||
struct ggml_v2_tensor * lm_head; // language model head
|
||||
|
||||
std::vector<gpt2_layer_v2> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_v2_tensor * memory_k;
|
||||
struct ggml_v2_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_v2_context * ctx;
|
||||
std::map<std::string, struct ggml_v2_tensor *> tensors;
|
||||
};
|
||||
|
||||
struct gpt2_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * ln_1_g;
|
||||
struct ggml_tensor * ln_1_b;
|
||||
|
||||
struct ggml_tensor * ln_2_g;
|
||||
struct ggml_tensor * ln_2_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_attn_w;
|
||||
struct ggml_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_tensor * c_attn_proj_w;
|
||||
struct ggml_tensor * c_attn_proj_b;
|
||||
|
||||
// mlp
|
||||
struct ggml_tensor * c_mlp_fc_w;
|
||||
struct ggml_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_tensor * c_mlp_proj_w;
|
||||
struct ggml_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt2_model {
|
||||
gpt2_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ln_f_g;
|
||||
struct ggml_tensor * ln_f_b;
|
||||
|
||||
struct ggml_tensor * wte; // position embedding
|
||||
struct ggml_tensor * wpe; // token embedding
|
||||
struct ggml_tensor * lm_head; // language model head
|
||||
|
||||
std::vector<gpt2_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
// default hparams (StableLM 3B)
|
||||
struct gpt_neox_hparams {
|
||||
int32_t n_vocab = 50257;
|
||||
int32_t n_ctx = 4096;
|
||||
int32_t n_embd = 4096;
|
||||
int32_t n_head = 32;
|
||||
int32_t n_layer = 16;
|
||||
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
|
||||
int32_t par_res = 1; // 1 = true, 0 = false
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct gpt_neox_layer_v2 {
|
||||
// pre normalization
|
||||
struct ggml_v2_tensor * ln_1_g;
|
||||
struct ggml_v2_tensor * ln_1_b;
|
||||
|
||||
// attention
|
||||
struct ggml_v2_tensor * c_attn_attn_w;
|
||||
struct ggml_v2_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_v2_tensor * c_attn_proj_w;
|
||||
struct ggml_v2_tensor * c_attn_proj_b;
|
||||
|
||||
// post normalization
|
||||
struct ggml_v2_tensor * ln_2_g;
|
||||
struct ggml_v2_tensor * ln_2_b;
|
||||
|
||||
// ff
|
||||
struct ggml_v2_tensor * c_mlp_fc_w;
|
||||
struct ggml_v2_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_v2_tensor * c_mlp_proj_w;
|
||||
struct ggml_v2_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt_neox_v2_model {
|
||||
gpt_neox_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_v2_tensor * ln_f_g;
|
||||
struct ggml_v2_tensor * ln_f_b;
|
||||
|
||||
struct ggml_v2_tensor * wte; // position embedding
|
||||
|
||||
struct ggml_v2_tensor * lmh_g; // language model head
|
||||
//struct ggml_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gpt_neox_layer_v2> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_v2_tensor * memory_k;
|
||||
struct ggml_v2_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_v2_context * ctx;
|
||||
std::map<std::string, struct ggml_v2_tensor *> tensors;
|
||||
};
|
||||
|
||||
struct gpt_neox_layer {
|
||||
// pre normalization
|
||||
struct ggml_tensor * ln_1_g;
|
||||
struct ggml_tensor * ln_1_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_attn_w;
|
||||
struct ggml_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_tensor * c_attn_proj_w;
|
||||
struct ggml_tensor * c_attn_proj_b;
|
||||
|
||||
// post normalization
|
||||
struct ggml_tensor * ln_2_g;
|
||||
struct ggml_tensor * ln_2_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * c_mlp_fc_w;
|
||||
struct ggml_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_tensor * c_mlp_proj_w;
|
||||
struct ggml_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt_neox_model {
|
||||
gpt_neox_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ln_f_g;
|
||||
struct ggml_tensor * ln_f_b;
|
||||
|
||||
struct ggml_tensor * wte; // position embedding
|
||||
|
||||
struct ggml_tensor * lmh_g; // language model head
|
||||
//struct ggml_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gpt_neox_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
|
||||
// no defaults for now
|
||||
struct mpt_hparams {
|
||||
int32_t d_model = 0;
|
||||
int32_t max_seq_len = 0;
|
||||
int32_t n_heads = 0;
|
||||
int32_t n_layers = 0;
|
||||
int32_t n_vocab = 0;
|
||||
float alibi_bias_max = 0;
|
||||
float clip_qkv = 0;
|
||||
int32_t ftype = 0;
|
||||
int32_t n_ctx = 0;
|
||||
|
||||
};
|
||||
|
||||
struct mpt_layer {
|
||||
// pre normalization
|
||||
struct ggml_tensor * norm_1_weight;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_wqkv_weight;
|
||||
struct ggml_tensor * c_attn_out_proj_weight;
|
||||
|
||||
// post normalization
|
||||
struct ggml_tensor * norm_2_weight;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * ffn_up_proj;
|
||||
struct ggml_tensor * ffn_down_proj;
|
||||
};
|
||||
|
||||
struct mpt_model {
|
||||
mpt_hparams hparams;
|
||||
|
||||
struct ggml_tensor * wte_weight; // position embedding
|
||||
struct ggml_tensor * norm_f_weight; // language model head
|
||||
|
||||
std::vector<mpt_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
|
||||
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers);
|
||||
bool gpt_neox_eval(
|
||||
const gpt_neox_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token,
|
||||
bool use_scratch);
|
@ -1,224 +0,0 @@
|
||||
#include "utils.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
#include <sstream>
|
||||
|
||||
|
||||
|
||||
void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
|
||||
size_t pos = 0;
|
||||
while ((pos = str.find(needle, pos)) != std::string::npos) {
|
||||
str.replace(pos, needle.length(), replacement);
|
||||
pos += replacement.length();
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||
std::map<std::string, int32_t> result;
|
||||
|
||||
// read file into string
|
||||
std::string json;
|
||||
{
|
||||
std::ifstream ifs(fname);
|
||||
if (!ifs) {
|
||||
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
||||
(std::istreambuf_iterator<char>()));
|
||||
}
|
||||
|
||||
if (json[0] != '{') {
|
||||
return result;
|
||||
}
|
||||
|
||||
// parse json
|
||||
{
|
||||
bool has_key = false;
|
||||
bool in_token = false;
|
||||
|
||||
std::string str_key = "";
|
||||
std::string str_val = "";
|
||||
|
||||
int n = json.size();
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (!in_token) {
|
||||
if (json[i] == ' ') continue;
|
||||
if (json[i] == '"') {
|
||||
in_token = true;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (json[i] == '\\' && i+1 < n) {
|
||||
if (has_key == false) {
|
||||
str_key += json[i];
|
||||
} else {
|
||||
str_val += json[i];
|
||||
}
|
||||
++i;
|
||||
} else if (json[i] == '"') {
|
||||
if (has_key == false) {
|
||||
has_key = true;
|
||||
++i;
|
||||
while (json[i] == ' ') ++i;
|
||||
++i; // :
|
||||
while (json[i] == ' ') ++i;
|
||||
if (json[i] != '\"') {
|
||||
while (json[i] != ',' && json[i] != '}') {
|
||||
str_val += json[i++];
|
||||
}
|
||||
has_key = false;
|
||||
} else {
|
||||
in_token = true;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
has_key = false;
|
||||
}
|
||||
|
||||
::utreplace(str_key, "\\u0120", " " ); // \u0120 -> space
|
||||
::utreplace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
||||
::utreplace(str_key, "\\\"", "\""); // \\\" -> "
|
||||
|
||||
try {
|
||||
result[str_key] = std::stoi(str_val);
|
||||
} catch (...) {
|
||||
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
||||
|
||||
}
|
||||
str_key = "";
|
||||
str_val = "";
|
||||
in_token = false;
|
||||
continue;
|
||||
}
|
||||
if (has_key == false) {
|
||||
str_key += json[i];
|
||||
} else {
|
||||
str_val += json[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
void gpt_vocab::add_special_token(const std::string & token) {
|
||||
special_tokens.push_back(token);
|
||||
}
|
||||
|
||||
|
||||
std::string convert_to_utf8(const std::wstring & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.to_bytes(input);
|
||||
}
|
||||
|
||||
|
||||
std::wstring convert_to_wstring(const std::string & input) {
|
||||
try {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.from_bytes(input);
|
||||
} catch (const std::range_error& e) {
|
||||
return L"";
|
||||
} catch (...) {
|
||||
return L"";
|
||||
}
|
||||
}
|
||||
|
||||
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
||||
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
const std::regex re(pattern);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(str, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
{
|
||||
std::string str = text;
|
||||
|
||||
// Generate the subpattern from the special_tokens vector if it's not empty
|
||||
if (!vocab.special_tokens.empty()) {
|
||||
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
||||
std::string special_tokens_subpattern;
|
||||
for (const auto & token : vocab.special_tokens) {
|
||||
if (!special_tokens_subpattern.empty()) {
|
||||
special_tokens_subpattern += "|";
|
||||
}
|
||||
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
||||
}
|
||||
|
||||
std::regex re(special_tokens_subpattern);
|
||||
std::smatch m;
|
||||
// Split the text by special tokens.
|
||||
while (std::regex_search(str, m, re)) {
|
||||
// Split the substrings in-between special tokens into words.
|
||||
gpt_split_words(m.prefix(), words);
|
||||
// Add matched special tokens as words.
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
// Remaining text without special tokens will be handled below.
|
||||
}
|
||||
|
||||
gpt_split_words(str, words);
|
||||
}
|
||||
|
||||
// find the longest token that forms each word in words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
for (unsigned long i = 0; i < word.size(); ){
|
||||
for (unsigned long j = word.size() - 1; j >= i; j--){
|
||||
auto cand = word.substr(i, j-i+1);
|
||||
auto it = vocab.token_to_id.find(cand);
|
||||
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
||||
tokens.push_back(it->second);
|
||||
i = j + 1;
|
||||
break;
|
||||
}
|
||||
else if (j == i){ // word.substr(i, 1) has no matching
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
bool should_transpose_layer(std::string name)
|
||||
{
|
||||
|
||||
if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
|
||||
name.find(".attn.out_proj.weight")!=std::string::npos ||
|
||||
name.find(".attn.q_proj.weight")!=std::string::npos ||
|
||||
name.find(".attn.k_proj.weight")!=std::string::npos ||
|
||||
name.find(".attn.v_proj.weight")!=std::string::npos ||
|
||||
name.find("/attn/c_attn/w")!=std::string::npos ||
|
||||
name.find("/attn/c_proj/w")!=std::string::npos ||
|
||||
name.find("/mlp/c_fc/w")!=std::string::npos ||
|
||||
name.find("/mlp/c_proj/w")!=std::string::npos)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user