Remove AGPLv3 code

This commit is contained in:
abb128 2023-07-10 12:21:05 +03:00
parent 85ed8afec9
commit 43e55bebfe
14 changed files with 1495 additions and 3330 deletions

View File

@ -22,9 +22,9 @@ LATIN_IME_JNI_SRC_FILES := \
LATIN_IME_CORE_SRC_FILES := \
ggml/ggml.c \
ggml/utils.cpp \
ggml/model_adapter.cpp \
ggml/neox_v3.cpp \
ggml/common.cpp \
ggml/context.cpp \
ggml/gpt_neox.cpp \
$(addprefix dictionary/header/, \
header_policy.cpp \
header_read_write_utils.cpp) \

View File

@ -38,7 +38,9 @@
#include "utils/profiler.h"
#include "utils/time_keeper.h"
#include "ggml/otherarch.h"
#include "ggml/gpt_neox.h"
#include "ggml/context.h"
#include "ggml/common.h"
#include <android/log.h>
@ -81,13 +83,12 @@ class ProximityInfo;
struct GGMLDictionaryState {
int n_threads = 3;
std::vector<int> smartcontext;
std::vector<gpt_vocab::id> current_context_tokens;
transformer_context t_context;
std::vector<float> logits;
std::vector<gpt_vocab::id> bad_logits;
size_t mem_per_token = 0;
bool use_scratch = true;
gpt_neox_model model;
gpt_vocab vocab;
@ -109,12 +110,10 @@ static jlong latinime_GGMLDictionary_open(JNIEnv *env, jclass clazz, jstring sou
GGMLDictionaryState *state = new GGMLDictionaryState();
std::string fname(sourceDirChars);
FileFormat format = check_file_format(fname);
assert(format == 405);
ModelLoadResult result = gpt_neox_model_load(fname, state->model, state->vocab, format, 0);
bool result = gpt_neox_model_load(fname, state->model, state->vocab);
if(result != ModelLoadResult::SUCCESS) {
if(!result) {
AKLOGE("GGMLDict: Could not load model");
free(state);
return 0;
@ -171,33 +170,28 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl
env->ReleaseStringUTFChars(partialWord, pwstr);
}
auto embd_inp = gpt_tokenize(state->vocab, contextString);
token_sequence next_context = gpt_tokenize(state->vocab, contextString);
//truncate to front of the prompt if its too long
int32_t nctx = state->model.hparams.n_ctx;
if (embd_inp.size() + 2 > nctx) {
int offset = embd_inp.size() - nctx + 2;
embd_inp = std::vector<int>(embd_inp.begin() + offset, embd_inp.end());
if (next_context.size() + 2 > nctx) {
int offset = next_context.size() - nctx + 2;
next_context = std::vector<int>(next_context.begin() + offset, next_context.end());
}
size_t size = env->GetArrayLength(outPredictions);
int n_past = 0;
auto fastforward_info = transformer_context_fastforward(state->t_context, next_context);
bool useSmartContext = true;
ContextFastForward(state->current_context_tokens, embd_inp, n_past, nctx, state->smartcontext, useSmartContext, false);
token_sequence &embd_inp = fastforward_info.first;
int n_past = fastforward_info.second;
if(embd_inp.empty()) return;
state->current_context_tokens.resize(n_past);
AKLOGI("npast = %d, size(embd) = %d\n", n_past, (int)embd_inp.size());
gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token, state->use_scratch);
gpt_neox_eval(state->model, state->n_threads, n_past, embd_inp, state->logits, state->mem_per_token);
for(auto token : embd_inp) {
state->current_context_tokens.emplace_back(token);
}
transformer_context_apply(state->t_context, fastforward_info);
int topid = std::min_element(state->logits.begin(),state->logits.end())-state->logits.begin();
float zeroValue = (state->logits[topid] < 0 ? state->logits[topid] : 0);
@ -249,6 +243,8 @@ static void latinime_GGMLDictionary_getSuggestions(JNIEnv *env, jclass clazz, jl
}
size_t size = env->GetArrayLength(outPredictions);
// Get the array elements
jint *probsArray = env->GetIntArrayElements(outProbabilities, nullptr);

View File

@ -0,0 +1,143 @@
#include "common.h"
#include <cmath>
#include <cstring>
#include <fstream>
#include <regex>
#include <locale>
#include <codecvt>
#include <sstream>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
std::string trim(const std::string & s) {
std::regex e("^\\s+|\\s+$");
return std::regex_replace(s, e, "");
}
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
std::string result = s;
size_t pos = 0;
while ((pos = result.find(from, pos)) != std::string::npos) {
result.replace(pos, from.length(), to);
pos += to.length();
}
return result;
}
void gpt_vocab::add_special_token(const std::string & token) {
special_tokens.push_back(token);
}
std::string convert_to_utf8(const std::wstring & input) {
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
return converter.to_bytes(input);
}
std::wstring convert_to_wstring(const std::string & input) {
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
return converter.from_bytes(input);
}
void gpt_split_words(std::string str, std::vector<std::string>& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
// Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) {
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
std::string special_tokens_subpattern;
for (const auto & token : vocab.special_tokens) {
if (!special_tokens_subpattern.empty()) {
special_tokens_subpattern += "|";
}
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
}
std::regex re(special_tokens_subpattern);
std::smatch m;
// Split the text by special tokens.
while (std::regex_search(str, m, re)) {
// Split the substrings in-between special tokens into words.
gpt_split_words(m.prefix(), words);
// Add matched special tokens as words.
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
// Remaining text without special tokens will be handled below.
}
gpt_split_words(str, words);
}
// find the longest token that forms each word in words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
for (int i = 0; i < (int) word.size(); ){
for (int j = word.size() - 1; j >= i; j--){
auto cand = word.substr(i, j-i+1);
auto it = vocab.token_to_id.find(cand);
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
tokens.push_back(it->second);
i = j + 1;
break;
}
else if (j == i){ // word.substr(i, 1) has no matching
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
i++;
}
}
}
}
return tokens;
}
float similarity(const std::string & s0, const std::string & s1) {
const size_t len0 = s0.size() + 1;
const size_t len1 = s1.size() + 1;
std::vector<int> col(len1, 0);
std::vector<int> prevCol(len1, 0);
for (size_t i = 0; i < len1; i++) {
prevCol[i] = i;
}
for (size_t i = 0; i < len0; i++) {
col[0] = i;
for (size_t j = 1; j < len1; j++) {
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
}
col.swap(prevCol);
}
const float dist = prevCol[len1 - 1];
return 1.0f - (dist / std::max(s0.size(), s1.size()));
}

View File

@ -1,5 +1,3 @@
// Various helper functions and utilities
#pragma once
#include <string>
@ -8,15 +6,6 @@
#include <random>
#include <thread>
//
// CLI argument parsing
//
//
// Vocab utils
//
struct gpt_vocab {
using id = int32_t;
using token = std::string;
@ -28,16 +17,7 @@ struct gpt_vocab {
void add_special_token(const std::string & token);
};
void utreplace(std::string & str, const std::string & needle, const std::string & replacement);
// poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname);
std::string convert_to_utf8(const std::wstring & input);
std::wstring convert_to_wstring(const std::string & input);
void gpt_split_words(std::string str, std::vector<std::string>& words);
typedef std::vector<gpt_vocab::id> token_sequence;
// split text into tokens
//
@ -49,8 +29,5 @@ void gpt_split_words(std::string str, std::vector<std::string>& words);
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
token_sequence gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
bool should_transpose_layer(std::string name);

View File

@ -0,0 +1,29 @@
#include "context.h"
std::pair<token_sequence, int> transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context) {
int npast = 0;
// Compare the two sequences and find the first index at which they differ.
int max_length = std::min(ctx.active_context.size(), next_context.size());
for(int i=0; i<max_length; i++) {
npast = i;
if(ctx.active_context[i] != next_context[i]) {
break;
}
}
token_sequence new_context(next_context.size() - npast);
new_context.assign(next_context.begin() + npast, next_context.end());
return {new_context, npast};
}
void transformer_context_apply(transformer_context &ctx, const std::pair<token_sequence, int> &fastforward_info) {
ctx.active_context.resize(fastforward_info.second);
for(auto i : fastforward_info.first) {
ctx.active_context.emplace_back(i);
}
}

View File

@ -0,0 +1,12 @@
#pragma once
#include <vector>
#include "common.h"
struct transformer_context {
token_sequence active_context;
};
std::pair<token_sequence, int> transformer_context_fastforward(const transformer_context &ctx, const token_sequence &next_context);
void transformer_context_apply(transformer_context &ctx, const std::pair<token_sequence, int> &fastforward_info);

File diff suppressed because it is too large Load Diff

View File

@ -65,7 +65,7 @@
// ggml_set_f32(a, 3.0f);
// ggml_set_f32(b, 4.0f);
//
// ggml_graph_compute(ctx0, &gf);
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
//
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
//
@ -201,6 +201,8 @@
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4
#define GGML_UNUSED(x) (void)(x)
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
@ -209,6 +211,30 @@
} \
} while (0)
// used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability.
//
// example:
//
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
//
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
const type prefix##0 = (pointer)->array[0]; \
GGML_UNUSED(prefix##0);
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
const type prefix##1 = (pointer)->array[1]; \
GGML_UNUSED(prefix##1);
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
const type prefix##2 = (pointer)->array[2]; \
GGML_UNUSED(prefix##2);
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
const type prefix##3 = (pointer)->array[3]; \
GGML_UNUSED(prefix##3);
#ifdef __cplusplus
extern "C" {
#endif
@ -224,8 +250,8 @@ extern "C" {
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
struct ggml_object;
struct ggml_context;
@ -295,12 +321,15 @@ extern "C" {
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_ARGMAX,
GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK,
GGML_OP_ABS,
GGML_OP_SGN,
GGML_OP_NEG,
GGML_OP_STEP,
GGML_OP_TANH,
GGML_OP_ELU,
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_GELU_QUICK,
@ -332,9 +361,8 @@ extern "C" {
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_1D_S1_PH,
GGML_OP_CONV_1D_S2_PH,
GGML_OP_CONV_2D_SK_P0,
GGML_OP_CONV_1D,
GGML_OP_CONV_2D,
GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
@ -390,9 +418,6 @@ extern "C" {
struct ggml_tensor * src1;
struct ggml_tensor * opt[GGML_MAX_OPT];
// thread scheduling
int n_tasks;
// performance
int perf_runs;
int64_t perf_cycles;
@ -404,19 +429,27 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[4];
char padding[8];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int n_tasks[GGML_MAX_NODES];
};
// computation graph
struct ggml_cgraph {
int n_nodes;
int n_leafs;
int n_threads;
size_t work_size;
struct ggml_tensor * work;
struct ggml_tensor * nodes[GGML_MAX_NODES];
struct ggml_tensor * grads[GGML_MAX_NODES];
@ -504,8 +537,6 @@ extern "C" {
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -692,6 +723,11 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// argmax along rows
GGML_API struct ggml_tensor * ggml_argmax(
struct ggml_context * ctx,
struct ggml_tensor * a);
// if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b
GGML_API struct ggml_tensor * ggml_repeat(
@ -736,6 +772,22 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_tanh(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_tanh_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_elu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_elu_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_relu(
struct ggml_context * ctx,
struct ggml_tensor * a);
@ -1086,58 +1138,33 @@ extern "C" {
float min,
float max);
// TODO: implement general-purpose convolutions
// GGML_API struct ggml_tensor * ggml_conv_1d(
// struct ggml_context * ctx,
// struct ggml_tensor * a,
// struct ggml_tensor * b,
// int s0
// int p0,
// int d0);
//
// GGML_API struct ggml_tensor * ggml_conv_2d(
// struct ggml_context * ctx,
// struct ggml_tensor * a,
// struct ggml_tensor * b,
// int s0,
// int s1,
// int p0,
// int p1,
// int d0,
// int d1);
// padding = half
// TODO: we don't support extra parameters for now
// that's why we are hard-coding the stride, padding, and dilation
// not great ..
// example:
// a: 3 80 768 1
// b: 3000 80 1 1
// res: 3000 768 1 1
// used in whisper
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b,
int s0, // stride
int p0, // padding
int d0); // dilation
// used in whisper
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
GGML_API struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
// kernel size is a->ne[0] x a->ne[1]
// stride is equal to kernel size
// padding is zero
// example:
// a: 16 16 3 768
// b: 1024 1024 3 1
// res: 64 64 768 1
// used in sam
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
// conv_1d with padding = half
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b,
int s,
int d);
GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx,
@ -1268,15 +1295,22 @@ extern "C" {
GGML_API void ggml_set_param(
struct ggml_context * ctx,
struct ggml_tensor * tensor);
struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
@ -1493,25 +1527,24 @@ extern "C" {
//
#ifdef __cplusplus
// restrict not standard in C++
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference;
quantize_row_q_t quantize_row_q_dot;
vec_dot_q_t vec_dot_q;
enum ggml_type vec_dot_type;
} quantize_fns_t;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
} ggml_type_traits_t;
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
#ifdef __cplusplus
}

View File

@ -1,49 +1,42 @@
#include "ggml.h"
#include "otherarch.h"
#include "utils.h"
#include "defines.h"
#include "ggml/ggml.h"
#include "gpt_neox.h"
#include "common.h"
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
// load the model's weights from a file
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
AKLOGI("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
AKLOGE("%s: failed to open '%s'\n", __func__, fname.c_str());
return ModelLoadResult::FAIL;
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
// verify magic
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
AKLOGE("%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return ModelLoadResult::FAIL;
if (magic != GGML_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
}
}
int32_t origmaxctx = model.hparams.n_ctx;
// load hparams
{
auto & hparams = model.hparams;
@ -59,17 +52,15 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
AKLOGI("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
AKLOGI("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
AKLOGI("%s: n_embd = %d\n", __func__, hparams.n_embd);
AKLOGI("%s: n_head = %d\n", __func__, hparams.n_head);
AKLOGI("%s: n_layer = %d\n", __func__, hparams.n_layer);
AKLOGI("%s: n_rot = %d\n", __func__, hparams.n_rot);
AKLOGI("%s: par_res = %d\n", __func__, hparams.par_res);
AKLOGI("%s: ftype = %d\n", __func__, hparams.ftype);
AKLOGI("%s: qntvr = %d\n", __func__, qntvr);
hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: par_res = %d\n", __func__, hparams.par_res);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
printf("%s: qntvr = %d\n", __func__, qntvr);
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
@ -94,14 +85,13 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
}
}
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
AKLOGE("%s: invalid model file '%s' (bad ftype value %d)\n",
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return ModelLoadResult::FAIL;
return false;
}
auto & ctx = model.ctx;
@ -142,25 +132,26 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += (6 + 16*n_layer)*1024; // object overhead
AKLOGI("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
AKLOGE("%s: ggml_init() failed\n", __func__);
return ModelLoadResult::FAIL;
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
@ -241,7 +232,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int64_t n_mem = n_layer*std::max(origmaxctx,n_ctx);
const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
@ -249,7 +240,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
AKLOGI("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
}
// load weights
@ -257,7 +248,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
int n_tensors = 0;
size_t total_size = 0;
AKLOGI("%s: ", __func__);
printf("%s: ", __func__);
while (true) {
int32_t n_dims;
@ -283,83 +274,52 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
AKLOGE("%s: unknown tensor '%s' in model file\n", __func__, name.data());
return ModelLoadResult::FAIL;
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
AKLOGE("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return ModelLoadResult::FAIL;
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
AKLOGE("%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return ModelLoadResult::FAIL;
return false;
}
// for debugging
if (0) {
AKLOGI("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
AKLOGE("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
ggml_free(ctx);
return ModelLoadResult::RETRY_LOAD;
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
total_size += ggml_nbytes(tensor);
if (++n_tensors % 8 == 0) {
AKLOGI(".");
printf(".");
fflush(stdout);
}
}
AKLOGI(" done\n");
printf(" done\n");
AKLOGI("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
}
fin.close();
//gpu offload
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
if(gpulayers>0)
{
const auto & hparams = model.hparams;
size_t vram_total = 0;
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
AKLOGE("%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
#if defined(GGML_USE_CLBLAST)
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
#else
ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
#endif
}
AKLOGE("%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#endif
return ModelLoadResult::SUCCESS;
return true;
}
@ -408,13 +368,12 @@ ggml_tensor * gpt_neox_ff(
// - embd_w: the predicted logits for the next token
//
bool gpt_neox_eval(
const gpt_neox_model & model,
gpt_neox_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token,
bool use_scratch) {
const token_sequence & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();
const auto & hparams = model.hparams;
@ -426,43 +385,40 @@ bool gpt_neox_eval(
const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_rot;
// TODO: All of this allocates over 800 megabytes of memory, way more than the size of the model!
static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);
// use 2 scratch buffers
// TODO: very hacky solution - reimplement in a more elegant way
static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
static size_t scr0_size = 256u*1024*1024;
static void * scr0 = malloc(scr0_size);
static size_t scr1_size = 256u*1024*1024;
static void * scr1 = malloc(scr1_size);
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
//AKLOGI("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate
if (buf_size_new > buf_size)
{
buf_size = buf_size_new;
buf = realloc(buf, buf_size);
if (buf == nullptr)
{
AKLOGE("%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
return false;
}
buf_size = buf_size_new;
buf = realloc(buf, buf_size);
if (buf == nullptr) {
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
return false;
}
}
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
params.no_alloc = false;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = {};
gf.n_threads = n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -473,9 +429,7 @@ bool gpt_neox_eval(
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
if(use_scratch){
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
}
// self-attention
{
@ -580,9 +534,7 @@ bool gpt_neox_eval(
}
}
if(use_scratch){
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
}
if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
@ -606,9 +558,7 @@ bool gpt_neox_eval(
}
}
if(use_scratch){
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
}
// norm
{
@ -622,9 +572,7 @@ bool gpt_neox_eval(
ggml_repeat(ctx0, model.ln_f_b, inpL));
}
if(use_scratch){
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
}
// lm_head
{
@ -640,7 +588,18 @@ bool gpt_neox_eval(
// run the computation
ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute (ctx0, &gf);
struct ggml_cplan plan = ggml_graph_plan(&gf, n_threads);
if (plan.work_size > 0) {
if(model.work_buf.size() < plan.work_size) {
model.work_buf.resize(plan.work_size);
}
plan.work_data = model.work_buf.data();
}
ggml_graph_compute(&gf, &plan);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
@ -657,7 +616,7 @@ bool gpt_neox_eval(
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
//AKLOGI("used_mem = %zu\n", ggml_used_mem(ctx0));
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
ggml_free(ctx0);

View File

@ -0,0 +1,86 @@
#pragma once
#include "ggml/ggml.h"
#include "common.h"
// default hparams (StableLM 3B)
struct gpt_neox_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 4096;
int32_t n_embd = 4096;
int32_t n_head = 32;
int32_t n_layer = 16;
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
int32_t par_res = 1; // 1 = true, 0 = false
int32_t ftype = 1;
};
struct gpt_neox_layer {
// pre normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
// attention
struct ggml_tensor * c_attn_attn_w;
struct ggml_tensor * c_attn_attn_b;
struct ggml_tensor * c_attn_proj_w;
struct ggml_tensor * c_attn_proj_b;
// post normalization
struct ggml_tensor * ln_2_g;
struct ggml_tensor * ln_2_b;
// ff
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gpt_neox_model {
gpt_neox_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * lmh_g; // language model head
//struct ggml_tensor * lmh_b; // language model bias
std::vector<gpt_neox_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
std::vector<uint8_t> work_buf;
};
bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab);
// evaluate the transformer
//
// - model: the model
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - logits: the predicted logits for the next token
//
bool gpt_neox_eval(
gpt_neox_model & model,
const int n_threads,
const int n_past,
const token_sequence & embd_inp,
std::vector<float> & logits,
size_t & mem_per_token
);

View File

@ -1,466 +0,0 @@
#include <cassert>
#include <cstring>
#include <fstream>
#include <regex>
#include <iostream>
#include <iterator>
#include <queue>
#include <string>
#include <math.h>
#include <vector>
#include "model_adapter.h"
#include <chrono>
static auto bench_timer = std::chrono::high_resolution_clock().now();
void timer_start()
{
bench_timer = std::chrono::high_resolution_clock().now();
}
double timer_check()
{
auto endtime = std::chrono::high_resolution_clock().now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endtime - bench_timer);
double time_taken = duration.count()/1000.0;
return time_taken;
}
void print_vec(std::vector<std::string> &embd)
{
std::cout << "[";
bool first = true;
for (auto i : embd)
{
if (!first)
{
std::cout << ',';
}
first = false;
std::cout << i;
}
std::cout << "]\n";
}
void print_tok_vec(std::vector<int> &embd)
{
std::cout << "[";
bool first = true;
for (auto i : embd)
{
if (!first)
{
std::cout << ',';
}
first = false;
std::cout << i;
}
std::cout << "]\n";
}
void print_tok_vec(std::vector<float> &embd)
{
std::cout << "[";
bool first = true;
int n = 0;
for (auto i : embd)
{
if (!first)
{
std::cout << ',';
}
first = false;
std::cout << i;
if(++n>20)
{
break;
}
}
std::cout << "]\n";
}
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
FileFormat check_file_format(const std::string & fname)
{
std::vector<char> f_buf(1024*1024);
auto fin = std::ifstream(fname, std::ios::binary);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return FileFormat::BADFORMAT;
}
FileFormat fileformat = FileFormat::BADFORMAT;
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic == 0x67676d6c) { //v1 format ggml, alpaca, old gptj and gpt2 models
fileformat = FileFormat::GGML;
//we need to read more to determine
int32_t vocabsiz = 0;
fin.read((char *) &vocabsiz, sizeof(int32_t));
if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
{
fileformat = FileFormat::MPT_1;
}
else if(vocabsiz==50400) //know GPT-J vocab size
{
fileformat = FileFormat::GPTJ_1;
uint32_t temp;
fin.read((char *)&temp, sizeof(temp)); //ctx
fin.read((char *)&temp, sizeof(temp)); //n_embd
fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //n_rot
fin.read((char *)&temp, sizeof(temp)); //f16
const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{
if (qntvr == 1)
{
fileformat = FileFormat::GPTJ_4;
}
else
{
fileformat = FileFormat::GPTJ_5;
}
}
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
}
}
else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder
{
fileformat = FileFormat::GPT2_1;
uint32_t temp;
fin.read((char *)&temp, sizeof(temp)); //ctx
fin.read((char *)&temp, sizeof(temp)); //n_embd
fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //f16
const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{
if (qntvr == 1)
{
fileformat = FileFormat::GPT2_3;
}
else
{
fileformat = FileFormat::GPT2_4;
}
}
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
}
}
else if(vocabsiz < 31998 || vocabsiz > 33000)
{
//anything outside the llama v1 range is assumed to be NeoX
fileformat = FileFormat::NEOX_6;
uint32_t temp,temp2;
fin.read((char *)&temp, sizeof(temp)); //ctx
fin.read((char *)&temp, sizeof(temp)); //n_embd
fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //n_rot
fin.read((char *)&temp, sizeof(temp)); //either par_res or ftype (for older ver)
if(temp!=0 && temp!=1){
//must be ftype, means its an older model. par_res will be undefined
fileformat = FileFormat::NEOX_2;
}
else
{
//it could be a newer model, or an old f16/f32 model
fin.read((char *)&temp2, sizeof(temp2)); //if previous was par_res, this is ftype. else unknown
//if it is new ftype, then it must have these properties: > 1000, low multiple of 1k and small remaineder
bool isNewFtype = (temp2>=1000 && temp2<=9000 && temp2%1000<20);
if(!isNewFtype)
{
fileformat = FileFormat::NEOX_2;
if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0
{
//its a f16/f32 model in the new format
fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6;
}
}
else
{
const int32_t qntvr = temp2 / 1000; //for future use
//then temp was par_res, use_parallel_residual is false in RedPajama
if(qntvr==1)
{
fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4);
}
else
{
fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6);
}
}
}
}
}
else if(magic == 0x67676d66) //v2 format ggmf
{
fileformat = FileFormat::GGHF;
uint32_t temp;
fin.read((char *)&temp, sizeof(temp)); //file version
if(temp==100)
{
fileformat = FileFormat::RWKV_1;
}
else if(temp==101)
{
fileformat = FileFormat::RWKV_2;
}
}
else if(magic == 0x67676a74) //v3 format ggjt
{
fileformat = FileFormat::GGJT_3; //ggjt by default
uint32_t ver, temp, ftype;
fin.read((char *)&ver, sizeof(ver)); //file version
fin.read((char *)&temp, sizeof(temp));//vocab
fin.read((char *)&temp, sizeof(temp)); //embd
fin.read((char *)&temp, sizeof(temp)); //mult
fin.read((char *)&temp, sizeof(temp));//head
fin.read((char *)&temp, sizeof(temp));//layer
fin.read((char *)&temp, sizeof(temp));//rot
fin.read((char *)&ftype, sizeof(ftype));//filetype
if(ver==1)
{
fileformat = FileFormat::GGJT;
}
else if(ver==2)
{
fileformat = FileFormat::GGJT_2;
}
}
fin.close();
return fileformat;
}
bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> searchSeq)
{
int ss = searchSeq.size();
if(targetArray.size()<ss)
{
return false;
}
for(int i=0;i<ss;++i)
{
if(targetArray[i]!=searchSeq[i])
{
return false;
}
}
return true;
}
int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq)
{
int ss = searchSeq.size();
int tas = targetArray.size();
if(tas<ss)
{
return -1;
}
for(int i=0;i<tas;++i)
{
int srch = 0;
bool fail = false;
for(int srch=0;srch<ss;++srch)
{
if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
{
fail = true;
break;
}
}
if(!fail)
{
return i;
}
}
return -1;
}
std::vector<int> LongestCommonSubseq(const std::vector<int> x, const std::vector<int> y)
{
int m = x.size(), n = y.size();
//int LCSuff[m+1][n+1];
std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
for (int j = 0; j <= n; j++)
LCSuff[0][j] = 0;
for (int i = 0; i <= m; i++)
LCSuff[i][0] = 0;
for (int i = 1; i <= m; i++)
{
for (int j = 1; j <= n; j++)
{
if (x[i - 1] == y[j - 1])
LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
else
LCSuff[i][j] = 0;
}
}
std::vector<int> longest;
for (int i = 1; i <= m; i++)
{
for (int j = 1; j <= n; j++)
{
if (LCSuff[i][j] > longest.size())
{
auto off1 = ((i - LCSuff[i][j] + 1) - 1);
auto off2 = off1 + LCSuff[i][j];
longest.clear();
// std::vector<int>().swap(longest);
longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
// x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
}
}
}
return longest;
}
void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
int &n_past, const int nctx, std::vector<int> &smartcontext,
bool useSmartContext, const bool requireFullSubset)
{
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
//fast forward the past based on identical tokens, stop once a divergence is noted
int embd_inp_len = embd_inp.size();
bool fastforwardok = true;
for (int i = 0; i < current_context_tokens.size(); ++i)
{
if (current_context_tokens[i] == embd_inp[i])
{
n_past += 1;
}
else
{
if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
{
n_past = 0;
fastforwardok = false;
}
break;
}
if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
{
if (i >= embd_inp_len)
{
n_past = 0;
fastforwardok = false;
break;
}
}
else
{
if ((i + 2) >= embd_inp_len)
{
break;
}
}
}
if(fastforwardok)
{
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
embd_inp_len = embd_inp.size();
}
//smart context mode, detect if we have a shifted context at max length
//requirement: previous context was at least nctx/2 longer than current,
//mode is on, and current context already maxed.
if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
{
//see if smartcontext is still usable
auto shared = LongestCommonSubseq(smartcontext, embd_inp);
if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common
{
int found = ArrFindIndexOf(embd_inp,shared);
if(found>=0)
{
auto trimmed = std::vector<int>(embd_inp.begin() + found, embd_inp.end());
embd_inp = trimmed;
embd_inp_len = embd_inp.size();
printf("\n[Reusing Smart Context: %d allowance remaining]", found);
int old_n_past = n_past;
int offset_fix = old_n_past;
if (current_context_tokens[n_past] != embd_inp[0])
{
offset_fix = 0;
}
for (int i = n_past; i < current_context_tokens.size(); ++i)
{
if (current_context_tokens[i] == embd_inp[i-offset_fix])
{
n_past += 1;
}
else
{
break;
}
if ((i + 2 - offset_fix) >= embd_inp_len)
{
break;
}
}
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past));
}else{
smartcontext.clear();
}
}
else
{
smartcontext.clear();
}
}
else
{
smartcontext.clear();
}
if(fastforwardok && useSmartContext
&& smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold
&& embd_inp_len >= SCInpLenThreshold
&& current_context_tokens.size() - n_past > SCPastLenThreshold)
{
//determine longest common substring after removing start part
int shiftamt = embd_inp.size() * SCTruncationRatio;
smartcontext = std::vector<int>(embd_inp.begin() + shiftamt, embd_inp.end());
printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt);
embd_inp = smartcontext;
//if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker.
//when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS
//remove all tokens between start part and start of LCS in new prompt, thus avoiding shift
//if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B
}
}

View File

@ -1,67 +0,0 @@
#pragma once
#include <cassert>
#include <cstring>
#include <fstream>
#include <regex>
#include <iostream>
#include <iterator>
#include <queue>
#include <string>
#include <math.h>
#include <vector>
enum FileFormat
{
BADFORMAT=0, //unknown, uninit, or failed to load
GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
GGHF=2, // 2=(llama ggmf)
GGJT=3, // 3=(llama ggjt)
GGJT_2=4, //newer llama format unshuffled
GGJT_3=5, //using 16bit scalar
GPTJ_1=100, //the very first super old GPTJ format
GPTJ_2=101, //pygmalion, uses old ggml lib
GPTJ_3=102, //uses new ggml lib
GPTJ_4=103, //unshuffled
GPTJ_5=104, //using 16bit scalar
GPT2_1=200,
GPT2_2=201,
GPT2_3=202, //unshuffled
GPT2_4=203, //using 16bit scalar
RWKV_1=300,
RWKV_2=301,
NEOX_1=400,
NEOX_2=401,
NEOX_3=402, //redpajama
NEOX_4=403, //unshuffled
NEOX_5=404, //unshuffled redpajama
NEOX_6=405, //using 16bit scalar
NEOX_7=406, //using 16bit scalar redpajama
MPT_1=500, //first supported mpt version
};
enum ModelLoadResult
{
FAIL = 0,
SUCCESS = 1,
RETRY_LOAD = 2, //used if it's suspected that the model is an older format
};
void timer_start();
double timer_check();
void print_tok_vec(std::vector<int> &embd);
void print_tok_vec(std::vector<float> &embd);
void print_vec(std::vector<std::string> &embd);
std::vector<int> LongestCommonSubseq(const std::vector<int> x, const std::vector<int> y);
bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> searchSeq);
int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);
FileFormat check_file_format(const std::string & fname);
void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
int &n_past, const int nctx, std::vector<int> &smartcontext,
const bool useSmartContext, const bool requireFullSubset);

View File

@ -1,464 +0,0 @@
#pragma once
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include "utils.h"
#include "model_adapter.h"
// default hparams (GPT-J 6B)
struct gptj_hparams {
int32_t n_vocab = 50400;
int32_t n_ctx = 2048;
int32_t n_embd = 4096;
int32_t n_head = 16;
int32_t n_layer = 28;
int32_t n_rot = 64;
int32_t ftype = 1;
};
struct gptj_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
// attention
struct ggml_tensor * c_attn_q_proj_w;
struct ggml_tensor * c_attn_k_proj_w;
struct ggml_tensor * c_attn_v_proj_w;
struct ggml_tensor * c_attn_proj_w;
// ff
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gptj_layer_v2 {
// normalization
struct ggml_v2_tensor * ln_1_g;
struct ggml_v2_tensor * ln_1_b;
// attention
struct ggml_v2_tensor * c_attn_q_proj_w;
struct ggml_v2_tensor * c_attn_k_proj_w;
struct ggml_v2_tensor * c_attn_v_proj_w;
struct ggml_v2_tensor * c_attn_proj_w;
// ff
struct ggml_v2_tensor * c_mlp_fc_w;
struct ggml_v2_tensor * c_mlp_fc_b;
struct ggml_v2_tensor * c_mlp_proj_w;
struct ggml_v2_tensor * c_mlp_proj_w_trans; //for backwards compatibility
struct ggml_v2_tensor * c_mlp_proj_b;
};
struct gptj_layer_v1 {
// normalization
struct ggml_v1_tensor * ln_1_g;
struct ggml_v1_tensor * ln_1_b;
// attention
struct ggml_v1_tensor * c_attn_q_proj_w;
struct ggml_v1_tensor * c_attn_k_proj_w;
struct ggml_v1_tensor * c_attn_v_proj_w;
struct ggml_v1_tensor * c_attn_proj_w;
// ff
struct ggml_v1_tensor * c_mlp_fc_w;
struct ggml_v1_tensor * c_mlp_fc_b;
struct ggml_v1_tensor * c_mlp_proj_w;
struct ggml_v1_tensor * c_mlp_proj_w_trans; //for backwards compatibility
struct ggml_v1_tensor * c_mlp_proj_b;
};
struct gptj_v1_model {
gptj_hparams hparams;
// normalization
struct ggml_v1_tensor * ln_f_g;
struct ggml_v1_tensor * ln_f_b;
struct ggml_v1_tensor * wte; // position embedding
struct ggml_v1_tensor * lmh_g; // language model head
struct ggml_v1_tensor * lmh_b; // language model bias
std::vector<gptj_layer_v1> layers;
// key + value memory
struct ggml_v1_tensor * memory_k;
struct ggml_v1_tensor * memory_v;
//
struct ggml_v1_context * ctx;
std::map<std::string, struct ggml_v1_tensor *> tensors;
};
struct gptj_v2_model {
gptj_hparams hparams;
// normalization
struct ggml_v2_tensor * ln_f_g;
struct ggml_v2_tensor * ln_f_b;
struct ggml_v2_tensor * wte; // position embedding
struct ggml_v2_tensor * lmh_g; // language model head
struct ggml_v2_tensor * lmh_b; // language model bias
std::vector<gptj_layer_v2> layers;
// key + value memory
struct ggml_v2_tensor * memory_k;
struct ggml_v2_tensor * memory_v;
//
struct ggml_v2_context * ctx;
std::map<std::string, struct ggml_v2_tensor *> tensors;
};
struct gptj_model {
gptj_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * lmh_g; // language model head
struct ggml_tensor * lmh_b; // language model bias
std::vector<gptj_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// default hparams (GPT-2 117M)
struct gpt2_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 1024;
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
int32_t ftype = 1;
};
struct gpt2_v1_layer {
// normalization
struct ggml_v1_tensor * ln_1_g;
struct ggml_v1_tensor * ln_1_b;
struct ggml_v1_tensor * ln_2_g;
struct ggml_v1_tensor * ln_2_b;
// attention
struct ggml_v1_tensor * c_attn_attn_w;
struct ggml_v1_tensor * c_attn_attn_b;
struct ggml_v1_tensor * c_attn_proj_w;
struct ggml_v1_tensor * c_attn_proj_b;
// mlp
struct ggml_v1_tensor * c_mlp_fc_w;
struct ggml_v1_tensor * c_mlp_fc_b;
struct ggml_v1_tensor * c_mlp_proj_w_trans; // transposed for efficiency
struct ggml_v1_tensor * c_mlp_proj_b;
};
struct gpt2_v1_model {
gpt2_hparams hparams;
// normalization
struct ggml_v1_tensor * ln_f_g;
struct ggml_v1_tensor * ln_f_b;
struct ggml_v1_tensor * wte; // position embedding
struct ggml_v1_tensor * wpe; // token embedding
std::vector<gpt2_v1_layer> layers;
// key + value memory
struct ggml_v1_tensor * memory_k;
struct ggml_v1_tensor * memory_v;
//
struct ggml_v1_context * ctx;
std::map<std::string, struct ggml_v1_tensor *> tensors;
};
struct gpt2_layer_v2 {
// normalization
struct ggml_v2_tensor * ln_1_g;
struct ggml_v2_tensor * ln_1_b;
struct ggml_v2_tensor * ln_2_g;
struct ggml_v2_tensor * ln_2_b;
// attention
struct ggml_v2_tensor * c_attn_attn_w;
struct ggml_v2_tensor * c_attn_attn_b;
struct ggml_v2_tensor * c_attn_proj_w;
struct ggml_v2_tensor * c_attn_proj_b;
// mlp
struct ggml_v2_tensor * c_mlp_fc_w;
struct ggml_v2_tensor * c_mlp_fc_b;
struct ggml_v2_tensor * c_mlp_proj_w;
struct ggml_v2_tensor * c_mlp_proj_b;
};
struct gpt2_v2_model {
gpt2_hparams hparams;
// normalization
struct ggml_v2_tensor * ln_f_g;
struct ggml_v2_tensor * ln_f_b;
struct ggml_v2_tensor * wte; // position embedding
struct ggml_v2_tensor * wpe; // token embedding
struct ggml_v2_tensor * lm_head; // language model head
std::vector<gpt2_layer_v2> layers;
// key + value memory
struct ggml_v2_tensor * memory_k;
struct ggml_v2_tensor * memory_v;
//
struct ggml_v2_context * ctx;
std::map<std::string, struct ggml_v2_tensor *> tensors;
};
struct gpt2_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
struct ggml_tensor * ln_2_g;
struct ggml_tensor * ln_2_b;
// attention
struct ggml_tensor * c_attn_attn_w;
struct ggml_tensor * c_attn_attn_b;
struct ggml_tensor * c_attn_proj_w;
struct ggml_tensor * c_attn_proj_b;
// mlp
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gpt2_model {
gpt2_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * wpe; // token embedding
struct ggml_tensor * lm_head; // language model head
std::vector<gpt2_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// default hparams (StableLM 3B)
struct gpt_neox_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 4096;
int32_t n_embd = 4096;
int32_t n_head = 32;
int32_t n_layer = 16;
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
int32_t par_res = 1; // 1 = true, 0 = false
int32_t ftype = 1;
};
struct gpt_neox_layer_v2 {
// pre normalization
struct ggml_v2_tensor * ln_1_g;
struct ggml_v2_tensor * ln_1_b;
// attention
struct ggml_v2_tensor * c_attn_attn_w;
struct ggml_v2_tensor * c_attn_attn_b;
struct ggml_v2_tensor * c_attn_proj_w;
struct ggml_v2_tensor * c_attn_proj_b;
// post normalization
struct ggml_v2_tensor * ln_2_g;
struct ggml_v2_tensor * ln_2_b;
// ff
struct ggml_v2_tensor * c_mlp_fc_w;
struct ggml_v2_tensor * c_mlp_fc_b;
struct ggml_v2_tensor * c_mlp_proj_w;
struct ggml_v2_tensor * c_mlp_proj_b;
};
struct gpt_neox_v2_model {
gpt_neox_hparams hparams;
// normalization
struct ggml_v2_tensor * ln_f_g;
struct ggml_v2_tensor * ln_f_b;
struct ggml_v2_tensor * wte; // position embedding
struct ggml_v2_tensor * lmh_g; // language model head
//struct ggml_tensor * lmh_b; // language model bias
std::vector<gpt_neox_layer_v2> layers;
// key + value memory
struct ggml_v2_tensor * memory_k;
struct ggml_v2_tensor * memory_v;
//
struct ggml_v2_context * ctx;
std::map<std::string, struct ggml_v2_tensor *> tensors;
};
struct gpt_neox_layer {
// pre normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
// attention
struct ggml_tensor * c_attn_attn_w;
struct ggml_tensor * c_attn_attn_b;
struct ggml_tensor * c_attn_proj_w;
struct ggml_tensor * c_attn_proj_b;
// post normalization
struct ggml_tensor * ln_2_g;
struct ggml_tensor * ln_2_b;
// ff
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gpt_neox_model {
gpt_neox_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * lmh_g; // language model head
//struct ggml_tensor * lmh_b; // language model bias
std::vector<gpt_neox_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// no defaults for now
struct mpt_hparams {
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
float alibi_bias_max = 0;
float clip_qkv = 0;
int32_t ftype = 0;
int32_t n_ctx = 0;
};
struct mpt_layer {
// pre normalization
struct ggml_tensor * norm_1_weight;
// attention
struct ggml_tensor * c_attn_wqkv_weight;
struct ggml_tensor * c_attn_out_proj_weight;
// post normalization
struct ggml_tensor * norm_2_weight;
// ff
struct ggml_tensor * ffn_up_proj;
struct ggml_tensor * ffn_down_proj;
};
struct mpt_model {
mpt_hparams hparams;
struct ggml_tensor * wte_weight; // position embedding
struct ggml_tensor * norm_f_weight; // language model head
std::vector<mpt_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers);
bool gpt_neox_eval(
const gpt_neox_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token,
bool use_scratch);

View File

@ -1,224 +0,0 @@
#include "utils.h"
#include <cmath>
#include <cstring>
#include <fstream>
#include <regex>
#include <locale>
#include <codecvt>
#include <sstream>
void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
size_t pos = 0;
while ((pos = str.find(needle, pos)) != std::string::npos) {
str.replace(pos, needle.length(), replacement);
pos += replacement.length();
}
}
std::map<std::string, int32_t> json_parse(const std::string & fname) {
std::map<std::string, int32_t> result;
// read file into string
std::string json;
{
std::ifstream ifs(fname);
if (!ifs) {
fprintf(stderr, "Failed to open %s\n", fname.c_str());
exit(1);
}
json = std::string((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
}
if (json[0] != '{') {
return result;
}
// parse json
{
bool has_key = false;
bool in_token = false;
std::string str_key = "";
std::string str_val = "";
int n = json.size();
for (int i = 1; i < n; ++i) {
if (!in_token) {
if (json[i] == ' ') continue;
if (json[i] == '"') {
in_token = true;
continue;
}
} else {
if (json[i] == '\\' && i+1 < n) {
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
++i;
} else if (json[i] == '"') {
if (has_key == false) {
has_key = true;
++i;
while (json[i] == ' ') ++i;
++i; // :
while (json[i] == ' ') ++i;
if (json[i] != '\"') {
while (json[i] != ',' && json[i] != '}') {
str_val += json[i++];
}
has_key = false;
} else {
in_token = true;
continue;
}
} else {
has_key = false;
}
::utreplace(str_key, "\\u0120", " " ); // \u0120 -> space
::utreplace(str_key, "\\u010a", "\n"); // \u010a -> new line
::utreplace(str_key, "\\\"", "\""); // \\\" -> "
try {
result[str_key] = std::stoi(str_val);
} catch (...) {
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
}
str_key = "";
str_val = "";
in_token = false;
continue;
}
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
}
}
}
return result;
}
void gpt_vocab::add_special_token(const std::string & token) {
special_tokens.push_back(token);
}
std::string convert_to_utf8(const std::wstring & input) {
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
return converter.to_bytes(input);
}
std::wstring convert_to_wstring(const std::string & input) {
try {
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
return converter.from_bytes(input);
} catch (const std::range_error& e) {
return L"";
} catch (...) {
return L"";
}
}
void gpt_split_words(std::string str, std::vector<std::string>& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
// Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) {
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
std::string special_tokens_subpattern;
for (const auto & token : vocab.special_tokens) {
if (!special_tokens_subpattern.empty()) {
special_tokens_subpattern += "|";
}
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
}
std::regex re(special_tokens_subpattern);
std::smatch m;
// Split the text by special tokens.
while (std::regex_search(str, m, re)) {
// Split the substrings in-between special tokens into words.
gpt_split_words(m.prefix(), words);
// Add matched special tokens as words.
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
// Remaining text without special tokens will be handled below.
}
gpt_split_words(str, words);
}
// find the longest token that forms each word in words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
for (unsigned long i = 0; i < word.size(); ){
for (unsigned long j = word.size() - 1; j >= i; j--){
auto cand = word.substr(i, j-i+1);
auto it = vocab.token_to_id.find(cand);
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
tokens.push_back(it->second);
i = j + 1;
break;
}
else if (j == i){ // word.substr(i, 1) has no matching
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
i++;
}
}
}
}
return tokens;
}
bool should_transpose_layer(std::string name)
{
if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
name.find(".attn.out_proj.weight")!=std::string::npos ||
name.find(".attn.q_proj.weight")!=std::string::npos ||
name.find(".attn.k_proj.weight")!=std::string::npos ||
name.find(".attn.v_proj.weight")!=std::string::npos ||
name.find("/attn/c_attn/w")!=std::string::npos ||
name.find("/attn/c_proj/w")!=std::string::npos ||
name.find("/mlp/c_fc/w")!=std::string::npos ||
name.find("/mlp/c_proj/w")!=std::string::npos)
{
return true;
}
return false;
}