2023-11-07 14:48:48 +00:00
|
|
|
//
|
|
|
|
// Created by alex on 11/7/23.
|
|
|
|
//
|
|
|
|
|
|
|
|
#include <string>
|
2024-01-30 15:14:02 +00:00
|
|
|
#include <iostream>
|
|
|
|
#include <sstream>
|
|
|
|
#include <chrono>
|
|
|
|
#include <iomanip>
|
2023-11-07 14:48:48 +00:00
|
|
|
#include "org_futo_inputmethod_latin_xlm_AdapterTrainer.h"
|
|
|
|
#include "defines.h"
|
|
|
|
#include "jni_common.h"
|
|
|
|
#include "ggml/finetune.h"
|
|
|
|
#include "sentencepiece/sentencepiece_processor.h"
|
2023-11-25 07:13:50 +00:00
|
|
|
#include "jni_utils.h"
|
2024-01-30 15:14:02 +00:00
|
|
|
#include "ggml/ModelMeta.h"
|
2023-11-07 14:48:48 +00:00
|
|
|
|
|
|
|
namespace latinime {
|
|
|
|
struct AdapterTrainerState {
|
|
|
|
std::string baseModelPath;
|
2023-11-14 18:40:00 +00:00
|
|
|
std::string loraCachePath;
|
|
|
|
std::string outputModelPath;
|
|
|
|
float outputScale;
|
2023-11-07 14:48:48 +00:00
|
|
|
|
2024-01-30 15:14:02 +00:00
|
|
|
ModelMetadata metadata;
|
|
|
|
|
2023-11-07 14:48:48 +00:00
|
|
|
sentencepiece::SentencePieceProcessor spm;
|
|
|
|
struct train_params params;
|
|
|
|
|
2023-11-14 16:11:00 +00:00
|
|
|
static void OnLossCallback(void *userdata, float loss) {
|
|
|
|
auto *state = reinterpret_cast<AdapterTrainerState *>(userdata);
|
|
|
|
state->OnLoss(loss);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void OnProgressCallback(void *userdata, float progress) {
|
|
|
|
auto *state = reinterpret_cast<AdapterTrainerState *>(userdata);
|
|
|
|
state->OnProgress(progress);
|
|
|
|
}
|
|
|
|
|
|
|
|
JNIEnv *env;
|
|
|
|
jobject callbackObject;
|
|
|
|
jmethodID lossMethodId;
|
|
|
|
jmethodID progressMethodId;
|
|
|
|
void OnLoss(float loss) const {
|
|
|
|
env->CallVoidMethod(callbackObject, lossMethodId, loss);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OnProgress(float progress) const {
|
|
|
|
env->CallVoidMethod(callbackObject, progressMethodId, progress);
|
|
|
|
}
|
|
|
|
|
2023-11-07 14:48:48 +00:00
|
|
|
bool Initialize() {
|
2024-01-30 15:14:02 +00:00
|
|
|
metadata = loadModelMetadata(baseModelPath.c_str());
|
|
|
|
|
|
|
|
// TODO: Gracefully handle errors
|
|
|
|
ASSERT(!metadata.error);
|
|
|
|
ASSERT(metadata.ext_tokenizer_type == ExternalTokenizerType::SentencePiece);
|
|
|
|
|
|
|
|
|
2023-11-07 14:48:48 +00:00
|
|
|
params = get_default_train_params();
|
|
|
|
params.common.fn_train_data = "";
|
|
|
|
params.common.fn_checkpoint_in = "";
|
|
|
|
params.common.fn_checkpoint_out = "";
|
|
|
|
params.fn_model_base = baseModelPath.c_str();
|
2023-11-14 18:40:00 +00:00
|
|
|
params.fn_lora_out = loraCachePath.c_str();
|
2023-11-07 14:48:48 +00:00
|
|
|
|
|
|
|
params.common.fill_with_next_samples = true;
|
2023-11-13 14:42:01 +00:00
|
|
|
params.common.n_threads = 6;
|
|
|
|
params.common.n_gradient_accumulation = 2;
|
|
|
|
params.common.n_batch = 2;
|
2023-11-21 15:07:43 +00:00
|
|
|
params.common.n_ctx = 64;
|
2023-11-13 14:42:01 +00:00
|
|
|
params.common.sample_random_offsets = true;
|
|
|
|
|
|
|
|
params.common.warmup = 10;
|
2023-11-14 09:43:36 +00:00
|
|
|
params.common.n_epochs = 1;
|
2023-11-07 14:48:48 +00:00
|
|
|
params.common.adam_alpha = 1e-3;
|
2023-11-21 15:07:43 +00:00
|
|
|
params.common.adam_n_iter = 128;
|
2023-11-13 14:42:01 +00:00
|
|
|
|
|
|
|
// Increasing/decreasing this doesn't appear to significantly affect training time
|
|
|
|
params.lora_r = 16;
|
|
|
|
params.lora_alpha = 16;
|
2023-11-07 14:48:48 +00:00
|
|
|
|
2023-11-14 16:11:00 +00:00
|
|
|
params.common.callbacks.userdata = this;
|
|
|
|
params.common.callbacks.loss = AdapterTrainerState::OnLossCallback;
|
|
|
|
params.common.callbacks.progress = AdapterTrainerState::OnProgressCallback;
|
|
|
|
|
2024-01-30 15:14:02 +00:00
|
|
|
if(!spm.LoadFromSerializedProto(metadata.ext_tokenizer_data).ok()){
|
|
|
|
AKLOGE("Failed to load tokenizer!");
|
2023-11-07 14:48:48 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void AddTrainingExample(const std::string &example) {
|
|
|
|
std::vector<llama_token> result = spm.EncodeAsIds(example);
|
|
|
|
params.training_data.push_back(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
int Train() const {
|
|
|
|
return finetune_train(params);
|
|
|
|
}
|
2024-01-30 15:14:02 +00:00
|
|
|
|
|
|
|
void UpdateHistoryAndCount(std::chrono::system_clock::time_point start, std::chrono::system_clock::time_point end) {
|
|
|
|
std::chrono::duration<double> elapsed_seconds = end - start;
|
|
|
|
|
|
|
|
int num_examples = params.training_data.size();
|
|
|
|
int num_tokens = 0;
|
|
|
|
for(const auto & example: params.training_data) {
|
|
|
|
num_tokens += example.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
time_t rawtime;
|
|
|
|
struct tm * timeinfo;
|
|
|
|
char date_time[32];
|
|
|
|
|
|
|
|
// Convert time_point to time_t
|
|
|
|
rawtime = std::chrono::system_clock::to_time_t(start);
|
|
|
|
// Convert time_t to tm struct
|
|
|
|
timeinfo = localtime(&rawtime);
|
|
|
|
|
|
|
|
// Format the date and time in ISO format
|
|
|
|
strftime(date_time, sizeof(date_time), "%Y-%m-%d %H:%M:%SZ", timeinfo);
|
|
|
|
|
|
|
|
// Create a stringstream object
|
|
|
|
std::stringstream ss;
|
|
|
|
|
|
|
|
// Format the string using the stringstream object
|
|
|
|
ss << "\n" << date_time << ": Fine-tuned on " << num_examples << " examples (" << num_tokens << " tokens), took "
|
|
|
|
<< std::fixed << std::setprecision(2) << elapsed_seconds.count() / 60.0 << " minutes";
|
|
|
|
|
|
|
|
// Convert the stringstream object to a std::string
|
|
|
|
std::string result = ss.str();
|
|
|
|
|
|
|
|
metadata.finetuning_count += 1;
|
|
|
|
metadata.history.append(result);
|
|
|
|
}
|
2023-11-07 14:48:48 +00:00
|
|
|
};
|
|
|
|
|
2024-01-30 15:14:02 +00:00
|
|
|
static jlong xlm_AdapterTrainer_open(JNIEnv *env, jclass clazz, jstring baseModelPathStr, jstring loraCacheStr, jstring outputModelPathStr, float outputScale) {
|
2023-11-07 14:48:48 +00:00
|
|
|
auto *state = new AdapterTrainerState();
|
2023-11-14 18:40:00 +00:00
|
|
|
state->baseModelPath = jstring2string(env, baseModelPathStr);
|
|
|
|
state->loraCachePath = jstring2string(env, loraCacheStr);
|
|
|
|
state->outputModelPath = jstring2string(env, outputModelPathStr);
|
|
|
|
state->outputScale = outputScale;
|
2023-11-07 14:48:48 +00:00
|
|
|
|
2023-11-14 16:11:00 +00:00
|
|
|
state->env = env;
|
|
|
|
|
2023-11-07 14:48:48 +00:00
|
|
|
if(!state->Initialize()) {
|
|
|
|
delete state;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return reinterpret_cast<jlong>(state);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void xlm_AdapterTrainer_close(JNIEnv *env, jclass clazz, jlong statePtr) {
|
|
|
|
auto *state = reinterpret_cast<AdapterTrainerState *>(statePtr);
|
|
|
|
if(state == nullptr) return;
|
|
|
|
delete state;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void xlm_AdapterTrainer_addExample(JNIEnv *env, jclass clazz, jlong statePtr, jstring exampleStr) {
|
|
|
|
auto *state = reinterpret_cast<AdapterTrainerState *>(statePtr);
|
|
|
|
state->AddTrainingExample(jstring2string(env, exampleStr));
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: Callback for progress
|
2023-11-14 16:11:00 +00:00
|
|
|
static void xlm_AdapterTrainer_train(JNIEnv *env, jobject instance, jlong statePtr) {
|
2024-01-30 15:14:02 +00:00
|
|
|
|
2023-11-14 16:11:00 +00:00
|
|
|
jclass clazz = env->GetObjectClass(instance);
|
2023-11-21 18:33:21 +00:00
|
|
|
ASSERT(clazz);
|
2023-11-14 16:11:00 +00:00
|
|
|
|
|
|
|
jmethodID progressMethodId = env->GetMethodID(clazz, "emitProgress", "(F)V");
|
|
|
|
jmethodID lossMethodId = env->GetMethodID(clazz, "emitLoss", "(F)V");
|
2023-11-21 18:33:21 +00:00
|
|
|
ASSERT(progressMethodId);
|
|
|
|
ASSERT(lossMethodId);
|
2023-11-14 16:11:00 +00:00
|
|
|
|
2023-11-07 14:48:48 +00:00
|
|
|
auto *state = reinterpret_cast<AdapterTrainerState *>(statePtr);
|
2023-11-14 16:11:00 +00:00
|
|
|
state->env = env;
|
|
|
|
state->lossMethodId = lossMethodId;
|
|
|
|
state->progressMethodId = progressMethodId;
|
|
|
|
state->callbackObject = instance;
|
|
|
|
|
2024-01-30 15:14:02 +00:00
|
|
|
std::chrono::system_clock::time_point start, end;
|
|
|
|
start = std::chrono::system_clock::now();
|
|
|
|
|
2023-11-07 14:48:48 +00:00
|
|
|
int result = state->Train();
|
|
|
|
if(result != 0) {
|
|
|
|
AKLOGE("train returned with non-zero code %d", result);
|
2023-11-14 18:40:00 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-01-30 15:14:02 +00:00
|
|
|
end = std::chrono::system_clock::now();
|
|
|
|
|
|
|
|
// Increment count and add history
|
|
|
|
state->UpdateHistoryAndCount(start, end);
|
|
|
|
|
2023-11-14 18:40:00 +00:00
|
|
|
// Apply LoRA
|
|
|
|
llama_model_params model_params = llama_model_default_params();
|
|
|
|
model_params.use_mmap = false;
|
|
|
|
|
|
|
|
llama_model *model = llama_load_model_from_file(state->baseModelPath.c_str(), model_params);
|
|
|
|
|
|
|
|
if(model == nullptr) {
|
|
|
|
AKLOGE("failed to load model for exporting LoRA");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
int err = llama_model_apply_lora_from_file(
|
|
|
|
model,
|
|
|
|
state->loraCachePath.c_str(),
|
|
|
|
state->outputScale,
|
|
|
|
nullptr,
|
|
|
|
4
|
|
|
|
);
|
|
|
|
if(err != 0) {
|
|
|
|
AKLOGE("Failed to apply lora: %d", err);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
int status = save_llama_model_file(
|
|
|
|
state->outputModelPath.c_str(),
|
|
|
|
state->baseModelPath.c_str(),
|
2024-01-30 15:14:02 +00:00
|
|
|
model,
|
|
|
|
state->metadata
|
2023-11-14 18:40:00 +00:00
|
|
|
);
|
|
|
|
if(status != 0) {
|
|
|
|
AKLOGE("Failed to save model! %d", status);
|
|
|
|
return;
|
2023-11-07 14:48:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const JNINativeMethod sMethods[] = {
|
|
|
|
{
|
|
|
|
const_cast<char *>("openNative"),
|
2024-01-30 15:14:02 +00:00
|
|
|
const_cast<char *>("(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;F)J"),
|
2023-11-07 14:48:48 +00:00
|
|
|
reinterpret_cast<void *>(xlm_AdapterTrainer_open)
|
|
|
|
},
|
|
|
|
{
|
|
|
|
const_cast<char *>("closeNative"),
|
|
|
|
const_cast<char *>("(J)V"),
|
|
|
|
reinterpret_cast<void *>(xlm_AdapterTrainer_close)
|
|
|
|
},
|
|
|
|
{
|
|
|
|
const_cast<char *>("addExample"),
|
|
|
|
const_cast<char *>("(JLjava/lang/String;)V"),
|
|
|
|
reinterpret_cast<void *>(xlm_AdapterTrainer_addExample)
|
|
|
|
},
|
|
|
|
{
|
|
|
|
const_cast<char *>("train"),
|
|
|
|
const_cast<char *>("(J)V"),
|
|
|
|
reinterpret_cast<void *>(xlm_AdapterTrainer_train)
|
|
|
|
},
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
int register_AdapterTrainer(JNIEnv *env) {
|
|
|
|
const char *const kClassPathName = "org/futo/inputmethod/latin/xlm/AdapterTrainer";
|
|
|
|
return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods));
|
|
|
|
}
|
|
|
|
}
|