Initial fine-tuning

2024-09-28 14:54:30 +01:00 · 2023-11-07 16:48:48 +02:00 · 2023-11-07 16:48:48 +02:00 · ee8a81f12c
commit ee8a81f12c
parent 5778cd15a0
19 changed files with 4340 additions and 7 deletions
--- a/java/src/org/futo/inputmethod/latin/uix/ActionBar.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/ActionBar.kt
@ -251,7 +251,6 @@ fun RowScope.SuggestionItems(words: SuggestedWords, onClick: (i: Int) -> Unit) {

    var offset = 0

-    // Don't show what the user is typing
    try {
        val info = words.getInfo(0)
        if (info.kind == KIND_TYPED && !info.isExactMatch && !info.isExactMatchWithIntentionalOmission) {
--- a/java/src/org/futo/inputmethod/latin/uix/settings/SettingsNavigator.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/settings/SettingsNavigator.kt
@ -8,6 +8,7 @@ import androidx.navigation.compose.rememberNavController
 import org.futo.inputmethod.latin.uix.settings.pages.HomeScreen
 import org.futo.inputmethod.latin.uix.settings.pages.PredictiveTextScreen
 import org.futo.inputmethod.latin.uix.settings.pages.ThemeScreen
+import org.futo.inputmethod.latin.uix.settings.pages.TrainDevScreen
 import org.futo.inputmethod.latin.uix.settings.pages.TypingScreen
 import org.futo.inputmethod.latin.uix.settings.pages.VoiceInputScreen

@ -24,5 +25,6 @@ fun SettingsNavigator(
        composable("typing") { TypingScreen(navController) }
        composable("voiceInput") { VoiceInputScreen(navController) }
        composable("themes") { ThemeScreen(navController) }
+        composable("trainDev") { TrainDevScreen(navController) }
    }
 }
--- a/java/src/org/futo/inputmethod/latin/uix/settings/pages/Home.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/settings/pages/Home.kt
@ -59,6 +59,13 @@ fun HomeScreen(navController: NavHostController = rememberNavController()) {
            icon = painterResource(id = R.drawable.eye)
        )

+        NavigationItem(
+            title = "Training",
+            style = NavigationItemStyle.HomeTertiary,
+            navigate = { navController.navigate("trainDev") },
+            icon = painterResource(id = R.drawable.delete)
+        )
+
        /*
        NavigationItem(
            title = "Advanced",
--- a/java/src/org/futo/inputmethod/latin/uix/settings/pages/TrainDev.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/settings/pages/TrainDev.kt
@ -0,0 +1,144 @@
+package org.futo.inputmethod.latin.uix.settings.pages
+
+import android.content.Context
+import androidx.compose.foundation.layout.Column
+import androidx.compose.foundation.layout.fillMaxSize
+import androidx.compose.material3.Button
+import androidx.compose.material3.ExperimentalMaterial3Api
+import androidx.compose.material3.Text
+import androidx.compose.material3.TextField
+import androidx.compose.runtime.Composable
+import androidx.compose.runtime.getValue
+import androidx.compose.runtime.mutableStateOf
+import androidx.compose.runtime.remember
+import androidx.compose.runtime.setValue
+import androidx.compose.ui.Modifier
+import androidx.compose.ui.platform.LocalContext
+import androidx.compose.ui.platform.LocalLifecycleOwner
+import androidx.compose.ui.tooling.preview.Preview
+import androidx.lifecycle.LifecycleCoroutineScope
+import androidx.lifecycle.lifecycleScope
+import androidx.navigation.NavHostController
+import androidx.navigation.compose.rememberNavController
+import kotlinx.coroutines.coroutineScope
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import org.futo.inputmethod.latin.R
+import org.futo.inputmethod.latin.uix.settings.ScreenTitle
+import org.futo.inputmethod.latin.uix.settings.ScrollableList
+import org.futo.inputmethod.latin.xlm.AdapterTrainerBuilder
+import java.io.File
+import java.io.FileOutputStream
+import java.io.IOException
+import java.io.OutputStream
+
+
+private fun getPathToModelResource(
+    context: Context,
+    modelResource: Int,
+    tokenizerResource: Int,
+    forceDelete: Boolean
+): Pair<String, String> {
+    val outputDir = context.cacheDir
+    val outputFile = File(outputDir, "ggml-model-$modelResource.gguf")
+    val outputFileTokenizer = File(
+        outputDir,
+        "tokenizer-$tokenizerResource.tokenizer"
+    )
+    if (forceDelete && outputFile.exists()) {
+        outputFile.delete()
+        outputFileTokenizer.delete()
+    }
+    if (!outputFile.exists() || forceDelete) {
+        // FIXME: We save this to a random temporary file so that we can have a path instead of an InputStream
+        val `is` = context.resources.openRawResource(modelResource)
+        val is_t = context.resources.openRawResource(tokenizerResource)
+        try {
+            val os: OutputStream = FileOutputStream(outputFile)
+            var read = 0
+            val bytes = ByteArray(1024)
+            while (`is`.read(bytes).also { read = it } != -1) {
+                os.write(bytes, 0, read)
+            }
+            os.flush()
+            os.close()
+            `is`.close()
+            val os_t: OutputStream = FileOutputStream(outputFileTokenizer)
+            read = 0
+            while (is_t.read(bytes).also { read = it } != -1) {
+                os_t.write(bytes, 0, read)
+            }
+            os_t.flush()
+            os_t.close()
+            is_t.close()
+        } catch (e: IOException) {
+            e.printStackTrace()
+            throw RuntimeException("Failed to write model asset to file")
+        }
+    }
+    return Pair(outputFile.absolutePath, outputFileTokenizer.absolutePath)
+}
+
+
+val exampleText = """
+GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms.
+Circles - A private photo sharing feed for families. Circles - A private photo sharing feed for families. Circles - A private photo sharing feed for families. Circles - A private photo sharing feed for families. Circles - A private photo sharing feed for families.
+Live Captions - Accessible live captions that are completely private. Live Captions - Accessible live captions that are completely private. Live Captions - Accessible live captions that are completely private. Live Captions - Accessible live captions that are completely private. Live Captions - Accessible live captions that are completely private.
+Polycentric - A distributed text-based social network centered around communities. Polycentric - A distributed text-based social network centered around communities. Polycentric - A distributed text-based social network centered around communities. Polycentric - A distributed text-based social network centered around communities. Polycentric - A distributed text-based social network centered around communities.
+FUBS - A frictionless and modifiable software development system. FUBS - A frictionless and modifiable software development system. FUBS - A frictionless and modifiable software development system. FUBS - A frictionless and modifiable software development system. FUBS - A frictionless and modifiable software development system.
+Harbor - An app for preserving identity on the internet. Harbor - An app for preserving identity on the internet. Harbor - An app for preserving identity on the internet. Harbor - An app for preserving identity on the internet. Harbor - An app for preserving identity on the internet.
+FUTO Voice Input - A privacy-friendly voice input application. FUTO Voice Input - A privacy-friendly voice input application. FUTO Voice Input - A privacy-friendly voice input application. FUTO Voice Input - A privacy-friendly voice input application. FUTO Voice Input - A privacy-friendly voice input application.
+GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms. GrayJay - A universal video app for following creators, not platforms.
+""".trimIndent()
+
+@OptIn(ExperimentalMaterial3Api::class)
+@Preview
+@Composable
+fun TrainDevScreen(navController: NavHostController = rememberNavController()) {
+    var trainText by remember { mutableStateOf(exampleText.trim()) }
+    var isTraining by remember { mutableStateOf(false) }
+
+    val context = LocalContext.current
+    ScrollableList {
+        ScreenTitle("Training", showBack = true, navController)
+
+
+        TextField(
+            value = trainText,
+            onValueChange = { trainText = it },
+            enabled = !isTraining
+        )
+
+        val scope = LocalLifecycleOwner.current
+        Button(onClick = {
+            val result = getPathToModelResource(context, R.raw.ml4_f16, R.raw.ml3_tokenizer, true)
+
+            val outputDir = context.cacheDir
+            val outputFile = File(outputDir, "test-adapter.bin")
+
+            val builder = AdapterTrainerBuilder(
+                result.first,
+                result.second,
+                outputFile.absolutePath
+            )
+
+            builder.addExamples(trainText.lines())
+
+            val trainer = builder.loadAndPrepare()
+
+            scope.lifecycleScope.launch {
+                isTraining = true
+                println("Staring to train")
+                trainer.train()
+                println("Finished training")
+                isTraining = false
+            }
+        }, enabled = !isTraining) {
+            if(isTraining) {
+                Text("Currently training, check status in logcat")
+            } else {
+                Text("Train model")
+            }
+        }
+    }
+}
--- a/java/src/org/futo/inputmethod/latin/xlm/AdapterTrainer.kt
+++ b/java/src/org/futo/inputmethod/latin/xlm/AdapterTrainer.kt
@ -0,0 +1,48 @@
+package org.futo.inputmethod.latin.xlm
+
+import kotlinx.coroutines.DelicateCoroutinesApi
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.newSingleThreadContext
+import kotlinx.coroutines.withContext
+
+@OptIn(DelicateCoroutinesApi::class)
+val TrainingContext = newSingleThreadContext("AdapterTrainingContext")
+
+class AdapterTrainer(baseModelPath: String, tokenizerPath: String, checkpointPath: String, examples: List<String>) {
+    private external fun openNative(baseModelPath: String, tokenizerPath: String, outputPath: String): Long
+    private external fun closeNative(handle: Long)
+    private external fun addExample(handle: Long, example: String)
+    private external fun train(handle: Long) // Long-running function
+
+    private var handle: Long = 0L
+    private fun isHandleValid() = handle != 0L
+
+    init {
+        handle = openNative(baseModelPath, tokenizerPath, checkpointPath)
+        if(!isHandleValid()) {
+            throw IllegalArgumentException("Failed to initialize AdapterTrainer with given parameters")
+        }
+
+        examples.forEach {
+            if(it.isNotBlank()) {
+                addExample(handle, it.trim())
+            }
+        }
+    }
+
+    suspend fun train() = withContext(TrainingContext) {
+        if(!isHandleValid()) throw IllegalStateException("Attempting to train with null handle")
+        train(handle)
+    }
+}
+
+class AdapterTrainerBuilder(val baseModelPath: String, val tokenizerPath: String, val checkpointPath: String) {
+    private val examples = mutableListOf<String>()
+    fun addExamples(newExamples: List<String>) {
+        examples.addAll(newExamples)
+    }
+
+    fun loadAndPrepare(): AdapterTrainer {
+        return AdapterTrainer(baseModelPath, tokenizerPath, checkpointPath, examples)
+    }
+}
--- a/java/src/org/futo/inputmethod/latin/xlm/LanguageModel.java
+++ b/java/src/org/futo/inputmethod/latin/xlm/LanguageModel.java
@ -95,16 +95,16 @@ public class LanguageModel extends Dictionary {
            @Override public void run() {
                if(mNativeState != 0) return;

-                String modelPath = getPathToModelResource(context, R.raw.ml3_q8, R.raw.ml3_tokenizer, false);
+                String modelPath = getPathToModelResource(context, R.raw.ml4_f16, R.raw.ml3_tokenizer, true);
                mNativeState = openNative(modelPath);

                if(mNativeState == 0){
-                    modelPath = getPathToModelResource(context, R.raw.ml3_q8, R.raw.ml3_tokenizer, true);
+                    modelPath = getPathToModelResource(context, R.raw.ml4_f16, R.raw.ml3_tokenizer, true);
                    mNativeState = openNative(modelPath);
                }

                if(mNativeState == 0){
-                    throw new RuntimeException("Failed to load R.raw.ml3_q8, R.raw.ml3_tokenizer model");
+                    throw new RuntimeException("Failed to load R.raw.ml4_f16, R.raw.ml3_tokenizer model");
                }
            }
        };
--- a/native/jni/Android.mk
+++ b/native/jni/Android.mk
@ -14,10 +14,12 @@

 LOCAL_PATH := $(call my-dir)

+LOCAL_ARM_NEON := true
+
 ############ some local flags
 # If you change any of those flags, you need to rebuild both libjni_latinime_common_static
 # and the shared library that uses libjni_latinime_common_static.
-FLAG_DBG ?= false
+FLAG_DBG ?= true
 FLAG_DO_PROFILE ?= false

 ######################################
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@ -18,6 +18,7 @@ LATIN_IME_JNI_SRC_FILES := \
    org_futo_inputmethod_latin_BinaryDictionaryUtils.cpp \
    org_futo_inputmethod_latin_DicTraverseSession.cpp \
    org_futo_inputmethod_latin_xlm_LanguageModel.cpp \
+    org_futo_inputmethod_latin_xlm_AdapterTrainer.cpp \
    jni_common.cpp

 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src/sentencepiece/builtin_pb
@ -33,8 +34,11 @@ LATIN_IME_CORE_SRC_FILES := \
    ggml/ggml-alloc.c \
    ggml/ggml-quants.c \
    ggml/ggml-backend.c \
-    ggml/LanguageModel.cpp \
    ggml/llama.cpp \
+    ggml/finetune.cpp \
+    ggml/train.cpp \
+    ggml/common.cpp \
+    ggml/LanguageModel.cpp \
    third_party/protobuf-lite/arena.cc \
    third_party/protobuf-lite/arenastring.cc \
    third_party/protobuf-lite/bytestream.cc \
--- a/native/jni/jni_common.cpp
+++ b/native/jni/jni_common.cpp
@ -24,6 +24,7 @@
 #include "org_futo_inputmethod_latin_DicTraverseSession.h"
 #include "org_futo_inputmethod_latin_xlm_LanguageModel.h"
 #include "defines.h"
+#include "org_futo_inputmethod_latin_xlm_AdapterTrainer.h"

 /*
 * Returns the JNI version on success, -1 on failure.
@ -60,6 +61,10 @@ jint JNI_OnLoad(JavaVM *vm, void *reserved) {
        AKLOGE("ERROR: LanguageModel native registration failed");
        return -1;
    }
+    if (!latinime::register_AdapterTrainer(env)) {
+        AKLOGE("ERROR: AdapterTrainer native registration failed");
+        return -1;
+    }
    /* success -- return valid version number */
    return JNI_VERSION_1_6;
 }
--- a/native/jni/org_futo_inputmethod_latin_xlm_AdapterTrainer.cpp
+++ b/native/jni/org_futo_inputmethod_latin_xlm_AdapterTrainer.cpp
@ -0,0 +1,132 @@
+//
+// Created by alex on 11/7/23.
+//
+
+#include <string>
+#include "org_futo_inputmethod_latin_xlm_AdapterTrainer.h"
+#include "defines.h"
+#include "jni_common.h"
+#include "ggml/finetune.h"
+#include "sentencepiece/sentencepiece_processor.h"
+
+
+std::string jstring2string(JNIEnv *env, jstring jStr) {
+    const jsize stringUtf8Length = env->GetStringUTFLength(jStr);
+    if (stringUtf8Length <= 0) {
+        AKLOGE("Can't get jStr");
+        return "";
+    }
+    char stringChars[stringUtf8Length + 1];
+    env->GetStringUTFRegion(jStr, 0, env->GetStringLength(jStr), stringChars);
+    stringChars[stringUtf8Length] = '\0';
+
+    return {stringChars};
+}
+
+
+namespace latinime {
+    struct AdapterTrainerState {
+        std::string baseModelPath;
+        std::string tokenizerPath;
+        std::string outputPath;
+
+        sentencepiece::SentencePieceProcessor spm;
+        struct train_params params;
+
+        bool Initialize() {
+            params = get_default_train_params();
+            params.common.fn_train_data = "";
+            params.common.fn_checkpoint_in = "";
+            params.common.fn_checkpoint_out = "";
+            params.fn_model_base = baseModelPath.c_str();
+            params.fn_lora_out = outputPath.c_str();
+
+            params.common.fill_with_next_samples = true;
+            params.common.n_threads = 8;
+            params.common.warmup = 4;
+            params.common.adam_alpha = 1e-3;
+            params.common.adam_n_iter = 32;
+
+            // TODO: Check model path valid / try to pre-load resources?
+
+            if(!spm.Load(tokenizerPath).ok()){
+                AKLOGE("Failed to load tokenizer at path %s!", tokenizerPath.c_str());
+                return false;
+            }
+
+            return true;
+        }
+
+        void AddTrainingExample(const std::string &example) {
+            std::vector<llama_token> result = spm.EncodeAsIds(example);
+            params.training_data.push_back(result);
+        }
+
+        int Train() const {
+            return finetune_train(params);
+        }
+    };
+
+    static jlong xlm_AdapterTrainer_open(JNIEnv *env, jclass clazz, jstring baseModelPathStr, jstring tokenizerPathStr, jstring outputPathStr) {
+        auto *state = new AdapterTrainerState();
+        state->baseModelPath = jstring2string(env, baseModelPathStr);
+        state->tokenizerPath = jstring2string(env, tokenizerPathStr);
+        state->outputPath    = jstring2string(env, outputPathStr);
+
+        if(!state->Initialize()) {
+            delete state;
+            return 0;
+        }
+
+        return reinterpret_cast<jlong>(state);
+    }
+
+    static void xlm_AdapterTrainer_close(JNIEnv *env, jclass clazz, jlong statePtr) {
+        auto *state = reinterpret_cast<AdapterTrainerState *>(statePtr);
+        if(state == nullptr) return;
+        delete state;
+    }
+
+    static void xlm_AdapterTrainer_addExample(JNIEnv *env, jclass clazz, jlong statePtr, jstring exampleStr) {
+        auto *state = reinterpret_cast<AdapterTrainerState *>(statePtr);
+        state->AddTrainingExample(jstring2string(env, exampleStr));
+    }
+
+    // TODO: Callback for progress
+    static void xlm_AdapterTrainer_train(JNIEnv *env, jclass clazz, jlong statePtr) {
+        auto *state = reinterpret_cast<AdapterTrainerState *>(statePtr);
+        int result = state->Train();
+        if(result != 0) {
+            AKLOGE("train returned with non-zero code %d", result);
+        }
+    }
+
+    static const JNINativeMethod sMethods[] = {
+            {
+                    const_cast<char *>("openNative"),
+                    const_cast<char *>("(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)J"),
+                    reinterpret_cast<void *>(xlm_AdapterTrainer_open)
+            },
+            {
+                    const_cast<char *>("closeNative"),
+                    const_cast<char *>("(J)V"),
+                    reinterpret_cast<void *>(xlm_AdapterTrainer_close)
+            },
+            {
+                    const_cast<char *>("addExample"),
+                    const_cast<char *>("(JLjava/lang/String;)V"),
+                    reinterpret_cast<void *>(xlm_AdapterTrainer_addExample)
+            },
+            {
+                    const_cast<char *>("train"),
+                    const_cast<char *>("(J)V"),
+                    reinterpret_cast<void *>(xlm_AdapterTrainer_train)
+            },
+
+    };
+
+    int register_AdapterTrainer(JNIEnv *env) {
+        const char *const kClassPathName = "org/futo/inputmethod/latin/xlm/AdapterTrainer";
+        return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods));
+    }
+}
--- a/native/jni/org_futo_inputmethod_latin_xlm_AdapterTrainer.h
+++ b/native/jni/org_futo_inputmethod_latin_xlm_AdapterTrainer.h
@ -0,0 +1,14 @@
+//
+// Created by alex on 11/7/23.
+//
+
+#ifndef LATINIME_ORG_FUTO_INPUTMETHOD_LATIN_XLM_ADAPTERTRAINER_H
+#define LATINIME_ORG_FUTO_INPUTMETHOD_LATIN_XLM_ADAPTERTRAINER_H
+
+#include "jni.h"
+
+namespace latinime {
+    int register_AdapterTrainer(JNIEnv *env);
+} // namespace latinime
+
+#endif //LATINIME_ORG_FUTO_INPUTMETHOD_LATIN_XLM_ADAPTERTRAINER_H
--- a/native/jni/org_futo_inputmethod_latin_xlm_LanguageModel.cpp
+++ b/native/jni/org_futo_inputmethod_latin_xlm_LanguageModel.cpp
@ -397,6 +397,7 @@ struct LanguageModelState {

    std::vector<std::pair<float, std::string>> PredictNextWord(const std::string &context) {
        token_sequence next_context = model->tokenize(trim(context) + " ");
+        next_context.insert(next_context.begin(), 1); // BOS
        //model->updateContext(next_context);

        auto results = Sample(next_context, 3);
@ -415,6 +416,7 @@ struct LanguageModelState {
            next_context = model->tokenize(trim(context) + " ");
        }

+        next_context.insert(next_context.begin(), 1); // BOS
        next_context.push_back(specialTokens.XBU);

        for(char c : trim(word)) {
@ -458,7 +460,7 @@ namespace latinime {
        LanguageModelState *state = new LanguageModelState();

        if(!state->Initialize(sourceDirChars)) {
-            free(state);
+            delete state;
            return 0;
        }

--- a/native/jni/src/ggml/LanguageModel.cpp
+++ b/native/jni/src/ggml/LanguageModel.cpp
@ -59,6 +59,7 @@ LanguageModel *LlamaAdapter::createLanguageModel(const std::string &paths) {
    ctx_params.n_threads_batch = 1;

    llama_model_params model_params = llama_model_default_params();
+    model_params.use_mmap = false;

    adapter->model = llama_load_model_from_file(modelPath.c_str(), model_params);

@ -67,6 +68,15 @@ LanguageModel *LlamaAdapter::createLanguageModel(const std::string &paths) {
        return nullptr;
    }

+    int err = llama_model_apply_lora_from_file(adapter->model,
+                                               "/data/user/0/org.futo.inputmethod.latin/cache/test-adapter.bin",
+                                               1.0,
+                                               NULL,
+                                               4);
+    if(err != 0) {
+        AKLOGE("Failed to apply lora: %d", err);
+    }
+
    adapter->context = llama_new_context_with_model(adapter->model, ctx_params);

    //adapter->spm = sentencepiece::SentencePieceProcessor();
--- a/native/jni/src/ggml/common.cpp
+++ b/native/jni/src/ggml/common.cpp
@ -0,0 +1,369 @@
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iterator>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <cinttypes>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <codecvt>
+#include <locale>
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
+        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (!siblings.empty()) {
+        return static_cast<int32_t>(siblings.size());
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32)
+    //TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+
+void process_escapes(std::string& input) {
+    std::size_t input_len = input.length();
+    std::size_t output_idx = 0;
+
+    for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
+        if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
+            switch (input[++input_idx]) {
+                case 'n':  input[output_idx++] = '\n'; break;
+                case 'r':  input[output_idx++] = '\r'; break;
+                case 't':  input[output_idx++] = '\t'; break;
+                case '\'': input[output_idx++] = '\''; break;
+                case '\"': input[output_idx++] = '\"'; break;
+                case '\\': input[output_idx++] = '\\'; break;
+                default:   input[output_idx++] = '\\';
+                    input[output_idx++] = input[input_idx]; break;
+            }
+        } else {
+            input[output_idx++] = input[input_idx];
+        }
+    }
+
+    input.resize(output_idx);
+}
+
+std::string gpt_random_prompt(std::mt19937 & rng) {
+    const int r = rng() % 10;
+    switch (r) {
+        case 0: return "So";
+        case 1: return "Once upon a time";
+        case 2: return "When";
+        case 3: return "The";
+        case 4: return "After";
+        case 5: return "If";
+        case 6: return "import";
+        case 7: return "He";
+        case 8: return "She";
+        case 9: return "They";
+    }
+
+    GGML_UNREACHABLE();
+}
+
+void llama_batch_clear(struct llama_batch & batch) {
+    batch.n_tokens = 0;
+}
+
+void llama_batch_add(
+        struct llama_batch & batch,
+        llama_token   id,
+        llama_pos   pos,
+        const std::vector<llama_seq_id> & seq_ids,
+        bool   logits) {
+    batch.token   [batch.n_tokens] = id;
+    batch.pos     [batch.n_tokens] = pos,
+            batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    }
+    batch.logits  [batch.n_tokens] = logits;
+
+    batch.n_tokens++;
+}
+
+//
+// Vocab utils
+//
+
+std::vector<llama_token> llama_tokenize(
+        const struct llama_context * ctx,
+        const std::string & text,
+        bool   add_bos,
+        bool   special) {
+    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
+}
+
+std::vector<llama_token> llama_tokenize(
+        const struct llama_model * model,
+        const std::string & text,
+        bool   add_bos,
+        bool   special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
+std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
+
+    std::string piece;
+    std::string result;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        piece = llama_token_to_piece(ctx, tokens[i]);
+
+        // remove the leading space of the first non-BOS token
+        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
+            piece = piece.substr(1);
+        }
+
+        result += piece;
+    }
+
+    return result;
+}
+
+std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::string piece;
+    std::string result;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        piece = llama_token_to_piece(ctx, tokens[i]);
+
+        result += piece;
+    }
+
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return result;
+}
+
+//
+// YAML utils
+//
+
+// returns true if successful, false otherwise
+bool create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    std::wstring wpath = converter.from_bytes(path);
+
+    // if the path already exists, check whether it's a directory
+    const DWORD attributes = GetFileAttributesW(wpath.c_str());
+    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        return true;
+    }
+
+    size_t pos_slash = 0;
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+        const std::wstring subpath = wpath.substr(0, pos_slash);
+        const wchar_t * test = subpath.c_str();
+
+        const bool success = CreateDirectoryW(test, NULL);
+        if (!success) {
+            const DWORD error = GetLastError();
+
+            // if the path already exists, ensure that it's a directory
+            if (error == ERROR_ALREADY_EXISTS) {
+                const DWORD attributes = GetFileAttributesW(subpath.c_str());
+                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#else
+    // if the path already exists, check whether it's a directory
+    struct stat info;
+    if (stat(path.c_str(), &info) == 0) {
+        return S_ISDIR(info.st_mode);
+    }
+
+    size_t pos_slash = 1; // skip leading slashes for directory creation
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+        const std::string subpath = path.substr(0, pos_slash);
+        struct stat info;
+
+        // if the path already exists, ensure that it's a directory
+        if (stat(subpath.c_str(), &info) == 0) {
+            if (!S_ISDIR(info.st_mode)) {
+                return false;
+            }
+        } else {
+            // create parent directories
+            const int ret = mkdir(subpath.c_str(), 0755);
+            if (ret != 0) {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#endif // _WIN32
+}
+
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
+    if (data.empty()) {
+        fprintf(stream, "%s:\n", prop_name);
+        return;
+    }
+
+    fprintf(stream, "%s: [", prop_name);
+    for (size_t i = 0; i < data.size() - 1; ++i) {
+        fprintf(stream, "%e, ", data[i]);
+    }
+    fprintf(stream, "%e]\n", data.back());
+}
+
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
+    if (data.empty()) {
+        fprintf(stream, "%s:\n", prop_name);
+        return;
+    }
+
+    fprintf(stream, "%s: [", prop_name);
+    for (size_t i = 0; i < data.size() - 1; ++i) {
+        fprintf(stream, "%d, ", data[i]);
+    }
+    fprintf(stream, "%d]\n", data.back());
+}
+
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
+    std::string data_str(data == NULL ? "" : data);
+
+    if (data_str.empty()) {
+        fprintf(stream, "%s:\n", prop_name);
+        return;
+    }
+
+    size_t pos_start = 0;
+    size_t pos_found = 0;
+
+    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
+        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
+        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
+        data_str = "\"" + data_str + "\"";
+        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+        return;
+    }
+
+    if (data_str.find('\n') == std::string::npos) {
+        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+        return;
+    }
+
+    fprintf(stream, "%s: |\n", prop_name);
+    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
+        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
+        pos_start = pos_found + 1;
+    }
+}
+
+std::string get_sortable_timestamp() {
+    using clock = std::chrono::system_clock;
+
+    const clock::time_point current_time = clock::now();
+    const time_t as_time_t = clock::to_time_t(current_time);
+    char timestamp_no_ns[100];
+    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+
+    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+            current_time.time_since_epoch() % 1000000000).count();
+    char timestamp_ns[11];
+    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
+
+    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+}
--- a/native/jni/src/ggml/common.h
+++ b/native/jni/src/ggml/common.h
@ -0,0 +1,114 @@
+// Various helper functions and utilities
+
+#pragma once
+
+#include "llama.h"
+
+#define LOG_NO_FILE_LINE_FUNCTION
+
+#include <string>
+#include <vector>
+#include <random>
+#include <thread>
+#include <unordered_map>
+#include <tuple>
+
+#ifdef _WIN32
+#define DIRECTORY_SEPARATOR '\\'
+#else
+#define DIRECTORY_SEPARATOR '/'
+#endif // _WIN32
+
+
+template<typename ... Args>
+std::string string_format( const std::string& format, Args ... args )
+{
+    int size_s = std::snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0'
+    if( size_s <= 0 ){ throw std::runtime_error( "Error during formatting." ); }
+    auto size = static_cast<size_t>( size_s );
+    std::unique_ptr<char[]> buf( new char[ size ] );
+    std::snprintf( buf.get(), size, format.c_str(), args ... );
+    return std::string( buf.get(), buf.get() + size - 1 ); // We don't want the '\0' inside
+}
+
+#define die(msg)          do { throw std::runtime_error(msg); } while (0)
+#define die_fmt(fmt, ...) do { throw std::runtime_error(string_format(fmt, __VA_ARGS__)); } while (0)
+
+#define print_build_info() do {                                                             \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);         \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);  \
+} while(0)
+
+//
+// CLI argument parsing
+//
+int32_t get_num_physical_cores();
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+void process_escapes(std::string& input);
+
+//
+// Model utils
+//
+
+// Batch utils
+
+void llama_batch_clear(struct llama_batch & batch);
+
+void llama_batch_add(
+        struct llama_batch & batch,
+        llama_token   id,
+        llama_pos   pos,
+        const std::vector<llama_seq_id> & seq_ids,
+        bool   logits);
+
+//
+// Vocab utils
+//
+
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
+std::vector<llama_token> llama_tokenize(
+        const struct llama_context * ctx,
+        const std::string & text,
+        bool   add_bos,
+        bool   special = false);
+
+std::vector<llama_token> llama_tokenize(
+        const struct llama_model * model,
+        const std::string & text,
+        bool   add_bos,
+        bool   special = false);
+
+// tokenizes a token into a piece
+// should work similar to Python's `tokenizer.id_to_piece`
+std::string llama_token_to_piece(
+        const struct llama_context * ctx,
+        llama_token   token);
+
+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+//       that takes into account the tokenizer type and decides how to handle the leading space
+//
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize_spm(
+        llama_context * ctx,
+        const std::vector<llama_token> & tokens);
+
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
+        llama_context * ctx,
+        const std::vector<llama_token> & tokens);
+
+//
+// YAML utils
+//
+
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();
--- a/native/jni/src/ggml/finetune.cpp
+++ b/native/jni/src/ggml/finetune.cpp
--- a/native/jni/src/ggml/finetune.h
+++ b/native/jni/src/ggml/finetune.h
@ -0,0 +1,110 @@
+#ifndef LATINIME_FINETUNE_H
+#define LATINIME_FINETUNE_H
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "llama.h"
+#include "common.h"
+#include "train.h"
+
+struct train_params {
+    struct train_params_common common;
+
+    std::vector<std::vector<llama_token>> training_data;
+
+    const char * fn_model_base;
+    const char * fn_lora_out;
+
+    bool only_write_lora;
+
+    float f_norm_rms_eps;
+    float rope_freq_base;
+    float rope_freq_scale;
+
+    bool custom_f_norm_rms_eps;
+    bool custom_rope_freq_base;
+    bool custom_rope_freq_scale;
+
+    int32_t lora_r;
+    int32_t lora_alpha;
+    bool custom_lora_alpha;
+
+    uint32_t n_rank_attention_norm;
+    uint32_t n_rank_wq;
+    uint32_t n_rank_wk;
+    uint32_t n_rank_wv;
+    uint32_t n_rank_wo;
+    uint32_t n_rank_ffn_norm;
+    uint32_t n_rank_w1;
+    uint32_t n_rank_w2;
+    uint32_t n_rank_w3;
+    uint32_t n_rank_tok_embeddings;
+    uint32_t n_rank_norm;
+    uint32_t n_rank_output;
+
+    bool custom_n_rank_attention_norm;
+    bool custom_n_rank_wq;
+    bool custom_n_rank_wk;
+    bool custom_n_rank_wv;
+    bool custom_n_rank_wo;
+    bool custom_n_rank_ffn_norm;
+    bool custom_n_rank_w1;
+    bool custom_n_rank_w2;
+    bool custom_n_rank_w3;
+    bool custom_n_rank_tok_embeddings;
+    bool custom_n_rank_norm;
+    bool custom_n_rank_output;
+};
+
+static struct train_params get_default_train_params() {
+    struct train_params params;
+    params.common = get_default_train_params_common();
+    params.fn_model_base     = "";
+    params.fn_lora_out       = "ggml-lora-ITERATION-f32.gguf";
+
+    params.only_write_lora = false;
+
+    params.f_norm_rms_eps  = 1e-5f;
+    params.rope_freq_base  = 10000.0f;
+    params.rope_freq_scale = 1.0f;
+
+    params.custom_f_norm_rms_eps  = false;
+    params.custom_rope_freq_base  = false;
+    params.custom_rope_freq_scale = false;
+
+    params.lora_r      = 4;
+    params.lora_alpha  = 4;
+    params.custom_lora_alpha = false;
+
+    params.n_rank_attention_norm = 1;
+    params.n_rank_wq             = 4;
+    params.n_rank_wk             = 4;
+    params.n_rank_wv             = 4;
+    params.n_rank_wo             = 4;
+    params.n_rank_ffn_norm       = 1;
+    params.n_rank_w1             = 4;
+    params.n_rank_w2             = 4;
+    params.n_rank_w3             = 4;
+    params.n_rank_tok_embeddings = 4;
+    params.n_rank_norm           = 1;
+    params.n_rank_output         = 4;
+
+    params.custom_n_rank_attention_norm = false;
+    params.custom_n_rank_wq             = false;
+    params.custom_n_rank_wk             = false;
+    params.custom_n_rank_wv             = false;
+    params.custom_n_rank_wo             = false;
+    params.custom_n_rank_ffn_norm       = false;
+    params.custom_n_rank_w1             = false;
+    params.custom_n_rank_w2             = false;
+    params.custom_n_rank_w3             = false;
+    params.custom_n_rank_tok_embeddings = false;
+    params.custom_n_rank_norm           = false;
+    params.custom_n_rank_output         = false;
+
+    return params;
+}
+
+int finetune_train(struct train_params params);
+
+#endif //LATINIME_FINETUNE_H
--- a/native/jni/src/ggml/train.cpp
+++ b/native/jni/src/ggml/train.cpp
--- a/native/jni/src/ggml/train.h
+++ b/native/jni/src/ggml/train.h
@ -0,0 +1,231 @@
+// Various helper functions and utilities for training
+
+#pragma once
+
+#include <string>
+#include <random>
+#include <vector>
+
+#include "ggml.h"
+#include "llama.h"
+
+typedef std::string mt19937_state;
+
+struct train_state {
+    struct ggml_opt_context * opt;
+
+    uint64_t train_its;
+    uint64_t train_samples;
+    uint64_t train_tokens;
+    uint64_t train_epochs;
+
+    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
+    mt19937_state shuffle_rng_state_current;
+    mt19937_state shuffle_rng_state_next;
+    size_t        shuffle_sample_count;
+    size_t        shuffle_next_sample;
+};
+
+struct train_params_common {
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * pattern_fn_it;
+    const char * fn_latest;
+
+    bool print_usage;
+
+    int save_every;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_threads;
+    int n_batch;
+    int n_gradient_accumulation;
+    int n_epochs;
+    int n_gpu_layers;
+
+    bool custom_n_ctx;
+
+    bool use_flash;
+    bool use_checkpointing;
+
+    std::string sample_start;
+    bool include_sample_start;
+    bool escape;
+    bool overlapping_samples;
+    bool fill_with_next_samples;
+    bool separate_with_eos;
+    bool separate_with_bos;
+    bool sample_random_offsets;
+
+    bool force_reshuffle;
+
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_min;
+    bool  enable_restart;
+
+    int   opt_past;
+    float opt_delta;
+    int   opt_max_no_improvement;
+
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_min_alpha;
+    float adam_decay;
+    int   adam_decay_min_ndim;
+    float adam_beta1;
+    float adam_beta2;
+    float adam_gclip;
+    float adam_eps_f;
+};
+
+typedef void (*save_train_files_callback)(void * data, struct train_state * train);
+
+struct train_opt_callback_data {
+    struct train_params_common * params;
+    struct train_state         * train;
+    save_train_files_callback    save_cb;
+    void                       * save_data;
+    struct llama_context       * lctx;
+    int                          last_save_iter;
+    llama_token                * tokens_data;
+    size_t                       tokens_size;
+    size_t                     * samples_begin;
+    size_t                     * samples_size;
+    size_t                     * shuffled_samples_offs;
+    size_t                     * shuffled_samples_begin;
+    size_t                     * shuffled_samples_size;
+    size_t                       samples_count;
+    struct ggml_tensor         * tokens_input;
+    struct ggml_tensor         * target_probs;
+    int                          first_iter;
+    int                          first_epoch;
+    int                          iter_at_last_epoch;
+    int64_t                      last_time;
+    double                       millis_per_iter;
+};
+
+struct train_state * init_train_state();
+void free_train_state(struct train_state  * state);
+
+struct train_params_common get_default_train_params_common();
+void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
+
+bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
+void finish_processing_train_args(struct train_params_common * params);
+
+struct random_normal_distribution;
+struct random_uniform_distribution;
+
+struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
+struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
+
+void free_random_normal_distribution (struct random_normal_distribution  * rnd);
+void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
+
+struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
+
+// generate random float in interval [0,1)
+float frand();
+float frand_normal (struct random_normal_distribution * rnd);
+float frand_uniform(struct random_uniform_distribution * rnd);
+
+int   clamp (const int v, const int min, const int max);
+float fclamp(const float v, const float min, const float max);
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
+
+size_t tokenize_file(
+        struct llama_context     * lctx,
+        const char               * filename,
+        const std::string        & sample_start,
+        bool                       include_sample_start,
+        bool                       overlapping_samples,
+        unsigned                   context_length,
+        std::vector<llama_token> & out_tokens,
+        std::vector<size_t>      & out_samples_begin,
+        std::vector<size_t>      & out_samples_size);
+
+int64_t get_example_targets_batch(
+        struct llama_context * lctx,
+        struct ggml_tensor   * tokens_input,
+        struct ggml_tensor   * target_probs,
+        int64_t                example_id,
+        const size_t         * samples_offs,
+        const size_t         * samples_begin,
+        const size_t         * samples_size,
+        size_t           samples_count,
+        const llama_token    * train_data,
+        size_t                 n_train_data,
+        bool                   separate_with_eos,
+        bool                   separate_with_bos,
+        bool                   fill_with_next_samples,
+        bool                   sample_random_offsets);
+
+
+void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
+mt19937_state mt19937_get_state(const std::mt19937& rng);
+mt19937_state mt19937_seed_to_state(unsigned seed);
+
+mt19937_state shuffle_samples(
+        const mt19937_state & rng_state,
+        size_t              * shuffled_offs,
+        size_t              * shuffled_begins,
+        size_t              * shuffled_sizes,
+        const size_t        * begins,
+        const size_t        * sizes,
+        size_t                count);
+
+size_t hash_combine(size_t h1, size_t h2);
+
+size_t compute_samples_hash(
+        const char* fn,
+        const size_t* samples_begin,
+        const size_t* samples_size,
+        size_t sample_count);
+
+
+std::string replace_str(const char * s, const char * needle, const char * replacement);
+
+void print_duration(double milliseconds);
+
+float cosine_decay(
+        int64_t step,
+        int64_t decay_steps,
+        float   minimum);
+
+float cosine_decay_restart(
+        int64_t step,
+        int64_t decay_steps,
+        float   minimum,
+        float   restart_step_mult);
+
+float learning_schedule(
+        int64_t step,
+        int64_t warmup_steps,
+        int64_t decay_steps,
+        float   learning_rate,
+        float   overall_minimum,
+        float   cos_decay_minimum,
+        float   cos_decay_restart_step_mult,
+        bool    enable_restart);
+
+void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
+
+void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
+void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
+
+bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
+void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
+
+std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
+
+void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);