Add TrainingDataGenerator

2024-09-28 14:54:30 +01:00 · 2023-11-08 16:24:10 +02:00 · 2023-11-08 16:24:10 +02:00 · 1d50ae9f22
commit 1d50ae9f22
parent ee8a81f12c
1 changed files with 213 additions and 0 deletions
--- a/java/src/org/futo/inputmethod/latin/xlm/TrainingDataGenerator.kt
+++ b/java/src/org/futo/inputmethod/latin/xlm/TrainingDataGenerator.kt
@ -0,0 +1,213 @@
 package org.futo.inputmethod.latin.xlm
 import kotlin.math.PI
 import kotlin.math.ceil
 import kotlin.math.cos
 import kotlin.math.ln
 import kotlin.math.pow
 import kotlin.math.sqrt
 import kotlin.random.Random
 import kotlin.random.nextInt
 class Vector2(val x: Float, val y: Float) {
    operator fun plus(other: Vector2): Vector2 {
        return Vector2(x + other.x, y + other.y)
    }
    operator fun minus(other: Vector2): Vector2 {
        return Vector2(x - other.x, y - other.y)
    }
    fun magnitudeSquared(): Float {
        return (x * x) + (y * y)
    }
 }
 fun randomNormal(mean: Float, standardDeviation: Float): Float {
    val u1 = Random.nextFloat()
    val u2 = Random.nextFloat()
    val randStdNormal = sqrt(-2.0 * ln(u1.toDouble())) * cos(2.0 * PI * u2.toDouble())
    return (mean + standardDeviation * randStdNormal).toFloat()
 }
 private interface KeyboardLayout {
    val tapSize: Vector2
    fun getKeyPosition(character: Char): Vector2?
    fun getClosestKey(position: Vector2): Char
 }
 const val SHIFT_KEY = '\u000f'
 const val BACKSPACE_KEY = '\u0008'
 object QWERTYKeyboardLayout : KeyboardLayout {
    override val tapSize: Vector2 = Vector2(80.0f, 80.0f)
    // Rough QWERTY positions based on eyeballing it
    private val KEYBOARD_KEYS = hashMapOf(
        'q' to Vector2(75.0f, 106.0f),
        'w' to Vector2(214.0f, 106.0f),
        'e' to Vector2(363.0f, 106.0f),
        'r' to Vector2(499.0f, 106.0f),
        't' to Vector2(645.0f, 106.0f),
        'y' to Vector2(789.0f, 106.0f),
        'u' to Vector2(928.0f, 106.0f),
        'i' to Vector2(1073.0f, 106.0f),
        'o' to Vector2(1216.0f, 106.0f),
        'p' to Vector2(1357.0f, 106.0f),
        'a' to Vector2(150.0f, 312.0f),
        's' to Vector2(291.0f, 312.0f),
        'd' to Vector2(434.0f, 312.0f),
        'f' to Vector2(574.0f, 312.0f),
        'g' to Vector2(717.0f, 312.0f),
        'h' to Vector2(859.0f, 312.0f),
        'j' to Vector2(1005.0f, 312.0f),
        'k' to Vector2(1140.0f, 312.0f),
        'l' to Vector2(1288.0f, 312.0f),
        SHIFT_KEY to Vector2(113.0f, 515.0f),
        'z' to Vector2(287.0f, 515.0f),
        'x' to Vector2(434.0f, 515.0f),
        'c' to Vector2(576.0f, 515.0f),
        'v' to Vector2(718.0f, 515.0f),
        'b' to Vector2(860.0f, 515.0f),
        'n' to Vector2(1003.0f, 515.0f),
        'm' to Vector2(1145.0f, 515.0f),
        BACKSPACE_KEY to Vector2(1329.0f, 515.0f),
    )
    override fun getKeyPosition(character: Char): Vector2? {
        return KEYBOARD_KEYS[character]
    }
    override fun getClosestKey(position: Vector2): Char {
        return KEYBOARD_KEYS.minBy {
            (it.value - position).magnitudeSquared()
        }.key
    }
 }
 private object WordMisspelling {
    fun substituteKeyboardLetters(layout: KeyboardLayout, word: String, temperature: Float = 0.6f): String {
        val keys = word.lowercase().toList()
        val newKeys = mutableListOf<Char>()
        keys.forEach { char ->
            val position = layout.getKeyPosition(char) ?: return@forEach
            val newPosition = Vector2(
                randomNormal(position.x, temperature * layout.tapSize.x),
                randomNormal(position.y, temperature * layout.tapSize.y)
            )
            val newKey = layout.getClosestKey(newPosition)
            if(newKey == SHIFT_KEY) {
                // next char should be uppercased, but it currently doesn't matter
            }else if(newKey == BACKSPACE_KEY) {
                if(newKeys.size > 0) newKeys.removeLast()
            }else {
                newKeys.add(newKey)
            }
        }
        return String(newKeys.toCharArray())
    }
    fun misspellWord(word: String, correctness: Float = 0.8f): String {
        var misspelledWord = word.trim().lowercase().replace("'", "")
        val getRand = { Random.nextFloat().pow(correctness) }
        // TODO: Random word transformations - substituting letters, deleting, repeating, adding, transposing
        // Substitute the word's characters with nearby ones randomly
        misspelledWord = substituteKeyboardLetters(QWERTYKeyboardLayout, misspelledWord, temperature = 1.0f * getRand())
        // Trim word randomly as if the user hasn't finished writing the word yet
        // This helps the model learn to complete partially-written words
        if((getRand() > 0.33) && (misspelledWord.length >= 2)) {
            val newLength = ceil((1.0 - (getRand() * getRand())) * misspelledWord.length).toInt().coerceAtLeast(2)
            misspelledWord = misspelledWord.substring(0, newLength.coerceAtMost(misspelledWord.length))
        }
        return misspelledWord
    }
 }
 const val TOKENIZER_BEGIN_USER_INPUT = "<XBU>"
 const val TOKENIZER_BEGIN_CORRECTION = "<XBC>"
 const val TOKENIZER_END_CORRECTION   = "<XEC>"
 private val TOKENIZER_LETTER_MAPPING = hashMapOf(
    'a' to "<CHAR_A>",
    'b' to "<CHAR_B>",
    'c' to "<CHAR_C>",
    'd' to "<CHAR_D>",
    'e' to "<CHAR_E>",
    'f' to "<CHAR_F>",
    'g' to "<CHAR_G>",
    'h' to "<CHAR_H>",
    'i' to "<CHAR_I>",
    'j' to "<CHAR_J>",
    'k' to "<CHAR_K>",
    'l' to "<CHAR_L>",
    'm' to "<CHAR_M>",
    'n' to "<CHAR_N>",
    'o' to "<CHAR_O>",
    'p' to "<CHAR_P>",
    'q' to "<CHAR_Q>",
    'r' to "<CHAR_R>",
    's' to "<CHAR_S>",
    't' to "<CHAR_T>",
    'u' to "<CHAR_U>",
    'v' to "<CHAR_V>",
    'w' to "<CHAR_W>",
    'x' to "<CHAR_X>",
    'y' to "<CHAR_Y>",
    'z' to "<CHAR_Z>",
 )
 private fun tokenizerFormatUserInput(misspelledWord: String): String {
    return TOKENIZER_BEGIN_USER_INPUT + misspelledWord.mapNotNull { TOKENIZER_LETTER_MAPPING[it] }.joinToString(separator = "") + TOKENIZER_BEGIN_CORRECTION
 }
 object TrainingDataGenerator {
    fun wordMisspelling(word: String): String {
        val misspelled = WordMisspelling.misspellWord(word)
        // Space after word is required for the tokenizer
        return tokenizerFormatUserInput(misspelled) + word.trim() + " " + TOKENIZER_END_CORRECTION
    }
    private val permittedCharacters = "abcdefghijklmnopqrstuvwxyz'-".toHashSet()
    fun suitableToMisspell(word: String): Boolean {
        return permittedCharacters.containsAll(word.lowercase().toList())
    }
    fun randomlyMisspellWords(text: String, proportion: Float = 0.333f): String {
        val words = text.split(" ").toMutableList()
        val wordsToMisspell = mutableListOf<Int>()
        for(i in 0 until (words.size * proportion).toInt()) {
            val remainingIndices = words.indices.toSet().subtract(wordsToMisspell.toSet()).toList()
            if(remainingIndices.isEmpty()) break;
            val wordToMisspell = remainingIndices[Random.nextInt(remainingIndices.indices)]
            if(suitableToMisspell(words[wordToMisspell])) {
                wordsToMisspell.add(wordToMisspell)
            }
        }
        wordsToMisspell.toSet().forEach { i ->
            words[i] = wordMisspelling(words[i])
        }
        return words.joinToString(separator=" ").trim()
            .replace("  ", " ")
            .replace("  ", " ")
            // Do not put spaces after these tokens, as it messes up tokenization
            .replace("$TOKENIZER_BEGIN_CORRECTION ", TOKENIZER_BEGIN_CORRECTION)
            .replace("$TOKENIZER_END_CORRECTION ", TOKENIZER_END_CORRECTION)
    }
 }