Load voice input settings instead of hardcoding

2024-09-28 14:54:30 +01:00 · 2023-09-01 08:51:42 +03:00 · 2023-09-01 08:51:42 +03:00 · 7f656bb622
commit 7f656bb622
parent af42223a0c
10 changed files with 246 additions and 190 deletions
--- a/java/src/org/futo/inputmethod/latin/uix/Settings.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/Settings.kt
@ -69,5 +69,22 @@ fun <T> LifecycleOwner.deferSetSetting(key: Preferences.Key<T>, value: T): Job {
    }
 }

+data class SettingsKey<T>(
+    val key: Preferences.Key<T>,
+    val default: T
+)
+
+suspend fun <T> Context.getSetting(key: SettingsKey<T>): T {
+    val valueFlow: Flow<T> =
+        this.dataStore.data.map { preferences -> preferences[key.key] ?: key.default }.take(1)
+
+    return valueFlow.first()
+}
+
+suspend fun <T> Context.setSetting(key: SettingsKey<T>, value: T) {
+    this.dataStore.edit { preferences ->
+        preferences[key.key] = value
+    }
+}

 val THEME_KEY = stringPreferencesKey("activeThemeOption")
--- a/java/src/org/futo/inputmethod/latin/uix/VoiceInputSettingKeys.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/VoiceInputSettingKeys.kt
@ -0,0 +1,45 @@
+package org.futo.inputmethod.latin.uix
+
+import androidx.datastore.preferences.core.booleanPreferencesKey
+import androidx.datastore.preferences.core.intPreferencesKey
+import androidx.datastore.preferences.core.stringSetPreferencesKey
+
+val ENABLE_SOUND = SettingsKey(
+    key = booleanPreferencesKey("enable_sounds"),
+    default = true
+)
+
+val VERBOSE_PROGRESS = SettingsKey(
+    key = booleanPreferencesKey("verbose_progress"),
+    default = false
+)
+
+val ENABLE_ENGLISH = SettingsKey(
+    key = booleanPreferencesKey("enable_english"),
+    default = true
+)
+
+val ENABLE_MULTILINGUAL = SettingsKey(
+    key = booleanPreferencesKey("enable_multilingual"),
+    default = false
+)
+
+val DISALLOW_SYMBOLS = SettingsKey(
+    key = booleanPreferencesKey("disallow_symbols"),
+    default = true
+)
+
+val ENGLISH_MODEL_INDEX = SettingsKey(
+    key = intPreferencesKey("english_model_index"),
+    default = 0
+)
+
+val MULTILINGUAL_MODEL_INDEX = SettingsKey(
+    key = intPreferencesKey("multilingual_model_index"),
+    default = 1
+)
+
+val LANGUAGE_TOGGLES = SettingsKey(
+    key = stringSetPreferencesKey("enabled_languages"),
+    default = setOf()
+)
--- a/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
@ -3,8 +3,6 @@ package org.futo.inputmethod.latin.uix.actions
 import androidx.compose.foundation.clickable
 import androidx.compose.foundation.interaction.MutableInteractionSource
 import androidx.compose.foundation.layout.Box
-import androidx.compose.foundation.layout.Column
-import androidx.compose.foundation.layout.ColumnScope
 import androidx.compose.foundation.layout.fillMaxSize
 import androidx.compose.runtime.Composable
 import androidx.compose.runtime.MutableState
@ -13,17 +11,40 @@ import androidx.compose.runtime.remember
 import androidx.compose.ui.Alignment
 import androidx.compose.ui.Modifier
 import androidx.compose.ui.res.stringResource
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.async
+import kotlinx.coroutines.coroutineScope
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import kotlinx.coroutines.yield
 import org.futo.inputmethod.latin.R
 import org.futo.inputmethod.latin.uix.Action
 import org.futo.inputmethod.latin.uix.ActionInputTransaction
 import org.futo.inputmethod.latin.uix.ActionWindow
+import org.futo.inputmethod.latin.uix.DISALLOW_SYMBOLS
+import org.futo.inputmethod.latin.uix.ENABLE_ENGLISH
+import org.futo.inputmethod.latin.uix.ENABLE_MULTILINGUAL
+import org.futo.inputmethod.latin.uix.ENABLE_SOUND
+import org.futo.inputmethod.latin.uix.ENGLISH_MODEL_INDEX
 import org.futo.inputmethod.latin.uix.KeyboardManagerForAction
+import org.futo.inputmethod.latin.uix.LANGUAGE_TOGGLES
+import org.futo.inputmethod.latin.uix.MULTILINGUAL_MODEL_INDEX
 import org.futo.inputmethod.latin.uix.PersistentActionState
+import org.futo.inputmethod.latin.uix.VERBOSE_PROGRESS
+import org.futo.inputmethod.latin.uix.getSetting
+import org.futo.voiceinput.shared.ENGLISH_MODELS
+import org.futo.voiceinput.shared.MULTILINGUAL_MODELS
+import org.futo.voiceinput.shared.ModelDoesNotExistException
 import org.futo.voiceinput.shared.RecognizerView
 import org.futo.voiceinput.shared.RecognizerViewListener
 import org.futo.voiceinput.shared.RecognizerViewSettings
 import org.futo.voiceinput.shared.SoundPlayer
+import org.futo.voiceinput.shared.types.Language
+import org.futo.voiceinput.shared.types.ModelLoader
+import org.futo.voiceinput.shared.types.getLanguageFromWhisperString
+import org.futo.voiceinput.shared.whisper.DecodingConfiguration
 import org.futo.voiceinput.shared.whisper.ModelManager
+import org.futo.voiceinput.shared.whisper.MultiModelRunConfiguration

 val SystemVoiceInputAction = Action(
    icon = R.drawable.mic_fill,
@ -44,34 +65,86 @@ class VoiceInputPersistentState(val manager: KeyboardManagerForAction) : Persist
        modelManager.cleanUp()
    }
 }
-val VoiceInputAction = Action(
-    icon = R.drawable.mic_fill,
-    name = R.string.voice_input_action_title,
-    simplePressImpl = null,
-    persistentState = { VoiceInputPersistentState(it) },

-    windowImpl = { manager, persistentState ->
-        val state = persistentState as VoiceInputPersistentState
-        object : ActionWindow, RecognizerViewListener {
-            private val recognizerView = RecognizerView(
-                context = manager.getContext(),
-                listener = this,
-                settings = RecognizerViewSettings(
+private class VoiceInputActionWindow(
+    val manager: KeyboardManagerForAction, val state: VoiceInputPersistentState
+) : ActionWindow, RecognizerViewListener {
+    val context = manager.getContext()
+
+    private var shouldPlaySounds: Boolean = false
+    private suspend fun loadSettings(): RecognizerViewSettings = coroutineScope {
+        val enableSound = async { context.getSetting(ENABLE_SOUND) }
+        val verboseFeedback = async { context.getSetting(VERBOSE_PROGRESS) }
+        val disallowSymbols = async { context.getSetting(DISALLOW_SYMBOLS) }
+        val enableEnglish = async { context.getSetting(ENABLE_ENGLISH) }
+        val englishModelIdx = async { context.getSetting(ENGLISH_MODEL_INDEX) }
+        val enableMultilingual = async { context.getSetting(ENABLE_MULTILINGUAL) }
+        val multilingualModelIdx = async { context.getSetting(MULTILINGUAL_MODEL_INDEX) }
+        val allowedLanguages = async {
+            context.getSetting(LANGUAGE_TOGGLES).mapNotNull { getLanguageFromWhisperString(it) }
+                .toSet()
+        }
+
+        val primaryModel = if (enableMultilingual.await()) {
+            MULTILINGUAL_MODELS[multilingualModelIdx.await()]
+        } else {
+            ENGLISH_MODELS[englishModelIdx.await()]
+        }
+
+        val languageSpecificModels = mutableMapOf<Language, ModelLoader>()
+        if (enableEnglish.await()) {
+            languageSpecificModels[Language.English] = ENGLISH_MODELS[englishModelIdx.await()]
+        }
+
+        shouldPlaySounds = enableSound.await()
+
+        return@coroutineScope RecognizerViewSettings(
            shouldShowInlinePartialResult = false,
-                    shouldShowVerboseFeedback = true
+            shouldShowVerboseFeedback = verboseFeedback.await(),
+            modelRunConfiguration = MultiModelRunConfiguration(
+                primaryModel = primaryModel, languageSpecificModels = languageSpecificModels
            ),
+            decodingConfiguration = DecodingConfiguration(
+                languages = allowedLanguages.await(), suppressSymbols = disallowSymbols.await()
+            )
+        )
+    }
+
+    private var recognizerView: MutableState<RecognizerView?> = mutableStateOf(null)
+
+    private val initJob = manager.getLifecycleScope().launch {
+        yield()
+        val settings = withContext(Dispatchers.IO) {
+            loadSettings()
+        }
+
+        yield()
+        val recognizerView = try {
+            RecognizerView(
+                context = manager.getContext(),
+                listener = this@VoiceInputActionWindow,
+                settings = settings,
                lifecycleScope = manager.getLifecycleScope(),
                modelManager = state.modelManager
            )
+        } catch(e: ModelDoesNotExistException) {
+            // TODO: Show an error to the user, with an option to download
+            close()
+            return@launch
+        }

-            init {
+        this@VoiceInputActionWindow.recognizerView.value = recognizerView
+
+        yield()
        recognizerView.reset()
+
+        yield()
        recognizerView.start()
    }

    private var inputTransaction: ActionInputTransaction? = null
    private fun getOrStartInputTransaction(): ActionInputTransaction {
-                if(inputTransaction == null) {
+        if (inputTransaction == null) {
            inputTransaction = manager.createInputTransaction(true)
        }

@ -87,35 +160,38 @@ val VoiceInputAction = Action(
    override fun WindowContents() {
        Box(modifier = Modifier
            .fillMaxSize()
-                    .clickable(
-                        enabled = true,
+            .clickable(enabled = true,
                onClickLabel = null,
-                        onClick = { recognizerView.finish() },
+                onClick = { recognizerView.value?.finish() },
                role = null,
                indication = null,
-                        interactionSource = remember { MutableInteractionSource() }
-                    )) {
+                interactionSource = remember { MutableInteractionSource() })) {
            Box(modifier = Modifier.align(Alignment.Center)) {
-                        recognizerView.Content()
+                recognizerView.value?.Content()
            }
        }
    }

    override fun close() {
-                recognizerView.cancel()
+        initJob.cancel()
+        recognizerView.value?.cancel()
    }

    private var wasFinished = false
    override fun cancelled() {
-                if(!wasFinished) {
+        if (!wasFinished) {
+            if (shouldPlaySounds) {
                state.soundPlayer.playCancelSound()
+            }
            getOrStartInputTransaction().cancel()
        }
    }

    override fun recordingStarted() {
+        if (shouldPlaySounds) {
            state.soundPlayer.playStartSound()
        }
+    }

    override fun finished(result: String) {
        wasFinished = true
@ -131,6 +207,15 @@ val VoiceInputAction = Action(
    override fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean {
        return false
    }
-        }
+}
+
+val VoiceInputAction = Action(icon = R.drawable.mic_fill,
+    name = R.string.voice_input_action_title,
+    simplePressImpl = null,
+    persistentState = { VoiceInputPersistentState(it) },
+    windowImpl = { manager, persistentState ->
+        VoiceInputActionWindow(
+            manager = manager, state = persistentState as VoiceInputPersistentState
+        )
    }
 )
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
@ -21,9 +21,7 @@ import com.konovalov.vad.config.SampleRate
 import com.konovalov.vad.models.VadModel
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.Job
-import kotlinx.coroutines.cancelAndJoin
 import kotlinx.coroutines.launch
-import kotlinx.coroutines.runBlocking
 import kotlinx.coroutines.withContext
 import kotlinx.coroutines.yield
 import org.futo.voiceinput.shared.types.AudioRecognizerListener
@ -51,11 +49,11 @@ data class AudioRecognizerSettings(
 class ModelDoesNotExistException(val models: List<ModelLoader>) : Throwable()

 class AudioRecognizer(
-    val context: Context,
-    val lifecycleScope: LifecycleCoroutineScope,
-    val modelManager: ModelManager,
-    val listener: AudioRecognizerListener,
-    val settings: AudioRecognizerSettings
+    private val context: Context,
+    private val lifecycleScope: LifecycleCoroutineScope,
+    modelManager: ModelManager,
+    private val listener: AudioRecognizerListener,
+    private val settings: AudioRecognizerSettings
 ) {
    private var isRecording = false
    private var recorder: AudioRecord? = null
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt
@ -1,15 +1,10 @@
 package org.futo.voiceinput.shared

 import android.content.Context
-import android.media.AudioAttributes
-import android.media.AudioAttributes.CONTENT_TYPE_SONIFICATION
-import android.media.AudioAttributes.USAGE_ASSISTANCE_SONIFICATION
-import android.media.SoundPool
 import androidx.compose.foundation.layout.Column
 import androidx.compose.runtime.Composable
 import androidx.compose.runtime.mutableStateOf
 import androidx.lifecycle.LifecycleCoroutineScope
-import kotlinx.coroutines.launch
 import org.futo.voiceinput.shared.types.AudioRecognizerListener
 import org.futo.voiceinput.shared.types.InferenceState
 import org.futo.voiceinput.shared.types.Language
@ -18,16 +13,16 @@ import org.futo.voiceinput.shared.ui.InnerRecognize
 import org.futo.voiceinput.shared.ui.PartialDecodingResult
 import org.futo.voiceinput.shared.ui.RecognizeLoadingCircle
 import org.futo.voiceinput.shared.ui.RecognizeMicError
-import org.futo.voiceinput.shared.util.ENABLE_SOUND
-import org.futo.voiceinput.shared.util.VERBOSE_PROGRESS
-import org.futo.voiceinput.shared.util.ValueFromSettings
 import org.futo.voiceinput.shared.whisper.DecodingConfiguration
 import org.futo.voiceinput.shared.whisper.ModelManager
 import org.futo.voiceinput.shared.whisper.MultiModelRunConfiguration

 data class RecognizerViewSettings(
    val shouldShowVerboseFeedback: Boolean,
-    val shouldShowInlinePartialResult: Boolean
+    val shouldShowInlinePartialResult: Boolean,
+
+    val modelRunConfiguration: MultiModelRunConfiguration,
+    val decodingConfiguration: DecodingConfiguration
 )

 private val VerboseAnnotations = hashMapOf(
@ -192,14 +187,14 @@ class RecognizerView(
        }
    }

-    // TODO: Dummy settings, should get them from constructor
    private val recognizer: AudioRecognizer = AudioRecognizer(
-        context, lifecycleScope, modelManager, audioRecognizerListener, AudioRecognizerSettings(
-            modelRunConfiguration = MultiModelRunConfiguration(
-                primaryModel = ENGLISH_MODELS[0], languageSpecificModels = mapOf()
-            ), decodingConfiguration = DecodingConfiguration(
-                languages = setOf(), suppressSymbols = true
-            )
+        context = context,
+        lifecycleScope = lifecycleScope,
+        modelManager = modelManager,
+        listener = audioRecognizerListener,
+        settings = AudioRecognizerSettings(
+            modelRunConfiguration = settings.modelRunConfiguration,
+            decodingConfiguration = settings.decodingConfiguration
        )
    )

--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/Tokens.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/Tokens.kt
@ -2,10 +2,6 @@ package org.futo.voiceinput.shared.types

 import org.futo.voiceinput.shared.whisper.stringifyUnicode

-enum class SpecialTokenKind {
-    StartOfTranscript, EndOfText, Translate, Transcribe, NoCaptions, NoTimestamps,
-}
-
 // Based on https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L236
 private val SYMBOLS = "#()*+/:;<=>@[\\]^_`{|}~「」『』".chunked(1) + listOf(
    "<<",
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/util/ArrayUtils.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/util/ArrayUtils.kt
@ -12,10 +12,6 @@ fun Array<DoubleArray>.shape(): IntArray {
    return arrayOf(size, this[0].size).toIntArray()
 }

-fun DoubleArray.toFloatArray(): FloatArray {
-    return this.map { it.toFloat() }.toFloatArray()
-}
-
 fun FloatArray.toDoubleArray(): DoubleArray {
    return this.map { it.toDouble() }.toDoubleArray()
 }
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/util/AudioFeatureExtraction.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/util/AudioFeatureExtraction.kt
@ -69,10 +69,6 @@ fun melToFreq(mels: DoubleArray, melScale: MelScale): DoubleArray {
    return mels.map { melToFreq(it, melScale) }.toDoubleArray()
 }

-fun freqToMel(freqs: DoubleArray, melScale: MelScale): DoubleArray {
-    return freqs.map { freqToMel(it, melScale) }.toDoubleArray()
-}
-
 fun linspace(min: Double, max: Double, num: Int): DoubleArray {
    val array = DoubleArray(num)
    val spacing = (max - min) / ((num - 1).toDouble())
@ -170,11 +166,11 @@ fun melFilterBank(
 fun padY(yValues: DoubleArray, nFFT: Int): DoubleArray {
    val ypad = DoubleArray(nFFT + yValues.size)
    for (i in 0 until nFFT / 2) {
-        ypad[nFFT / 2 - i - 1] = yValues[i + 1].toDouble()
-        ypad[nFFT / 2 + yValues.size + i] = yValues[yValues.size - 2 - i].toDouble()
+        ypad[nFFT / 2 - i - 1] = yValues[i + 1]
+        ypad[nFFT / 2 + yValues.size + i] = yValues[yValues.size - 2 - i]
    }
    for (j in yValues.indices) {
-        ypad[nFFT / 2 + j] = yValues[j].toDouble()
+        ypad[nFFT / 2 + j] = yValues[j]
    }
    return ypad
 }
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/util/Settings.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/util/Settings.kt
@ -1,58 +0,0 @@
-package org.futo.voiceinput.shared.util
-
-import android.content.Context
-import androidx.datastore.core.DataStore
-import androidx.datastore.preferences.core.Preferences
-import androidx.datastore.preferences.core.booleanPreferencesKey
-import androidx.datastore.preferences.core.intPreferencesKey
-import androidx.datastore.preferences.core.stringSetPreferencesKey
-import androidx.datastore.preferences.preferencesDataStore
-import kotlinx.coroutines.flow.Flow
-import kotlinx.coroutines.flow.first
-import kotlinx.coroutines.flow.map
-import kotlinx.coroutines.flow.take
-
-class ValueFromSettings<T>(val key: Preferences.Key<T>, val default: T) {
-    private var _value = default
-
-    val value: T
-        get() {
-            return _value
-        }
-
-    suspend fun load(context: Context, onResult: ((T) -> Unit)? = null) {
-        val valueFlow: Flow<T> =
-            context.dataStore.data.map { preferences -> preferences[key] ?: default }.take(1)
-
-        valueFlow.collect {
-            _value = it
-
-            if (onResult != null) {
-                onResult(it)
-            }
-        }
-    }
-
-    suspend fun get(context: Context): T {
-        val valueFlow: Flow<T> =
-            context.dataStore.data.map { preferences -> preferences[key] ?: default }.take(1)
-
-        return valueFlow.first()
-    }
-}
-
-
-val Context.dataStore: DataStore<Preferences> by preferencesDataStore(name = "settingsVoice")
-val ENABLE_SOUND = booleanPreferencesKey("enable_sounds")
-val VERBOSE_PROGRESS = booleanPreferencesKey("verbose_progress")
-val ENABLE_ENGLISH = booleanPreferencesKey("enable_english")
-val ENABLE_MULTILINGUAL = booleanPreferencesKey("enable_multilingual")
-val DISALLOW_SYMBOLS = booleanPreferencesKey("disallow_symbols")
-
-val ENGLISH_MODEL_INDEX = intPreferencesKey("english_model_index")
-val ENGLISH_MODEL_INDEX_DEFAULT = 0
-
-val MULTILINGUAL_MODEL_INDEX = intPreferencesKey("multilingual_model_index")
-val MULTILINGUAL_MODEL_INDEX_DEFAULT = 1
-
-val LANGUAGE_TOGGLES = stringSetPreferencesKey("enabled_languages")
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/whisper/Tokenizer.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/whisper/Tokenizer.kt
@ -6,7 +6,6 @@ import kotlinx.serialization.json.int
 import kotlinx.serialization.json.jsonObject
 import kotlinx.serialization.json.jsonPrimitive
 import org.futo.voiceinput.shared.types.Language
-import org.futo.voiceinput.shared.types.SpecialTokenKind
 import org.futo.voiceinput.shared.types.getLanguageFromWhisperString
 import org.futo.voiceinput.shared.types.getSymbolTokens
 import org.futo.voiceinput.shared.util.loadTextFromFile
@ -14,8 +13,8 @@ import org.futo.voiceinput.shared.util.loadTextFromResource
 import java.io.File

 class Tokenizer(tokenJson: String) {
-    val idToToken: Array<String?>
-    val tokenToId: HashMap<String, Int> = hashMapOf()
+    private val idToToken: Array<String?>
+    private val tokenToId: HashMap<String, Int> = hashMapOf()

    val symbolTokens: IntArray

@ -26,8 +25,8 @@ class Tokenizer(tokenJson: String) {
    val noTimestampsToken: Int
    val transcribeToken: Int

-    val startOfLanguages: Int
-    val endOfLanguages: Int
+    private val startOfLanguages: Int
+    private val endOfLanguages: Int

    init {
        val data = Json.parseToJsonElement(tokenJson)
@ -65,19 +64,6 @@ class Tokenizer(tokenJson: String) {
        return tokenToId[token]
    }

-
-    fun toSpecialToken(token: Int): SpecialTokenKind? {
-        return when (token) {
-            decodeStartToken -> SpecialTokenKind.StartOfTranscript
-            decodeEndToken -> SpecialTokenKind.EndOfText
-            translateToken -> SpecialTokenKind.Translate
-            noCaptionsToken -> SpecialTokenKind.NoCaptions
-            noTimestampsToken -> SpecialTokenKind.NoTimestamps
-            transcribeToken -> SpecialTokenKind.Transcribe
-            else -> null
-        }
-    }
-
    fun toLanguage(token: Int): Language? {
        if ((token < startOfLanguages) || (token > endOfLanguages)) return null