Load voice input settings instead of hardcoding

This commit is contained in:
Aleksandras Kostarevas 2023-09-01 08:51:42 +03:00
parent af42223a0c
commit 7f656bb622
10 changed files with 246 additions and 190 deletions

View File

@ -69,5 +69,22 @@ fun <T> LifecycleOwner.deferSetSetting(key: Preferences.Key<T>, value: T): Job {
}
}
data class SettingsKey<T>(
val key: Preferences.Key<T>,
val default: T
)
suspend fun <T> Context.getSetting(key: SettingsKey<T>): T {
val valueFlow: Flow<T> =
this.dataStore.data.map { preferences -> preferences[key.key] ?: key.default }.take(1)
return valueFlow.first()
}
suspend fun <T> Context.setSetting(key: SettingsKey<T>, value: T) {
this.dataStore.edit { preferences ->
preferences[key.key] = value
}
}
val THEME_KEY = stringPreferencesKey("activeThemeOption")

View File

@ -0,0 +1,45 @@
package org.futo.inputmethod.latin.uix
import androidx.datastore.preferences.core.booleanPreferencesKey
import androidx.datastore.preferences.core.intPreferencesKey
import androidx.datastore.preferences.core.stringSetPreferencesKey
val ENABLE_SOUND = SettingsKey(
key = booleanPreferencesKey("enable_sounds"),
default = true
)
val VERBOSE_PROGRESS = SettingsKey(
key = booleanPreferencesKey("verbose_progress"),
default = false
)
val ENABLE_ENGLISH = SettingsKey(
key = booleanPreferencesKey("enable_english"),
default = true
)
val ENABLE_MULTILINGUAL = SettingsKey(
key = booleanPreferencesKey("enable_multilingual"),
default = false
)
val DISALLOW_SYMBOLS = SettingsKey(
key = booleanPreferencesKey("disallow_symbols"),
default = true
)
val ENGLISH_MODEL_INDEX = SettingsKey(
key = intPreferencesKey("english_model_index"),
default = 0
)
val MULTILINGUAL_MODEL_INDEX = SettingsKey(
key = intPreferencesKey("multilingual_model_index"),
default = 1
)
val LANGUAGE_TOGGLES = SettingsKey(
key = stringSetPreferencesKey("enabled_languages"),
default = setOf()
)

View File

@ -3,8 +3,6 @@ package org.futo.inputmethod.latin.uix.actions
import androidx.compose.foundation.clickable
import androidx.compose.foundation.interaction.MutableInteractionSource
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.ColumnScope
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.runtime.Composable
import androidx.compose.runtime.MutableState
@ -13,17 +11,40 @@ import androidx.compose.runtime.remember
import androidx.compose.ui.Alignment
import androidx.compose.ui.Modifier
import androidx.compose.ui.res.stringResource
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import kotlinx.coroutines.yield
import org.futo.inputmethod.latin.R
import org.futo.inputmethod.latin.uix.Action
import org.futo.inputmethod.latin.uix.ActionInputTransaction
import org.futo.inputmethod.latin.uix.ActionWindow
import org.futo.inputmethod.latin.uix.DISALLOW_SYMBOLS
import org.futo.inputmethod.latin.uix.ENABLE_ENGLISH
import org.futo.inputmethod.latin.uix.ENABLE_MULTILINGUAL
import org.futo.inputmethod.latin.uix.ENABLE_SOUND
import org.futo.inputmethod.latin.uix.ENGLISH_MODEL_INDEX
import org.futo.inputmethod.latin.uix.KeyboardManagerForAction
import org.futo.inputmethod.latin.uix.LANGUAGE_TOGGLES
import org.futo.inputmethod.latin.uix.MULTILINGUAL_MODEL_INDEX
import org.futo.inputmethod.latin.uix.PersistentActionState
import org.futo.inputmethod.latin.uix.VERBOSE_PROGRESS
import org.futo.inputmethod.latin.uix.getSetting
import org.futo.voiceinput.shared.ENGLISH_MODELS
import org.futo.voiceinput.shared.MULTILINGUAL_MODELS
import org.futo.voiceinput.shared.ModelDoesNotExistException
import org.futo.voiceinput.shared.RecognizerView
import org.futo.voiceinput.shared.RecognizerViewListener
import org.futo.voiceinput.shared.RecognizerViewSettings
import org.futo.voiceinput.shared.SoundPlayer
import org.futo.voiceinput.shared.types.Language
import org.futo.voiceinput.shared.types.ModelLoader
import org.futo.voiceinput.shared.types.getLanguageFromWhisperString
import org.futo.voiceinput.shared.whisper.DecodingConfiguration
import org.futo.voiceinput.shared.whisper.ModelManager
import org.futo.voiceinput.shared.whisper.MultiModelRunConfiguration
val SystemVoiceInputAction = Action(
icon = R.drawable.mic_fill,
@ -44,93 +65,157 @@ class VoiceInputPersistentState(val manager: KeyboardManagerForAction) : Persist
modelManager.cleanUp()
}
}
val VoiceInputAction = Action(
icon = R.drawable.mic_fill,
name = R.string.voice_input_action_title,
simplePressImpl = null,
persistentState = { VoiceInputPersistentState(it) },
windowImpl = { manager, persistentState ->
val state = persistentState as VoiceInputPersistentState
object : ActionWindow, RecognizerViewListener {
private val recognizerView = RecognizerView(
private class VoiceInputActionWindow(
val manager: KeyboardManagerForAction, val state: VoiceInputPersistentState
) : ActionWindow, RecognizerViewListener {
val context = manager.getContext()
private var shouldPlaySounds: Boolean = false
private suspend fun loadSettings(): RecognizerViewSettings = coroutineScope {
val enableSound = async { context.getSetting(ENABLE_SOUND) }
val verboseFeedback = async { context.getSetting(VERBOSE_PROGRESS) }
val disallowSymbols = async { context.getSetting(DISALLOW_SYMBOLS) }
val enableEnglish = async { context.getSetting(ENABLE_ENGLISH) }
val englishModelIdx = async { context.getSetting(ENGLISH_MODEL_INDEX) }
val enableMultilingual = async { context.getSetting(ENABLE_MULTILINGUAL) }
val multilingualModelIdx = async { context.getSetting(MULTILINGUAL_MODEL_INDEX) }
val allowedLanguages = async {
context.getSetting(LANGUAGE_TOGGLES).mapNotNull { getLanguageFromWhisperString(it) }
.toSet()
}
val primaryModel = if (enableMultilingual.await()) {
MULTILINGUAL_MODELS[multilingualModelIdx.await()]
} else {
ENGLISH_MODELS[englishModelIdx.await()]
}
val languageSpecificModels = mutableMapOf<Language, ModelLoader>()
if (enableEnglish.await()) {
languageSpecificModels[Language.English] = ENGLISH_MODELS[englishModelIdx.await()]
}
shouldPlaySounds = enableSound.await()
return@coroutineScope RecognizerViewSettings(
shouldShowInlinePartialResult = false,
shouldShowVerboseFeedback = verboseFeedback.await(),
modelRunConfiguration = MultiModelRunConfiguration(
primaryModel = primaryModel, languageSpecificModels = languageSpecificModels
),
decodingConfiguration = DecodingConfiguration(
languages = allowedLanguages.await(), suppressSymbols = disallowSymbols.await()
)
)
}
private var recognizerView: MutableState<RecognizerView?> = mutableStateOf(null)
private val initJob = manager.getLifecycleScope().launch {
yield()
val settings = withContext(Dispatchers.IO) {
loadSettings()
}
yield()
val recognizerView = try {
RecognizerView(
context = manager.getContext(),
listener = this,
settings = RecognizerViewSettings(
shouldShowInlinePartialResult = false,
shouldShowVerboseFeedback = true
),
listener = this@VoiceInputActionWindow,
settings = settings,
lifecycleScope = manager.getLifecycleScope(),
modelManager = state.modelManager
)
} catch(e: ModelDoesNotExistException) {
// TODO: Show an error to the user, with an option to download
close()
return@launch
}
init {
recognizerView.reset()
recognizerView.start()
}
this@VoiceInputActionWindow.recognizerView.value = recognizerView
private var inputTransaction: ActionInputTransaction? = null
private fun getOrStartInputTransaction(): ActionInputTransaction {
if(inputTransaction == null) {
inputTransaction = manager.createInputTransaction(true)
}
yield()
recognizerView.reset()
return inputTransaction!!
}
yield()
recognizerView.start()
}
@Composable
override fun windowName(): String {
return stringResource(R.string.voice_input_action_title)
}
private var inputTransaction: ActionInputTransaction? = null
private fun getOrStartInputTransaction(): ActionInputTransaction {
if (inputTransaction == null) {
inputTransaction = manager.createInputTransaction(true)
}
@Composable
override fun WindowContents() {
Box(modifier = Modifier
.fillMaxSize()
.clickable(
enabled = true,
onClickLabel = null,
onClick = { recognizerView.finish() },
role = null,
indication = null,
interactionSource = remember { MutableInteractionSource() }
)) {
Box(modifier = Modifier.align(Alignment.Center)) {
recognizerView.Content()
}
}
}
return inputTransaction!!
}
override fun close() {
recognizerView.cancel()
}
@Composable
override fun windowName(): String {
return stringResource(R.string.voice_input_action_title)
}
private var wasFinished = false
override fun cancelled() {
if(!wasFinished) {
state.soundPlayer.playCancelSound()
getOrStartInputTransaction().cancel()
}
}
override fun recordingStarted() {
state.soundPlayer.playStartSound()
}
override fun finished(result: String) {
wasFinished = true
getOrStartInputTransaction().commit(result)
manager.closeActionWindow()
}
override fun partialResult(result: String) {
getOrStartInputTransaction().updatePartial(result)
}
override fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean {
return false
@Composable
override fun WindowContents() {
Box(modifier = Modifier
.fillMaxSize()
.clickable(enabled = true,
onClickLabel = null,
onClick = { recognizerView.value?.finish() },
role = null,
indication = null,
interactionSource = remember { MutableInteractionSource() })) {
Box(modifier = Modifier.align(Alignment.Center)) {
recognizerView.value?.Content()
}
}
}
override fun close() {
initJob.cancel()
recognizerView.value?.cancel()
}
private var wasFinished = false
override fun cancelled() {
if (!wasFinished) {
if (shouldPlaySounds) {
state.soundPlayer.playCancelSound()
}
getOrStartInputTransaction().cancel()
}
}
override fun recordingStarted() {
if (shouldPlaySounds) {
state.soundPlayer.playStartSound()
}
}
override fun finished(result: String) {
wasFinished = true
getOrStartInputTransaction().commit(result)
manager.closeActionWindow()
}
override fun partialResult(result: String) {
getOrStartInputTransaction().updatePartial(result)
}
override fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean {
return false
}
}
val VoiceInputAction = Action(icon = R.drawable.mic_fill,
name = R.string.voice_input_action_title,
simplePressImpl = null,
persistentState = { VoiceInputPersistentState(it) },
windowImpl = { manager, persistentState ->
VoiceInputActionWindow(
manager = manager, state = persistentState as VoiceInputPersistentState
)
}
)

View File

@ -21,9 +21,7 @@ import com.konovalov.vad.config.SampleRate
import com.konovalov.vad.models.VadModel
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.cancelAndJoin
import kotlinx.coroutines.launch
import kotlinx.coroutines.runBlocking
import kotlinx.coroutines.withContext
import kotlinx.coroutines.yield
import org.futo.voiceinput.shared.types.AudioRecognizerListener
@ -51,11 +49,11 @@ data class AudioRecognizerSettings(
class ModelDoesNotExistException(val models: List<ModelLoader>) : Throwable()
class AudioRecognizer(
val context: Context,
val lifecycleScope: LifecycleCoroutineScope,
val modelManager: ModelManager,
val listener: AudioRecognizerListener,
val settings: AudioRecognizerSettings
private val context: Context,
private val lifecycleScope: LifecycleCoroutineScope,
modelManager: ModelManager,
private val listener: AudioRecognizerListener,
private val settings: AudioRecognizerSettings
) {
private var isRecording = false
private var recorder: AudioRecord? = null

View File

@ -1,15 +1,10 @@
package org.futo.voiceinput.shared
import android.content.Context
import android.media.AudioAttributes
import android.media.AudioAttributes.CONTENT_TYPE_SONIFICATION
import android.media.AudioAttributes.USAGE_ASSISTANCE_SONIFICATION
import android.media.SoundPool
import androidx.compose.foundation.layout.Column
import androidx.compose.runtime.Composable
import androidx.compose.runtime.mutableStateOf
import androidx.lifecycle.LifecycleCoroutineScope
import kotlinx.coroutines.launch
import org.futo.voiceinput.shared.types.AudioRecognizerListener
import org.futo.voiceinput.shared.types.InferenceState
import org.futo.voiceinput.shared.types.Language
@ -18,16 +13,16 @@ import org.futo.voiceinput.shared.ui.InnerRecognize
import org.futo.voiceinput.shared.ui.PartialDecodingResult
import org.futo.voiceinput.shared.ui.RecognizeLoadingCircle
import org.futo.voiceinput.shared.ui.RecognizeMicError
import org.futo.voiceinput.shared.util.ENABLE_SOUND
import org.futo.voiceinput.shared.util.VERBOSE_PROGRESS
import org.futo.voiceinput.shared.util.ValueFromSettings
import org.futo.voiceinput.shared.whisper.DecodingConfiguration
import org.futo.voiceinput.shared.whisper.ModelManager
import org.futo.voiceinput.shared.whisper.MultiModelRunConfiguration
data class RecognizerViewSettings(
val shouldShowVerboseFeedback: Boolean,
val shouldShowInlinePartialResult: Boolean
val shouldShowInlinePartialResult: Boolean,
val modelRunConfiguration: MultiModelRunConfiguration,
val decodingConfiguration: DecodingConfiguration
)
private val VerboseAnnotations = hashMapOf(
@ -192,14 +187,14 @@ class RecognizerView(
}
}
// TODO: Dummy settings, should get them from constructor
private val recognizer: AudioRecognizer = AudioRecognizer(
context, lifecycleScope, modelManager, audioRecognizerListener, AudioRecognizerSettings(
modelRunConfiguration = MultiModelRunConfiguration(
primaryModel = ENGLISH_MODELS[0], languageSpecificModels = mapOf()
), decodingConfiguration = DecodingConfiguration(
languages = setOf(), suppressSymbols = true
)
context = context,
lifecycleScope = lifecycleScope,
modelManager = modelManager,
listener = audioRecognizerListener,
settings = AudioRecognizerSettings(
modelRunConfiguration = settings.modelRunConfiguration,
decodingConfiguration = settings.decodingConfiguration
)
)

View File

@ -2,10 +2,6 @@ package org.futo.voiceinput.shared.types
import org.futo.voiceinput.shared.whisper.stringifyUnicode
enum class SpecialTokenKind {
StartOfTranscript, EndOfText, Translate, Transcribe, NoCaptions, NoTimestamps,
}
// Based on https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L236
private val SYMBOLS = "#()*+/:;<=>@[\\]^_`{|}~「」『』".chunked(1) + listOf(
"<<",

View File

@ -12,10 +12,6 @@ fun Array<DoubleArray>.shape(): IntArray {
return arrayOf(size, this[0].size).toIntArray()
}
fun DoubleArray.toFloatArray(): FloatArray {
return this.map { it.toFloat() }.toFloatArray()
}
fun FloatArray.toDoubleArray(): DoubleArray {
return this.map { it.toDouble() }.toDoubleArray()
}

View File

@ -69,10 +69,6 @@ fun melToFreq(mels: DoubleArray, melScale: MelScale): DoubleArray {
return mels.map { melToFreq(it, melScale) }.toDoubleArray()
}
fun freqToMel(freqs: DoubleArray, melScale: MelScale): DoubleArray {
return freqs.map { freqToMel(it, melScale) }.toDoubleArray()
}
fun linspace(min: Double, max: Double, num: Int): DoubleArray {
val array = DoubleArray(num)
val spacing = (max - min) / ((num - 1).toDouble())
@ -170,11 +166,11 @@ fun melFilterBank(
fun padY(yValues: DoubleArray, nFFT: Int): DoubleArray {
val ypad = DoubleArray(nFFT + yValues.size)
for (i in 0 until nFFT / 2) {
ypad[nFFT / 2 - i - 1] = yValues[i + 1].toDouble()
ypad[nFFT / 2 + yValues.size + i] = yValues[yValues.size - 2 - i].toDouble()
ypad[nFFT / 2 - i - 1] = yValues[i + 1]
ypad[nFFT / 2 + yValues.size + i] = yValues[yValues.size - 2 - i]
}
for (j in yValues.indices) {
ypad[nFFT / 2 + j] = yValues[j].toDouble()
ypad[nFFT / 2 + j] = yValues[j]
}
return ypad
}

View File

@ -1,58 +0,0 @@
package org.futo.voiceinput.shared.util
import android.content.Context
import androidx.datastore.core.DataStore
import androidx.datastore.preferences.core.Preferences
import androidx.datastore.preferences.core.booleanPreferencesKey
import androidx.datastore.preferences.core.intPreferencesKey
import androidx.datastore.preferences.core.stringSetPreferencesKey
import androidx.datastore.preferences.preferencesDataStore
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.first
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.take
class ValueFromSettings<T>(val key: Preferences.Key<T>, val default: T) {
private var _value = default
val value: T
get() {
return _value
}
suspend fun load(context: Context, onResult: ((T) -> Unit)? = null) {
val valueFlow: Flow<T> =
context.dataStore.data.map { preferences -> preferences[key] ?: default }.take(1)
valueFlow.collect {
_value = it
if (onResult != null) {
onResult(it)
}
}
}
suspend fun get(context: Context): T {
val valueFlow: Flow<T> =
context.dataStore.data.map { preferences -> preferences[key] ?: default }.take(1)
return valueFlow.first()
}
}
val Context.dataStore: DataStore<Preferences> by preferencesDataStore(name = "settingsVoice")
val ENABLE_SOUND = booleanPreferencesKey("enable_sounds")
val VERBOSE_PROGRESS = booleanPreferencesKey("verbose_progress")
val ENABLE_ENGLISH = booleanPreferencesKey("enable_english")
val ENABLE_MULTILINGUAL = booleanPreferencesKey("enable_multilingual")
val DISALLOW_SYMBOLS = booleanPreferencesKey("disallow_symbols")
val ENGLISH_MODEL_INDEX = intPreferencesKey("english_model_index")
val ENGLISH_MODEL_INDEX_DEFAULT = 0
val MULTILINGUAL_MODEL_INDEX = intPreferencesKey("multilingual_model_index")
val MULTILINGUAL_MODEL_INDEX_DEFAULT = 1
val LANGUAGE_TOGGLES = stringSetPreferencesKey("enabled_languages")

View File

@ -6,7 +6,6 @@ import kotlinx.serialization.json.int
import kotlinx.serialization.json.jsonObject
import kotlinx.serialization.json.jsonPrimitive
import org.futo.voiceinput.shared.types.Language
import org.futo.voiceinput.shared.types.SpecialTokenKind
import org.futo.voiceinput.shared.types.getLanguageFromWhisperString
import org.futo.voiceinput.shared.types.getSymbolTokens
import org.futo.voiceinput.shared.util.loadTextFromFile
@ -14,8 +13,8 @@ import org.futo.voiceinput.shared.util.loadTextFromResource
import java.io.File
class Tokenizer(tokenJson: String) {
val idToToken: Array<String?>
val tokenToId: HashMap<String, Int> = hashMapOf()
private val idToToken: Array<String?>
private val tokenToId: HashMap<String, Int> = hashMapOf()
val symbolTokens: IntArray
@ -26,8 +25,8 @@ class Tokenizer(tokenJson: String) {
val noTimestampsToken: Int
val transcribeToken: Int
val startOfLanguages: Int
val endOfLanguages: Int
private val startOfLanguages: Int
private val endOfLanguages: Int
init {
val data = Json.parseToJsonElement(tokenJson)
@ -65,19 +64,6 @@ class Tokenizer(tokenJson: String) {
return tokenToId[token]
}
fun toSpecialToken(token: Int): SpecialTokenKind? {
return when (token) {
decodeStartToken -> SpecialTokenKind.StartOfTranscript
decodeEndToken -> SpecialTokenKind.EndOfText
translateToken -> SpecialTokenKind.Translate
noCaptionsToken -> SpecialTokenKind.NoCaptions
noTimestampsToken -> SpecialTokenKind.NoTimestamps
transcribeToken -> SpecialTokenKind.Transcribe
else -> null
}
}
fun toLanguage(token: Int): Language? {
if ((token < startOfLanguages) || (token > endOfLanguages)) return null