From 3acb8b5e44e5a7c40ea6151bc5c260a529eb3c45 Mon Sep 17 00:00:00 2001 From: Aleksandras Kostarevas Date: Thu, 31 Aug 2023 19:15:50 +0300 Subject: [PATCH] Create SoundPlayer for persistent state --- .../org/futo/inputmethod/latin/uix/Action.kt | 6 + .../latin/uix/actions/VoiceInputAction.kt | 73 +++++--- .../futo/voiceinput/shared/AudioRecognizer.kt | 53 ++---- .../futo/voiceinput/shared/RecognizerView.kt | 174 ++++++++---------- .../org/futo/voiceinput/shared/SoundPlayer.kt | 62 +++++++ .../shared/types/AudioRecognizerListener.kt | 21 +++ .../voiceinput/shared/ui/RecognizeViews.kt | 2 +- 7 files changed, 226 insertions(+), 165 deletions(-) create mode 100644 voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt create mode 100644 voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt diff --git a/java/src/org/futo/inputmethod/latin/uix/Action.kt b/java/src/org/futo/inputmethod/latin/uix/Action.kt index 38d607798..57ce0ec30 100644 --- a/java/src/org/futo/inputmethod/latin/uix/Action.kt +++ b/java/src/org/futo/inputmethod/latin/uix/Action.kt @@ -36,6 +36,12 @@ interface ActionWindow { } interface PersistentActionState { + /** + * When called, the device may be on low memory and is requesting the action to clean up its + * state. You can close any resources that may not be necessary anymore. This will never be + * called when the action window is currently open. The PersistentActionState will stick around + * after this. + */ suspend fun cleanUp() } diff --git a/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt b/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt index e1b7c009e..d6f38a76d 100644 --- a/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt +++ b/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt @@ -15,6 +15,9 @@ import org.futo.inputmethod.latin.uix.ActionWindow import org.futo.inputmethod.latin.uix.KeyboardManagerForAction import org.futo.inputmethod.latin.uix.PersistentActionState import org.futo.voiceinput.shared.RecognizerView +import org.futo.voiceinput.shared.RecognizerViewListener +import org.futo.voiceinput.shared.RecognizerViewSettings +import org.futo.voiceinput.shared.SoundPlayer import org.futo.voiceinput.shared.whisper.ModelManager val SystemVoiceInputAction = Action( @@ -29,7 +32,8 @@ val SystemVoiceInputAction = Action( class VoiceInputPersistentState(val manager: KeyboardManagerForAction) : PersistentActionState { - var modelManager: ModelManager = ModelManager(manager.getContext()) + val modelManager = ModelManager(manager.getContext()) + val soundPlayer = SoundPlayer(manager.getContext()) override suspend fun cleanUp() { modelManager.cleanUp() @@ -43,29 +47,21 @@ val VoiceInputAction = Action( windowImpl = { manager, persistentState -> val state = persistentState as VoiceInputPersistentState - object : ActionWindow, RecognizerView(manager.getContext(), manager.getLifecycleScope(), state.modelManager) { + object : ActionWindow, RecognizerViewListener { + private val recognizerView = RecognizerView( + context = manager.getContext(), + listener = this, + settings = RecognizerViewSettings( + shouldShowInlinePartialResult = false, + shouldShowVerboseFeedback = true + ), + lifecycleScope = manager.getLifecycleScope(), + modelManager = state.modelManager + ) + init { - this.reset() - this.init() - } - - override fun onCancel() { - this.reset() - manager.closeActionWindow() - } - - override fun sendResult(result: String) { - manager.typeText(result) - onCancel() - } - - override fun sendPartialResult(result: String): Boolean { - manager.typePartialText(result) - return true - } - - override fun requestPermission() { - permissionResultRejected() + recognizerView.reset() + recognizerView.start() } @Composable @@ -77,14 +73,39 @@ val VoiceInputAction = Action( override fun WindowContents() { Box(modifier = Modifier.fillMaxSize()) { Box(modifier = Modifier.align(Alignment.Center)) { - Content() + recognizerView.Content() } } } override fun close() { - this.reset() - //soundPool.release() + recognizerView.cancel() + } + + private var wasFinished = false + override fun cancelled() { + if(!wasFinished) { + state.soundPlayer.playCancelSound() + } + } + + override fun recordingStarted() { + state.soundPlayer.playStartSound() + } + + override fun finished(result: String) { + wasFinished = true + + manager.typeText(result) + manager.closeActionWindow() + } + + override fun partialResult(result: String) { + manager.typePartialText(result) + } + + override fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean { + return false } } } diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt index fbf51b8d4..b819a87ec 100644 --- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt +++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt @@ -26,8 +26,10 @@ import kotlinx.coroutines.launch import kotlinx.coroutines.runBlocking import kotlinx.coroutines.withContext import kotlinx.coroutines.yield +import org.futo.voiceinput.shared.types.AudioRecognizerListener import org.futo.voiceinput.shared.types.InferenceState import org.futo.voiceinput.shared.types.Language +import org.futo.voiceinput.shared.types.MagnitudeState import org.futo.voiceinput.shared.types.ModelInferenceCallback import org.futo.voiceinput.shared.types.ModelLoader import org.futo.voiceinput.shared.whisper.DecodingConfiguration @@ -41,27 +43,6 @@ import kotlin.math.min import kotlin.math.pow import kotlin.math.sqrt -enum class MagnitudeState { - NOT_TALKED_YET, MIC_MAY_BE_BLOCKED, TALKING -} - -interface AudioRecognizerListener { - fun cancelled() - fun finished(result: String) - fun languageDetected(language: Language) - fun partialResult(result: String) - fun decodingStatus(status: InferenceState) - - fun loading() - fun needPermission() - fun permissionRejected() - - fun recordingStarted() - fun updateMagnitude(magnitude: Float, state: MagnitudeState) - - fun processing() -} - data class AudioRecognizerSettings( val modelRunConfiguration: MultiModelRunConfiguration, val decodingConfiguration: DecodingConfiguration @@ -69,8 +50,6 @@ data class AudioRecognizerSettings( class ModelDoesNotExistException(val models: List) : Throwable() -// Ideally this shouldn't load the models at all, we should have something else that loads it -// and gives the model to AudioRecognizer class AudioRecognizer( val context: Context, val lifecycleScope: LifecycleCoroutineScope, @@ -122,11 +101,11 @@ class AudioRecognizer( isRecording = false } - fun finishRecognizer() { + fun finish() { onFinishRecording() } - fun cancelRecognizer() { + fun cancel() { reset() listener.cancelled() } @@ -142,25 +121,25 @@ class AudioRecognizer( myAppSettings.flags = Intent.FLAG_ACTIVITY_NEW_TASK context.startActivity(myAppSettings) - cancelRecognizer() + cancel() } - fun create() { + fun start() { listener.loading() if (context.checkSelfPermission(Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) { - listener.needPermission() + requestPermission() } else { startRecording() } } - fun permissionResultGranted() { - startRecording() - } - - fun permissionResultRejected() { - listener.permissionRejected() + private fun requestPermission() { + listener.needPermission { wasGranted -> + if(wasGranted) { + startRecording() + } + } } @Throws(SecurityException::class) @@ -219,7 +198,7 @@ class AudioRecognizer( if (isRunningOutOfSpace || hasNotTalkedRecently) { yield() withContext(Dispatchers.Main) { - finishRecognizer() + finish() } return } @@ -305,7 +284,7 @@ class AudioRecognizer( if (floatSamples.remaining() < nRead2) { yield() withContext(Dispatchers.Main) { - finishRecognizer() + finish() } break } @@ -333,7 +312,7 @@ class AudioRecognizer( createAudioRecorder() } catch (e: SecurityException) { // It's possible we may have lost permission, so let's just ask for permission again - listener.needPermission() + requestPermission() return } diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt index fb0c0d7fc..06b90d27e 100644 --- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt +++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt @@ -10,8 +10,10 @@ import androidx.compose.runtime.Composable import androidx.compose.runtime.mutableStateOf import androidx.lifecycle.LifecycleCoroutineScope import kotlinx.coroutines.launch +import org.futo.voiceinput.shared.types.AudioRecognizerListener import org.futo.voiceinput.shared.types.InferenceState import org.futo.voiceinput.shared.types.Language +import org.futo.voiceinput.shared.types.MagnitudeState import org.futo.voiceinput.shared.ui.InnerRecognize import org.futo.voiceinput.shared.ui.PartialDecodingResult import org.futo.voiceinput.shared.ui.RecognizeLoadingCircle @@ -23,50 +25,49 @@ import org.futo.voiceinput.shared.whisper.DecodingConfiguration import org.futo.voiceinput.shared.whisper.ModelManager import org.futo.voiceinput.shared.whisper.MultiModelRunConfiguration -abstract class RecognizerView( +data class RecognizerViewSettings( + val shouldShowVerboseFeedback: Boolean, + val shouldShowInlinePartialResult: Boolean +) + +private val VerboseAnnotations = hashMapOf( + InferenceState.ExtractingMel to R.string.extracting_features, + InferenceState.LoadingModel to R.string.loading_model, + InferenceState.Encoding to R.string.encoding, + InferenceState.DecodingLanguage to R.string.decoding, + InferenceState.SwitchingModel to R.string.switching_model, + InferenceState.DecodingStarted to R.string.decoding +) + +private val DefaultAnnotations = hashMapOf( + InferenceState.ExtractingMel to R.string.processing, + InferenceState.LoadingModel to R.string.processing, + InferenceState.Encoding to R.string.processing, + InferenceState.DecodingLanguage to R.string.processing, + InferenceState.SwitchingModel to R.string.switching_model, + InferenceState.DecodingStarted to R.string.processing +) + +interface RecognizerViewListener { + fun cancelled() + + fun recordingStarted() + + fun finished(result: String) + + fun partialResult(result: String) + + // Return true if a permission modal was shown, otherwise return false + fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean +} + +class RecognizerView( private val context: Context, - private val lifecycleScope: LifecycleCoroutineScope, - private val modelManager: ModelManager + private val listener: RecognizerViewListener, + private val settings: RecognizerViewSettings, + lifecycleScope: LifecycleCoroutineScope, + modelManager: ModelManager ) { - // TODO: Should not get settings here, pass settings to constructor - private val shouldPlaySounds: ValueFromSettings = ValueFromSettings(ENABLE_SOUND, true) - private val shouldBeVerbose: ValueFromSettings = - ValueFromSettings(VERBOSE_PROGRESS, false) - - // TODO: SoundPool should be managed by parent, not by view, as the view is short-lived - /* val soundPool: SoundPool = SoundPool.Builder().setMaxStreams(2).setAudioAttributes( - AudioAttributes.Builder().setUsage(USAGE_ASSISTANCE_SONIFICATION) - .setContentType(CONTENT_TYPE_SONIFICATION).build() - ).build()*/ - - private var startSoundId: Int = -1 - private var cancelSoundId: Int = -1 - - abstract fun onCancel() - abstract fun sendResult(result: String) - abstract fun sendPartialResult(result: String): Boolean - abstract fun requestPermission() - - companion object { - private val verboseAnnotations = hashMapOf( - InferenceState.ExtractingMel to R.string.extracting_features, - InferenceState.LoadingModel to R.string.loading_model, - InferenceState.Encoding to R.string.encoding, - InferenceState.DecodingLanguage to R.string.decoding, - InferenceState.SwitchingModel to R.string.switching_model, - InferenceState.DecodingStarted to R.string.decoding - ) - - private val defaultAnnotations = hashMapOf( - InferenceState.ExtractingMel to R.string.processing, - InferenceState.LoadingModel to R.string.processing, - InferenceState.Encoding to R.string.processing, - InferenceState.DecodingLanguage to R.string.processing, - InferenceState.SwitchingModel to R.string.switching_model, - InferenceState.DecodingStarted to R.string.processing - ) - } - private val magnitudeState = mutableStateOf(0.0f) private val statusState = mutableStateOf(MagnitudeState.NOT_TALKED_YET) @@ -96,7 +97,7 @@ abstract class RecognizerView( CurrentView.InnerRecognize -> { Column { InnerRecognize( - onFinish = { recognizer.finishRecognizer() }, + onFinish = { recognizer.finish() }, magnitude = magnitudeState, state = statusState ) @@ -111,37 +112,17 @@ abstract class RecognizerView( } } - fun onClose() { - recognizer.cancelRecognizer() + fun cancel() { + recognizer.cancel() } - private val listener = object : AudioRecognizerListener { - // Tries to play a sound. If it's not yet ready, plays it when it's ready - private fun playSound(id: Int) { - /* - lifecycleScope.launch { - shouldPlaySounds.load(context) { - if (it) { - if (soundPool.play(id, 1.0f, 1.0f, 0, 0, 1.0f) == 0) { - soundPool.setOnLoadCompleteListener { soundPool, sampleId, status -> - if ((sampleId == id) && (status == 0)) { - soundPool.play(id, 1.0f, 1.0f, 0, 0, 1.0f) - } - } - } - } - } - } - */ - } - + private val audioRecognizerListener = object : AudioRecognizerListener { override fun cancelled() { - playSound(cancelSoundId) - onCancel() + listener.cancelled() } override fun finished(result: String) { - sendResult(result) + listener.finished(result) } override fun languageDetected(language: Language) { @@ -149,20 +130,19 @@ abstract class RecognizerView( } override fun partialResult(result: String) { - if (!sendPartialResult(result)) { - if (result.isNotBlank()) { - partialDecodingText.value = result - currentViewState.value = CurrentView.PartialDecodingResult - } + listener.partialResult(result) + if (settings.shouldShowInlinePartialResult && result.isNotBlank()) { + partialDecodingText.value = result + currentViewState.value = CurrentView.PartialDecodingResult } } override fun decodingStatus(status: InferenceState) { val text = context.getString( - when (shouldBeVerbose.value) { - true -> verboseAnnotations[status]!! - false -> defaultAnnotations[status]!! + when (settings.shouldShowVerboseFeedback) { + true -> VerboseAnnotations[status]!! + false -> DefaultAnnotations[status]!! } ) @@ -175,18 +155,25 @@ abstract class RecognizerView( currentViewState.value = CurrentView.LoadingCircle } - override fun needPermission() { - requestPermission() - } + override fun needPermission(onResult: (Boolean) -> Unit) { + val shown = listener.requestPermission( + onGranted = { + onResult(true) + }, + onRejected = { + onResult(false) + currentViewState.value = CurrentView.PermissionError + } + ) - override fun permissionRejected() { - currentViewState.value = CurrentView.PermissionError + if(!shown) { + currentViewState.value = CurrentView.PermissionError + } } override fun recordingStarted() { updateMagnitude(0.0f, MagnitudeState.NOT_TALKED_YET) - - playSound(startSoundId) + listener.recordingStarted() } override fun updateMagnitude(magnitude: Float, state: MagnitudeState) { @@ -203,7 +190,7 @@ abstract class RecognizerView( // TODO: Dummy settings, should get them from constructor private val recognizer: AudioRecognizer = AudioRecognizer( - context, lifecycleScope, modelManager, listener, AudioRecognizerSettings( + context, lifecycleScope, modelManager, audioRecognizerListener, AudioRecognizerSettings( modelRunConfiguration = MultiModelRunConfiguration( primaryModel = ENGLISH_MODELS[0], languageSpecificModels = mapOf() ), decodingConfiguration = DecodingConfiguration( @@ -216,22 +203,7 @@ abstract class RecognizerView( recognizer.reset() } - fun init() { - lifecycleScope.launch { - shouldBeVerbose.load(context) - } - - //startSoundId = soundPool.load(this.context, R.raw.start, 0) - //cancelSoundId = soundPool.load(this.context, R.raw.cancel, 0) - - recognizer.create() - } - - fun permissionResultGranted() { - recognizer.permissionResultGranted() - } - - fun permissionResultRejected() { - recognizer.permissionResultRejected() + fun start() { + recognizer.start() } } diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt new file mode 100644 index 000000000..56a9bf0d4 --- /dev/null +++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt @@ -0,0 +1,62 @@ +package org.futo.voiceinput.shared + +import android.content.Context +import android.media.AudioAttributes +import android.media.AudioAttributes.CONTENT_TYPE_SONIFICATION +import android.media.AudioAttributes.USAGE_ASSISTANCE_SONIFICATION +import android.media.SoundPool +import java.io.Closeable + +// soundPool.play returns 0 on failure +private const val SoundPoolPlayFailure = 0 + +// status in OnLoadCompleteListener is 0 when successful +private const val LoadStatusSuccess = 0 + +class SoundPlayer( + private val context: Context +): Closeable { + private val soundPool: SoundPool = SoundPool.Builder().setMaxStreams(2).setAudioAttributes( + AudioAttributes.Builder().setUsage(USAGE_ASSISTANCE_SONIFICATION) + .setContentType(CONTENT_TYPE_SONIFICATION).build() + ).build() + + private var startSound: Int = -1 + private var cancelSound: Int = -1 + + init { + startSound = soundPool.load(this.context, R.raw.start, 0) + cancelSound = soundPool.load(this.context, R.raw.cancel, 0) + } + + override fun close() { + soundPool.release() + } + + // Returns true if successful, zero if failed + private fun playSound(id: Int): Boolean { + return when(soundPool.play(id, 1.0f, 1.0f, 0, 0, 1.0f)) { + SoundPoolPlayFailure -> false + else -> true + } + } + + // Tries to play a sound. If it's not yet ready, plays it when it's ready + private fun playSoundOrLoad(id: Int) { + if (!playSound(id)) { + soundPool.setOnLoadCompleteListener { _, sampleId, status -> + if ((sampleId == id) && (status == LoadStatusSuccess)) { + playSound(id) + } + } + } + } + + fun playStartSound() { + playSoundOrLoad(startSound) + } + + fun playCancelSound() { + playSoundOrLoad(cancelSound) + } +} \ No newline at end of file diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt new file mode 100644 index 000000000..15a4dccef --- /dev/null +++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt @@ -0,0 +1,21 @@ +package org.futo.voiceinput.shared.types + +enum class MagnitudeState { + NOT_TALKED_YET, MIC_MAY_BE_BLOCKED, TALKING +} + +interface AudioRecognizerListener { + fun cancelled() + fun finished(result: String) + fun languageDetected(language: Language) + fun partialResult(result: String) + fun decodingStatus(status: InferenceState) + + fun loading() + fun needPermission(onResult: (Boolean) -> Unit) + + fun recordingStarted() + fun updateMagnitude(magnitude: Float, state: MagnitudeState) + + fun processing() +} \ No newline at end of file diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt index dfb879ce5..cc40d0a72 100644 --- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt +++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt @@ -27,8 +27,8 @@ import androidx.compose.ui.res.painterResource import androidx.compose.ui.res.stringResource import androidx.compose.ui.text.style.TextAlign import androidx.compose.ui.unit.dp -import org.futo.voiceinput.shared.MagnitudeState import org.futo.voiceinput.shared.R +import org.futo.voiceinput.shared.types.MagnitudeState import org.futo.voiceinput.shared.ui.theme.Typography