From 3acb8b5e44e5a7c40ea6151bc5c260a529eb3c45 Mon Sep 17 00:00:00 2001
From: Aleksandras Kostarevas <alex@futo.org>
Date: Thu, 31 Aug 2023 19:15:50 +0300
Subject: [PATCH] Create SoundPlayer for persistent state

---
 .../org/futo/inputmethod/latin/uix/Action.kt  |   6 +
 .../latin/uix/actions/VoiceInputAction.kt     |  73 +++++---
 .../futo/voiceinput/shared/AudioRecognizer.kt |  53 ++----
 .../futo/voiceinput/shared/RecognizerView.kt  | 174 ++++++++----------
 .../org/futo/voiceinput/shared/SoundPlayer.kt |  62 +++++++
 .../shared/types/AudioRecognizerListener.kt   |  21 +++
 .../voiceinput/shared/ui/RecognizeViews.kt    |   2 +-
 7 files changed, 226 insertions(+), 165 deletions(-)
 create mode 100644 voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt
 create mode 100644 voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt

diff --git a/java/src/org/futo/inputmethod/latin/uix/Action.kt b/java/src/org/futo/inputmethod/latin/uix/Action.kt
index 38d607798..57ce0ec30 100644
--- a/java/src/org/futo/inputmethod/latin/uix/Action.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/Action.kt
@@ -36,6 +36,12 @@ interface ActionWindow {
 }
 
 interface PersistentActionState {
+    /**
+     * When called, the device may be on low memory and is requesting the action to clean up its
+     * state. You can close any resources that may not be necessary anymore. This will never be
+     * called when the action window is currently open. The PersistentActionState will stick around
+     * after this.
+     */
     suspend fun cleanUp()
 }
 
diff --git a/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt b/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
index e1b7c009e..d6f38a76d 100644
--- a/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
@@ -15,6 +15,9 @@ import org.futo.inputmethod.latin.uix.ActionWindow
 import org.futo.inputmethod.latin.uix.KeyboardManagerForAction
 import org.futo.inputmethod.latin.uix.PersistentActionState
 import org.futo.voiceinput.shared.RecognizerView
+import org.futo.voiceinput.shared.RecognizerViewListener
+import org.futo.voiceinput.shared.RecognizerViewSettings
+import org.futo.voiceinput.shared.SoundPlayer
 import org.futo.voiceinput.shared.whisper.ModelManager
 
 val SystemVoiceInputAction = Action(
@@ -29,7 +32,8 @@ val SystemVoiceInputAction = Action(
 
 
 class VoiceInputPersistentState(val manager: KeyboardManagerForAction) : PersistentActionState {
-    var modelManager: ModelManager = ModelManager(manager.getContext())
+    val modelManager = ModelManager(manager.getContext())
+    val soundPlayer = SoundPlayer(manager.getContext())
 
     override suspend fun cleanUp() {
         modelManager.cleanUp()
@@ -43,29 +47,21 @@ val VoiceInputAction = Action(
 
     windowImpl = { manager, persistentState ->
         val state = persistentState as VoiceInputPersistentState
-        object : ActionWindow, RecognizerView(manager.getContext(), manager.getLifecycleScope(), state.modelManager) {
+        object : ActionWindow, RecognizerViewListener {
+            private val recognizerView = RecognizerView(
+                context = manager.getContext(),
+                listener = this,
+                settings = RecognizerViewSettings(
+                    shouldShowInlinePartialResult = false,
+                    shouldShowVerboseFeedback = true
+                ),
+                lifecycleScope = manager.getLifecycleScope(),
+                modelManager = state.modelManager
+            )
+
             init {
-                this.reset()
-                this.init()
-            }
-
-            override fun onCancel() {
-                this.reset()
-                manager.closeActionWindow()
-            }
-
-            override fun sendResult(result: String) {
-                manager.typeText(result)
-                onCancel()
-            }
-
-            override fun sendPartialResult(result: String): Boolean {
-                manager.typePartialText(result)
-                return true
-            }
-
-            override fun requestPermission() {
-                permissionResultRejected()
+                recognizerView.reset()
+                recognizerView.start()
             }
 
             @Composable
@@ -77,14 +73,39 @@ val VoiceInputAction = Action(
             override fun WindowContents() {
                 Box(modifier = Modifier.fillMaxSize()) {
                     Box(modifier = Modifier.align(Alignment.Center)) {
-                        Content()
+                        recognizerView.Content()
                     }
                 }
             }
 
             override fun close() {
-                this.reset()
-                //soundPool.release()
+                recognizerView.cancel()
+            }
+
+            private var wasFinished = false
+            override fun cancelled() {
+                if(!wasFinished) {
+                    state.soundPlayer.playCancelSound()
+                }
+            }
+
+            override fun recordingStarted() {
+                state.soundPlayer.playStartSound()
+            }
+
+            override fun finished(result: String) {
+                wasFinished = true
+
+                manager.typeText(result)
+                manager.closeActionWindow()
+            }
+
+            override fun partialResult(result: String) {
+                manager.typePartialText(result)
+            }
+
+            override fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean {
+                return false
             }
         }
     }
diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
index fbf51b8d4..b819a87ec 100644
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
@@ -26,8 +26,10 @@ import kotlinx.coroutines.launch
 import kotlinx.coroutines.runBlocking
 import kotlinx.coroutines.withContext
 import kotlinx.coroutines.yield
+import org.futo.voiceinput.shared.types.AudioRecognizerListener
 import org.futo.voiceinput.shared.types.InferenceState
 import org.futo.voiceinput.shared.types.Language
+import org.futo.voiceinput.shared.types.MagnitudeState
 import org.futo.voiceinput.shared.types.ModelInferenceCallback
 import org.futo.voiceinput.shared.types.ModelLoader
 import org.futo.voiceinput.shared.whisper.DecodingConfiguration
@@ -41,27 +43,6 @@ import kotlin.math.min
 import kotlin.math.pow
 import kotlin.math.sqrt
 
-enum class MagnitudeState {
-    NOT_TALKED_YET, MIC_MAY_BE_BLOCKED, TALKING
-}
-
-interface AudioRecognizerListener {
-    fun cancelled()
-    fun finished(result: String)
-    fun languageDetected(language: Language)
-    fun partialResult(result: String)
-    fun decodingStatus(status: InferenceState)
-
-    fun loading()
-    fun needPermission()
-    fun permissionRejected()
-
-    fun recordingStarted()
-    fun updateMagnitude(magnitude: Float, state: MagnitudeState)
-
-    fun processing()
-}
-
 data class AudioRecognizerSettings(
     val modelRunConfiguration: MultiModelRunConfiguration,
     val decodingConfiguration: DecodingConfiguration
@@ -69,8 +50,6 @@ data class AudioRecognizerSettings(
 
 class ModelDoesNotExistException(val models: List<ModelLoader>) : Throwable()
 
-// Ideally this shouldn't load the models at all, we should have something else that loads it
-// and gives the model to AudioRecognizer
 class AudioRecognizer(
     val context: Context,
     val lifecycleScope: LifecycleCoroutineScope,
@@ -122,11 +101,11 @@ class AudioRecognizer(
         isRecording = false
     }
 
-    fun finishRecognizer() {
+    fun finish() {
         onFinishRecording()
     }
 
-    fun cancelRecognizer() {
+    fun cancel() {
         reset()
         listener.cancelled()
     }
@@ -142,25 +121,25 @@ class AudioRecognizer(
         myAppSettings.flags = Intent.FLAG_ACTIVITY_NEW_TASK
         context.startActivity(myAppSettings)
 
-        cancelRecognizer()
+        cancel()
     }
 
-    fun create() {
+    fun start() {
         listener.loading()
 
         if (context.checkSelfPermission(Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
-            listener.needPermission()
+            requestPermission()
         } else {
             startRecording()
         }
     }
 
-    fun permissionResultGranted() {
-        startRecording()
-    }
-
-    fun permissionResultRejected() {
-        listener.permissionRejected()
+    private fun requestPermission() {
+        listener.needPermission { wasGranted ->
+            if(wasGranted) {
+                startRecording()
+            }
+        }
     }
 
     @Throws(SecurityException::class)
@@ -219,7 +198,7 @@ class AudioRecognizer(
             if (isRunningOutOfSpace || hasNotTalkedRecently) {
                 yield()
                 withContext(Dispatchers.Main) {
-                    finishRecognizer()
+                    finish()
                 }
                 return
             }
@@ -305,7 +284,7 @@ class AudioRecognizer(
                     if (floatSamples.remaining() < nRead2) {
                         yield()
                         withContext(Dispatchers.Main) {
-                            finishRecognizer()
+                            finish()
                         }
                         break
                     }
@@ -333,7 +312,7 @@ class AudioRecognizer(
             createAudioRecorder()
         } catch (e: SecurityException) {
             // It's possible we may have lost permission, so let's just ask for permission again
-            listener.needPermission()
+            requestPermission()
             return
         }
 
diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt
index fb0c0d7fc..06b90d27e 100644
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/RecognizerView.kt
@@ -10,8 +10,10 @@ import androidx.compose.runtime.Composable
 import androidx.compose.runtime.mutableStateOf
 import androidx.lifecycle.LifecycleCoroutineScope
 import kotlinx.coroutines.launch
+import org.futo.voiceinput.shared.types.AudioRecognizerListener
 import org.futo.voiceinput.shared.types.InferenceState
 import org.futo.voiceinput.shared.types.Language
+import org.futo.voiceinput.shared.types.MagnitudeState
 import org.futo.voiceinput.shared.ui.InnerRecognize
 import org.futo.voiceinput.shared.ui.PartialDecodingResult
 import org.futo.voiceinput.shared.ui.RecognizeLoadingCircle
@@ -23,50 +25,49 @@ import org.futo.voiceinput.shared.whisper.DecodingConfiguration
 import org.futo.voiceinput.shared.whisper.ModelManager
 import org.futo.voiceinput.shared.whisper.MultiModelRunConfiguration
 
-abstract class RecognizerView(
+data class RecognizerViewSettings(
+    val shouldShowVerboseFeedback: Boolean,
+    val shouldShowInlinePartialResult: Boolean
+)
+
+private val VerboseAnnotations = hashMapOf(
+    InferenceState.ExtractingMel to R.string.extracting_features,
+    InferenceState.LoadingModel to R.string.loading_model,
+    InferenceState.Encoding to R.string.encoding,
+    InferenceState.DecodingLanguage to R.string.decoding,
+    InferenceState.SwitchingModel to R.string.switching_model,
+    InferenceState.DecodingStarted to R.string.decoding
+)
+
+private val DefaultAnnotations = hashMapOf(
+    InferenceState.ExtractingMel to R.string.processing,
+    InferenceState.LoadingModel to R.string.processing,
+    InferenceState.Encoding to R.string.processing,
+    InferenceState.DecodingLanguage to R.string.processing,
+    InferenceState.SwitchingModel to R.string.switching_model,
+    InferenceState.DecodingStarted to R.string.processing
+)
+
+interface RecognizerViewListener {
+    fun cancelled()
+
+    fun recordingStarted()
+
+    fun finished(result: String)
+
+    fun partialResult(result: String)
+
+    // Return true if a permission modal was shown, otherwise return false
+    fun requestPermission(onGranted: () -> Unit, onRejected: () -> Unit): Boolean
+}
+
+class RecognizerView(
     private val context: Context,
-    private val lifecycleScope: LifecycleCoroutineScope,
-    private val modelManager: ModelManager
+    private val listener: RecognizerViewListener,
+    private val settings: RecognizerViewSettings,
+    lifecycleScope: LifecycleCoroutineScope,
+    modelManager: ModelManager
 ) {
-    // TODO: Should not get settings here, pass settings to constructor
-    private val shouldPlaySounds: ValueFromSettings<Boolean> = ValueFromSettings(ENABLE_SOUND, true)
-    private val shouldBeVerbose: ValueFromSettings<Boolean> =
-        ValueFromSettings(VERBOSE_PROGRESS, false)
-
-    // TODO: SoundPool should be managed by parent, not by view, as the view is short-lived
-    /* val soundPool: SoundPool = SoundPool.Builder().setMaxStreams(2).setAudioAttributes(
-        AudioAttributes.Builder().setUsage(USAGE_ASSISTANCE_SONIFICATION)
-            .setContentType(CONTENT_TYPE_SONIFICATION).build()
-    ).build()*/
-
-    private var startSoundId: Int = -1
-    private var cancelSoundId: Int = -1
-
-    abstract fun onCancel()
-    abstract fun sendResult(result: String)
-    abstract fun sendPartialResult(result: String): Boolean
-    abstract fun requestPermission()
-
-    companion object {
-        private val verboseAnnotations = hashMapOf(
-            InferenceState.ExtractingMel to R.string.extracting_features,
-            InferenceState.LoadingModel to R.string.loading_model,
-            InferenceState.Encoding to R.string.encoding,
-            InferenceState.DecodingLanguage to R.string.decoding,
-            InferenceState.SwitchingModel to R.string.switching_model,
-            InferenceState.DecodingStarted to R.string.decoding
-        )
-
-        private val defaultAnnotations = hashMapOf(
-            InferenceState.ExtractingMel to R.string.processing,
-            InferenceState.LoadingModel to R.string.processing,
-            InferenceState.Encoding to R.string.processing,
-            InferenceState.DecodingLanguage to R.string.processing,
-            InferenceState.SwitchingModel to R.string.switching_model,
-            InferenceState.DecodingStarted to R.string.processing
-        )
-    }
-
     private val magnitudeState = mutableStateOf(0.0f)
     private val statusState = mutableStateOf(MagnitudeState.NOT_TALKED_YET)
 
@@ -96,7 +97,7 @@ abstract class RecognizerView(
             CurrentView.InnerRecognize -> {
                 Column {
                     InnerRecognize(
-                        onFinish = { recognizer.finishRecognizer() },
+                        onFinish = { recognizer.finish() },
                         magnitude = magnitudeState,
                         state = statusState
                     )
@@ -111,37 +112,17 @@ abstract class RecognizerView(
         }
     }
 
-    fun onClose() {
-        recognizer.cancelRecognizer()
+    fun cancel() {
+        recognizer.cancel()
     }
 
-    private val listener = object : AudioRecognizerListener {
-        // Tries to play a sound. If it's not yet ready, plays it when it's ready
-        private fun playSound(id: Int) {
-            /*
-            lifecycleScope.launch {
-                shouldPlaySounds.load(context) {
-                    if (it) {
-                        if (soundPool.play(id, 1.0f, 1.0f, 0, 0, 1.0f) == 0) {
-                            soundPool.setOnLoadCompleteListener { soundPool, sampleId, status ->
-                                if ((sampleId == id) && (status == 0)) {
-                                    soundPool.play(id, 1.0f, 1.0f, 0, 0, 1.0f)
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            */
-        }
-
+    private val audioRecognizerListener = object : AudioRecognizerListener {
         override fun cancelled() {
-            playSound(cancelSoundId)
-            onCancel()
+            listener.cancelled()
         }
 
         override fun finished(result: String) {
-            sendResult(result)
+            listener.finished(result)
         }
 
         override fun languageDetected(language: Language) {
@@ -149,20 +130,19 @@ abstract class RecognizerView(
         }
 
         override fun partialResult(result: String) {
-            if (!sendPartialResult(result)) {
-                if (result.isNotBlank()) {
-                    partialDecodingText.value = result
-                    currentViewState.value = CurrentView.PartialDecodingResult
-                }
+            listener.partialResult(result)
+            if (settings.shouldShowInlinePartialResult && result.isNotBlank()) {
+                partialDecodingText.value = result
+                currentViewState.value = CurrentView.PartialDecodingResult
             }
         }
 
 
         override fun decodingStatus(status: InferenceState) {
             val text = context.getString(
-                when (shouldBeVerbose.value) {
-                    true -> verboseAnnotations[status]!!
-                    false -> defaultAnnotations[status]!!
+                when (settings.shouldShowVerboseFeedback) {
+                    true -> VerboseAnnotations[status]!!
+                    false -> DefaultAnnotations[status]!!
                 }
             )
 
@@ -175,18 +155,25 @@ abstract class RecognizerView(
             currentViewState.value = CurrentView.LoadingCircle
         }
 
-        override fun needPermission() {
-            requestPermission()
-        }
+        override fun needPermission(onResult: (Boolean) -> Unit) {
+            val shown = listener.requestPermission(
+                onGranted = {
+                    onResult(true)
+                },
+                onRejected = {
+                    onResult(false)
+                    currentViewState.value = CurrentView.PermissionError
+                }
+            )
 
-        override fun permissionRejected() {
-            currentViewState.value = CurrentView.PermissionError
+            if(!shown) {
+                currentViewState.value = CurrentView.PermissionError
+            }
         }
 
         override fun recordingStarted() {
             updateMagnitude(0.0f, MagnitudeState.NOT_TALKED_YET)
-
-            playSound(startSoundId)
+            listener.recordingStarted()
         }
 
         override fun updateMagnitude(magnitude: Float, state: MagnitudeState) {
@@ -203,7 +190,7 @@ abstract class RecognizerView(
 
     // TODO: Dummy settings, should get them from constructor
     private val recognizer: AudioRecognizer = AudioRecognizer(
-        context, lifecycleScope, modelManager, listener, AudioRecognizerSettings(
+        context, lifecycleScope, modelManager, audioRecognizerListener, AudioRecognizerSettings(
             modelRunConfiguration = MultiModelRunConfiguration(
                 primaryModel = ENGLISH_MODELS[0], languageSpecificModels = mapOf()
             ), decodingConfiguration = DecodingConfiguration(
@@ -216,22 +203,7 @@ abstract class RecognizerView(
         recognizer.reset()
     }
 
-    fun init() {
-        lifecycleScope.launch {
-            shouldBeVerbose.load(context)
-        }
-
-        //startSoundId = soundPool.load(this.context, R.raw.start, 0)
-        //cancelSoundId = soundPool.load(this.context, R.raw.cancel, 0)
-
-        recognizer.create()
-    }
-
-    fun permissionResultGranted() {
-        recognizer.permissionResultGranted()
-    }
-
-    fun permissionResultRejected() {
-        recognizer.permissionResultRejected()
+    fun start() {
+        recognizer.start()
     }
 }
diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt
new file mode 100644
index 000000000..56a9bf0d4
--- /dev/null
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/SoundPlayer.kt
@@ -0,0 +1,62 @@
+package org.futo.voiceinput.shared
+
+import android.content.Context
+import android.media.AudioAttributes
+import android.media.AudioAttributes.CONTENT_TYPE_SONIFICATION
+import android.media.AudioAttributes.USAGE_ASSISTANCE_SONIFICATION
+import android.media.SoundPool
+import java.io.Closeable
+
+// soundPool.play returns 0 on failure
+private const val SoundPoolPlayFailure = 0
+
+// status in OnLoadCompleteListener is 0 when successful
+private const val LoadStatusSuccess = 0
+
+class SoundPlayer(
+    private val context: Context
+): Closeable {
+    private val soundPool: SoundPool = SoundPool.Builder().setMaxStreams(2).setAudioAttributes(
+        AudioAttributes.Builder().setUsage(USAGE_ASSISTANCE_SONIFICATION)
+            .setContentType(CONTENT_TYPE_SONIFICATION).build()
+    ).build()
+
+    private var startSound: Int = -1
+    private var cancelSound: Int = -1
+
+    init {
+        startSound = soundPool.load(this.context, R.raw.start, 0)
+        cancelSound = soundPool.load(this.context, R.raw.cancel, 0)
+    }
+
+    override fun close() {
+        soundPool.release()
+    }
+
+    // Returns true if successful, zero if failed
+    private fun playSound(id: Int): Boolean {
+        return when(soundPool.play(id, 1.0f, 1.0f, 0, 0, 1.0f)) {
+            SoundPoolPlayFailure -> false
+            else -> true
+        }
+    }
+
+    // Tries to play a sound. If it's not yet ready, plays it when it's ready
+    private fun playSoundOrLoad(id: Int) {
+        if (!playSound(id)) {
+            soundPool.setOnLoadCompleteListener { _, sampleId, status ->
+                if ((sampleId == id) && (status == LoadStatusSuccess)) {
+                    playSound(id)
+                }
+            }
+        }
+    }
+
+    fun playStartSound() {
+        playSoundOrLoad(startSound)
+    }
+
+    fun playCancelSound() {
+        playSoundOrLoad(cancelSound)
+    }
+}
\ No newline at end of file
diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt
new file mode 100644
index 000000000..15a4dccef
--- /dev/null
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/types/AudioRecognizerListener.kt
@@ -0,0 +1,21 @@
+package org.futo.voiceinput.shared.types
+
+enum class MagnitudeState {
+    NOT_TALKED_YET, MIC_MAY_BE_BLOCKED, TALKING
+}
+
+interface AudioRecognizerListener {
+    fun cancelled()
+    fun finished(result: String)
+    fun languageDetected(language: Language)
+    fun partialResult(result: String)
+    fun decodingStatus(status: InferenceState)
+
+    fun loading()
+    fun needPermission(onResult: (Boolean) -> Unit)
+
+    fun recordingStarted()
+    fun updateMagnitude(magnitude: Float, state: MagnitudeState)
+
+    fun processing()
+}
\ No newline at end of file
diff --git a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt
index dfb879ce5..cc40d0a72 100644
--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/ui/RecognizeViews.kt
@@ -27,8 +27,8 @@ import androidx.compose.ui.res.painterResource
 import androidx.compose.ui.res.stringResource
 import androidx.compose.ui.text.style.TextAlign
 import androidx.compose.ui.unit.dp
-import org.futo.voiceinput.shared.MagnitudeState
 import org.futo.voiceinput.shared.R
+import org.futo.voiceinput.shared.types.MagnitudeState
 import org.futo.voiceinput.shared.ui.theme.Typography