Attempt one after major code update to implement the true Whisper model use and operation

2026-01-23 10:33:31 +11:00
3 changed files with 90 additions and 80 deletions
--- a/app/build.gradle.kts
+++ b/app/build.gradle.kts
@@ -12,7 +12,7 @@ android {
        minSdk = 28
        targetSdk = 36
        versionCode = 10
-        versionName = "1.0"
+        versionName = "1.1"

        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
    }
@@ -25,11 +25,6 @@ android {
                "proguard-rules.pro"
            )
        }
-        debug {
-            applicationIdSuffix = ".streaming"
-            // This changes the app name on your homescreen to "MyApp (Dev)"
-            resValue("string", "app_name", "Speech To Keyboard (Streaming)")
-        }
    }
    compileOptions {
        sourceCompatibility = JavaVersion.VERSION_11
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt
@@ -89,7 +89,7 @@ class MainActivity : AppCompatActivity() {
                    var entry = zipInputStream.nextEntry
                    var foundEncoder = false
                    var foundDecoder = false
-                    var foundJoiner = false
+                    // var foundJoiner = false - removed for true Whisper model use
                    var foundTokens = false

                    while (entry != null) {
@@ -100,7 +100,7 @@ class MainActivity : AppCompatActivity() {
                        val targetFileName = when {
                            name.contains("encoder") && name.endsWith(".onnx") -> "encoder.onnx"
                            name.contains("decoder") && name.endsWith(".onnx") -> "decoder.onnx"
-                            name.contains("joiner") && name.endsWith(".onnx") -> "joiner.onnx"
+                            // name.contains("joiner") && name.endsWith(".onnx") -> "joiner.onnx" - removed for true Whisper model use
                            name.contains("tokens.txt") -> "tokens.txt"
                            else -> null
                        }
@@ -115,7 +115,7 @@ class MainActivity : AppCompatActivity() {
                            when (targetFileName) {
                                "encoder.onnx" -> foundEncoder = true
                                "decoder.onnx" -> foundDecoder = true
-                                "joiner.onnx" -> foundJoiner = true
+                                // "joiner.onnx" -> foundJoiner = true = re,moved for true Whisper model use
                                "tokens.txt" -> foundTokens = true
                            }
                        }
@@ -124,7 +124,8 @@ class MainActivity : AppCompatActivity() {
                    }

                    runOnUiThread {
-                        if (foundEncoder && foundDecoder && foundJoiner && foundTokens) {
+                        // if (foundEncoder && foundDecoder && foundJoiner && foundTokens) { - removed for true Whisper model use
+                        if (foundEncoder && foundDecoder && foundTokens) {
                            statusText.text = "Model Installed Successfully!"
                            Toast.makeText(this, "Ready to use!", Toast.LENGTH_SHORT).show()
                        } else {
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
@@ -20,11 +20,23 @@ import com.hoho.android.usbserial.driver.UsbSerialProber
 import com.hoho.android.usbserial.util.SerialInputOutputManager
 import com.k2fsa.sherpa.onnx.EndpointConfig
 import com.k2fsa.sherpa.onnx.EndpointRule
+/*
 import com.k2fsa.sherpa.onnx.FeatureConfig
+import com.k2fsa.sherpa.onnx.OnlineModelConfig
 import com.k2fsa.sherpa.onnx.OnlineRecognizer
 import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
 import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
 import com.k2fsa.sherpa.onnx.OnlineStream
+
+ */
+// Below for the "offline" libraries and the true Whisper integration
+import com.k2fsa.sherpa.onnx.OfflineRecognizer
+import com.k2fsa.sherpa.onnx.OfflineStream
+import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
+import com.k2fsa.sherpa.onnx.OfflineModelConfig
+import com.k2fsa.sherpa.onnx.OfflineWhisperModelConfig
+import com.k2fsa.sherpa.onnx.FeatureConfig
+
 import java.io.File

 class TestModelActivity : AppCompatActivity() {
@@ -34,8 +46,10 @@ class TestModelActivity : AppCompatActivity() {
    private lateinit var micButton: ImageButton

    // Sherpa (Whisper) Components
-    private var recognizer: OnlineRecognizer? = null
-    private var stream: OnlineStream? = null
+    // private var recognizer: OnlineRecognizer? = null // - Removed for true Whisper model usa
+    // private var stream: OnlineStream? = null // - Removed for true Whisper model usa
+    private var recognizer: OfflineRecognizer? = null // Was OnlineRecognizer
+    private var stream: OfflineStream? = null         // Was OnlineStream
    private var isRecording = false
    private var recordingThread: Thread? = null

@@ -76,46 +90,37 @@ class TestModelActivity : AppCompatActivity() {
            return
        }

+        // 1. Point to your files
+        val encoderPath = File(modelDir, "encoder.onnx").absolutePath
+        val decoderPath = File(modelDir, "decoder.onnx").absolutePath
+        val tokensPath = File(modelDir, "tokens.txt").absolutePath
+
        try {
-            // 1. Define Model Paths
-            val transducerConfig = OnlineTransducerModelConfig(
-                encoder = File(modelDir, "encoder.onnx").absolutePath,
-                decoder = File(modelDir, "decoder.onnx").absolutePath,
-                joiner = File(modelDir, "joiner.onnx").absolutePath
-            )
-
-            // 2. Define General Config
-            val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
-                transducer = transducerConfig,
-                tokens = File(modelDir, "tokens.txt").absolutePath,
-                numThreads = 1,
-                debug = false,
-                modelType = "zipformer"
-            )
-
-            // 3. Define Endpoint Rule (The fix for your error)
-            // rule1 = detected silence after speech. We set this to 2.4 seconds.
-            val silenceRule = EndpointRule(
-                mustContainNonSilence = false,
-                minTrailingSilence = 2.4f,
-                minUtteranceLength = 0.0f
-            )
-
-            // 4. Create Recognizer Config
-            val config = OnlineRecognizerConfig(
+            // CONFIGURATION FOR WHISPER (OFFLINE)
+            val config = OfflineRecognizerConfig(
                featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
-                modelConfig = onlineModelConfig,
-                endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
-                enableEndpoint = true,
+                modelConfig = OfflineModelConfig(
+                    // This parameter 'whisper' exists here!
+                    whisper = OfflineWhisperModelConfig(
+                        encoder = encoderPath,
+                        decoder = decoderPath,
+                        // tokenizer is not strictly needed in config here if passed in tokens param below
+                        // but usually standard offline config uses just these two:
+                    ),
+                    tokens = tokensPath,
+                    modelType = "whisper",
+                    debug = false,
+                    numThreads = 1
+                ),
                decodingMethod = "greedy_search",
                maxActivePaths = 4
            )

-            // recognizer = OnlineRecognizer(assetManager = assets, config = config)
-            recognizer = OnlineRecognizer(config = config)
+            // Initialize OFFLINE Engine
+            recognizer = OfflineRecognizer(config = config)
            stream = recognizer?.createStream()

-            outputText.text = "Engine Loaded. Ready to Stream."
+            outputText.text = "Whisper Engine Ready."

        } catch (e: Exception) {
            Log.e("Sherpa", "Init Error", e)
@@ -140,12 +145,9 @@ class TestModelActivity : AppCompatActivity() {
            return
        }

-        // FIX 1: CLEAR THE BUFFER
-        // This prevents the "ghost text" from the previous session appearing
-        // when you hit record again.
-        stream?.let { activeStream ->
-            recognizer?.reset(activeStream)
-        }
+        // Reset the stream for a new session
+        // Note: Sherpa streams can be persistent, but resetting ensures clean start
+        // If you want continuous conversation, don't reset 'committedText'

        isRecording = true
        micButton.setColorFilter(android.graphics.Color.RED)
@@ -159,19 +161,46 @@ class TestModelActivity : AppCompatActivity() {

    private fun stopRecording() {
        isRecording = false
-        recordingThread?.join()
+        try {
+            recordingThread?.join() // Wait for loop to finish
+        } catch (e: InterruptedException) {
+            // Handle interruption if necessary
+        }
+
        micButton.clearColorFilter()

-        // Just show what we have, don't overwrite with "[Stopped]"
-        // to prevent visual jarring.
-        outputText.append("\n[Stopped]")
+        // FIX: Safely unwrap 'stream' before passing it to getResult
+        // This reads: "If stream is NOT null, call getResult. Otherwise return empty string."
+        val finalCurrentText = stream?.let { activeStream ->
+            recognizer?.getResult(activeStream)?.text
+        } ?: ""
+
+        val cleanFinal = finalCurrentText.lowercase()
+
+        if (cleanFinal.isNotEmpty()) {
+            // 1. Commit to history
+            committedText += "$cleanFinal "
+
+            // 2. Send to Pico
+            sendToPico("$cleanFinal ")
+
+            // 3. Update UI
+            outputText.text = "$committedText \n[Stopped]"
+
+            // 4. Reset for next time
+            // We release the old stream and create a fresh one for the next sentence
+            stream?.release()
+            stream = recognizer?.createStream()
+        } else {
+            outputText.append("\n[Stopped - No Text]")
+        }
    }

    private fun processAudioLoop() {
        val sampleRate = 16000
        val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)

-        // Guard clauses
+        // 1. GUARD CLAUSE (Safely unwrap nullables)
        val localRec = recognizer ?: return
        val localStream = stream ?: return

@@ -189,39 +218,24 @@ class TestModelActivity : AppCompatActivity() {
            if (ret > 0) {
                val samples = FloatArray(ret) { buffer[it] / 32768.0f }

+                // 2. Feed Audio
                localStream.acceptWaveform(samples, sampleRate)

-                while (localRec.isReady(localStream)) {
-                    localRec.decode(localStream)
-                }
+                // 3. Decode (No isReady check needed for Offline)
+                localRec.decode(localStream)

+                // 4. Get Current Text
+                // Whisper updates this string constantly as it hears more
                val text = localRec.getResult(localStream).text
-                val isEndpoint = localRec.isEndpoint(localStream)

                if (text.isNotEmpty()) {
                    val cleanText = text.lowercase()

-                    if (isEndpoint) {
-                        // FIX 2: THE ORDER OF OPERATIONS
-
-                        // A. Update UI first
-                        runOnUiThread {
-                            committedText += "$cleanText "
-                            outputText.text = committedText
-                            sendToPico("$cleanText ")
-                        }
-
-                        // B. RESET IMMEDIATELY ON BACKGROUND THREAD
-                        // We do this HERE, not inside runOnUiThread.
-                        // This guarantees the stream is clean BEFORE the loop
-                        // reads the next chunk of audio.
-                        localRec.reset(localStream)
-
-                    } else {
-                        // Standard partial update
-                        runOnUiThread {
-                            outputText.text = "$committedText $cleanText"
-                        }
+                    runOnUiThread {
+                        // Update the screen so user sees what is happening
+                        // We do NOT send to USB yet, because Whisper might change this text
+                        // as you keep speaking.
+                        outputText.text = "$committedText $cleanText"
                    }
                }
            }