Attempt one after major code update to implement the true Whisper model use and operation

2026-01-23 10:33:31 +11:00
5 changed files with 92 additions and 113 deletions
--- a/.idea/deploymentTargetSelector.xml
+++ b/.idea/deploymentTargetSelector.xml
@@ -4,10 +4,10 @@
    <selectionStates>
      <SelectionState runConfigName="app">
        <option name="selectionMode" value="DROPDOWN" />
-        <DropdownSelection timestamp="2026-01-23T01:29:57.710335816Z">
+        <DropdownSelection timestamp="2026-01-22T04:36:45.393638454Z">
          <Target type="DEFAULT_BOOT">
            <handle>
-              <DeviceId pluginId="PhysicalDevice" identifier="serial=DKTAB13NEU0019483" />
+              <DeviceId pluginId="LocalEmulator" identifier="path=/home/michael/.android/avd/Pixel_5_API_31_Android_12_.avd" />
            </handle>
          </Target>
        </DropdownSelection>
--- a/app/build.gradle.kts
+++ b/app/build.gradle.kts
@@ -11,7 +11,7 @@ android {
        applicationId = "net.mmanningau.speechtokeyboard"
        minSdk = 28
        targetSdk = 36
-        versionCode = 12
+        versionCode = 10
        versionName = "1.1"
        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
@@ -25,11 +25,6 @@ android {
                "proguard-rules.pro"
            )
        }
        debug {
            applicationIdSuffix = ".streaming"
            // This changes the app name on your homescreen to "MyApp (Dev)"
            resValue("string", "app_name", "Speech To Keyboard (Streaming)")
        }
    }
    compileOptions {
        sourceCompatibility = JavaVersion.VERSION_11
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@@ -37,8 +37,6 @@
        <activity
            android:name=".TestModelActivity"
            android:parentActivityName=".MainActivity"
            android:exported="false"
            android:configChanges="orientation|screenSize|screenLayout|keyboardHidden"
            android:label="Test Microphone" />
    </application>
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt
@@ -89,7 +89,7 @@ class MainActivity : AppCompatActivity() {
                    var entry = zipInputStream.nextEntry
                    var foundEncoder = false
                    var foundDecoder = false
-                    var foundJoiner = false
+                    // var foundJoiner = false - removed for true Whisper model use
                    var foundTokens = false
                    while (entry != null) {
@@ -100,7 +100,7 @@ class MainActivity : AppCompatActivity() {
                        val targetFileName = when {
                            name.contains("encoder") && name.endsWith(".onnx") -> "encoder.onnx"
                            name.contains("decoder") && name.endsWith(".onnx") -> "decoder.onnx"
-                            name.contains("joiner") && name.endsWith(".onnx") -> "joiner.onnx"
+                            // name.contains("joiner") && name.endsWith(".onnx") -> "joiner.onnx" - removed for true Whisper model use
                            name.contains("tokens.txt") -> "tokens.txt"
                            else -> null
                        }
@@ -115,7 +115,7 @@ class MainActivity : AppCompatActivity() {
                            when (targetFileName) {
                                "encoder.onnx" -> foundEncoder = true
                                "decoder.onnx" -> foundDecoder = true
-                                "joiner.onnx" -> foundJoiner = true
+                                // "joiner.onnx" -> foundJoiner = true = re,moved for true Whisper model use
                                "tokens.txt" -> foundTokens = true
                            }
                        }
@@ -124,7 +124,8 @@ class MainActivity : AppCompatActivity() {
                    }
                    runOnUiThread {
-                        if (foundEncoder && foundDecoder && foundJoiner && foundTokens) {
+                        // if (foundEncoder && foundDecoder && foundJoiner && foundTokens) { - removed for true Whisper model use
                        if (foundEncoder && foundDecoder && foundTokens) {
                            statusText.text = "Model Installed Successfully!"
                            Toast.makeText(this, "Ready to use!", Toast.LENGTH_SHORT).show()
                        } else {
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
@@ -20,16 +20,24 @@ import com.hoho.android.usbserial.driver.UsbSerialProber
 import com.hoho.android.usbserial.util.SerialInputOutputManager
 import com.k2fsa.sherpa.onnx.EndpointConfig
 import com.k2fsa.sherpa.onnx.EndpointRule
 /*
 import com.k2fsa.sherpa.onnx.FeatureConfig
 import com.k2fsa.sherpa.onnx.OnlineModelConfig
 import com.k2fsa.sherpa.onnx.OnlineRecognizer
 import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
 import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
 import com.k2fsa.sherpa.onnx.OnlineStream
 import java.io.File
-import com.k2fsa.sherpa.onnx.OfflinePunctuation
+ */
-import com.k2fsa.sherpa.onnx.OfflinePunctuationConfig
+// Below for the "offline" libraries and the true Whisper integration
-import com.k2fsa.sherpa.onnx.OfflinePunctuationModelConfig
+import com.k2fsa.sherpa.onnx.OfflineRecognizer
 import com.k2fsa.sherpa.onnx.OfflineStream
 import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
 import com.k2fsa.sherpa.onnx.OfflineModelConfig
 import com.k2fsa.sherpa.onnx.OfflineWhisperModelConfig
 import com.k2fsa.sherpa.onnx.FeatureConfig
 import java.io.File
 class TestModelActivity : AppCompatActivity() {
@@ -38,14 +46,13 @@ class TestModelActivity : AppCompatActivity() {
    private lateinit var micButton: ImageButton
    // Sherpa (Whisper) Components
-    private var recognizer: OnlineRecognizer? = null
+    // private var recognizer: OnlineRecognizer? = null // - Removed for true Whisper model usa
-    private var stream: OnlineStream? = null
+    // private var stream: OnlineStream? = null // - Removed for true Whisper model usa
    private var recognizer: OfflineRecognizer? = null // Was OnlineRecognizer
    private var stream: OfflineStream? = null         // Was OnlineStream
    private var isRecording = false
    private var recordingThread: Thread? = null
    // Punctuation variables
    private var punctuator: OfflinePunctuation? = null
    // USB Components
    private var usbPort: UsbSerialPort? = null
@@ -83,63 +90,37 @@ class TestModelActivity : AppCompatActivity() {
            return
        }
        // 1. Point to your files
        val encoderPath = File(modelDir, "encoder.onnx").absolutePath
        val decoderPath = File(modelDir, "decoder.onnx").absolutePath
        val tokensPath = File(modelDir, "tokens.txt").absolutePath
        try {
-            // 1. Define Model Paths
+            // CONFIGURATION FOR WHISPER (OFFLINE)
-            val transducerConfig = OnlineTransducerModelConfig(
+            val config = OfflineRecognizerConfig(
                encoder = File(modelDir, "encoder.onnx").absolutePath,
                decoder = File(modelDir, "decoder.onnx").absolutePath,
                joiner = File(modelDir, "joiner.onnx").absolutePath
            )
            // 2. Define General Config
            val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
                transducer = transducerConfig,
                tokens = File(modelDir, "tokens.txt").absolutePath,
                numThreads = 1,
                debug = false,
                modelType = "zipformer"
            )
            // 3. Define Endpoint Rule (The fix for your error)
            // rule1 = detected silence after speech. We set this to 2.4 seconds.
            val silenceRule = EndpointRule(
                mustContainNonSilence = false,
                minTrailingSilence = 2.4f,
                minUtteranceLength = 0.0f
            )
            // 4. Create Recognizer Config
            val config = OnlineRecognizerConfig(
                featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
-                modelConfig = onlineModelConfig,
+                modelConfig = OfflineModelConfig(
-                endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
+                    // This parameter 'whisper' exists here!
-                enableEndpoint = true,
+                    whisper = OfflineWhisperModelConfig(
                        encoder = encoderPath,
                        decoder = decoderPath,
                        // tokenizer is not strictly needed in config here if passed in tokens param below
                        // but usually standard offline config uses just these two:
                    ),
                    tokens = tokensPath,
                    modelType = "whisper",
                    debug = false,
                    numThreads = 1
                ),
                decodingMethod = "greedy_search",
                maxActivePaths = 4
            )
-            // recognizer = OnlineRecognizer(assetManager = assets, config = config)
+            // Initialize OFFLINE Engine
-            recognizer = OnlineRecognizer(config = config)
+            recognizer = OfflineRecognizer(config = config)
            stream = recognizer?.createStream()
-            outputText.text = "Engine Loaded. Ready to Stream."
+            outputText.text = "Whisper Engine Ready."
            // ... existing recognizer init code ...
 // 5. Initialize Punctuation Engine
            val punctPath = File(modelDir, "punct_model.onnx").absolutePath
            if (File(punctPath).exists()) {
                // CORRECTED: Wrap the path inside 'OfflinePunctuationModelConfig'
                val punctConfig = OfflinePunctuationConfig(
                    model = OfflinePunctuationModelConfig(ctTransformer = punctPath)
                )
                punctuator = OfflinePunctuation(config = punctConfig)
                outputText.append("\n+ Punctuation Ready")
            } else {
                outputText.append("\n(No Punctuation model found)")
            }
        } catch (e: Exception) {
            Log.e("Sherpa", "Init Error", e)
@@ -164,12 +145,9 @@ class TestModelActivity : AppCompatActivity() {
            return
        }
-        // FIX 1: CLEAR THE BUFFER
+        // Reset the stream for a new session
-        // This prevents the "ghost text" from the previous session appearing
+        // Note: Sherpa streams can be persistent, but resetting ensures clean start
-        // when you hit record again.
+        // If you want continuous conversation, don't reset 'committedText'
        stream?.let { activeStream ->
            recognizer?.reset(activeStream)
        }
        isRecording = true
        micButton.setColorFilter(android.graphics.Color.RED)
@@ -183,19 +161,46 @@ class TestModelActivity : AppCompatActivity() {
    private fun stopRecording() {
        isRecording = false
-        recordingThread?.join()
+        try {
            recordingThread?.join() // Wait for loop to finish
        } catch (e: InterruptedException) {
            // Handle interruption if necessary
        }
        micButton.clearColorFilter()
-        // Just show what we have, don't overwrite with "[Stopped]"
+        // FIX: Safely unwrap 'stream' before passing it to getResult
-        // to prevent visual jarring.
+        // This reads: "If stream is NOT null, call getResult. Otherwise return empty string."
-        outputText.append("\n[Stopped]")
+        val finalCurrentText = stream?.let { activeStream ->
            recognizer?.getResult(activeStream)?.text
        } ?: ""
        val cleanFinal = finalCurrentText.lowercase()
        if (cleanFinal.isNotEmpty()) {
            // 1. Commit to history
            committedText += "$cleanFinal "
            // 2. Send to Pico
            sendToPico("$cleanFinal ")
            // 3. Update UI
            outputText.text = "$committedText \n[Stopped]"
            // 4. Reset for next time
            // We release the old stream and create a fresh one for the next sentence
            stream?.release()
            stream = recognizer?.createStream()
        } else {
            outputText.append("\n[Stopped - No Text]")
        }
    }
    private fun processAudioLoop() {
        val sampleRate = 16000
        val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
-        // Guard clauses
+        // 1. GUARD CLAUSE (Safely unwrap nullables)
        val localRec = recognizer ?: return
        val localStream = stream ?: return
@@ -213,44 +218,24 @@ class TestModelActivity : AppCompatActivity() {
            if (ret > 0) {
                val samples = FloatArray(ret) { buffer[it] / 32768.0f }
                // 2. Feed Audio
                localStream.acceptWaveform(samples, sampleRate)
-                while (localRec.isReady(localStream)) {
+                // 3. Decode (No isReady check needed for Offline)
-                    localRec.decode(localStream)
+                localRec.decode(localStream)
                }
                // 4. Get Current Text
                // Whisper updates this string constantly as it hears more
                val text = localRec.getResult(localStream).text
                val isEndpoint = localRec.isEndpoint(localStream)
                if (text.isNotEmpty()) {
                    val cleanText = text.lowercase()
-                    if (isEndpoint) {
+                    runOnUiThread {
-                        // FIX 2: THE ORDER OF OPERATIONS
+                        // Update the screen so user sees what is happening
-
+                        // We do NOT send to USB yet, because Whisper might change this text
-                        // A. Update UI first
+                        // as you keep speaking.
-                        // 1. PUNCTUATE
+                        outputText.text = "$committedText $cleanText"
                        // We pass the raw text to the punctuator
                        val punctuatedText = punctuator?.addPunctuation(cleanText) ?: cleanText
                        runOnUiThread {
                            // 2. Commit the BEAUTIFUL text
                            committedText += "$punctuatedText "
                            outputText.text = committedText
                            sendToPico("$punctuatedText ")
                        }
                        // B. RESET IMMEDIATELY ON BACKGROUND THREAD
                        // We do this HERE, not inside runOnUiThread.
                        // This guarantees the stream is clean BEFORE the loop
                        // reads the next chunk of audio.
                        localRec.reset(localStream)
                    } else {
                        // Standard partial update
                        runOnUiThread {
                            outputText.text = "$committedText $cleanText"
                        }
                    }
                }
            }