Initial steps to migrate to the whisper/sherpa setup..

Still missing majot changes to the TestModelActivity.kt file which are coming next...
2026-01-22 19:49:48 +11:00
parent 12c0508713
commit 404bc55ed3
4 changed files with 115 additions and 28 deletions
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt
@@ -9,8 +9,8 @@ import android.widget.TextView
 import android.widget.Toast
 import androidx.activity.result.contract.ActivityResultContracts
 import androidx.appcompat.app.AppCompatActivity
-import org.vosk.Model
-import org.vosk.android.SpeechService
+// import org.vosk.Model
+// import org.vosk.android.SpeechService    --- removed as part of migratoin to whisper.cpp
 import java.io.File
 import java.util.zip.ZipInputStream

@@ -19,9 +19,9 @@ class MainActivity : AppCompatActivity() {
    // UI Components
    private lateinit var statusText: TextView

-    // Vosk Components
-    private var model: Model? = null
-    private var speechService: SpeechService? = null
+    // Vosk Components - removed as part of whisper migration
+    // private var model: Model? = null
+    // private var speechService: SpeechService? = null

    // 1. THE FILE PICKER REGISTRY
    // This handles the result when the user picks a ZIP file
@@ -43,7 +43,7 @@ class MainActivity : AppCompatActivity() {

        // ADD THIS LINE AT THE BOTTOM:
        // This attempts to load the model immediately if files exist
-        initVoskModel()
+        // initVoskModel() - removed as part of whisper migration
    }

    // 2. SETUP THE MENU
@@ -113,7 +113,7 @@ class MainActivity : AppCompatActivity() {
                // Back to UI Thread to say success
                runOnUiThread {
                    statusText.text = "Model Installed! Initializing..."
-                    initVoskModel()
+                    //  initVoskModel() - removed as part of the whisper migration
                }

            } catch (e: Exception) {
@@ -127,7 +127,9 @@ class MainActivity : AppCompatActivity() {

    // 6. INITIALIZE VOSK "BRAIN"
    // Replace your existing initVoskModel with this updated version
+    /*
    private fun initVoskModel() {
+
        val modelPath = File(filesDir, "vosk-model")

        // Check if the directory exists before trying to load
@@ -146,4 +148,5 @@ class MainActivity : AppCompatActivity() {
            statusText.text = "Error loading saved model: ${e.message}"
        }
    }
+*/
 }
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
@@ -10,10 +10,14 @@ import androidx.appcompat.app.AppCompatActivity
 import androidx.core.app.ActivityCompat
 import androidx.core.content.ContextCompat
 import org.json.JSONObject
-import org.vosk.Model
-import org.vosk.Recognizer
-import org.vosk.android.RecognitionListener
-import org.vosk.android.SpeechService
+
+import com.k2fsa.sherpa.onnx.*  // import for whisper sherpa wrapper
+
+// import org.vosk.Model  --- migration to whisper removals
+//  import org.vosk.Recognizer
+// import org.vosk.android.RecognitionListener
+// import org.vosk.android.SpeechService
+
 import java.io.File

 import android.content.Context
@@ -22,14 +26,20 @@ import com.hoho.android.usbserial.driver.UsbSerialPort
 import com.hoho.android.usbserial.driver.UsbSerialProber
 import com.hoho.android.usbserial.util.SerialInputOutputManager

-class TestModelActivity : AppCompatActivity(), RecognitionListener {
+// class TestModelActivity : AppCompatActivity(), RecognitionListener {
+class TestModelActivity : AppCompatActivity() {

    private lateinit var outputText: TextView
    private lateinit var micButton: ImageButton

-    // Vosk Components
-    private var model: Model? = null
-    private var speechService: SpeechService? = null
+    // Whisper/Sherpa wrapper setup variables here
+    private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper
+    private var recognizer: OnlineRecognizer? = null
+    private var stream: OnlineStream? = null
+
+    // Vosk Components - now removed as whisper migration
+    // private var model: Model? = null
+    // private var speechService: SpeechService? = null
    private var isListening = false

    // USB Components
@@ -94,8 +104,82 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener {
        }
    }

+    //Whisper/Sherpa implementation functions here....
+    private fun initSherpaModel() {
+        // Sherpa requires specific configuration
+        val config = OnlineRecognizerConfig(
+            featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80),
+            transducerModelConfig = OnlineTransducerModelConfig(
+                encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path
+                decoder = "$filesDir/decoder-epoch-99-avg-1.onnx",
+                joiner = "$filesDir/joiner-epoch-99-avg-1.onnx",
+            ),
+            enableEndpoint = true, // Detects when you stop speaking
+            ruleFsts = "",
+            decodingMethod = "greedy_search",
+            maxActivePaths = 4
+        )

-    private fun initModel() {
+        try {
+            recognizer = OnlineRecognizer(assetManager = assets, config = config)
+            stream = recognizer?.createStream()
+            outputText.text = "Whisper/Sherpa Ready!"
+        } catch (e: Exception) {
+            outputText.text = "Error: ${e.message}"
+        }
+    }
+
+    private fun startRecordingLoop() {
+        val sampleRate = 16000
+        val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
+        val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
+
+        record.startRecording()
+        isListening = true
+
+        Thread {
+            val buffer = ShortArray(bufferSize / 2)
+            while (isListening) {
+                val read = record.read(buffer, 0, buffer.size)
+                if (read > 0) {
+                    // 1. Feed audio to engine
+                    val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize
+                    stream?.acceptWaveform(floatSamples, sampleRate)
+
+                    // 2. Decode
+                    while (recognizer?.isReady(stream) == true) {
+                        recognizer?.decode(stream)
+                    }
+
+                    // 3. Get Result
+                    val result = recognizer?.getResult(stream)
+                    val text = result?.text ?: ""
+
+                    if (text.isNotEmpty()) {
+                        // Update UI
+                        runOnUiThread {
+                            // Sherpa returns the FULL string so far, not just chunks
+                            // So we just overwrite the "Current" view
+                            val cleanText = text.lowercase()
+                            outputText.text = cleanText
+
+                            // Check if sentence is "Final" (Endpoint detected)
+                            if (recognizer?.isEndpoint(stream) == true) {
+                                sendToPico(cleanText)
+                                recognizer?.reset(stream) // Clear buffer for next sentence
+                            }
+                        }
+                    }
+                }
+            }
+            record.stop()
+            record.release()
+        }.start()
+    }
+
+
+    /*      ---- removed as part of the whisper migration
+     private fun initModel() {
        // We look for the folder inside private storage (same logic as MainActivity)
        val modelPath = File(filesDir, "vosk-model")

@@ -121,7 +205,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener {
            }
        }.start()
    }
-
+*/
    private fun sendToPico(text: String) {
        if (usbPort == null) return // Safety check

@@ -173,6 +257,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener {
    }

    // --- Vosk Listener Callbacks ---
+/* removed as part of migration to whisper

    override fun onResult(hypothesis: String?) {
        hypothesis?.let {
@@ -205,6 +290,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener {
            }
        }
    }
+*/

    override fun onFinalResult(hypothesis: String?) {
        // Final flush when stopping
@@ -215,7 +301,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener {
            }
        }
    }
-
+/* Whispoer migration removals
    override fun onError(exception: Exception?) {
        outputText.append("\nError: ${exception?.message}")
    }
@@ -224,15 +310,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener {
        outputText.append("\nTimeout.")
    }

-    // Helper to clean JSON: {"text": "hello world"} -> "hello world"
-    private fun parseVoskResult(json: String): String {
-        return try {
-            JSONObject(json).optString("text", "")
-        } catch (e: Exception) {
-            ""
-        }
-    }
-
+*/
    // Permission Helper
    private fun checkAudioPermission() {
        if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {