Most changes which compile using the Whisper Sherpa setup - but prior to the changes in the MainActivity file which will unzip the new model files for us automagically ;-)

2026-01-22 20:30:14 +11:00
parent 404bc55ed3
commit ce72ef7a16
1 changed files with 197 additions and 265 deletions
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
@@ -1,52 +1,49 @@
 package net.mmanningau.speechtokeyboard
 import android.Manifest
 import android.content.Context
 import android.content.pm.PackageManager
 import android.hardware.usb.UsbManager
 import android.media.AudioFormat
 import android.media.AudioRecord
 import android.media.MediaRecorder
 import android.os.Bundle
 import android.util.Log
 import android.widget.ImageButton
 import android.widget.TextView
 import android.widget.Toast
 import androidx.appcompat.app.AppCompatActivity
 import androidx.core.app.ActivityCompat
 import androidx.core.content.ContextCompat
 import org.json.JSONObject
 import com.k2fsa.sherpa.onnx.*  // import for whisper sherpa wrapper
 // import org.vosk.Model  --- migration to whisper removals
 //  import org.vosk.Recognizer
 // import org.vosk.android.RecognitionListener
 // import org.vosk.android.SpeechService
 import java.io.File
 import android.content.Context
 import android.hardware.usb.UsbManager
 import com.hoho.android.usbserial.driver.UsbSerialPort
 import com.hoho.android.usbserial.driver.UsbSerialProber
 import com.hoho.android.usbserial.util.SerialInputOutputManager
 import com.k2fsa.sherpa.onnx.EndpointConfig
 import com.k2fsa.sherpa.onnx.EndpointRule
 import com.k2fsa.sherpa.onnx.FeatureConfig
 import com.k2fsa.sherpa.onnx.OnlineRecognizer
 import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
 import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
 import com.k2fsa.sherpa.onnx.OnlineStream
 import java.io.File
 // class TestModelActivity : AppCompatActivity(), RecognitionListener {
 class TestModelActivity : AppCompatActivity() {
    // UI Components
    private lateinit var outputText: TextView
    private lateinit var micButton: ImageButton
-    // Whisper/Sherpa wrapper setup variables here
+    // Sherpa (Whisper) Components
    private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper
    private var recognizer: OnlineRecognizer? = null
    private var stream: OnlineStream? = null
-
+    private var isRecording = false
-    // Vosk Components - now removed as whisper migration
+    private var recordingThread: Thread? = null
    // private var model: Model? = null
    // private var speechService: SpeechService? = null
    private var isListening = false
    // USB Components
    private var usbPort: UsbSerialPort? = null
    private var usbIoManager: SerialInputOutputManager? = null // Handles the data flow
-    private var committedText = "" // Stores the finalized sentences
+    // Text History
    private var committedText = ""
    override fun onCreate(savedInstanceState: Bundle?) {
        super.onCreate(savedInstanceState)
@@ -56,117 +53,160 @@ class TestModelActivity : AppCompatActivity() {
        outputText = findViewById(R.id.text_output_log)
        micButton = findViewById(R.id.btn_mic_toggle)
        // Check Permissions immediately
        checkAudioPermission()
        connectToPico() // Try to auto-connect USB on start
-        // Setup Button Listener
+        // Initialize Engine
        initSherpaModel()
        // Setup Button
        micButton.setOnClickListener {
-            toggleListening()
+            toggleRecording()
        }
    }
-        // Initialize the model in background
+    // ----------------------------------------------------------------
-        initModel()
+    // 1. ENGINE INITIALIZATION (The "Missing Code")
-    }
+    // ----------------------------------------------------------------
    private fun initSherpaModel() {
        val modelDir = File(filesDir, "sherpa-model")
-    private fun connectToPico() {
+        if (!File(modelDir, "encoder.onnx").exists()) {
-        val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
+            outputText.text = "Error: Sherpa Model files missing in /sherpa-model/"
        // 1. Find the Device
        // (This probes specifically for devices listed in your device_filter.xml)
        val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
        if (availableDrivers.isEmpty()) {
            outputText.append("\n> No USB device found.")
            return
        }
        // Assume the first device found is the Pico
        val driver = availableDrivers[0]
        val connection = usbManager.openDevice(driver.device)
        if (connection == null) {
            outputText.append("\n> Permission denied. Re-plug device?")
            return
        }
        // 2. Open the Port
        // Most Picos use port 0.
        usbPort = driver.ports[0]
        try {
-            usbPort?.open(connection)
+            // 1. Define Model Paths
-            // 3. Set Parameters (Must match your Pico's C/Python code!)
+            val transducerConfig = OnlineTransducerModelConfig(
-            // 115200 Baud, 8 Data bits, 1 Stop bit, No Parity
+                encoder = File(modelDir, "encoder.onnx").absolutePath,
-            usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
+                decoder = File(modelDir, "decoder.onnx").absolutePath,
                joiner = File(modelDir, "joiner.onnx").absolutePath
            )
-            outputText.append("\n> USB Connected to Pico!")
+            // 2. Define General Config
-        } catch (e: Exception) {
+            val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
-            outputText.append("\n> USB Error: ${e.message}")
+                transducer = transducerConfig,
-        }
+                tokens = File(modelDir, "tokens.txt").absolutePath,
-    }
+                numThreads = 1,
                debug = false,
                modelType = "zipformer"
            )
-    //Whisper/Sherpa implementation functions here....
+            // 3. Define Endpoint Rule (The fix for your error)
-    private fun initSherpaModel() {
+            // rule1 = detected silence after speech. We set this to 2.4 seconds.
-        // Sherpa requires specific configuration
+            val silenceRule = EndpointRule(
                mustContainNonSilence = false,
                minTrailingSilence = 2.4f,
                minUtteranceLength = 0.0f
            )
            // 4. Create Recognizer Config
            val config = OnlineRecognizerConfig(
-            featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80),
+                featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
-            transducerModelConfig = OnlineTransducerModelConfig(
+                modelConfig = onlineModelConfig,
-                encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path
+                endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
-                decoder = "$filesDir/decoder-epoch-99-avg-1.onnx",
+                enableEndpoint = true,
                joiner = "$filesDir/joiner-epoch-99-avg-1.onnx",
            ),
            enableEndpoint = true, // Detects when you stop speaking
            ruleFsts = "",
                decodingMethod = "greedy_search",
                maxActivePaths = 4
            )
        try {
            recognizer = OnlineRecognizer(assetManager = assets, config = config)
            stream = recognizer?.createStream()
-            outputText.text = "Whisper/Sherpa Ready!"
+
            outputText.text = "Engine Loaded. Ready to Stream."
        } catch (e: Exception) {
-            outputText.text = "Error: ${e.message}"
+            Log.e("Sherpa", "Init Error", e)
            outputText.text = "Init Error: ${e.message}"
        }
    }
-    private fun startRecordingLoop() {
+    // ----------------------------------------------------------------
    // 2. AUDIO LOOP (The "Manual" Listener)
    // ----------------------------------------------------------------
    private fun toggleRecording() {
        if (isRecording) {
            stopRecording()
        } else {
            startRecording()
        }
    }
    private fun startRecording() {
        if (recognizer == null) {
            Toast.makeText(this, "Engine not ready", Toast.LENGTH_SHORT).show()
            return
        }
        // Reset the stream for a new session
        // Note: Sherpa streams can be persistent, but resetting ensures clean start
        // If you want continuous conversation, don't reset 'committedText'
        isRecording = true
        micButton.setColorFilter(android.graphics.Color.RED)
        outputText.text = "$committedText [Listening...]"
        recordingThread = Thread {
            processAudioLoop()
        }
        recordingThread?.start()
    }
    private fun stopRecording() {
        isRecording = false
        recordingThread?.join() // Wait for loop to finish
        micButton.clearColorFilter()
        outputText.text = "$committedText [Stopped]"
    }
    private fun processAudioLoop() {
        val sampleRate = 16000
        val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
        val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
-        record.startRecording()
+        if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
-        isListening = true
+            return
        Thread {
            val buffer = ShortArray(bufferSize / 2)
            while (isListening) {
                val read = record.read(buffer, 0, buffer.size)
                if (read > 0) {
                    // 1. Feed audio to engine
                    val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize
                    stream?.acceptWaveform(floatSamples, sampleRate)
                    // 2. Decode
                    while (recognizer?.isReady(stream) == true) {
                        recognizer?.decode(stream)
        }
-                    // 3. Get Result
+        // --- FIX START ---
-                    val result = recognizer?.getResult(stream)
+        // Capture global variables into local non-null variables.
-                    val text = result?.text ?: ""
+        // If either is null, we just exit the loop safely.
        val activeStream = stream ?: return
        val activeRecognizer = recognizer ?: return
        // --- FIX END ---
        val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
        record.startRecording()
        val buffer = ShortArray(bufferSize)
        while (isRecording) {
            val ret = record.read(buffer, 0, buffer.size)
            if (ret > 0) {
                val samples = FloatArray(ret) { buffer[it] / 32768.0f }
                // Use 'activeStream' and 'activeRecognizer' (No ? needed anymore)
                activeStream.acceptWaveform(samples, sampleRate)
                while (activeRecognizer.isReady(activeStream)) {
                    activeRecognizer.decode(activeStream)
                }
                val text = activeRecognizer.getResult(activeStream).text
                if (text.isNotEmpty()) {
                        // Update UI
                        runOnUiThread {
                            // Sherpa returns the FULL string so far, not just chunks
                            // So we just overwrite the "Current" view
                    val cleanText = text.lowercase()
-                            outputText.text = cleanText
+                    runOnUiThread {
                        outputText.text = "$committedText $cleanText"
                    }
-                            // Check if sentence is "Final" (Endpoint detected)
+                    if (activeRecognizer.isEndpoint(activeStream)) {
-                            if (recognizer?.isEndpoint(stream) == true) {
+                        if (cleanText.isNotBlank()) {
-                                sendToPico(cleanText)
+                            committedText += "$cleanText "
-                                recognizer?.reset(stream) // Clear buffer for next sentence
+                            sendToPico("$cleanText ")
                            // Reset the stream
                            activeRecognizer.reset(activeStream)
                        }
                    }
                }
@@ -174,160 +214,52 @@ class TestModelActivity : AppCompatActivity() {
        }
        record.stop()
        record.release()
        }.start()
    }
    // ----------------------------------------------------------------
    // 3. USB LOGIC (Unchanged from before)
    // ----------------------------------------------------------------
    private fun connectToPico() {
        val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
        val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
        if (availableDrivers.isEmpty()) return
-    /*      ---- removed as part of the whisper migration
+        val driver = availableDrivers[0]
-     private fun initModel() {
+        val connection = usbManager.openDevice(driver.device) ?: return
        // We look for the folder inside private storage (same logic as MainActivity)
        val modelPath = File(filesDir, "vosk-model")
-        if (!modelPath.exists()) {
+        usbPort = driver.ports[0]
            outputText.text = "Error: Model not found. Please go back and load a model first."
            micButton.isEnabled = false
            return
        }
        Thread {
        try {
-                // Find the actual model folder inside
+            usbPort?.open(connection)
-                val actualModelDir = modelPath.listFiles()?.firstOrNull { it.isDirectory } ?: modelPath
+            usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
-                model = Model(actualModelDir.absolutePath)
+            outputText.append("\n> USB Connected")
                runOnUiThread {
                    outputText.append("\n\n> Model Loaded. Ready.")
                }
        } catch (e: Exception) {
-                runOnUiThread {
+            outputText.append("\n> USB Error: ${e.message}")
                    outputText.text = "Error loading model: ${e.message}"
        }
    }
-        }.start()
+
    }
 */
    private fun sendToPico(text: String) {
-        if (usbPort == null) return // Safety check
+        if (usbPort == null) return
        try {
-            // Convert text to bytes and send
+            usbPort?.write(text.toByteArray(Charsets.UTF_8), 500)
            val data = text.toByteArray(Charsets.UTF_8)
            usbPort?.write(data, 1000) // 1000ms timeout
        } catch (e: Exception) {
-            outputText.append("\n[Send Failed: ${e.message}]")
+            // Log error
        }
    }
-    private fun toggleListening() {
+    // ----------------------------------------------------------------
-        if (model == null) {
+    // 4. CLEANUP
-            Toast.makeText(this, "Model not loaded yet", Toast.LENGTH_SHORT).show()
+    // ----------------------------------------------------------------
            return
        }
        if (isListening) {
            stopRecognition()
        } else {
            startRecognition()
        }
    }
    private fun startRecognition() {
        try {
            val recognizer = Recognizer(model, 16000.0f) // 16kHz is standard for Vosk
            speechService = SpeechService(recognizer, 16000.0f)
            //speechService?.addListener(this)           <----- removed this as it generated an error
            speechService?.startListening(this)
            isListening = true
            micButton.setColorFilter(android.graphics.Color.RED) // Turn button red
            outputText.text = "" // Clear previous text
            outputText.append("> Listening...\n")
        } catch (e: Exception) {
            outputText.append("\nError starting mic: ${e.message}")
        }
    }
    private fun stopRecognition() {
        speechService?.stop()
        speechService = null
        isListening = false
        micButton.clearColorFilter() // Reset button color
        outputText.append("\n> Stopped.")
    }
    // --- Vosk Listener Callbacks ---
 /* removed as part of migration to whisper
    override fun onResult(hypothesis: String?) {
        hypothesis?.let {
            val text = parseVoskResult(it)
            if (text.isNotEmpty()) {
                // 1. Update the UI History
                // Add the new sentence to our history
                committedText += "$text. "
                // Update screen
                outputText.text = "$committedText"
                // 2. SEND TO PICO
                // We append a space because speech engines strip trailing spaces,
                // and you don't want "helloworld" typed into your computer.
                sendToPico("$text ")
            }
        }
    }
    override fun onPartialResult(hypothesis: String?) {
        // Optional: Shows words as they are being spoken (streaming)
        // You can enable this if you want to see "typing" effect
        hypothesis?.let {
            // Parse the "partial" JSON key
            val partial = JSONObject(it).optString("partial", "")
            if (partial.isNotEmpty()) {
                // Display: [History] + [Current Streaming Guess]
                outputText.text = "$committedText $partial..."
            }
        }
    }
 */
    override fun onFinalResult(hypothesis: String?) {
        // Final flush when stopping
        hypothesis?.let {
            val text = parseVoskResult(it)
            if (text.isNotEmpty()) {
                outputText.append("$text\n")
            }
        }
    }
 /* Whispoer migration removals
    override fun onError(exception: Exception?) {
        outputText.append("\nError: ${exception?.message}")
    }
    override fun onTimeout() {
        outputText.append("\nTimeout.")
    }
 */
    // Permission Helper
    private fun checkAudioPermission() {
        if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
            ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1)
        }
    }
    // Cleanup on exit
    override fun onDestroy() {
        super.onDestroy()
-        speechService?.shutdown()
+        isRecording = false
-
+        stream?.release()
-        // Close USB
+        recognizer?.release()
-        try {
+        try { usbPort?.close() } catch (e: Exception) {}
            usbPort?.close()
        } catch (e: Exception) {
            // Ignore errors on close
        }
    }
 }