Most changes which compile using the Whisper Sherpa setup - but prior to the changes in the MainActivity file which will unzip the new model files for us automagically ;-)

2026-01-22 20:30:14 +11:00
parent 404bc55ed3
commit ce72ef7a16
1 changed files with 197 additions and 265 deletions
--- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
+++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt
@@ -1,52 +1,49 @@
 package net.mmanningau.speechtokeyboard

 import android.Manifest
+import android.content.Context
 import android.content.pm.PackageManager
+import android.hardware.usb.UsbManager
+import android.media.AudioFormat
+import android.media.AudioRecord
+import android.media.MediaRecorder
 import android.os.Bundle
+import android.util.Log
 import android.widget.ImageButton
 import android.widget.TextView
 import android.widget.Toast
 import androidx.appcompat.app.AppCompatActivity
 import androidx.core.app.ActivityCompat
 import androidx.core.content.ContextCompat
-import org.json.JSONObject
-
-import com.k2fsa.sherpa.onnx.*  // import for whisper sherpa wrapper
-
-// import org.vosk.Model  --- migration to whisper removals
-//  import org.vosk.Recognizer
-// import org.vosk.android.RecognitionListener
-// import org.vosk.android.SpeechService
-
-import java.io.File
-
-import android.content.Context
-import android.hardware.usb.UsbManager
 import com.hoho.android.usbserial.driver.UsbSerialPort
 import com.hoho.android.usbserial.driver.UsbSerialProber
 import com.hoho.android.usbserial.util.SerialInputOutputManager
+import com.k2fsa.sherpa.onnx.EndpointConfig
+import com.k2fsa.sherpa.onnx.EndpointRule
+import com.k2fsa.sherpa.onnx.FeatureConfig
+import com.k2fsa.sherpa.onnx.OnlineRecognizer
+import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
+import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
+import com.k2fsa.sherpa.onnx.OnlineStream
+import java.io.File

-// class TestModelActivity : AppCompatActivity(), RecognitionListener {
 class TestModelActivity : AppCompatActivity() {

+    // UI Components
    private lateinit var outputText: TextView
    private lateinit var micButton: ImageButton

-    // Whisper/Sherpa wrapper setup variables here
-    private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper
+    // Sherpa (Whisper) Components
    private var recognizer: OnlineRecognizer? = null
    private var stream: OnlineStream? = null
-
-    // Vosk Components - now removed as whisper migration
-    // private var model: Model? = null
-    // private var speechService: SpeechService? = null
-    private var isListening = false
+    private var isRecording = false
+    private var recordingThread: Thread? = null

    // USB Components
    private var usbPort: UsbSerialPort? = null
-    private var usbIoManager: SerialInputOutputManager? = null // Handles the data flow

-    private var committedText = "" // Stores the finalized sentences
+    // Text History
+    private var committedText = ""

    override fun onCreate(savedInstanceState: Bundle?) {
        super.onCreate(savedInstanceState)
@@ -56,278 +53,213 @@ class TestModelActivity : AppCompatActivity() {
        outputText = findViewById(R.id.text_output_log)
        micButton = findViewById(R.id.btn_mic_toggle)

-        // Check Permissions immediately
        checkAudioPermission()
+        connectToPico() // Try to auto-connect USB on start

-        // Setup Button Listener
+        // Initialize Engine
+        initSherpaModel()
+
+        // Setup Button
        micButton.setOnClickListener {
-            toggleListening()
+            toggleRecording()
        }
-
-        // Initialize the model in background
-        initModel()
    }

-    private fun connectToPico() {
-        val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
+    // ----------------------------------------------------------------
+    // 1. ENGINE INITIALIZATION (The "Missing Code")
+    // ----------------------------------------------------------------
+    private fun initSherpaModel() {
+        val modelDir = File(filesDir, "sherpa-model")

-        // 1. Find the Device
-        // (This probes specifically for devices listed in your device_filter.xml)
-        val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
-        if (availableDrivers.isEmpty()) {
-            outputText.append("\n> No USB device found.")
+        if (!File(modelDir, "encoder.onnx").exists()) {
+            outputText.text = "Error: Sherpa Model files missing in /sherpa-model/"
            return
        }

-        // Assume the first device found is the Pico
-        val driver = availableDrivers[0]
-        val connection = usbManager.openDevice(driver.device)
-
-        if (connection == null) {
-            outputText.append("\n> Permission denied. Re-plug device?")
-            return
-        }
-
-        // 2. Open the Port
-        // Most Picos use port 0.
-        usbPort = driver.ports[0]
-
        try {
-            usbPort?.open(connection)
-            // 3. Set Parameters (Must match your Pico's C/Python code!)
-            // 115200 Baud, 8 Data bits, 1 Stop bit, No Parity
-            usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
+            // 1. Define Model Paths
+            val transducerConfig = OnlineTransducerModelConfig(
+                encoder = File(modelDir, "encoder.onnx").absolutePath,
+                decoder = File(modelDir, "decoder.onnx").absolutePath,
+                joiner = File(modelDir, "joiner.onnx").absolutePath
+            )

-            outputText.append("\n> USB Connected to Pico!")
+            // 2. Define General Config
+            val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
+                transducer = transducerConfig,
+                tokens = File(modelDir, "tokens.txt").absolutePath,
+                numThreads = 1,
+                debug = false,
+                modelType = "zipformer"
+            )
+
+            // 3. Define Endpoint Rule (The fix for your error)
+            // rule1 = detected silence after speech. We set this to 2.4 seconds.
+            val silenceRule = EndpointRule(
+                mustContainNonSilence = false,
+                minTrailingSilence = 2.4f,
+                minUtteranceLength = 0.0f
+            )
+
+            // 4. Create Recognizer Config
+            val config = OnlineRecognizerConfig(
+                featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
+                modelConfig = onlineModelConfig,
+                endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
+                enableEndpoint = true,
+                decodingMethod = "greedy_search",
+                maxActivePaths = 4
+            )
+
+            recognizer = OnlineRecognizer(assetManager = assets, config = config)
+            stream = recognizer?.createStream()
+
+            outputText.text = "Engine Loaded. Ready to Stream."
+
+        } catch (e: Exception) {
+            Log.e("Sherpa", "Init Error", e)
+            outputText.text = "Init Error: ${e.message}"
+        }
+    }
+
+    // ----------------------------------------------------------------
+    // 2. AUDIO LOOP (The "Manual" Listener)
+    // ----------------------------------------------------------------
+    private fun toggleRecording() {
+        if (isRecording) {
+            stopRecording()
+        } else {
+            startRecording()
+        }
+    }
+
+    private fun startRecording() {
+        if (recognizer == null) {
+            Toast.makeText(this, "Engine not ready", Toast.LENGTH_SHORT).show()
+            return
+        }
+
+        // Reset the stream for a new session
+        // Note: Sherpa streams can be persistent, but resetting ensures clean start
+        // If you want continuous conversation, don't reset 'committedText'
+
+        isRecording = true
+        micButton.setColorFilter(android.graphics.Color.RED)
+        outputText.text = "$committedText [Listening...]"
+
+        recordingThread = Thread {
+            processAudioLoop()
+        }
+        recordingThread?.start()
+    }
+
+    private fun stopRecording() {
+        isRecording = false
+        recordingThread?.join() // Wait for loop to finish
+        micButton.clearColorFilter()
+        outputText.text = "$committedText [Stopped]"
+    }
+
+    private fun processAudioLoop() {
+        val sampleRate = 16000
+        val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
+
+        if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
+            return
+        }
+
+        // --- FIX START ---
+        // Capture global variables into local non-null variables.
+        // If either is null, we just exit the loop safely.
+        val activeStream = stream ?: return
+        val activeRecognizer = recognizer ?: return
+        // --- FIX END ---
+
+        val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
+        record.startRecording()
+
+        val buffer = ShortArray(bufferSize)
+
+        while (isRecording) {
+            val ret = record.read(buffer, 0, buffer.size)
+            if (ret > 0) {
+                val samples = FloatArray(ret) { buffer[it] / 32768.0f }
+
+                // Use 'activeStream' and 'activeRecognizer' (No ? needed anymore)
+                activeStream.acceptWaveform(samples, sampleRate)
+
+                while (activeRecognizer.isReady(activeStream)) {
+                    activeRecognizer.decode(activeStream)
+                }
+
+                val text = activeRecognizer.getResult(activeStream).text
+
+                if (text.isNotEmpty()) {
+                    val cleanText = text.lowercase()
+                    runOnUiThread {
+                        outputText.text = "$committedText $cleanText"
+                    }
+
+                    if (activeRecognizer.isEndpoint(activeStream)) {
+                        if (cleanText.isNotBlank()) {
+                            committedText += "$cleanText "
+                            sendToPico("$cleanText ")
+
+                            // Reset the stream
+                            activeRecognizer.reset(activeStream)
+                        }
+                    }
+                }
+            }
+        }
+        record.stop()
+        record.release()
+    }
+
+    // ----------------------------------------------------------------
+    // 3. USB LOGIC (Unchanged from before)
+    // ----------------------------------------------------------------
+    private fun connectToPico() {
+        val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
+        val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
+        if (availableDrivers.isEmpty()) return
+
+        val driver = availableDrivers[0]
+        val connection = usbManager.openDevice(driver.device) ?: return
+
+        usbPort = driver.ports[0]
+        try {
+            usbPort?.open(connection)
+            usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
+            outputText.append("\n> USB Connected")
        } catch (e: Exception) {
            outputText.append("\n> USB Error: ${e.message}")
        }
    }

-    //Whisper/Sherpa implementation functions here....
-    private fun initSherpaModel() {
-        // Sherpa requires specific configuration
-        val config = OnlineRecognizerConfig(
-            featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80),
-            transducerModelConfig = OnlineTransducerModelConfig(
-                encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path
-                decoder = "$filesDir/decoder-epoch-99-avg-1.onnx",
-                joiner = "$filesDir/joiner-epoch-99-avg-1.onnx",
-            ),
-            enableEndpoint = true, // Detects when you stop speaking
-            ruleFsts = "",
-            decodingMethod = "greedy_search",
-            maxActivePaths = 4
-        )
-
-        try {
-            recognizer = OnlineRecognizer(assetManager = assets, config = config)
-            stream = recognizer?.createStream()
-            outputText.text = "Whisper/Sherpa Ready!"
-        } catch (e: Exception) {
-            outputText.text = "Error: ${e.message}"
-        }
-    }
-
-    private fun startRecordingLoop() {
-        val sampleRate = 16000
-        val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
-        val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
-
-        record.startRecording()
-        isListening = true
-
-        Thread {
-            val buffer = ShortArray(bufferSize / 2)
-            while (isListening) {
-                val read = record.read(buffer, 0, buffer.size)
-                if (read > 0) {
-                    // 1. Feed audio to engine
-                    val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize
-                    stream?.acceptWaveform(floatSamples, sampleRate)
-
-                    // 2. Decode
-                    while (recognizer?.isReady(stream) == true) {
-                        recognizer?.decode(stream)
-                    }
-
-                    // 3. Get Result
-                    val result = recognizer?.getResult(stream)
-                    val text = result?.text ?: ""
-
-                    if (text.isNotEmpty()) {
-                        // Update UI
-                        runOnUiThread {
-                            // Sherpa returns the FULL string so far, not just chunks
-                            // So we just overwrite the "Current" view
-                            val cleanText = text.lowercase()
-                            outputText.text = cleanText
-
-                            // Check if sentence is "Final" (Endpoint detected)
-                            if (recognizer?.isEndpoint(stream) == true) {
-                                sendToPico(cleanText)
-                                recognizer?.reset(stream) // Clear buffer for next sentence
-                            }
-                        }
-                    }
-                }
-            }
-            record.stop()
-            record.release()
-        }.start()
-    }
-
-
-    /*      ---- removed as part of the whisper migration
-     private fun initModel() {
-        // We look for the folder inside private storage (same logic as MainActivity)
-        val modelPath = File(filesDir, "vosk-model")
-
-        if (!modelPath.exists()) {
-            outputText.text = "Error: Model not found. Please go back and load a model first."
-            micButton.isEnabled = false
-            return
-        }
-
-        Thread {
-            try {
-                // Find the actual model folder inside
-                val actualModelDir = modelPath.listFiles()?.firstOrNull { it.isDirectory } ?: modelPath
-                model = Model(actualModelDir.absolutePath)
-
-                runOnUiThread {
-                    outputText.append("\n\n> Model Loaded. Ready.")
-                }
-            } catch (e: Exception) {
-                runOnUiThread {
-                    outputText.text = "Error loading model: ${e.message}"
-                }
-            }
-        }.start()
-    }
-*/
    private fun sendToPico(text: String) {
-        if (usbPort == null) return // Safety check
-
+        if (usbPort == null) return
        try {
-            // Convert text to bytes and send
-            val data = text.toByteArray(Charsets.UTF_8)
-            usbPort?.write(data, 1000) // 1000ms timeout
+            usbPort?.write(text.toByteArray(Charsets.UTF_8), 500)
        } catch (e: Exception) {
-            outputText.append("\n[Send Failed: ${e.message}]")
+            // Log error
        }
    }

-    private fun toggleListening() {
-        if (model == null) {
-            Toast.makeText(this, "Model not loaded yet", Toast.LENGTH_SHORT).show()
-            return
-        }
-
-        if (isListening) {
-            stopRecognition()
-        } else {
-            startRecognition()
-        }
-    }
-
-    private fun startRecognition() {
-        try {
-            val recognizer = Recognizer(model, 16000.0f) // 16kHz is standard for Vosk
-            speechService = SpeechService(recognizer, 16000.0f)
-            //speechService?.addListener(this)           <----- removed this as it generated an error
-            speechService?.startListening(this)
-
-            isListening = true
-            micButton.setColorFilter(android.graphics.Color.RED) // Turn button red
-            outputText.text = "" // Clear previous text
-            outputText.append("> Listening...\n")
-
-        } catch (e: Exception) {
-            outputText.append("\nError starting mic: ${e.message}")
-        }
-    }
-
-    private fun stopRecognition() {
-        speechService?.stop()
-        speechService = null
-        isListening = false
-        micButton.clearColorFilter() // Reset button color
-        outputText.append("\n> Stopped.")
-    }
-
-    // --- Vosk Listener Callbacks ---
-/* removed as part of migration to whisper
-
-    override fun onResult(hypothesis: String?) {
-        hypothesis?.let {
-            val text = parseVoskResult(it)
-            if (text.isNotEmpty()) {
-                // 1. Update the UI History
-                // Add the new sentence to our history
-                committedText += "$text. "
-                // Update screen
-                outputText.text = "$committedText"
-
-                // 2. SEND TO PICO
-                // We append a space because speech engines strip trailing spaces,
-                // and you don't want "helloworld" typed into your computer.
-                sendToPico("$text ")
-            }
-        }
-    }
-
-    override fun onPartialResult(hypothesis: String?) {
-        // Optional: Shows words as they are being spoken (streaming)
-        // You can enable this if you want to see "typing" effect
-        hypothesis?.let {
-            // Parse the "partial" JSON key
-            val partial = JSONObject(it).optString("partial", "")
-
-            if (partial.isNotEmpty()) {
-                // Display: [History] + [Current Streaming Guess]
-                outputText.text = "$committedText $partial..."
-            }
-        }
-    }
-*/
-
-    override fun onFinalResult(hypothesis: String?) {
-        // Final flush when stopping
-        hypothesis?.let {
-            val text = parseVoskResult(it)
-            if (text.isNotEmpty()) {
-                outputText.append("$text\n")
-            }
-        }
-    }
-/* Whispoer migration removals
-    override fun onError(exception: Exception?) {
-        outputText.append("\nError: ${exception?.message}")
-    }
-
-    override fun onTimeout() {
-        outputText.append("\nTimeout.")
-    }
-
-*/
-    // Permission Helper
+    // ----------------------------------------------------------------
+    // 4. CLEANUP
+    // ----------------------------------------------------------------
    private fun checkAudioPermission() {
        if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
            ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1)
        }
    }

-    // Cleanup on exit
    override fun onDestroy() {
        super.onDestroy()
-        speechService?.shutdown()
-
-        // Close USB
-        try {
-            usbPort?.close()
-        } catch (e: Exception) {
-            // Ignore errors on close
-        }
+        isRecording = false
+        stream?.release()
+        recognizer?.release()
+        try { usbPort?.close() } catch (e: Exception) {}
    }
 }