Most changes which compile using the Whisper Sherpa setup - but prior to the changes in the MainActivity file which will unzip the new model files for us automagically ;-)

This commit is contained in:
2026-01-22 20:30:14 +11:00
parent 404bc55ed3
commit ce72ef7a16

View File

@@ -1,52 +1,49 @@
package net.mmanningau.speechtokeyboard
import android.Manifest
import android.content.Context
import android.content.pm.PackageManager
import android.hardware.usb.UsbManager
import android.media.AudioFormat
import android.media.AudioRecord
import android.media.MediaRecorder
import android.os.Bundle
import android.util.Log
import android.widget.ImageButton
import android.widget.TextView
import android.widget.Toast
import androidx.appcompat.app.AppCompatActivity
import androidx.core.app.ActivityCompat
import androidx.core.content.ContextCompat
import org.json.JSONObject
import com.k2fsa.sherpa.onnx.* // import for whisper sherpa wrapper
// import org.vosk.Model --- migration to whisper removals
// import org.vosk.Recognizer
// import org.vosk.android.RecognitionListener
// import org.vosk.android.SpeechService
import java.io.File
import android.content.Context
import android.hardware.usb.UsbManager
import com.hoho.android.usbserial.driver.UsbSerialPort
import com.hoho.android.usbserial.driver.UsbSerialProber
import com.hoho.android.usbserial.util.SerialInputOutputManager
import com.k2fsa.sherpa.onnx.EndpointConfig
import com.k2fsa.sherpa.onnx.EndpointRule
import com.k2fsa.sherpa.onnx.FeatureConfig
import com.k2fsa.sherpa.onnx.OnlineRecognizer
import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
import com.k2fsa.sherpa.onnx.OnlineStream
import java.io.File
// class TestModelActivity : AppCompatActivity(), RecognitionListener {
class TestModelActivity : AppCompatActivity() {
// UI Components
private lateinit var outputText: TextView
private lateinit var micButton: ImageButton
// Whisper/Sherpa wrapper setup variables here
private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper
// Sherpa (Whisper) Components
private var recognizer: OnlineRecognizer? = null
private var stream: OnlineStream? = null
// Vosk Components - now removed as whisper migration
// private var model: Model? = null
// private var speechService: SpeechService? = null
private var isListening = false
private var isRecording = false
private var recordingThread: Thread? = null
// USB Components
private var usbPort: UsbSerialPort? = null
private var usbIoManager: SerialInputOutputManager? = null // Handles the data flow
private var committedText = "" // Stores the finalized sentences
// Text History
private var committedText = ""
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
@@ -56,278 +53,213 @@ class TestModelActivity : AppCompatActivity() {
outputText = findViewById(R.id.text_output_log)
micButton = findViewById(R.id.btn_mic_toggle)
// Check Permissions immediately
checkAudioPermission()
connectToPico() // Try to auto-connect USB on start
// Setup Button Listener
// Initialize Engine
initSherpaModel()
// Setup Button
micButton.setOnClickListener {
toggleListening()
toggleRecording()
}
// Initialize the model in background
initModel()
}
private fun connectToPico() {
val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
// ----------------------------------------------------------------
// 1. ENGINE INITIALIZATION (The "Missing Code")
// ----------------------------------------------------------------
private fun initSherpaModel() {
val modelDir = File(filesDir, "sherpa-model")
// 1. Find the Device
// (This probes specifically for devices listed in your device_filter.xml)
val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
if (availableDrivers.isEmpty()) {
outputText.append("\n> No USB device found.")
if (!File(modelDir, "encoder.onnx").exists()) {
outputText.text = "Error: Sherpa Model files missing in /sherpa-model/"
return
}
// Assume the first device found is the Pico
val driver = availableDrivers[0]
val connection = usbManager.openDevice(driver.device)
if (connection == null) {
outputText.append("\n> Permission denied. Re-plug device?")
return
}
// 2. Open the Port
// Most Picos use port 0.
usbPort = driver.ports[0]
try {
usbPort?.open(connection)
// 3. Set Parameters (Must match your Pico's C/Python code!)
// 115200 Baud, 8 Data bits, 1 Stop bit, No Parity
usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
// 1. Define Model Paths
val transducerConfig = OnlineTransducerModelConfig(
encoder = File(modelDir, "encoder.onnx").absolutePath,
decoder = File(modelDir, "decoder.onnx").absolutePath,
joiner = File(modelDir, "joiner.onnx").absolutePath
)
outputText.append("\n> USB Connected to Pico!")
// 2. Define General Config
val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
transducer = transducerConfig,
tokens = File(modelDir, "tokens.txt").absolutePath,
numThreads = 1,
debug = false,
modelType = "zipformer"
)
// 3. Define Endpoint Rule (The fix for your error)
// rule1 = detected silence after speech. We set this to 2.4 seconds.
val silenceRule = EndpointRule(
mustContainNonSilence = false,
minTrailingSilence = 2.4f,
minUtteranceLength = 0.0f
)
// 4. Create Recognizer Config
val config = OnlineRecognizerConfig(
featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
modelConfig = onlineModelConfig,
endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
enableEndpoint = true,
decodingMethod = "greedy_search",
maxActivePaths = 4
)
recognizer = OnlineRecognizer(assetManager = assets, config = config)
stream = recognizer?.createStream()
outputText.text = "Engine Loaded. Ready to Stream."
} catch (e: Exception) {
Log.e("Sherpa", "Init Error", e)
outputText.text = "Init Error: ${e.message}"
}
}
// ----------------------------------------------------------------
// 2. AUDIO LOOP (The "Manual" Listener)
// ----------------------------------------------------------------
private fun toggleRecording() {
if (isRecording) {
stopRecording()
} else {
startRecording()
}
}
private fun startRecording() {
if (recognizer == null) {
Toast.makeText(this, "Engine not ready", Toast.LENGTH_SHORT).show()
return
}
// Reset the stream for a new session
// Note: Sherpa streams can be persistent, but resetting ensures clean start
// If you want continuous conversation, don't reset 'committedText'
isRecording = true
micButton.setColorFilter(android.graphics.Color.RED)
outputText.text = "$committedText [Listening...]"
recordingThread = Thread {
processAudioLoop()
}
recordingThread?.start()
}
private fun stopRecording() {
isRecording = false
recordingThread?.join() // Wait for loop to finish
micButton.clearColorFilter()
outputText.text = "$committedText [Stopped]"
}
private fun processAudioLoop() {
val sampleRate = 16000
val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
return
}
// --- FIX START ---
// Capture global variables into local non-null variables.
// If either is null, we just exit the loop safely.
val activeStream = stream ?: return
val activeRecognizer = recognizer ?: return
// --- FIX END ---
val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
record.startRecording()
val buffer = ShortArray(bufferSize)
while (isRecording) {
val ret = record.read(buffer, 0, buffer.size)
if (ret > 0) {
val samples = FloatArray(ret) { buffer[it] / 32768.0f }
// Use 'activeStream' and 'activeRecognizer' (No ? needed anymore)
activeStream.acceptWaveform(samples, sampleRate)
while (activeRecognizer.isReady(activeStream)) {
activeRecognizer.decode(activeStream)
}
val text = activeRecognizer.getResult(activeStream).text
if (text.isNotEmpty()) {
val cleanText = text.lowercase()
runOnUiThread {
outputText.text = "$committedText $cleanText"
}
if (activeRecognizer.isEndpoint(activeStream)) {
if (cleanText.isNotBlank()) {
committedText += "$cleanText "
sendToPico("$cleanText ")
// Reset the stream
activeRecognizer.reset(activeStream)
}
}
}
}
}
record.stop()
record.release()
}
// ----------------------------------------------------------------
// 3. USB LOGIC (Unchanged from before)
// ----------------------------------------------------------------
private fun connectToPico() {
val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
if (availableDrivers.isEmpty()) return
val driver = availableDrivers[0]
val connection = usbManager.openDevice(driver.device) ?: return
usbPort = driver.ports[0]
try {
usbPort?.open(connection)
usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
outputText.append("\n> USB Connected")
} catch (e: Exception) {
outputText.append("\n> USB Error: ${e.message}")
}
}
//Whisper/Sherpa implementation functions here....
private fun initSherpaModel() {
// Sherpa requires specific configuration
val config = OnlineRecognizerConfig(
featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80),
transducerModelConfig = OnlineTransducerModelConfig(
encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path
decoder = "$filesDir/decoder-epoch-99-avg-1.onnx",
joiner = "$filesDir/joiner-epoch-99-avg-1.onnx",
),
enableEndpoint = true, // Detects when you stop speaking
ruleFsts = "",
decodingMethod = "greedy_search",
maxActivePaths = 4
)
try {
recognizer = OnlineRecognizer(assetManager = assets, config = config)
stream = recognizer?.createStream()
outputText.text = "Whisper/Sherpa Ready!"
} catch (e: Exception) {
outputText.text = "Error: ${e.message}"
}
}
private fun startRecordingLoop() {
val sampleRate = 16000
val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
record.startRecording()
isListening = true
Thread {
val buffer = ShortArray(bufferSize / 2)
while (isListening) {
val read = record.read(buffer, 0, buffer.size)
if (read > 0) {
// 1. Feed audio to engine
val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize
stream?.acceptWaveform(floatSamples, sampleRate)
// 2. Decode
while (recognizer?.isReady(stream) == true) {
recognizer?.decode(stream)
}
// 3. Get Result
val result = recognizer?.getResult(stream)
val text = result?.text ?: ""
if (text.isNotEmpty()) {
// Update UI
runOnUiThread {
// Sherpa returns the FULL string so far, not just chunks
// So we just overwrite the "Current" view
val cleanText = text.lowercase()
outputText.text = cleanText
// Check if sentence is "Final" (Endpoint detected)
if (recognizer?.isEndpoint(stream) == true) {
sendToPico(cleanText)
recognizer?.reset(stream) // Clear buffer for next sentence
}
}
}
}
}
record.stop()
record.release()
}.start()
}
/* ---- removed as part of the whisper migration
private fun initModel() {
// We look for the folder inside private storage (same logic as MainActivity)
val modelPath = File(filesDir, "vosk-model")
if (!modelPath.exists()) {
outputText.text = "Error: Model not found. Please go back and load a model first."
micButton.isEnabled = false
return
}
Thread {
try {
// Find the actual model folder inside
val actualModelDir = modelPath.listFiles()?.firstOrNull { it.isDirectory } ?: modelPath
model = Model(actualModelDir.absolutePath)
runOnUiThread {
outputText.append("\n\n> Model Loaded. Ready.")
}
} catch (e: Exception) {
runOnUiThread {
outputText.text = "Error loading model: ${e.message}"
}
}
}.start()
}
*/
private fun sendToPico(text: String) {
if (usbPort == null) return // Safety check
if (usbPort == null) return
try {
// Convert text to bytes and send
val data = text.toByteArray(Charsets.UTF_8)
usbPort?.write(data, 1000) // 1000ms timeout
usbPort?.write(text.toByteArray(Charsets.UTF_8), 500)
} catch (e: Exception) {
outputText.append("\n[Send Failed: ${e.message}]")
// Log error
}
}
private fun toggleListening() {
if (model == null) {
Toast.makeText(this, "Model not loaded yet", Toast.LENGTH_SHORT).show()
return
}
if (isListening) {
stopRecognition()
} else {
startRecognition()
}
}
private fun startRecognition() {
try {
val recognizer = Recognizer(model, 16000.0f) // 16kHz is standard for Vosk
speechService = SpeechService(recognizer, 16000.0f)
//speechService?.addListener(this) <----- removed this as it generated an error
speechService?.startListening(this)
isListening = true
micButton.setColorFilter(android.graphics.Color.RED) // Turn button red
outputText.text = "" // Clear previous text
outputText.append("> Listening...\n")
} catch (e: Exception) {
outputText.append("\nError starting mic: ${e.message}")
}
}
private fun stopRecognition() {
speechService?.stop()
speechService = null
isListening = false
micButton.clearColorFilter() // Reset button color
outputText.append("\n> Stopped.")
}
// --- Vosk Listener Callbacks ---
/* removed as part of migration to whisper
override fun onResult(hypothesis: String?) {
hypothesis?.let {
val text = parseVoskResult(it)
if (text.isNotEmpty()) {
// 1. Update the UI History
// Add the new sentence to our history
committedText += "$text. "
// Update screen
outputText.text = "$committedText"
// 2. SEND TO PICO
// We append a space because speech engines strip trailing spaces,
// and you don't want "helloworld" typed into your computer.
sendToPico("$text ")
}
}
}
override fun onPartialResult(hypothesis: String?) {
// Optional: Shows words as they are being spoken (streaming)
// You can enable this if you want to see "typing" effect
hypothesis?.let {
// Parse the "partial" JSON key
val partial = JSONObject(it).optString("partial", "")
if (partial.isNotEmpty()) {
// Display: [History] + [Current Streaming Guess]
outputText.text = "$committedText $partial..."
}
}
}
*/
override fun onFinalResult(hypothesis: String?) {
// Final flush when stopping
hypothesis?.let {
val text = parseVoskResult(it)
if (text.isNotEmpty()) {
outputText.append("$text\n")
}
}
}
/* Whispoer migration removals
override fun onError(exception: Exception?) {
outputText.append("\nError: ${exception?.message}")
}
override fun onTimeout() {
outputText.append("\nTimeout.")
}
*/
// Permission Helper
// ----------------------------------------------------------------
// 4. CLEANUP
// ----------------------------------------------------------------
private fun checkAudioPermission() {
if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1)
}
}
// Cleanup on exit
override fun onDestroy() {
super.onDestroy()
speechService?.shutdown()
// Close USB
try {
usbPort?.close()
} catch (e: Exception) {
// Ignore errors on close
}
isRecording = false
stream?.release()
recognizer?.release()
try { usbPort?.close() } catch (e: Exception) {}
}
}