Most changes which compile using the Whisper Sherpa setup - but prior to the changes in the MainActivity file which will unzip the new model files for us automagically ;-)

This commit is contained in:
2026-01-22 20:30:14 +11:00
parent 404bc55ed3
commit ce72ef7a16

View File

@@ -1,52 +1,49 @@
package net.mmanningau.speechtokeyboard package net.mmanningau.speechtokeyboard
import android.Manifest import android.Manifest
import android.content.Context
import android.content.pm.PackageManager import android.content.pm.PackageManager
import android.hardware.usb.UsbManager
import android.media.AudioFormat
import android.media.AudioRecord
import android.media.MediaRecorder
import android.os.Bundle import android.os.Bundle
import android.util.Log
import android.widget.ImageButton import android.widget.ImageButton
import android.widget.TextView import android.widget.TextView
import android.widget.Toast import android.widget.Toast
import androidx.appcompat.app.AppCompatActivity import androidx.appcompat.app.AppCompatActivity
import androidx.core.app.ActivityCompat import androidx.core.app.ActivityCompat
import androidx.core.content.ContextCompat import androidx.core.content.ContextCompat
import org.json.JSONObject
import com.k2fsa.sherpa.onnx.* // import for whisper sherpa wrapper
// import org.vosk.Model --- migration to whisper removals
// import org.vosk.Recognizer
// import org.vosk.android.RecognitionListener
// import org.vosk.android.SpeechService
import java.io.File
import android.content.Context
import android.hardware.usb.UsbManager
import com.hoho.android.usbserial.driver.UsbSerialPort import com.hoho.android.usbserial.driver.UsbSerialPort
import com.hoho.android.usbserial.driver.UsbSerialProber import com.hoho.android.usbserial.driver.UsbSerialProber
import com.hoho.android.usbserial.util.SerialInputOutputManager import com.hoho.android.usbserial.util.SerialInputOutputManager
import com.k2fsa.sherpa.onnx.EndpointConfig
import com.k2fsa.sherpa.onnx.EndpointRule
import com.k2fsa.sherpa.onnx.FeatureConfig
import com.k2fsa.sherpa.onnx.OnlineRecognizer
import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
import com.k2fsa.sherpa.onnx.OnlineStream
import java.io.File
// class TestModelActivity : AppCompatActivity(), RecognitionListener {
class TestModelActivity : AppCompatActivity() { class TestModelActivity : AppCompatActivity() {
// UI Components
private lateinit var outputText: TextView private lateinit var outputText: TextView
private lateinit var micButton: ImageButton private lateinit var micButton: ImageButton
// Whisper/Sherpa wrapper setup variables here // Sherpa (Whisper) Components
private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper
private var recognizer: OnlineRecognizer? = null private var recognizer: OnlineRecognizer? = null
private var stream: OnlineStream? = null private var stream: OnlineStream? = null
private var isRecording = false
// Vosk Components - now removed as whisper migration private var recordingThread: Thread? = null
// private var model: Model? = null
// private var speechService: SpeechService? = null
private var isListening = false
// USB Components // USB Components
private var usbPort: UsbSerialPort? = null private var usbPort: UsbSerialPort? = null
private var usbIoManager: SerialInputOutputManager? = null // Handles the data flow
private var committedText = "" // Stores the finalized sentences // Text History
private var committedText = ""
override fun onCreate(savedInstanceState: Bundle?) { override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState) super.onCreate(savedInstanceState)
@@ -56,117 +53,160 @@ class TestModelActivity : AppCompatActivity() {
outputText = findViewById(R.id.text_output_log) outputText = findViewById(R.id.text_output_log)
micButton = findViewById(R.id.btn_mic_toggle) micButton = findViewById(R.id.btn_mic_toggle)
// Check Permissions immediately
checkAudioPermission() checkAudioPermission()
connectToPico() // Try to auto-connect USB on start
// Setup Button Listener // Initialize Engine
initSherpaModel()
// Setup Button
micButton.setOnClickListener { micButton.setOnClickListener {
toggleListening() toggleRecording()
}
} }
// Initialize the model in background // ----------------------------------------------------------------
initModel() // 1. ENGINE INITIALIZATION (The "Missing Code")
} // ----------------------------------------------------------------
private fun initSherpaModel() {
val modelDir = File(filesDir, "sherpa-model")
private fun connectToPico() { if (!File(modelDir, "encoder.onnx").exists()) {
val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager outputText.text = "Error: Sherpa Model files missing in /sherpa-model/"
// 1. Find the Device
// (This probes specifically for devices listed in your device_filter.xml)
val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
if (availableDrivers.isEmpty()) {
outputText.append("\n> No USB device found.")
return return
} }
// Assume the first device found is the Pico
val driver = availableDrivers[0]
val connection = usbManager.openDevice(driver.device)
if (connection == null) {
outputText.append("\n> Permission denied. Re-plug device?")
return
}
// 2. Open the Port
// Most Picos use port 0.
usbPort = driver.ports[0]
try { try {
usbPort?.open(connection) // 1. Define Model Paths
// 3. Set Parameters (Must match your Pico's C/Python code!) val transducerConfig = OnlineTransducerModelConfig(
// 115200 Baud, 8 Data bits, 1 Stop bit, No Parity encoder = File(modelDir, "encoder.onnx").absolutePath,
usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE) decoder = File(modelDir, "decoder.onnx").absolutePath,
joiner = File(modelDir, "joiner.onnx").absolutePath
)
outputText.append("\n> USB Connected to Pico!") // 2. Define General Config
} catch (e: Exception) { val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
outputText.append("\n> USB Error: ${e.message}") transducer = transducerConfig,
} tokens = File(modelDir, "tokens.txt").absolutePath,
} numThreads = 1,
debug = false,
modelType = "zipformer"
)
//Whisper/Sherpa implementation functions here.... // 3. Define Endpoint Rule (The fix for your error)
private fun initSherpaModel() { // rule1 = detected silence after speech. We set this to 2.4 seconds.
// Sherpa requires specific configuration val silenceRule = EndpointRule(
mustContainNonSilence = false,
minTrailingSilence = 2.4f,
minUtteranceLength = 0.0f
)
// 4. Create Recognizer Config
val config = OnlineRecognizerConfig( val config = OnlineRecognizerConfig(
featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80), featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
transducerModelConfig = OnlineTransducerModelConfig( modelConfig = onlineModelConfig,
encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
decoder = "$filesDir/decoder-epoch-99-avg-1.onnx", enableEndpoint = true,
joiner = "$filesDir/joiner-epoch-99-avg-1.onnx",
),
enableEndpoint = true, // Detects when you stop speaking
ruleFsts = "",
decodingMethod = "greedy_search", decodingMethod = "greedy_search",
maxActivePaths = 4 maxActivePaths = 4
) )
try {
recognizer = OnlineRecognizer(assetManager = assets, config = config) recognizer = OnlineRecognizer(assetManager = assets, config = config)
stream = recognizer?.createStream() stream = recognizer?.createStream()
outputText.text = "Whisper/Sherpa Ready!"
outputText.text = "Engine Loaded. Ready to Stream."
} catch (e: Exception) { } catch (e: Exception) {
outputText.text = "Error: ${e.message}" Log.e("Sherpa", "Init Error", e)
outputText.text = "Init Error: ${e.message}"
} }
} }
private fun startRecordingLoop() { // ----------------------------------------------------------------
// 2. AUDIO LOOP (The "Manual" Listener)
// ----------------------------------------------------------------
private fun toggleRecording() {
if (isRecording) {
stopRecording()
} else {
startRecording()
}
}
private fun startRecording() {
if (recognizer == null) {
Toast.makeText(this, "Engine not ready", Toast.LENGTH_SHORT).show()
return
}
// Reset the stream for a new session
// Note: Sherpa streams can be persistent, but resetting ensures clean start
// If you want continuous conversation, don't reset 'committedText'
isRecording = true
micButton.setColorFilter(android.graphics.Color.RED)
outputText.text = "$committedText [Listening...]"
recordingThread = Thread {
processAudioLoop()
}
recordingThread?.start()
}
private fun stopRecording() {
isRecording = false
recordingThread?.join() // Wait for loop to finish
micButton.clearColorFilter()
outputText.text = "$committedText [Stopped]"
}
private fun processAudioLoop() {
val sampleRate = 16000 val sampleRate = 16000
val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT) val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
record.startRecording() if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
isListening = true return
Thread {
val buffer = ShortArray(bufferSize / 2)
while (isListening) {
val read = record.read(buffer, 0, buffer.size)
if (read > 0) {
// 1. Feed audio to engine
val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize
stream?.acceptWaveform(floatSamples, sampleRate)
// 2. Decode
while (recognizer?.isReady(stream) == true) {
recognizer?.decode(stream)
} }
// 3. Get Result // --- FIX START ---
val result = recognizer?.getResult(stream) // Capture global variables into local non-null variables.
val text = result?.text ?: "" // If either is null, we just exit the loop safely.
val activeStream = stream ?: return
val activeRecognizer = recognizer ?: return
// --- FIX END ---
val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
record.startRecording()
val buffer = ShortArray(bufferSize)
while (isRecording) {
val ret = record.read(buffer, 0, buffer.size)
if (ret > 0) {
val samples = FloatArray(ret) { buffer[it] / 32768.0f }
// Use 'activeStream' and 'activeRecognizer' (No ? needed anymore)
activeStream.acceptWaveform(samples, sampleRate)
while (activeRecognizer.isReady(activeStream)) {
activeRecognizer.decode(activeStream)
}
val text = activeRecognizer.getResult(activeStream).text
if (text.isNotEmpty()) { if (text.isNotEmpty()) {
// Update UI
runOnUiThread {
// Sherpa returns the FULL string so far, not just chunks
// So we just overwrite the "Current" view
val cleanText = text.lowercase() val cleanText = text.lowercase()
outputText.text = cleanText runOnUiThread {
outputText.text = "$committedText $cleanText"
}
// Check if sentence is "Final" (Endpoint detected) if (activeRecognizer.isEndpoint(activeStream)) {
if (recognizer?.isEndpoint(stream) == true) { if (cleanText.isNotBlank()) {
sendToPico(cleanText) committedText += "$cleanText "
recognizer?.reset(stream) // Clear buffer for next sentence sendToPico("$cleanText ")
// Reset the stream
activeRecognizer.reset(activeStream)
} }
} }
} }
@@ -174,160 +214,52 @@ class TestModelActivity : AppCompatActivity() {
} }
record.stop() record.stop()
record.release() record.release()
}.start()
} }
// ----------------------------------------------------------------
// 3. USB LOGIC (Unchanged from before)
// ----------------------------------------------------------------
private fun connectToPico() {
val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
if (availableDrivers.isEmpty()) return
/* ---- removed as part of the whisper migration val driver = availableDrivers[0]
private fun initModel() { val connection = usbManager.openDevice(driver.device) ?: return
// We look for the folder inside private storage (same logic as MainActivity)
val modelPath = File(filesDir, "vosk-model")
if (!modelPath.exists()) { usbPort = driver.ports[0]
outputText.text = "Error: Model not found. Please go back and load a model first."
micButton.isEnabled = false
return
}
Thread {
try { try {
// Find the actual model folder inside usbPort?.open(connection)
val actualModelDir = modelPath.listFiles()?.firstOrNull { it.isDirectory } ?: modelPath usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
model = Model(actualModelDir.absolutePath) outputText.append("\n> USB Connected")
runOnUiThread {
outputText.append("\n\n> Model Loaded. Ready.")
}
} catch (e: Exception) { } catch (e: Exception) {
runOnUiThread { outputText.append("\n> USB Error: ${e.message}")
outputText.text = "Error loading model: ${e.message}"
} }
} }
}.start()
}
*/
private fun sendToPico(text: String) { private fun sendToPico(text: String) {
if (usbPort == null) return // Safety check if (usbPort == null) return
try { try {
// Convert text to bytes and send usbPort?.write(text.toByteArray(Charsets.UTF_8), 500)
val data = text.toByteArray(Charsets.UTF_8)
usbPort?.write(data, 1000) // 1000ms timeout
} catch (e: Exception) { } catch (e: Exception) {
outputText.append("\n[Send Failed: ${e.message}]") // Log error
} }
} }
private fun toggleListening() { // ----------------------------------------------------------------
if (model == null) { // 4. CLEANUP
Toast.makeText(this, "Model not loaded yet", Toast.LENGTH_SHORT).show() // ----------------------------------------------------------------
return
}
if (isListening) {
stopRecognition()
} else {
startRecognition()
}
}
private fun startRecognition() {
try {
val recognizer = Recognizer(model, 16000.0f) // 16kHz is standard for Vosk
speechService = SpeechService(recognizer, 16000.0f)
//speechService?.addListener(this) <----- removed this as it generated an error
speechService?.startListening(this)
isListening = true
micButton.setColorFilter(android.graphics.Color.RED) // Turn button red
outputText.text = "" // Clear previous text
outputText.append("> Listening...\n")
} catch (e: Exception) {
outputText.append("\nError starting mic: ${e.message}")
}
}
private fun stopRecognition() {
speechService?.stop()
speechService = null
isListening = false
micButton.clearColorFilter() // Reset button color
outputText.append("\n> Stopped.")
}
// --- Vosk Listener Callbacks ---
/* removed as part of migration to whisper
override fun onResult(hypothesis: String?) {
hypothesis?.let {
val text = parseVoskResult(it)
if (text.isNotEmpty()) {
// 1. Update the UI History
// Add the new sentence to our history
committedText += "$text. "
// Update screen
outputText.text = "$committedText"
// 2. SEND TO PICO
// We append a space because speech engines strip trailing spaces,
// and you don't want "helloworld" typed into your computer.
sendToPico("$text ")
}
}
}
override fun onPartialResult(hypothesis: String?) {
// Optional: Shows words as they are being spoken (streaming)
// You can enable this if you want to see "typing" effect
hypothesis?.let {
// Parse the "partial" JSON key
val partial = JSONObject(it).optString("partial", "")
if (partial.isNotEmpty()) {
// Display: [History] + [Current Streaming Guess]
outputText.text = "$committedText $partial..."
}
}
}
*/
override fun onFinalResult(hypothesis: String?) {
// Final flush when stopping
hypothesis?.let {
val text = parseVoskResult(it)
if (text.isNotEmpty()) {
outputText.append("$text\n")
}
}
}
/* Whispoer migration removals
override fun onError(exception: Exception?) {
outputText.append("\nError: ${exception?.message}")
}
override fun onTimeout() {
outputText.append("\nTimeout.")
}
*/
// Permission Helper
private fun checkAudioPermission() { private fun checkAudioPermission() {
if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) { if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1) ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1)
} }
} }
// Cleanup on exit
override fun onDestroy() { override fun onDestroy() {
super.onDestroy() super.onDestroy()
speechService?.shutdown() isRecording = false
stream?.release()
// Close USB recognizer?.release()
try { try { usbPort?.close() } catch (e: Exception) {}
usbPort?.close()
} catch (e: Exception) {
// Ignore errors on close
}
} }
} }