Most changes which compile using the Whisper Sherpa setup - but prior to the changes in the MainActivity file which will unzip the new model files for us automagically ;-)
This commit is contained in:
@@ -1,52 +1,49 @@
|
||||
package net.mmanningau.speechtokeyboard
|
||||
|
||||
import android.Manifest
|
||||
import android.content.Context
|
||||
import android.content.pm.PackageManager
|
||||
import android.hardware.usb.UsbManager
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioRecord
|
||||
import android.media.MediaRecorder
|
||||
import android.os.Bundle
|
||||
import android.util.Log
|
||||
import android.widget.ImageButton
|
||||
import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import androidx.core.content.ContextCompat
|
||||
import org.json.JSONObject
|
||||
|
||||
import com.k2fsa.sherpa.onnx.* // import for whisper sherpa wrapper
|
||||
|
||||
// import org.vosk.Model --- migration to whisper removals
|
||||
// import org.vosk.Recognizer
|
||||
// import org.vosk.android.RecognitionListener
|
||||
// import org.vosk.android.SpeechService
|
||||
|
||||
import java.io.File
|
||||
|
||||
import android.content.Context
|
||||
import android.hardware.usb.UsbManager
|
||||
import com.hoho.android.usbserial.driver.UsbSerialPort
|
||||
import com.hoho.android.usbserial.driver.UsbSerialProber
|
||||
import com.hoho.android.usbserial.util.SerialInputOutputManager
|
||||
import com.k2fsa.sherpa.onnx.EndpointConfig
|
||||
import com.k2fsa.sherpa.onnx.EndpointRule
|
||||
import com.k2fsa.sherpa.onnx.FeatureConfig
|
||||
import com.k2fsa.sherpa.onnx.OnlineRecognizer
|
||||
import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig
|
||||
import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig
|
||||
import com.k2fsa.sherpa.onnx.OnlineStream
|
||||
import java.io.File
|
||||
|
||||
// class TestModelActivity : AppCompatActivity(), RecognitionListener {
|
||||
class TestModelActivity : AppCompatActivity() {
|
||||
|
||||
// UI Components
|
||||
private lateinit var outputText: TextView
|
||||
private lateinit var micButton: ImageButton
|
||||
|
||||
// Whisper/Sherpa wrapper setup variables here
|
||||
private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper
|
||||
// Sherpa (Whisper) Components
|
||||
private var recognizer: OnlineRecognizer? = null
|
||||
private var stream: OnlineStream? = null
|
||||
|
||||
// Vosk Components - now removed as whisper migration
|
||||
// private var model: Model? = null
|
||||
// private var speechService: SpeechService? = null
|
||||
private var isListening = false
|
||||
private var isRecording = false
|
||||
private var recordingThread: Thread? = null
|
||||
|
||||
// USB Components
|
||||
private var usbPort: UsbSerialPort? = null
|
||||
private var usbIoManager: SerialInputOutputManager? = null // Handles the data flow
|
||||
|
||||
private var committedText = "" // Stores the finalized sentences
|
||||
// Text History
|
||||
private var committedText = ""
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
@@ -56,278 +53,213 @@ class TestModelActivity : AppCompatActivity() {
|
||||
outputText = findViewById(R.id.text_output_log)
|
||||
micButton = findViewById(R.id.btn_mic_toggle)
|
||||
|
||||
// Check Permissions immediately
|
||||
checkAudioPermission()
|
||||
connectToPico() // Try to auto-connect USB on start
|
||||
|
||||
// Setup Button Listener
|
||||
// Initialize Engine
|
||||
initSherpaModel()
|
||||
|
||||
// Setup Button
|
||||
micButton.setOnClickListener {
|
||||
toggleListening()
|
||||
toggleRecording()
|
||||
}
|
||||
|
||||
// Initialize the model in background
|
||||
initModel()
|
||||
}
|
||||
|
||||
private fun connectToPico() {
|
||||
val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
|
||||
// ----------------------------------------------------------------
|
||||
// 1. ENGINE INITIALIZATION (The "Missing Code")
|
||||
// ----------------------------------------------------------------
|
||||
private fun initSherpaModel() {
|
||||
val modelDir = File(filesDir, "sherpa-model")
|
||||
|
||||
// 1. Find the Device
|
||||
// (This probes specifically for devices listed in your device_filter.xml)
|
||||
val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
|
||||
if (availableDrivers.isEmpty()) {
|
||||
outputText.append("\n> No USB device found.")
|
||||
if (!File(modelDir, "encoder.onnx").exists()) {
|
||||
outputText.text = "Error: Sherpa Model files missing in /sherpa-model/"
|
||||
return
|
||||
}
|
||||
|
||||
// Assume the first device found is the Pico
|
||||
val driver = availableDrivers[0]
|
||||
val connection = usbManager.openDevice(driver.device)
|
||||
|
||||
if (connection == null) {
|
||||
outputText.append("\n> Permission denied. Re-plug device?")
|
||||
return
|
||||
}
|
||||
|
||||
// 2. Open the Port
|
||||
// Most Picos use port 0.
|
||||
usbPort = driver.ports[0]
|
||||
|
||||
try {
|
||||
usbPort?.open(connection)
|
||||
// 3. Set Parameters (Must match your Pico's C/Python code!)
|
||||
// 115200 Baud, 8 Data bits, 1 Stop bit, No Parity
|
||||
usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
|
||||
// 1. Define Model Paths
|
||||
val transducerConfig = OnlineTransducerModelConfig(
|
||||
encoder = File(modelDir, "encoder.onnx").absolutePath,
|
||||
decoder = File(modelDir, "decoder.onnx").absolutePath,
|
||||
joiner = File(modelDir, "joiner.onnx").absolutePath
|
||||
)
|
||||
|
||||
outputText.append("\n> USB Connected to Pico!")
|
||||
// 2. Define General Config
|
||||
val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig(
|
||||
transducer = transducerConfig,
|
||||
tokens = File(modelDir, "tokens.txt").absolutePath,
|
||||
numThreads = 1,
|
||||
debug = false,
|
||||
modelType = "zipformer"
|
||||
)
|
||||
|
||||
// 3. Define Endpoint Rule (The fix for your error)
|
||||
// rule1 = detected silence after speech. We set this to 2.4 seconds.
|
||||
val silenceRule = EndpointRule(
|
||||
mustContainNonSilence = false,
|
||||
minTrailingSilence = 2.4f,
|
||||
minUtteranceLength = 0.0f
|
||||
)
|
||||
|
||||
// 4. Create Recognizer Config
|
||||
val config = OnlineRecognizerConfig(
|
||||
featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80),
|
||||
modelConfig = onlineModelConfig,
|
||||
endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here
|
||||
enableEndpoint = true,
|
||||
decodingMethod = "greedy_search",
|
||||
maxActivePaths = 4
|
||||
)
|
||||
|
||||
recognizer = OnlineRecognizer(assetManager = assets, config = config)
|
||||
stream = recognizer?.createStream()
|
||||
|
||||
outputText.text = "Engine Loaded. Ready to Stream."
|
||||
|
||||
} catch (e: Exception) {
|
||||
Log.e("Sherpa", "Init Error", e)
|
||||
outputText.text = "Init Error: ${e.message}"
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// 2. AUDIO LOOP (The "Manual" Listener)
|
||||
// ----------------------------------------------------------------
|
||||
private fun toggleRecording() {
|
||||
if (isRecording) {
|
||||
stopRecording()
|
||||
} else {
|
||||
startRecording()
|
||||
}
|
||||
}
|
||||
|
||||
private fun startRecording() {
|
||||
if (recognizer == null) {
|
||||
Toast.makeText(this, "Engine not ready", Toast.LENGTH_SHORT).show()
|
||||
return
|
||||
}
|
||||
|
||||
// Reset the stream for a new session
|
||||
// Note: Sherpa streams can be persistent, but resetting ensures clean start
|
||||
// If you want continuous conversation, don't reset 'committedText'
|
||||
|
||||
isRecording = true
|
||||
micButton.setColorFilter(android.graphics.Color.RED)
|
||||
outputText.text = "$committedText [Listening...]"
|
||||
|
||||
recordingThread = Thread {
|
||||
processAudioLoop()
|
||||
}
|
||||
recordingThread?.start()
|
||||
}
|
||||
|
||||
private fun stopRecording() {
|
||||
isRecording = false
|
||||
recordingThread?.join() // Wait for loop to finish
|
||||
micButton.clearColorFilter()
|
||||
outputText.text = "$committedText [Stopped]"
|
||||
}
|
||||
|
||||
private fun processAudioLoop() {
|
||||
val sampleRate = 16000
|
||||
val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
|
||||
|
||||
if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
|
||||
return
|
||||
}
|
||||
|
||||
// --- FIX START ---
|
||||
// Capture global variables into local non-null variables.
|
||||
// If either is null, we just exit the loop safely.
|
||||
val activeStream = stream ?: return
|
||||
val activeRecognizer = recognizer ?: return
|
||||
// --- FIX END ---
|
||||
|
||||
val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
|
||||
record.startRecording()
|
||||
|
||||
val buffer = ShortArray(bufferSize)
|
||||
|
||||
while (isRecording) {
|
||||
val ret = record.read(buffer, 0, buffer.size)
|
||||
if (ret > 0) {
|
||||
val samples = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
|
||||
// Use 'activeStream' and 'activeRecognizer' (No ? needed anymore)
|
||||
activeStream.acceptWaveform(samples, sampleRate)
|
||||
|
||||
while (activeRecognizer.isReady(activeStream)) {
|
||||
activeRecognizer.decode(activeStream)
|
||||
}
|
||||
|
||||
val text = activeRecognizer.getResult(activeStream).text
|
||||
|
||||
if (text.isNotEmpty()) {
|
||||
val cleanText = text.lowercase()
|
||||
runOnUiThread {
|
||||
outputText.text = "$committedText $cleanText"
|
||||
}
|
||||
|
||||
if (activeRecognizer.isEndpoint(activeStream)) {
|
||||
if (cleanText.isNotBlank()) {
|
||||
committedText += "$cleanText "
|
||||
sendToPico("$cleanText ")
|
||||
|
||||
// Reset the stream
|
||||
activeRecognizer.reset(activeStream)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
record.stop()
|
||||
record.release()
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// 3. USB LOGIC (Unchanged from before)
|
||||
// ----------------------------------------------------------------
|
||||
private fun connectToPico() {
|
||||
val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager
|
||||
val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager)
|
||||
if (availableDrivers.isEmpty()) return
|
||||
|
||||
val driver = availableDrivers[0]
|
||||
val connection = usbManager.openDevice(driver.device) ?: return
|
||||
|
||||
usbPort = driver.ports[0]
|
||||
try {
|
||||
usbPort?.open(connection)
|
||||
usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE)
|
||||
outputText.append("\n> USB Connected")
|
||||
} catch (e: Exception) {
|
||||
outputText.append("\n> USB Error: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
//Whisper/Sherpa implementation functions here....
|
||||
private fun initSherpaModel() {
|
||||
// Sherpa requires specific configuration
|
||||
val config = OnlineRecognizerConfig(
|
||||
featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80),
|
||||
transducerModelConfig = OnlineTransducerModelConfig(
|
||||
encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path
|
||||
decoder = "$filesDir/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "$filesDir/joiner-epoch-99-avg-1.onnx",
|
||||
),
|
||||
enableEndpoint = true, // Detects when you stop speaking
|
||||
ruleFsts = "",
|
||||
decodingMethod = "greedy_search",
|
||||
maxActivePaths = 4
|
||||
)
|
||||
|
||||
try {
|
||||
recognizer = OnlineRecognizer(assetManager = assets, config = config)
|
||||
stream = recognizer?.createStream()
|
||||
outputText.text = "Whisper/Sherpa Ready!"
|
||||
} catch (e: Exception) {
|
||||
outputText.text = "Error: ${e.message}"
|
||||
}
|
||||
}
|
||||
|
||||
private fun startRecordingLoop() {
|
||||
val sampleRate = 16000
|
||||
val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
|
||||
val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
|
||||
|
||||
record.startRecording()
|
||||
isListening = true
|
||||
|
||||
Thread {
|
||||
val buffer = ShortArray(bufferSize / 2)
|
||||
while (isListening) {
|
||||
val read = record.read(buffer, 0, buffer.size)
|
||||
if (read > 0) {
|
||||
// 1. Feed audio to engine
|
||||
val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize
|
||||
stream?.acceptWaveform(floatSamples, sampleRate)
|
||||
|
||||
// 2. Decode
|
||||
while (recognizer?.isReady(stream) == true) {
|
||||
recognizer?.decode(stream)
|
||||
}
|
||||
|
||||
// 3. Get Result
|
||||
val result = recognizer?.getResult(stream)
|
||||
val text = result?.text ?: ""
|
||||
|
||||
if (text.isNotEmpty()) {
|
||||
// Update UI
|
||||
runOnUiThread {
|
||||
// Sherpa returns the FULL string so far, not just chunks
|
||||
// So we just overwrite the "Current" view
|
||||
val cleanText = text.lowercase()
|
||||
outputText.text = cleanText
|
||||
|
||||
// Check if sentence is "Final" (Endpoint detected)
|
||||
if (recognizer?.isEndpoint(stream) == true) {
|
||||
sendToPico(cleanText)
|
||||
recognizer?.reset(stream) // Clear buffer for next sentence
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
record.stop()
|
||||
record.release()
|
||||
}.start()
|
||||
}
|
||||
|
||||
|
||||
/* ---- removed as part of the whisper migration
|
||||
private fun initModel() {
|
||||
// We look for the folder inside private storage (same logic as MainActivity)
|
||||
val modelPath = File(filesDir, "vosk-model")
|
||||
|
||||
if (!modelPath.exists()) {
|
||||
outputText.text = "Error: Model not found. Please go back and load a model first."
|
||||
micButton.isEnabled = false
|
||||
return
|
||||
}
|
||||
|
||||
Thread {
|
||||
try {
|
||||
// Find the actual model folder inside
|
||||
val actualModelDir = modelPath.listFiles()?.firstOrNull { it.isDirectory } ?: modelPath
|
||||
model = Model(actualModelDir.absolutePath)
|
||||
|
||||
runOnUiThread {
|
||||
outputText.append("\n\n> Model Loaded. Ready.")
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
runOnUiThread {
|
||||
outputText.text = "Error loading model: ${e.message}"
|
||||
}
|
||||
}
|
||||
}.start()
|
||||
}
|
||||
*/
|
||||
private fun sendToPico(text: String) {
|
||||
if (usbPort == null) return // Safety check
|
||||
|
||||
if (usbPort == null) return
|
||||
try {
|
||||
// Convert text to bytes and send
|
||||
val data = text.toByteArray(Charsets.UTF_8)
|
||||
usbPort?.write(data, 1000) // 1000ms timeout
|
||||
usbPort?.write(text.toByteArray(Charsets.UTF_8), 500)
|
||||
} catch (e: Exception) {
|
||||
outputText.append("\n[Send Failed: ${e.message}]")
|
||||
// Log error
|
||||
}
|
||||
}
|
||||
|
||||
private fun toggleListening() {
|
||||
if (model == null) {
|
||||
Toast.makeText(this, "Model not loaded yet", Toast.LENGTH_SHORT).show()
|
||||
return
|
||||
}
|
||||
|
||||
if (isListening) {
|
||||
stopRecognition()
|
||||
} else {
|
||||
startRecognition()
|
||||
}
|
||||
}
|
||||
|
||||
private fun startRecognition() {
|
||||
try {
|
||||
val recognizer = Recognizer(model, 16000.0f) // 16kHz is standard for Vosk
|
||||
speechService = SpeechService(recognizer, 16000.0f)
|
||||
//speechService?.addListener(this) <----- removed this as it generated an error
|
||||
speechService?.startListening(this)
|
||||
|
||||
isListening = true
|
||||
micButton.setColorFilter(android.graphics.Color.RED) // Turn button red
|
||||
outputText.text = "" // Clear previous text
|
||||
outputText.append("> Listening...\n")
|
||||
|
||||
} catch (e: Exception) {
|
||||
outputText.append("\nError starting mic: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun stopRecognition() {
|
||||
speechService?.stop()
|
||||
speechService = null
|
||||
isListening = false
|
||||
micButton.clearColorFilter() // Reset button color
|
||||
outputText.append("\n> Stopped.")
|
||||
}
|
||||
|
||||
// --- Vosk Listener Callbacks ---
|
||||
/* removed as part of migration to whisper
|
||||
|
||||
override fun onResult(hypothesis: String?) {
|
||||
hypothesis?.let {
|
||||
val text = parseVoskResult(it)
|
||||
if (text.isNotEmpty()) {
|
||||
// 1. Update the UI History
|
||||
// Add the new sentence to our history
|
||||
committedText += "$text. "
|
||||
// Update screen
|
||||
outputText.text = "$committedText"
|
||||
|
||||
// 2. SEND TO PICO
|
||||
// We append a space because speech engines strip trailing spaces,
|
||||
// and you don't want "helloworld" typed into your computer.
|
||||
sendToPico("$text ")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override fun onPartialResult(hypothesis: String?) {
|
||||
// Optional: Shows words as they are being spoken (streaming)
|
||||
// You can enable this if you want to see "typing" effect
|
||||
hypothesis?.let {
|
||||
// Parse the "partial" JSON key
|
||||
val partial = JSONObject(it).optString("partial", "")
|
||||
|
||||
if (partial.isNotEmpty()) {
|
||||
// Display: [History] + [Current Streaming Guess]
|
||||
outputText.text = "$committedText $partial..."
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
override fun onFinalResult(hypothesis: String?) {
|
||||
// Final flush when stopping
|
||||
hypothesis?.let {
|
||||
val text = parseVoskResult(it)
|
||||
if (text.isNotEmpty()) {
|
||||
outputText.append("$text\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Whispoer migration removals
|
||||
override fun onError(exception: Exception?) {
|
||||
outputText.append("\nError: ${exception?.message}")
|
||||
}
|
||||
|
||||
override fun onTimeout() {
|
||||
outputText.append("\nTimeout.")
|
||||
}
|
||||
|
||||
*/
|
||||
// Permission Helper
|
||||
// ----------------------------------------------------------------
|
||||
// 4. CLEANUP
|
||||
// ----------------------------------------------------------------
|
||||
private fun checkAudioPermission() {
|
||||
if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
|
||||
ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1)
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup on exit
|
||||
override fun onDestroy() {
|
||||
super.onDestroy()
|
||||
speechService?.shutdown()
|
||||
|
||||
// Close USB
|
||||
try {
|
||||
usbPort?.close()
|
||||
} catch (e: Exception) {
|
||||
// Ignore errors on close
|
||||
}
|
||||
isRecording = false
|
||||
stream?.release()
|
||||
recognizer?.release()
|
||||
try { usbPort?.close() } catch (e: Exception) {}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user