diff --git a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt index 8911040..ca98dcc 100644 --- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt +++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt @@ -1,52 +1,49 @@ package net.mmanningau.speechtokeyboard import android.Manifest +import android.content.Context import android.content.pm.PackageManager +import android.hardware.usb.UsbManager +import android.media.AudioFormat +import android.media.AudioRecord +import android.media.MediaRecorder import android.os.Bundle +import android.util.Log import android.widget.ImageButton import android.widget.TextView import android.widget.Toast import androidx.appcompat.app.AppCompatActivity import androidx.core.app.ActivityCompat import androidx.core.content.ContextCompat -import org.json.JSONObject - -import com.k2fsa.sherpa.onnx.* // import for whisper sherpa wrapper - -// import org.vosk.Model --- migration to whisper removals -// import org.vosk.Recognizer -// import org.vosk.android.RecognitionListener -// import org.vosk.android.SpeechService - -import java.io.File - -import android.content.Context -import android.hardware.usb.UsbManager import com.hoho.android.usbserial.driver.UsbSerialPort import com.hoho.android.usbserial.driver.UsbSerialProber import com.hoho.android.usbserial.util.SerialInputOutputManager +import com.k2fsa.sherpa.onnx.EndpointConfig +import com.k2fsa.sherpa.onnx.EndpointRule +import com.k2fsa.sherpa.onnx.FeatureConfig +import com.k2fsa.sherpa.onnx.OnlineRecognizer +import com.k2fsa.sherpa.onnx.OnlineRecognizerConfig +import com.k2fsa.sherpa.onnx.OnlineTransducerModelConfig +import com.k2fsa.sherpa.onnx.OnlineStream +import java.io.File -// class TestModelActivity : AppCompatActivity(), RecognitionListener { class TestModelActivity : AppCompatActivity() { + // UI Components private lateinit var outputText: TextView private lateinit var micButton: ImageButton - // Whisper/Sherpa wrapper setup variables here - private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper + // Sherpa (Whisper) Components private var recognizer: OnlineRecognizer? = null private var stream: OnlineStream? = null - - // Vosk Components - now removed as whisper migration - // private var model: Model? = null - // private var speechService: SpeechService? = null - private var isListening = false + private var isRecording = false + private var recordingThread: Thread? = null // USB Components private var usbPort: UsbSerialPort? = null - private var usbIoManager: SerialInputOutputManager? = null // Handles the data flow - private var committedText = "" // Stores the finalized sentences + // Text History + private var committedText = "" override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) @@ -56,278 +53,213 @@ class TestModelActivity : AppCompatActivity() { outputText = findViewById(R.id.text_output_log) micButton = findViewById(R.id.btn_mic_toggle) - // Check Permissions immediately checkAudioPermission() + connectToPico() // Try to auto-connect USB on start - // Setup Button Listener + // Initialize Engine + initSherpaModel() + + // Setup Button micButton.setOnClickListener { - toggleListening() + toggleRecording() } - - // Initialize the model in background - initModel() } - private fun connectToPico() { - val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager + // ---------------------------------------------------------------- + // 1. ENGINE INITIALIZATION (The "Missing Code") + // ---------------------------------------------------------------- + private fun initSherpaModel() { + val modelDir = File(filesDir, "sherpa-model") - // 1. Find the Device - // (This probes specifically for devices listed in your device_filter.xml) - val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager) - if (availableDrivers.isEmpty()) { - outputText.append("\n> No USB device found.") + if (!File(modelDir, "encoder.onnx").exists()) { + outputText.text = "Error: Sherpa Model files missing in /sherpa-model/" return } - // Assume the first device found is the Pico - val driver = availableDrivers[0] - val connection = usbManager.openDevice(driver.device) - - if (connection == null) { - outputText.append("\n> Permission denied. Re-plug device?") - return - } - - // 2. Open the Port - // Most Picos use port 0. - usbPort = driver.ports[0] - try { - usbPort?.open(connection) - // 3. Set Parameters (Must match your Pico's C/Python code!) - // 115200 Baud, 8 Data bits, 1 Stop bit, No Parity - usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE) + // 1. Define Model Paths + val transducerConfig = OnlineTransducerModelConfig( + encoder = File(modelDir, "encoder.onnx").absolutePath, + decoder = File(modelDir, "decoder.onnx").absolutePath, + joiner = File(modelDir, "joiner.onnx").absolutePath + ) - outputText.append("\n> USB Connected to Pico!") + // 2. Define General Config + val onlineModelConfig = com.k2fsa.sherpa.onnx.OnlineModelConfig( + transducer = transducerConfig, + tokens = File(modelDir, "tokens.txt").absolutePath, + numThreads = 1, + debug = false, + modelType = "zipformer" + ) + + // 3. Define Endpoint Rule (The fix for your error) + // rule1 = detected silence after speech. We set this to 2.4 seconds. + val silenceRule = EndpointRule( + mustContainNonSilence = false, + minTrailingSilence = 2.4f, + minUtteranceLength = 0.0f + ) + + // 4. Create Recognizer Config + val config = OnlineRecognizerConfig( + featConfig = FeatureConfig(sampleRate = 16000, featureDim = 80), + modelConfig = onlineModelConfig, + endpointConfig = EndpointConfig(rule1 = silenceRule), // Pass the rule object here + enableEndpoint = true, + decodingMethod = "greedy_search", + maxActivePaths = 4 + ) + + recognizer = OnlineRecognizer(assetManager = assets, config = config) + stream = recognizer?.createStream() + + outputText.text = "Engine Loaded. Ready to Stream." + + } catch (e: Exception) { + Log.e("Sherpa", "Init Error", e) + outputText.text = "Init Error: ${e.message}" + } + } + + // ---------------------------------------------------------------- + // 2. AUDIO LOOP (The "Manual" Listener) + // ---------------------------------------------------------------- + private fun toggleRecording() { + if (isRecording) { + stopRecording() + } else { + startRecording() + } + } + + private fun startRecording() { + if (recognizer == null) { + Toast.makeText(this, "Engine not ready", Toast.LENGTH_SHORT).show() + return + } + + // Reset the stream for a new session + // Note: Sherpa streams can be persistent, but resetting ensures clean start + // If you want continuous conversation, don't reset 'committedText' + + isRecording = true + micButton.setColorFilter(android.graphics.Color.RED) + outputText.text = "$committedText [Listening...]" + + recordingThread = Thread { + processAudioLoop() + } + recordingThread?.start() + } + + private fun stopRecording() { + isRecording = false + recordingThread?.join() // Wait for loop to finish + micButton.clearColorFilter() + outputText.text = "$committedText [Stopped]" + } + + private fun processAudioLoop() { + val sampleRate = 16000 + val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT) + + if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) { + return + } + + // --- FIX START --- + // Capture global variables into local non-null variables. + // If either is null, we just exit the loop safely. + val activeStream = stream ?: return + val activeRecognizer = recognizer ?: return + // --- FIX END --- + + val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize) + record.startRecording() + + val buffer = ShortArray(bufferSize) + + while (isRecording) { + val ret = record.read(buffer, 0, buffer.size) + if (ret > 0) { + val samples = FloatArray(ret) { buffer[it] / 32768.0f } + + // Use 'activeStream' and 'activeRecognizer' (No ? needed anymore) + activeStream.acceptWaveform(samples, sampleRate) + + while (activeRecognizer.isReady(activeStream)) { + activeRecognizer.decode(activeStream) + } + + val text = activeRecognizer.getResult(activeStream).text + + if (text.isNotEmpty()) { + val cleanText = text.lowercase() + runOnUiThread { + outputText.text = "$committedText $cleanText" + } + + if (activeRecognizer.isEndpoint(activeStream)) { + if (cleanText.isNotBlank()) { + committedText += "$cleanText " + sendToPico("$cleanText ") + + // Reset the stream + activeRecognizer.reset(activeStream) + } + } + } + } + } + record.stop() + record.release() + } + + // ---------------------------------------------------------------- + // 3. USB LOGIC (Unchanged from before) + // ---------------------------------------------------------------- + private fun connectToPico() { + val usbManager = getSystemService(Context.USB_SERVICE) as UsbManager + val availableDrivers = UsbSerialProber.getDefaultProber().findAllDrivers(usbManager) + if (availableDrivers.isEmpty()) return + + val driver = availableDrivers[0] + val connection = usbManager.openDevice(driver.device) ?: return + + usbPort = driver.ports[0] + try { + usbPort?.open(connection) + usbPort?.setParameters(115200, 8, UsbSerialPort.STOPBITS_1, UsbSerialPort.PARITY_NONE) + outputText.append("\n> USB Connected") } catch (e: Exception) { outputText.append("\n> USB Error: ${e.message}") } } - //Whisper/Sherpa implementation functions here.... - private fun initSherpaModel() { - // Sherpa requires specific configuration - val config = OnlineRecognizerConfig( - featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80), - transducerModelConfig = OnlineTransducerModelConfig( - encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path - decoder = "$filesDir/decoder-epoch-99-avg-1.onnx", - joiner = "$filesDir/joiner-epoch-99-avg-1.onnx", - ), - enableEndpoint = true, // Detects when you stop speaking - ruleFsts = "", - decodingMethod = "greedy_search", - maxActivePaths = 4 - ) - - try { - recognizer = OnlineRecognizer(assetManager = assets, config = config) - stream = recognizer?.createStream() - outputText.text = "Whisper/Sherpa Ready!" - } catch (e: Exception) { - outputText.text = "Error: ${e.message}" - } - } - - private fun startRecordingLoop() { - val sampleRate = 16000 - val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT) - val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize) - - record.startRecording() - isListening = true - - Thread { - val buffer = ShortArray(bufferSize / 2) - while (isListening) { - val read = record.read(buffer, 0, buffer.size) - if (read > 0) { - // 1. Feed audio to engine - val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize - stream?.acceptWaveform(floatSamples, sampleRate) - - // 2. Decode - while (recognizer?.isReady(stream) == true) { - recognizer?.decode(stream) - } - - // 3. Get Result - val result = recognizer?.getResult(stream) - val text = result?.text ?: "" - - if (text.isNotEmpty()) { - // Update UI - runOnUiThread { - // Sherpa returns the FULL string so far, not just chunks - // So we just overwrite the "Current" view - val cleanText = text.lowercase() - outputText.text = cleanText - - // Check if sentence is "Final" (Endpoint detected) - if (recognizer?.isEndpoint(stream) == true) { - sendToPico(cleanText) - recognizer?.reset(stream) // Clear buffer for next sentence - } - } - } - } - } - record.stop() - record.release() - }.start() - } - - - /* ---- removed as part of the whisper migration - private fun initModel() { - // We look for the folder inside private storage (same logic as MainActivity) - val modelPath = File(filesDir, "vosk-model") - - if (!modelPath.exists()) { - outputText.text = "Error: Model not found. Please go back and load a model first." - micButton.isEnabled = false - return - } - - Thread { - try { - // Find the actual model folder inside - val actualModelDir = modelPath.listFiles()?.firstOrNull { it.isDirectory } ?: modelPath - model = Model(actualModelDir.absolutePath) - - runOnUiThread { - outputText.append("\n\n> Model Loaded. Ready.") - } - } catch (e: Exception) { - runOnUiThread { - outputText.text = "Error loading model: ${e.message}" - } - } - }.start() - } -*/ private fun sendToPico(text: String) { - if (usbPort == null) return // Safety check - + if (usbPort == null) return try { - // Convert text to bytes and send - val data = text.toByteArray(Charsets.UTF_8) - usbPort?.write(data, 1000) // 1000ms timeout + usbPort?.write(text.toByteArray(Charsets.UTF_8), 500) } catch (e: Exception) { - outputText.append("\n[Send Failed: ${e.message}]") + // Log error } } - private fun toggleListening() { - if (model == null) { - Toast.makeText(this, "Model not loaded yet", Toast.LENGTH_SHORT).show() - return - } - - if (isListening) { - stopRecognition() - } else { - startRecognition() - } - } - - private fun startRecognition() { - try { - val recognizer = Recognizer(model, 16000.0f) // 16kHz is standard for Vosk - speechService = SpeechService(recognizer, 16000.0f) - //speechService?.addListener(this) <----- removed this as it generated an error - speechService?.startListening(this) - - isListening = true - micButton.setColorFilter(android.graphics.Color.RED) // Turn button red - outputText.text = "" // Clear previous text - outputText.append("> Listening...\n") - - } catch (e: Exception) { - outputText.append("\nError starting mic: ${e.message}") - } - } - - private fun stopRecognition() { - speechService?.stop() - speechService = null - isListening = false - micButton.clearColorFilter() // Reset button color - outputText.append("\n> Stopped.") - } - - // --- Vosk Listener Callbacks --- -/* removed as part of migration to whisper - - override fun onResult(hypothesis: String?) { - hypothesis?.let { - val text = parseVoskResult(it) - if (text.isNotEmpty()) { - // 1. Update the UI History - // Add the new sentence to our history - committedText += "$text. " - // Update screen - outputText.text = "$committedText" - - // 2. SEND TO PICO - // We append a space because speech engines strip trailing spaces, - // and you don't want "helloworld" typed into your computer. - sendToPico("$text ") - } - } - } - - override fun onPartialResult(hypothesis: String?) { - // Optional: Shows words as they are being spoken (streaming) - // You can enable this if you want to see "typing" effect - hypothesis?.let { - // Parse the "partial" JSON key - val partial = JSONObject(it).optString("partial", "") - - if (partial.isNotEmpty()) { - // Display: [History] + [Current Streaming Guess] - outputText.text = "$committedText $partial..." - } - } - } -*/ - - override fun onFinalResult(hypothesis: String?) { - // Final flush when stopping - hypothesis?.let { - val text = parseVoskResult(it) - if (text.isNotEmpty()) { - outputText.append("$text\n") - } - } - } -/* Whispoer migration removals - override fun onError(exception: Exception?) { - outputText.append("\nError: ${exception?.message}") - } - - override fun onTimeout() { - outputText.append("\nTimeout.") - } - -*/ - // Permission Helper + // ---------------------------------------------------------------- + // 4. CLEANUP + // ---------------------------------------------------------------- private fun checkAudioPermission() { if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) { ActivityCompat.requestPermissions(this, arrayOf(Manifest.permission.RECORD_AUDIO), 1) } } - // Cleanup on exit override fun onDestroy() { super.onDestroy() - speechService?.shutdown() - - // Close USB - try { - usbPort?.close() - } catch (e: Exception) { - // Ignore errors on close - } + isRecording = false + stream?.release() + recognizer?.release() + try { usbPort?.close() } catch (e: Exception) {} } } \ No newline at end of file