From 404bc55ed32c0ce791037775a926834c671eb3ed Mon Sep 17 00:00:00 2001 From: mmanningau Date: Thu, 22 Jan 2026 19:49:48 +1100 Subject: [PATCH] Initial steps to migrate to the whisper/sherpa setup.. Still missing majot changes to the TestModelActivity.kt file which are coming next... --- app/build.gradle.kts | 7 +- .../speechtokeyboard/MainActivity.kt | 17 +-- .../speechtokeyboard/TestModelActivity.kt | 118 +++++++++++++++--- gradle/libs.versions.toml | 1 + 4 files changed, 115 insertions(+), 28 deletions(-) diff --git a/app/build.gradle.kts b/app/build.gradle.kts index 376c919..5e5a2cb 100644 --- a/app/build.gradle.kts +++ b/app/build.gradle.kts @@ -47,12 +47,17 @@ dependencies { androidTestImplementation(libs.androidx.espresso.core) // 1. The "Brain": Vosk Offline Speech Recognition - implementation("com.alphacephei:vosk-android:0.3.47") + //implementation("com.alphacephei:vosk-android:0.3.47") - removed as part of the migration to whisper + // (Optional) Helper for memory management if needed later // Removed the following as it was listed as optional and it did cause errors - // so to avoid a whole list of duplicate class found errors - this is already required via the VOSK libraries // implementation("net.java.dev.jna:jna:5.13.0") + // New Whisper include... + // implementation("com.k2fsa.sherpa.onnx:sherpa-onnx:1.12.23") // The engine + implementation("com.github.k2-fsa:sherpa-onnx:v1.12.23") + // 2. The "Mouth": USB Serial Driver for Android implementation("com.github.mik3y:usb-serial-for-android:3.7.0") } \ No newline at end of file diff --git a/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt b/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt index d010159..2d75a2c 100644 --- a/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt +++ b/app/src/main/java/net/mmanningau/speechtokeyboard/MainActivity.kt @@ -9,8 +9,8 @@ import android.widget.TextView import android.widget.Toast import androidx.activity.result.contract.ActivityResultContracts import androidx.appcompat.app.AppCompatActivity -import org.vosk.Model -import org.vosk.android.SpeechService +// import org.vosk.Model +// import org.vosk.android.SpeechService --- removed as part of migratoin to whisper.cpp import java.io.File import java.util.zip.ZipInputStream @@ -19,9 +19,9 @@ class MainActivity : AppCompatActivity() { // UI Components private lateinit var statusText: TextView - // Vosk Components - private var model: Model? = null - private var speechService: SpeechService? = null + // Vosk Components - removed as part of whisper migration + // private var model: Model? = null + // private var speechService: SpeechService? = null // 1. THE FILE PICKER REGISTRY // This handles the result when the user picks a ZIP file @@ -43,7 +43,7 @@ class MainActivity : AppCompatActivity() { // ADD THIS LINE AT THE BOTTOM: // This attempts to load the model immediately if files exist - initVoskModel() + // initVoskModel() - removed as part of whisper migration } // 2. SETUP THE MENU @@ -113,7 +113,7 @@ class MainActivity : AppCompatActivity() { // Back to UI Thread to say success runOnUiThread { statusText.text = "Model Installed! Initializing..." - initVoskModel() + // initVoskModel() - removed as part of the whisper migration } } catch (e: Exception) { @@ -127,7 +127,9 @@ class MainActivity : AppCompatActivity() { // 6. INITIALIZE VOSK "BRAIN" // Replace your existing initVoskModel with this updated version + /* private fun initVoskModel() { + val modelPath = File(filesDir, "vosk-model") // Check if the directory exists before trying to load @@ -146,4 +148,5 @@ class MainActivity : AppCompatActivity() { statusText.text = "Error loading saved model: ${e.message}" } } +*/ } \ No newline at end of file diff --git a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt index 0f82aca..8911040 100644 --- a/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt +++ b/app/src/main/java/net/mmanningau/speechtokeyboard/TestModelActivity.kt @@ -10,10 +10,14 @@ import androidx.appcompat.app.AppCompatActivity import androidx.core.app.ActivityCompat import androidx.core.content.ContextCompat import org.json.JSONObject -import org.vosk.Model -import org.vosk.Recognizer -import org.vosk.android.RecognitionListener -import org.vosk.android.SpeechService + +import com.k2fsa.sherpa.onnx.* // import for whisper sherpa wrapper + +// import org.vosk.Model --- migration to whisper removals +// import org.vosk.Recognizer +// import org.vosk.android.RecognitionListener +// import org.vosk.android.SpeechService + import java.io.File import android.content.Context @@ -22,14 +26,20 @@ import com.hoho.android.usbserial.driver.UsbSerialPort import com.hoho.android.usbserial.driver.UsbSerialProber import com.hoho.android.usbserial.util.SerialInputOutputManager -class TestModelActivity : AppCompatActivity(), RecognitionListener { +// class TestModelActivity : AppCompatActivity(), RecognitionListener { +class TestModelActivity : AppCompatActivity() { private lateinit var outputText: TextView private lateinit var micButton: ImageButton - // Vosk Components - private var model: Model? = null - private var speechService: SpeechService? = null + // Whisper/Sherpa wrapper setup variables here + private var audioRecorder: AudioRecorder? = null // You'll need a new recorder helper + private var recognizer: OnlineRecognizer? = null + private var stream: OnlineStream? = null + + // Vosk Components - now removed as whisper migration + // private var model: Model? = null + // private var speechService: SpeechService? = null private var isListening = false // USB Components @@ -94,8 +104,82 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener { } } + //Whisper/Sherpa implementation functions here.... + private fun initSherpaModel() { + // Sherpa requires specific configuration + val config = OnlineRecognizerConfig( + featConfig = FeatureConfig(sampleRate = 16000.0f, featureDim = 80), + transducerModelConfig = OnlineTransducerModelConfig( + encoder = "$filesDir/encoder-epoch-99-avg-1.onnx", // Example path + decoder = "$filesDir/decoder-epoch-99-avg-1.onnx", + joiner = "$filesDir/joiner-epoch-99-avg-1.onnx", + ), + enableEndpoint = true, // Detects when you stop speaking + ruleFsts = "", + decodingMethod = "greedy_search", + maxActivePaths = 4 + ) - private fun initModel() { + try { + recognizer = OnlineRecognizer(assetManager = assets, config = config) + stream = recognizer?.createStream() + outputText.text = "Whisper/Sherpa Ready!" + } catch (e: Exception) { + outputText.text = "Error: ${e.message}" + } + } + + private fun startRecordingLoop() { + val sampleRate = 16000 + val bufferSize = AudioRecord.getMinBufferSize(sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT) + val record = AudioRecord(MediaRecorder.AudioSource.MIC, sampleRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize) + + record.startRecording() + isListening = true + + Thread { + val buffer = ShortArray(bufferSize / 2) + while (isListening) { + val read = record.read(buffer, 0, buffer.size) + if (read > 0) { + // 1. Feed audio to engine + val floatSamples = FloatArray(read) { buffer[it] / 32768.0f } // Normalize + stream?.acceptWaveform(floatSamples, sampleRate) + + // 2. Decode + while (recognizer?.isReady(stream) == true) { + recognizer?.decode(stream) + } + + // 3. Get Result + val result = recognizer?.getResult(stream) + val text = result?.text ?: "" + + if (text.isNotEmpty()) { + // Update UI + runOnUiThread { + // Sherpa returns the FULL string so far, not just chunks + // So we just overwrite the "Current" view + val cleanText = text.lowercase() + outputText.text = cleanText + + // Check if sentence is "Final" (Endpoint detected) + if (recognizer?.isEndpoint(stream) == true) { + sendToPico(cleanText) + recognizer?.reset(stream) // Clear buffer for next sentence + } + } + } + } + } + record.stop() + record.release() + }.start() + } + + + /* ---- removed as part of the whisper migration + private fun initModel() { // We look for the folder inside private storage (same logic as MainActivity) val modelPath = File(filesDir, "vosk-model") @@ -121,7 +205,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener { } }.start() } - +*/ private fun sendToPico(text: String) { if (usbPort == null) return // Safety check @@ -173,6 +257,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener { } // --- Vosk Listener Callbacks --- +/* removed as part of migration to whisper override fun onResult(hypothesis: String?) { hypothesis?.let { @@ -205,6 +290,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener { } } } +*/ override fun onFinalResult(hypothesis: String?) { // Final flush when stopping @@ -215,7 +301,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener { } } } - +/* Whispoer migration removals override fun onError(exception: Exception?) { outputText.append("\nError: ${exception?.message}") } @@ -224,15 +310,7 @@ class TestModelActivity : AppCompatActivity(), RecognitionListener { outputText.append("\nTimeout.") } - // Helper to clean JSON: {"text": "hello world"} -> "hello world" - private fun parseVoskResult(json: String): String { - return try { - JSONObject(json).optString("text", "") - } catch (e: Exception) { - "" - } - } - +*/ // Permission Helper private fun checkAudioPermission() { if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) { diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c2ca9e9..3d58c34 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -10,6 +10,7 @@ material = "1.13.0" activity = "1.12.2" constraintlayout = "2.2.1" + [libraries] androidx-core-ktx = { group = "androidx.core", name = "core-ktx", version.ref = "coreKtx" } junit = { group = "junit", name = "junit", version.ref = "junit" }