feat: Integrate Google Cloud Speech-to-Text for voice transcription and enhance response handling

2025-05-30 22:49:22 -06:00 · 2025-05-30 22:49:22 -06:00 · 445be20991
commit 445be20991
parent 36f3f80487
2 changed files with 75 additions and 59 deletions
--- a/cogs/VoiceGatewayCog.py
+++ b/cogs/VoiceGatewayCog.py
@ -14,10 +14,10 @@ from gurt import config as GurtConfig

 # Attempt to import STT and VAD libraries
 try:
-    import whisper
+    from google.cloud import speech
 except ImportError:
-    print("Whisper library not found. Please install with 'pip install openai-whisper'")
-    whisper = None
+    print("Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'")
+    speech = None

 try:
    import webrtcvad
@ -246,17 +246,16 @@ class VoiceGatewayCog(commands.Cog):
        self.bot = bot
        self.active_sinks = {} # guild_id: VoiceAudioSink
        self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
-        self.whisper_model = None
-        if whisper:
+        self.speech_client = None
+        if speech:
            try:
-                # Load a smaller model initially, can be made configurable
-                self.whisper_model = whisper.load_model("base")
-                print("Whisper model 'base' loaded successfully.")
+                self.speech_client = speech.SpeechClient()
+                print("Google Cloud Speech client initialized successfully.")
            except Exception as e:
-                print(f"Error loading Whisper model: {e}. STT will not be available.")
-                self.whisper_model = None
+                print(f"Error initializing Google Cloud Speech client: {e}. STT will not be available.")
+                self.speech_client = None
        else:
-            print("Whisper library not available. STT functionality will be disabled.")
+            print("Google Cloud Speech library not available. STT functionality will be disabled.")

    async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]:
        if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
@ -533,43 +532,41 @@ class VoiceGatewayCog(commands.Cog):
    # Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.

    async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild):
-        """Processes a segment of audio data using Whisper."""
-        if not self.whisper_model or not audio_data: # also check if audio_data is empty
+        """Processes a segment of audio data using Google Cloud Speech-to-Text."""
+        if not self.speech_client or not audio_data:
            if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.")
            return

-        # Save audio_data (PCM) to a temporary WAV file
-        # Whisper expects a file path or a NumPy array.
-        # Using a temporary file is straightforward.
        try:
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
-                wav_file_path = tmp_wav.name
-                wf = wave.open(tmp_wav, 'wb')
-                wf.setnchannels(CHANNELS)
-                wf.setsampwidth(SAMPLE_WIDTH)
-                wf.setframerate(SAMPLE_RATE)
-                wf.writeframes(audio_data)
-                wf.close()
-
-            # Transcribe using Whisper (this can be blocking, run in executor)
-            # Use functools.partial to pass keyword arguments to the transcribe method
-            transcribe_func = functools.partial(self.whisper_model.transcribe, wav_file_path, fp16=False)
-            result = await self.bot.loop.run_in_executor(
-                None, # Default ThreadPoolExecutor
-                transcribe_func
+            recognition_config = speech.RecognitionConfig(
+                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=SAMPLE_RATE,  # Defined as 16000
+                language_code="en-US",
+                enable_automatic_punctuation=True,
+                # model="telephony" # Consider uncommenting if default isn't ideal for voice chat
            )
-            transcribed_text = result["text"].strip()
+            recognition_audio = speech.RecognitionAudio(content=audio_data)

-            if transcribed_text: # Only dispatch if there's actual text
+            # Run in executor as it's a network call that can be blocking
+            response = await self.bot.loop.run_in_executor(
+                None,  # Default ThreadPoolExecutor
+                functools.partial(self.speech_client.recognize, config=recognition_config, audio=recognition_audio)
+            )
+
+            transcribed_text = ""
+            for result in response.results:
+                if result.alternatives:
+                    transcribed_text += result.alternatives[0].transcript + " "
+            
+            transcribed_text = transcribed_text.strip()
+
+            if transcribed_text:
                user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
-                print(f"Transcription for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
+                print(f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
                self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text)

        except Exception as e:
-            print(f"Error processing audio segment for user {user_id}: {e}")
-        finally:
-            if 'wav_file_path' in locals() and os.path.exists(wav_file_path):
-                os.remove(wav_file_path)
+            print(f"Error processing audio segment with Google STT for user {user_id}: {e}")


 async def setup(bot: commands.Bot):
--- a/gurt/listeners.py
+++ b/gurt/listeners.py
@ -731,7 +731,7 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
    """Listener for transcribed voice messages."""
    from .api import get_ai_response # For processing the text
    from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response
-    from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED # Import new config
+    from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED, VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL # Import new config

    print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'")

@ -914,27 +914,46 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
            # If not, and there's text, we could make it speak here as a fallback,
            # but it's better if the AI decides to use the speak_in_voice_channel tool.

-            # If there's also a text component to send to the text_channel:
-            if response_text: # Only send if there's actual text content
-                # This part is simplified; a more robust solution would reuse the
-                # send_response_content helper from on_message_listener if possible,
-                # or adapt its logic here.
+            if response_text:
+                # Force speak the response if it's from a voice transcription context
+                speak_tool_func = cog.TOOL_MAPPING.get("speak_in_voice_channel")
+                if speak_tool_func:
+                    print(f"Forcing voice response for transcription: '{response_text[:50]}...'")
+                    speak_result = await speak_tool_func(cog, text_to_speak=response_text)
+                    
+                    if speak_result.get("status") == "success":
+                        print(f"Successfully forced voice response. Text log handled by speak_in_voice_channel tool if enabled.")
+                        # The speak_in_voice_channel tool will log to the dedicated text channel
+                        # if VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is true.
+                        # No need to send separately from here if that config is true.
+                        # If VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is false, no text log of GURT's speech will appear.
+                    else:
+                        print(f"Forced speak_in_voice_channel failed: {speak_result.get('error')}")
+                        # Fallback: if speaking failed, send it as text to the dedicated channel
+                        # so the user at least gets a response.
                        try:
-                    # Simulate typing if sending to text channel
-                    async with text_channel.typing():
-                        await simulate_human_typing(cog, text_channel, response_text)
-                    
-                    sent_text_msg = await text_channel.send(response_text)
-                    print(f"Sent text response to {text_channel.name} for voice transcription: '{response_text[:50]}...'")
-                    
-                    # Cache GURT's text response
-                    bot_response_cache_entry = format_message(cog, sent_text_msg)
+                            fallback_msg = await text_channel.send(f"(Voice output failed) GURT: {response_text}")
+                            print(f"Sent fallback text response to {text_channel.name} for voice transcription failure.")
+                            # Cache this fallback text response
+                            bot_response_cache_entry = format_message(cog, fallback_msg)
                            cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
                            cog.message_cache['global_recent'].append(bot_response_cache_entry)
                            cog.bot_last_spoke[text_channel.id] = time.time()
-
-                except Exception as send_err:
-                    print(f"Error sending text response for voice transcription: {send_err}")
+                        except Exception as send_fallback_err:
+                            print(f"Error sending fallback text for voice failure: {send_fallback_err}")
+                else:
+                    print("speak_in_voice_channel tool not found. Sending text response as fallback.")
+                    try:
+                        # Fallback to text if tool is missing
+                        fallback_msg = await text_channel.send(f"(Voice tool missing) GURT: {response_text}")
+                        print(f"Sent fallback text response to {text_channel.name} due to missing voice tool.")
+                        # Cache this fallback text response
+                        bot_response_cache_entry = format_message(cog, fallback_msg)
+                        cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
+                        cog.message_cache['global_recent'].append(bot_response_cache_entry)
+                        cog.bot_last_spoke[text_channel.id] = time.time()
+                    except Exception as send_fallback_err3:
+                        print(f"Error sending fallback text for missing voice tool: {send_fallback_err3}")
            
            # Handle reactions if any (similar to on_message)
            emoji_to_react = final_response_data.get("react_with_emoji")