diff --git a/cogs/VoiceGatewayCog.py b/cogs/VoiceGatewayCog.py index b01e638..ca8e020 100644 --- a/cogs/VoiceGatewayCog.py +++ b/cogs/VoiceGatewayCog.py @@ -14,10 +14,10 @@ from gurt import config as GurtConfig # Attempt to import STT and VAD libraries try: - import whisper + from google.cloud import speech except ImportError: - print("Whisper library not found. Please install with 'pip install openai-whisper'") - whisper = None + print("Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'") + speech = None try: import webrtcvad @@ -246,17 +246,16 @@ class VoiceGatewayCog(commands.Cog): self.bot = bot self.active_sinks = {} # guild_id: VoiceAudioSink self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id - self.whisper_model = None - if whisper: + self.speech_client = None + if speech: try: - # Load a smaller model initially, can be made configurable - self.whisper_model = whisper.load_model("base") - print("Whisper model 'base' loaded successfully.") + self.speech_client = speech.SpeechClient() + print("Google Cloud Speech client initialized successfully.") except Exception as e: - print(f"Error loading Whisper model: {e}. STT will not be available.") - self.whisper_model = None + print(f"Error initializing Google Cloud Speech client: {e}. STT will not be available.") + self.speech_client = None else: - print("Whisper library not available. STT functionality will be disabled.") + print("Google Cloud Speech library not available. STT functionality will be disabled.") async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]: if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED: @@ -533,43 +532,41 @@ class VoiceGatewayCog(commands.Cog): # Removed start_listening_pipeline as the sink now handles more logic directly or via tasks. async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild): - """Processes a segment of audio data using Whisper.""" - if not self.whisper_model or not audio_data: # also check if audio_data is empty + """Processes a segment of audio data using Google Cloud Speech-to-Text.""" + if not self.speech_client or not audio_data: if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.") return - # Save audio_data (PCM) to a temporary WAV file - # Whisper expects a file path or a NumPy array. - # Using a temporary file is straightforward. try: - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: - wav_file_path = tmp_wav.name - wf = wave.open(tmp_wav, 'wb') - wf.setnchannels(CHANNELS) - wf.setsampwidth(SAMPLE_WIDTH) - wf.setframerate(SAMPLE_RATE) - wf.writeframes(audio_data) - wf.close() - - # Transcribe using Whisper (this can be blocking, run in executor) - # Use functools.partial to pass keyword arguments to the transcribe method - transcribe_func = functools.partial(self.whisper_model.transcribe, wav_file_path, fp16=False) - result = await self.bot.loop.run_in_executor( - None, # Default ThreadPoolExecutor - transcribe_func + recognition_config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=SAMPLE_RATE, # Defined as 16000 + language_code="en-US", + enable_automatic_punctuation=True, + # model="telephony" # Consider uncommenting if default isn't ideal for voice chat ) - transcribed_text = result["text"].strip() + recognition_audio = speech.RecognitionAudio(content=audio_data) - if transcribed_text: # Only dispatch if there's actual text + # Run in executor as it's a network call that can be blocking + response = await self.bot.loop.run_in_executor( + None, # Default ThreadPoolExecutor + functools.partial(self.speech_client.recognize, config=recognition_config, audio=recognition_audio) + ) + + transcribed_text = "" + for result in response.results: + if result.alternatives: + transcribed_text += result.alternatives[0].transcript + " " + + transcribed_text = transcribed_text.strip() + + if transcribed_text: user = guild.get_member(user_id) or await self.bot.fetch_user(user_id) - print(f"Transcription for {user.name} ({user_id}) in {guild.name}: {transcribed_text}") + print(f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}") self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text) except Exception as e: - print(f"Error processing audio segment for user {user_id}: {e}") - finally: - if 'wav_file_path' in locals() and os.path.exists(wav_file_path): - os.remove(wav_file_path) + print(f"Error processing audio segment with Google STT for user {user_id}: {e}") async def setup(bot: commands.Bot): diff --git a/gurt/listeners.py b/gurt/listeners.py index 5f74248..01e128a 100644 --- a/gurt/listeners.py +++ b/gurt/listeners.py @@ -731,7 +731,7 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor """Listener for transcribed voice messages.""" from .api import get_ai_response # For processing the text from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response - from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED # Import new config + from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED, VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL # Import new config print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'") @@ -914,27 +914,46 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor # If not, and there's text, we could make it speak here as a fallback, # but it's better if the AI decides to use the speak_in_voice_channel tool. - # If there's also a text component to send to the text_channel: - if response_text: # Only send if there's actual text content - # This part is simplified; a more robust solution would reuse the - # send_response_content helper from on_message_listener if possible, - # or adapt its logic here. - try: - # Simulate typing if sending to text channel - async with text_channel.typing(): - await simulate_human_typing(cog, text_channel, response_text) + if response_text: + # Force speak the response if it's from a voice transcription context + speak_tool_func = cog.TOOL_MAPPING.get("speak_in_voice_channel") + if speak_tool_func: + print(f"Forcing voice response for transcription: '{response_text[:50]}...'") + speak_result = await speak_tool_func(cog, text_to_speak=response_text) - sent_text_msg = await text_channel.send(response_text) - print(f"Sent text response to {text_channel.name} for voice transcription: '{response_text[:50]}...'") - - # Cache GURT's text response - bot_response_cache_entry = format_message(cog, sent_text_msg) - cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry) - cog.message_cache['global_recent'].append(bot_response_cache_entry) - cog.bot_last_spoke[text_channel.id] = time.time() - - except Exception as send_err: - print(f"Error sending text response for voice transcription: {send_err}") + if speak_result.get("status") == "success": + print(f"Successfully forced voice response. Text log handled by speak_in_voice_channel tool if enabled.") + # The speak_in_voice_channel tool will log to the dedicated text channel + # if VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is true. + # No need to send separately from here if that config is true. + # If VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is false, no text log of GURT's speech will appear. + else: + print(f"Forced speak_in_voice_channel failed: {speak_result.get('error')}") + # Fallback: if speaking failed, send it as text to the dedicated channel + # so the user at least gets a response. + try: + fallback_msg = await text_channel.send(f"(Voice output failed) GURT: {response_text}") + print(f"Sent fallback text response to {text_channel.name} for voice transcription failure.") + # Cache this fallback text response + bot_response_cache_entry = format_message(cog, fallback_msg) + cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry) + cog.message_cache['global_recent'].append(bot_response_cache_entry) + cog.bot_last_spoke[text_channel.id] = time.time() + except Exception as send_fallback_err: + print(f"Error sending fallback text for voice failure: {send_fallback_err}") + else: + print("speak_in_voice_channel tool not found. Sending text response as fallback.") + try: + # Fallback to text if tool is missing + fallback_msg = await text_channel.send(f"(Voice tool missing) GURT: {response_text}") + print(f"Sent fallback text response to {text_channel.name} due to missing voice tool.") + # Cache this fallback text response + bot_response_cache_entry = format_message(cog, fallback_msg) + cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry) + cog.message_cache['global_recent'].append(bot_response_cache_entry) + cog.bot_last_spoke[text_channel.id] = time.time() + except Exception as send_fallback_err3: + print(f"Error sending fallback text for missing voice tool: {send_fallback_err3}") # Handle reactions if any (similar to on_message) emoji_to_react = final_response_data.get("react_with_emoji")