feat: Integrate Google Cloud Speech-to-Text for voice transcription and enhance response handling

This commit is contained in:
Slipstream 2025-05-30 22:49:22 -06:00
parent 36f3f80487
commit 445be20991
Signed by: slipstream
GPG Key ID: 13E498CE010AC6FD
2 changed files with 75 additions and 59 deletions

View File

@ -14,10 +14,10 @@ from gurt import config as GurtConfig
# Attempt to import STT and VAD libraries # Attempt to import STT and VAD libraries
try: try:
import whisper from google.cloud import speech
except ImportError: except ImportError:
print("Whisper library not found. Please install with 'pip install openai-whisper'") print("Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'")
whisper = None speech = None
try: try:
import webrtcvad import webrtcvad
@ -246,17 +246,16 @@ class VoiceGatewayCog(commands.Cog):
self.bot = bot self.bot = bot
self.active_sinks = {} # guild_id: VoiceAudioSink self.active_sinks = {} # guild_id: VoiceAudioSink
self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
self.whisper_model = None self.speech_client = None
if whisper: if speech:
try: try:
# Load a smaller model initially, can be made configurable self.speech_client = speech.SpeechClient()
self.whisper_model = whisper.load_model("base") print("Google Cloud Speech client initialized successfully.")
print("Whisper model 'base' loaded successfully.")
except Exception as e: except Exception as e:
print(f"Error loading Whisper model: {e}. STT will not be available.") print(f"Error initializing Google Cloud Speech client: {e}. STT will not be available.")
self.whisper_model = None self.speech_client = None
else: else:
print("Whisper library not available. STT functionality will be disabled.") print("Google Cloud Speech library not available. STT functionality will be disabled.")
async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]: async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]:
if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED: if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
@ -533,43 +532,41 @@ class VoiceGatewayCog(commands.Cog):
# Removed start_listening_pipeline as the sink now handles more logic directly or via tasks. # Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.
async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild): async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild):
"""Processes a segment of audio data using Whisper.""" """Processes a segment of audio data using Google Cloud Speech-to-Text."""
if not self.whisper_model or not audio_data: # also check if audio_data is empty if not self.speech_client or not audio_data:
if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.") if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.")
return return
# Save audio_data (PCM) to a temporary WAV file
# Whisper expects a file path or a NumPy array.
# Using a temporary file is straightforward.
try: try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: recognition_config = speech.RecognitionConfig(
wav_file_path = tmp_wav.name encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
wf = wave.open(tmp_wav, 'wb') sample_rate_hertz=SAMPLE_RATE, # Defined as 16000
wf.setnchannels(CHANNELS) language_code="en-US",
wf.setsampwidth(SAMPLE_WIDTH) enable_automatic_punctuation=True,
wf.setframerate(SAMPLE_RATE) # model="telephony" # Consider uncommenting if default isn't ideal for voice chat
wf.writeframes(audio_data)
wf.close()
# Transcribe using Whisper (this can be blocking, run in executor)
# Use functools.partial to pass keyword arguments to the transcribe method
transcribe_func = functools.partial(self.whisper_model.transcribe, wav_file_path, fp16=False)
result = await self.bot.loop.run_in_executor(
None, # Default ThreadPoolExecutor
transcribe_func
) )
transcribed_text = result["text"].strip() recognition_audio = speech.RecognitionAudio(content=audio_data)
if transcribed_text: # Only dispatch if there's actual text # Run in executor as it's a network call that can be blocking
response = await self.bot.loop.run_in_executor(
None, # Default ThreadPoolExecutor
functools.partial(self.speech_client.recognize, config=recognition_config, audio=recognition_audio)
)
transcribed_text = ""
for result in response.results:
if result.alternatives:
transcribed_text += result.alternatives[0].transcript + " "
transcribed_text = transcribed_text.strip()
if transcribed_text:
user = guild.get_member(user_id) or await self.bot.fetch_user(user_id) user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
print(f"Transcription for {user.name} ({user_id}) in {guild.name}: {transcribed_text}") print(f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text) self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text)
except Exception as e: except Exception as e:
print(f"Error processing audio segment for user {user_id}: {e}") print(f"Error processing audio segment with Google STT for user {user_id}: {e}")
finally:
if 'wav_file_path' in locals() and os.path.exists(wav_file_path):
os.remove(wav_file_path)
async def setup(bot: commands.Bot): async def setup(bot: commands.Bot):

View File

@ -731,7 +731,7 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
"""Listener for transcribed voice messages.""" """Listener for transcribed voice messages."""
from .api import get_ai_response # For processing the text from .api import get_ai_response # For processing the text
from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response
from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED # Import new config from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED, VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL # Import new config
print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'") print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'")
@ -914,27 +914,46 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
# If not, and there's text, we could make it speak here as a fallback, # If not, and there's text, we could make it speak here as a fallback,
# but it's better if the AI decides to use the speak_in_voice_channel tool. # but it's better if the AI decides to use the speak_in_voice_channel tool.
# If there's also a text component to send to the text_channel: if response_text:
if response_text: # Only send if there's actual text content # Force speak the response if it's from a voice transcription context
# This part is simplified; a more robust solution would reuse the speak_tool_func = cog.TOOL_MAPPING.get("speak_in_voice_channel")
# send_response_content helper from on_message_listener if possible, if speak_tool_func:
# or adapt its logic here. print(f"Forcing voice response for transcription: '{response_text[:50]}...'")
try: speak_result = await speak_tool_func(cog, text_to_speak=response_text)
# Simulate typing if sending to text channel
async with text_channel.typing():
await simulate_human_typing(cog, text_channel, response_text)
sent_text_msg = await text_channel.send(response_text) if speak_result.get("status") == "success":
print(f"Sent text response to {text_channel.name} for voice transcription: '{response_text[:50]}...'") print(f"Successfully forced voice response. Text log handled by speak_in_voice_channel tool if enabled.")
# The speak_in_voice_channel tool will log to the dedicated text channel
# Cache GURT's text response # if VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is true.
bot_response_cache_entry = format_message(cog, sent_text_msg) # No need to send separately from here if that config is true.
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry) # If VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is false, no text log of GURT's speech will appear.
cog.message_cache['global_recent'].append(bot_response_cache_entry) else:
cog.bot_last_spoke[text_channel.id] = time.time() print(f"Forced speak_in_voice_channel failed: {speak_result.get('error')}")
# Fallback: if speaking failed, send it as text to the dedicated channel
except Exception as send_err: # so the user at least gets a response.
print(f"Error sending text response for voice transcription: {send_err}") try:
fallback_msg = await text_channel.send(f"(Voice output failed) GURT: {response_text}")
print(f"Sent fallback text response to {text_channel.name} for voice transcription failure.")
# Cache this fallback text response
bot_response_cache_entry = format_message(cog, fallback_msg)
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
cog.message_cache['global_recent'].append(bot_response_cache_entry)
cog.bot_last_spoke[text_channel.id] = time.time()
except Exception as send_fallback_err:
print(f"Error sending fallback text for voice failure: {send_fallback_err}")
else:
print("speak_in_voice_channel tool not found. Sending text response as fallback.")
try:
# Fallback to text if tool is missing
fallback_msg = await text_channel.send(f"(Voice tool missing) GURT: {response_text}")
print(f"Sent fallback text response to {text_channel.name} due to missing voice tool.")
# Cache this fallback text response
bot_response_cache_entry = format_message(cog, fallback_msg)
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
cog.message_cache['global_recent'].append(bot_response_cache_entry)
cog.bot_last_spoke[text_channel.id] = time.time()
except Exception as send_fallback_err3:
print(f"Error sending fallback text for missing voice tool: {send_fallback_err3}")
# Handle reactions if any (similar to on_message) # Handle reactions if any (similar to on_message)
emoji_to_react = final_response_data.get("react_with_emoji") emoji_to_react = final_response_data.get("react_with_emoji")