feat: Integrate Google Cloud Speech-to-Text for voice transcription and enhance response handling
This commit is contained in:
parent
36f3f80487
commit
445be20991
@ -14,10 +14,10 @@ from gurt import config as GurtConfig
|
||||
|
||||
# Attempt to import STT and VAD libraries
|
||||
try:
|
||||
import whisper
|
||||
from google.cloud import speech
|
||||
except ImportError:
|
||||
print("Whisper library not found. Please install with 'pip install openai-whisper'")
|
||||
whisper = None
|
||||
print("Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'")
|
||||
speech = None
|
||||
|
||||
try:
|
||||
import webrtcvad
|
||||
@ -246,17 +246,16 @@ class VoiceGatewayCog(commands.Cog):
|
||||
self.bot = bot
|
||||
self.active_sinks = {} # guild_id: VoiceAudioSink
|
||||
self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
|
||||
self.whisper_model = None
|
||||
if whisper:
|
||||
self.speech_client = None
|
||||
if speech:
|
||||
try:
|
||||
# Load a smaller model initially, can be made configurable
|
||||
self.whisper_model = whisper.load_model("base")
|
||||
print("Whisper model 'base' loaded successfully.")
|
||||
self.speech_client = speech.SpeechClient()
|
||||
print("Google Cloud Speech client initialized successfully.")
|
||||
except Exception as e:
|
||||
print(f"Error loading Whisper model: {e}. STT will not be available.")
|
||||
self.whisper_model = None
|
||||
print(f"Error initializing Google Cloud Speech client: {e}. STT will not be available.")
|
||||
self.speech_client = None
|
||||
else:
|
||||
print("Whisper library not available. STT functionality will be disabled.")
|
||||
print("Google Cloud Speech library not available. STT functionality will be disabled.")
|
||||
|
||||
async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]:
|
||||
if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
|
||||
@ -533,43 +532,41 @@ class VoiceGatewayCog(commands.Cog):
|
||||
# Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.
|
||||
|
||||
async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild):
|
||||
"""Processes a segment of audio data using Whisper."""
|
||||
if not self.whisper_model or not audio_data: # also check if audio_data is empty
|
||||
"""Processes a segment of audio data using Google Cloud Speech-to-Text."""
|
||||
if not self.speech_client or not audio_data:
|
||||
if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.")
|
||||
return
|
||||
|
||||
# Save audio_data (PCM) to a temporary WAV file
|
||||
# Whisper expects a file path or a NumPy array.
|
||||
# Using a temporary file is straightforward.
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
|
||||
wav_file_path = tmp_wav.name
|
||||
wf = wave.open(tmp_wav, 'wb')
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setsampwidth(SAMPLE_WIDTH)
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
wf.writeframes(audio_data)
|
||||
wf.close()
|
||||
|
||||
# Transcribe using Whisper (this can be blocking, run in executor)
|
||||
# Use functools.partial to pass keyword arguments to the transcribe method
|
||||
transcribe_func = functools.partial(self.whisper_model.transcribe, wav_file_path, fp16=False)
|
||||
result = await self.bot.loop.run_in_executor(
|
||||
None, # Default ThreadPoolExecutor
|
||||
transcribe_func
|
||||
recognition_config = speech.RecognitionConfig(
|
||||
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=SAMPLE_RATE, # Defined as 16000
|
||||
language_code="en-US",
|
||||
enable_automatic_punctuation=True,
|
||||
# model="telephony" # Consider uncommenting if default isn't ideal for voice chat
|
||||
)
|
||||
transcribed_text = result["text"].strip()
|
||||
recognition_audio = speech.RecognitionAudio(content=audio_data)
|
||||
|
||||
if transcribed_text: # Only dispatch if there's actual text
|
||||
# Run in executor as it's a network call that can be blocking
|
||||
response = await self.bot.loop.run_in_executor(
|
||||
None, # Default ThreadPoolExecutor
|
||||
functools.partial(self.speech_client.recognize, config=recognition_config, audio=recognition_audio)
|
||||
)
|
||||
|
||||
transcribed_text = ""
|
||||
for result in response.results:
|
||||
if result.alternatives:
|
||||
transcribed_text += result.alternatives[0].transcript + " "
|
||||
|
||||
transcribed_text = transcribed_text.strip()
|
||||
|
||||
if transcribed_text:
|
||||
user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
|
||||
print(f"Transcription for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
|
||||
print(f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
|
||||
self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing audio segment for user {user_id}: {e}")
|
||||
finally:
|
||||
if 'wav_file_path' in locals() and os.path.exists(wav_file_path):
|
||||
os.remove(wav_file_path)
|
||||
print(f"Error processing audio segment with Google STT for user {user_id}: {e}")
|
||||
|
||||
|
||||
async def setup(bot: commands.Bot):
|
||||
|
@ -731,7 +731,7 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
|
||||
"""Listener for transcribed voice messages."""
|
||||
from .api import get_ai_response # For processing the text
|
||||
from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response
|
||||
from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED # Import new config
|
||||
from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED, VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL # Import new config
|
||||
|
||||
print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'")
|
||||
|
||||
@ -914,27 +914,46 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
|
||||
# If not, and there's text, we could make it speak here as a fallback,
|
||||
# but it's better if the AI decides to use the speak_in_voice_channel tool.
|
||||
|
||||
# If there's also a text component to send to the text_channel:
|
||||
if response_text: # Only send if there's actual text content
|
||||
# This part is simplified; a more robust solution would reuse the
|
||||
# send_response_content helper from on_message_listener if possible,
|
||||
# or adapt its logic here.
|
||||
if response_text:
|
||||
# Force speak the response if it's from a voice transcription context
|
||||
speak_tool_func = cog.TOOL_MAPPING.get("speak_in_voice_channel")
|
||||
if speak_tool_func:
|
||||
print(f"Forcing voice response for transcription: '{response_text[:50]}...'")
|
||||
speak_result = await speak_tool_func(cog, text_to_speak=response_text)
|
||||
|
||||
if speak_result.get("status") == "success":
|
||||
print(f"Successfully forced voice response. Text log handled by speak_in_voice_channel tool if enabled.")
|
||||
# The speak_in_voice_channel tool will log to the dedicated text channel
|
||||
# if VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is true.
|
||||
# No need to send separately from here if that config is true.
|
||||
# If VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is false, no text log of GURT's speech will appear.
|
||||
else:
|
||||
print(f"Forced speak_in_voice_channel failed: {speak_result.get('error')}")
|
||||
# Fallback: if speaking failed, send it as text to the dedicated channel
|
||||
# so the user at least gets a response.
|
||||
try:
|
||||
# Simulate typing if sending to text channel
|
||||
async with text_channel.typing():
|
||||
await simulate_human_typing(cog, text_channel, response_text)
|
||||
|
||||
sent_text_msg = await text_channel.send(response_text)
|
||||
print(f"Sent text response to {text_channel.name} for voice transcription: '{response_text[:50]}...'")
|
||||
|
||||
# Cache GURT's text response
|
||||
bot_response_cache_entry = format_message(cog, sent_text_msg)
|
||||
fallback_msg = await text_channel.send(f"(Voice output failed) GURT: {response_text}")
|
||||
print(f"Sent fallback text response to {text_channel.name} for voice transcription failure.")
|
||||
# Cache this fallback text response
|
||||
bot_response_cache_entry = format_message(cog, fallback_msg)
|
||||
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
|
||||
cog.message_cache['global_recent'].append(bot_response_cache_entry)
|
||||
cog.bot_last_spoke[text_channel.id] = time.time()
|
||||
|
||||
except Exception as send_err:
|
||||
print(f"Error sending text response for voice transcription: {send_err}")
|
||||
except Exception as send_fallback_err:
|
||||
print(f"Error sending fallback text for voice failure: {send_fallback_err}")
|
||||
else:
|
||||
print("speak_in_voice_channel tool not found. Sending text response as fallback.")
|
||||
try:
|
||||
# Fallback to text if tool is missing
|
||||
fallback_msg = await text_channel.send(f"(Voice tool missing) GURT: {response_text}")
|
||||
print(f"Sent fallback text response to {text_channel.name} due to missing voice tool.")
|
||||
# Cache this fallback text response
|
||||
bot_response_cache_entry = format_message(cog, fallback_msg)
|
||||
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
|
||||
cog.message_cache['global_recent'].append(bot_response_cache_entry)
|
||||
cog.bot_last_spoke[text_channel.id] = time.time()
|
||||
except Exception as send_fallback_err3:
|
||||
print(f"Error sending fallback text for missing voice tool: {send_fallback_err3}")
|
||||
|
||||
# Handle reactions if any (similar to on_message)
|
||||
emoji_to_react = final_response_data.get("react_with_emoji")
|
||||
|
Loading…
x
Reference in New Issue
Block a user