feat: Integrate Google Cloud Speech-to-Text for voice transcription and enhance response handling
This commit is contained in:
parent
36f3f80487
commit
445be20991
@ -14,10 +14,10 @@ from gurt import config as GurtConfig
|
|||||||
|
|
||||||
# Attempt to import STT and VAD libraries
|
# Attempt to import STT and VAD libraries
|
||||||
try:
|
try:
|
||||||
import whisper
|
from google.cloud import speech
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("Whisper library not found. Please install with 'pip install openai-whisper'")
|
print("Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'")
|
||||||
whisper = None
|
speech = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import webrtcvad
|
import webrtcvad
|
||||||
@ -246,17 +246,16 @@ class VoiceGatewayCog(commands.Cog):
|
|||||||
self.bot = bot
|
self.bot = bot
|
||||||
self.active_sinks = {} # guild_id: VoiceAudioSink
|
self.active_sinks = {} # guild_id: VoiceAudioSink
|
||||||
self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
|
self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
|
||||||
self.whisper_model = None
|
self.speech_client = None
|
||||||
if whisper:
|
if speech:
|
||||||
try:
|
try:
|
||||||
# Load a smaller model initially, can be made configurable
|
self.speech_client = speech.SpeechClient()
|
||||||
self.whisper_model = whisper.load_model("base")
|
print("Google Cloud Speech client initialized successfully.")
|
||||||
print("Whisper model 'base' loaded successfully.")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error loading Whisper model: {e}. STT will not be available.")
|
print(f"Error initializing Google Cloud Speech client: {e}. STT will not be available.")
|
||||||
self.whisper_model = None
|
self.speech_client = None
|
||||||
else:
|
else:
|
||||||
print("Whisper library not available. STT functionality will be disabled.")
|
print("Google Cloud Speech library not available. STT functionality will be disabled.")
|
||||||
|
|
||||||
async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]:
|
async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]:
|
||||||
if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
|
if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
|
||||||
@ -533,43 +532,41 @@ class VoiceGatewayCog(commands.Cog):
|
|||||||
# Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.
|
# Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.
|
||||||
|
|
||||||
async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild):
|
async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild):
|
||||||
"""Processes a segment of audio data using Whisper."""
|
"""Processes a segment of audio data using Google Cloud Speech-to-Text."""
|
||||||
if not self.whisper_model or not audio_data: # also check if audio_data is empty
|
if not self.speech_client or not audio_data:
|
||||||
if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.")
|
if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Save audio_data (PCM) to a temporary WAV file
|
|
||||||
# Whisper expects a file path or a NumPy array.
|
|
||||||
# Using a temporary file is straightforward.
|
|
||||||
try:
|
try:
|
||||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
|
recognition_config = speech.RecognitionConfig(
|
||||||
wav_file_path = tmp_wav.name
|
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||||
wf = wave.open(tmp_wav, 'wb')
|
sample_rate_hertz=SAMPLE_RATE, # Defined as 16000
|
||||||
wf.setnchannels(CHANNELS)
|
language_code="en-US",
|
||||||
wf.setsampwidth(SAMPLE_WIDTH)
|
enable_automatic_punctuation=True,
|
||||||
wf.setframerate(SAMPLE_RATE)
|
# model="telephony" # Consider uncommenting if default isn't ideal for voice chat
|
||||||
wf.writeframes(audio_data)
|
|
||||||
wf.close()
|
|
||||||
|
|
||||||
# Transcribe using Whisper (this can be blocking, run in executor)
|
|
||||||
# Use functools.partial to pass keyword arguments to the transcribe method
|
|
||||||
transcribe_func = functools.partial(self.whisper_model.transcribe, wav_file_path, fp16=False)
|
|
||||||
result = await self.bot.loop.run_in_executor(
|
|
||||||
None, # Default ThreadPoolExecutor
|
|
||||||
transcribe_func
|
|
||||||
)
|
)
|
||||||
transcribed_text = result["text"].strip()
|
recognition_audio = speech.RecognitionAudio(content=audio_data)
|
||||||
|
|
||||||
if transcribed_text: # Only dispatch if there's actual text
|
# Run in executor as it's a network call that can be blocking
|
||||||
|
response = await self.bot.loop.run_in_executor(
|
||||||
|
None, # Default ThreadPoolExecutor
|
||||||
|
functools.partial(self.speech_client.recognize, config=recognition_config, audio=recognition_audio)
|
||||||
|
)
|
||||||
|
|
||||||
|
transcribed_text = ""
|
||||||
|
for result in response.results:
|
||||||
|
if result.alternatives:
|
||||||
|
transcribed_text += result.alternatives[0].transcript + " "
|
||||||
|
|
||||||
|
transcribed_text = transcribed_text.strip()
|
||||||
|
|
||||||
|
if transcribed_text:
|
||||||
user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
|
user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
|
||||||
print(f"Transcription for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
|
print(f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
|
||||||
self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text)
|
self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing audio segment for user {user_id}: {e}")
|
print(f"Error processing audio segment with Google STT for user {user_id}: {e}")
|
||||||
finally:
|
|
||||||
if 'wav_file_path' in locals() and os.path.exists(wav_file_path):
|
|
||||||
os.remove(wav_file_path)
|
|
||||||
|
|
||||||
|
|
||||||
async def setup(bot: commands.Bot):
|
async def setup(bot: commands.Bot):
|
||||||
|
@ -731,7 +731,7 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
|
|||||||
"""Listener for transcribed voice messages."""
|
"""Listener for transcribed voice messages."""
|
||||||
from .api import get_ai_response # For processing the text
|
from .api import get_ai_response # For processing the text
|
||||||
from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response
|
from .utils import format_message, simulate_human_typing # For creating pseudo-message and sending response
|
||||||
from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED # Import new config
|
from .config import IGNORED_CHANNEL_IDS, VOICE_DEDICATED_TEXT_CHANNEL_ENABLED, VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL # Import new config
|
||||||
|
|
||||||
print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'")
|
print(f"Voice transcription received from {user.name} ({user.id}) in {guild.name}: '{text}'")
|
||||||
|
|
||||||
@ -914,27 +914,46 @@ async def on_voice_transcription_received_listener(cog: 'GurtCog', guild: discor
|
|||||||
# If not, and there's text, we could make it speak here as a fallback,
|
# If not, and there's text, we could make it speak here as a fallback,
|
||||||
# but it's better if the AI decides to use the speak_in_voice_channel tool.
|
# but it's better if the AI decides to use the speak_in_voice_channel tool.
|
||||||
|
|
||||||
# If there's also a text component to send to the text_channel:
|
if response_text:
|
||||||
if response_text: # Only send if there's actual text content
|
# Force speak the response if it's from a voice transcription context
|
||||||
# This part is simplified; a more robust solution would reuse the
|
speak_tool_func = cog.TOOL_MAPPING.get("speak_in_voice_channel")
|
||||||
# send_response_content helper from on_message_listener if possible,
|
if speak_tool_func:
|
||||||
# or adapt its logic here.
|
print(f"Forcing voice response for transcription: '{response_text[:50]}...'")
|
||||||
try:
|
speak_result = await speak_tool_func(cog, text_to_speak=response_text)
|
||||||
# Simulate typing if sending to text channel
|
|
||||||
async with text_channel.typing():
|
|
||||||
await simulate_human_typing(cog, text_channel, response_text)
|
|
||||||
|
|
||||||
sent_text_msg = await text_channel.send(response_text)
|
if speak_result.get("status") == "success":
|
||||||
print(f"Sent text response to {text_channel.name} for voice transcription: '{response_text[:50]}...'")
|
print(f"Successfully forced voice response. Text log handled by speak_in_voice_channel tool if enabled.")
|
||||||
|
# The speak_in_voice_channel tool will log to the dedicated text channel
|
||||||
# Cache GURT's text response
|
# if VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is true.
|
||||||
bot_response_cache_entry = format_message(cog, sent_text_msg)
|
# No need to send separately from here if that config is true.
|
||||||
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
|
# If VOICE_LOG_SPEECH_TO_DEDICATED_CHANNEL is false, no text log of GURT's speech will appear.
|
||||||
cog.message_cache['global_recent'].append(bot_response_cache_entry)
|
else:
|
||||||
cog.bot_last_spoke[text_channel.id] = time.time()
|
print(f"Forced speak_in_voice_channel failed: {speak_result.get('error')}")
|
||||||
|
# Fallback: if speaking failed, send it as text to the dedicated channel
|
||||||
except Exception as send_err:
|
# so the user at least gets a response.
|
||||||
print(f"Error sending text response for voice transcription: {send_err}")
|
try:
|
||||||
|
fallback_msg = await text_channel.send(f"(Voice output failed) GURT: {response_text}")
|
||||||
|
print(f"Sent fallback text response to {text_channel.name} for voice transcription failure.")
|
||||||
|
# Cache this fallback text response
|
||||||
|
bot_response_cache_entry = format_message(cog, fallback_msg)
|
||||||
|
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
|
||||||
|
cog.message_cache['global_recent'].append(bot_response_cache_entry)
|
||||||
|
cog.bot_last_spoke[text_channel.id] = time.time()
|
||||||
|
except Exception as send_fallback_err:
|
||||||
|
print(f"Error sending fallback text for voice failure: {send_fallback_err}")
|
||||||
|
else:
|
||||||
|
print("speak_in_voice_channel tool not found. Sending text response as fallback.")
|
||||||
|
try:
|
||||||
|
# Fallback to text if tool is missing
|
||||||
|
fallback_msg = await text_channel.send(f"(Voice tool missing) GURT: {response_text}")
|
||||||
|
print(f"Sent fallback text response to {text_channel.name} due to missing voice tool.")
|
||||||
|
# Cache this fallback text response
|
||||||
|
bot_response_cache_entry = format_message(cog, fallback_msg)
|
||||||
|
cog.message_cache['by_channel'][text_channel.id].append(bot_response_cache_entry)
|
||||||
|
cog.message_cache['global_recent'].append(bot_response_cache_entry)
|
||||||
|
cog.bot_last_spoke[text_channel.id] = time.time()
|
||||||
|
except Exception as send_fallback_err3:
|
||||||
|
print(f"Error sending fallback text for missing voice tool: {send_fallback_err3}")
|
||||||
|
|
||||||
# Handle reactions if any (similar to on_message)
|
# Handle reactions if any (similar to on_message)
|
||||||
emoji_to_react = final_response_data.get("react_with_emoji")
|
emoji_to_react = final_response_data.get("react_with_emoji")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user