discordbot/cogs/VoiceGatewayCog.py

595 lines
31 KiB
Python

import discord
from discord.ext import commands
import asyncio
import os
import tempfile
import wave # For saving audio data
import functools # Added for partial
import subprocess # For audio conversion
from discord.ext import voice_recv # For receiving voice
from typing import Optional # For type hinting
# Gurt specific imports
from gurt import config as GurtConfig
# Attempt to import STT and VAD libraries
try:
from google.cloud import speech
except ImportError:
print("Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'")
speech = None
try:
import webrtcvad
except ImportError:
print("webrtcvad library not found. Please install with 'pip install webrtc-voice-activity-detector'")
webrtcvad = None
# OpusDecoder is no longer needed as discord-ext-voice-recv provides PCM.
FFMPEG_OPTIONS = {
# 'before_options': '-reconnect 1 -reconnect_streamed 1 -reconnect_delay_max 5', # Removed as these are for network streams and might cause issues with local files
'options': '-vn'
}
# Constants for audio processing
SAMPLE_RATE = 16000 # Whisper prefers 16kHz
CHANNELS = 1 # Mono
SAMPLE_WIDTH = 2 # 16-bit audio (2 bytes per sample)
VAD_MODE = 3 # VAD aggressiveness (0-3, 3 is most aggressive)
FRAME_DURATION_MS = 30 # Duration of a frame in ms for VAD (10, 20, or 30)
BYTES_PER_FRAME = (SAMPLE_RATE // 1000) * FRAME_DURATION_MS * CHANNELS * SAMPLE_WIDTH
# OPUS constants removed as Opus decoding is no longer handled here.
# Silence detection parameters
SILENCE_THRESHOLD_FRAMES = 25 # Number of consecutive silent VAD frames to consider end of speech (e.g., 25 * 30ms = 750ms)
MAX_SPEECH_DURATION_S = 15 # Max duration of a single speech segment to process
MAX_SPEECH_FRAMES = (MAX_SPEECH_DURATION_S * 1000) // FRAME_DURATION_MS
# Helper function for audio conversion
def _convert_audio_to_16khz_mono(raw_pcm_data_48k_stereo: bytes) -> bytes:
"""
Converts raw 48kHz stereo PCM data to 16kHz mono PCM data using FFmpeg.
"""
input_temp_file = None
output_temp_file = None
converted_audio_data = b""
try:
with tempfile.NamedTemporaryFile(suffix=".raw", delete=False) as tmp_in:
input_temp_file = tmp_in.name
tmp_in.write(raw_pcm_data_48k_stereo)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_out:
output_temp_file = tmp_out.name
command = [
'ffmpeg',
'-f', 's16le', # Input format: signed 16-bit little-endian PCM
'-ac', '2', # Input channels: stereo
'-ar', '48000', # Input sample rate: 48kHz
'-i', input_temp_file,
'-ac', str(CHANNELS), # Output channels (e.g., 1 for mono)
'-ar', str(SAMPLE_RATE), # Output sample rate (e.g., 16000)
'-sample_fmt', 's16',# Output sample format
'-y', # Overwrite output file if it exists
output_temp_file
]
process = subprocess.run(command, capture_output=True, check=False)
if process.returncode != 0:
print(f"FFmpeg error during audio conversion. Return code: {process.returncode}")
print(f"FFmpeg stdout: {process.stdout.decode(errors='ignore')}")
print(f"FFmpeg stderr: {process.stderr.decode(errors='ignore')}")
return b""
with open(output_temp_file, 'rb') as f_out:
with wave.open(f_out, 'rb') as wf:
if wf.getnchannels() == CHANNELS and \
wf.getframerate() == SAMPLE_RATE and \
wf.getsampwidth() == SAMPLE_WIDTH:
converted_audio_data = wf.readframes(wf.getnframes())
else:
print(f"Warning: Converted WAV file format mismatch. Expected {CHANNELS}ch, {SAMPLE_RATE}Hz, {SAMPLE_WIDTH}bytes/sample.")
print(f"Got: {wf.getnchannels()}ch, {wf.getframerate()}Hz, {wf.getsampwidth()}bytes/sample.")
return b""
except FileNotFoundError:
print("FFmpeg command not found. Please ensure FFmpeg is installed and in your system's PATH.")
return b""
except Exception as e:
print(f"Error during audio conversion: {e}")
return b""
finally:
if input_temp_file and os.path.exists(input_temp_file):
os.remove(input_temp_file)
if output_temp_file and os.path.exists(output_temp_file):
os.remove(output_temp_file)
return converted_audio_data
class VoiceAudioSink(voice_recv.AudioSink): # Inherit from voice_recv.AudioSink
def __init__(self, cog_instance): # Removed voice_client parameter
super().__init__()
self.cog = cog_instance
# self.voice_client is set by the library when listen() is called
# user_audio_data now keyed by user_id, 'decoder' removed
self.user_audio_data = {} # {user_id: {'buffer': bytearray, 'speaking': False, 'silent_frames': 0, 'speech_frames': 0, 'vad': VAD_instance}}
# OpusDecoder check removed
if not webrtcvad:
print("VAD library not loaded. STT might be less efficient or not work as intended.")
def wants_opus(self) -> bool:
"""
Indicates whether the sink wants Opus-encoded audio (True) or PCM audio (False).
Our sink processes PCM data, so we return False.
"""
return False
# Signature changed: user object directly, data is VoiceData
def write(self, user: discord.User, voice_data_packet: voice_recv.VoiceData):
if not webrtcvad or not self.voice_client or not user: # OpusDecoder check removed, user check added
return
user_id = user.id # Get user_id from the user object
if user_id not in self.user_audio_data:
self.user_audio_data[user_id] = {
'buffer': bytearray(),
'speaking': False,
'silent_frames': 0,
'speech_frames': 0,
# 'decoder' removed
'vad': webrtcvad.Vad(VAD_MODE) if webrtcvad else None
}
entry = self.user_audio_data[user_id]
# Extract PCM data from VoiceData packet
raw_pcm_data_48k_stereo = voice_data_packet.pcm
# Convert incoming 48kHz stereo PCM to 16kHz mono PCM
pcm_data = _convert_audio_to_16khz_mono(raw_pcm_data_48k_stereo)
if not pcm_data: # Conversion failed or returned empty bytes
# print(f"Audio conversion failed for user {user_id}. Skipping frame.")
return
# VAD processing expects frames of 10, 20, or 30 ms.
# pcm_data is now 16kHz mono, hopefully in appropriate chunks from conversion.
# We need to ensure it's split into VAD-compatible frame lengths if not already.
# If pcm_data (now 16kHz mono) is a 20ms chunk, its length is 640 bytes.
# A 10ms frame at 16kHz is 320 bytes. A 30ms frame is 960 bytes.
# Ensure frame_length for VAD is correct (e.g. 20ms at 16kHz mono = 640 bytes)
# This constant could be defined at class or module level.
# For a 20ms frame, which is typical for voice packets:
frame_length_for_vad_20ms = (SAMPLE_RATE // 1000) * 20 * CHANNELS * SAMPLE_WIDTH
if len(pcm_data) % frame_length_for_vad_20ms != 0 and len(pcm_data) > 0 : # Check if it's a multiple, or handle if not.
# This might happen if the converted chunk size isn't exactly what VAD expects per call.
# For now, we'll try to process it. A more robust solution might buffer/segment pcm_data
# into exact 10, 20, or 30ms chunks for VAD.
# print(f"Warning: PCM data length {len(pcm_data)} after conversion is not an exact multiple of VAD frame size {frame_length_for_vad_20ms} for User {user_id}. Trying to process.")
pass # Continue, VAD might handle it or error.
# Process VAD in chunks if pcm_data is longer than one VAD frame
# For simplicity, let's assume pcm_data is one processable chunk for now.
# If pcm_data can be multiple VAD frames, iterate through it.
# Current VAD logic processes the whole pcm_data chunk at once.
# This is okay if pcm_data is already a single VAD frame (e.g. 20ms).
if entry['vad']:
try:
# Ensure pcm_data is a valid frame for VAD (e.g. 10, 20, 30 ms)
# If pcm_data is, for example, 640 bytes (20ms at 16kHz mono), it's fine.
if len(pcm_data) == frame_length_for_vad_20ms: # Common case
is_speech = entry['vad'].is_speech(pcm_data, SAMPLE_RATE)
elif len(pcm_data) > 0 : # If not standard, but has data, try (might error)
# print(f"VAD processing for User {user_id} with non-standard PCM length {len(pcm_data)}. May error.")
# This path is risky if VAD is strict. For now, we assume it's handled or errors.
# A robust way: segment pcm_data into valid VAD frames.
# For now, let's assume the chunk from conversion is one such frame.
is_speech = entry['vad'].is_speech(pcm_data, SAMPLE_RATE) # This might fail if len is not 10/20/30ms worth
else: # No data
is_speech = False
except Exception as e: # webrtcvad can raise errors on invalid frame length
# print(f"VAD error for User {user_id} with PCM length {len(pcm_data)}: {e}. Defaulting to speech=True for this frame.")
is_speech = True # Fallback: if VAD fails, assume it's speech
else: # No VAD
is_speech = True
if is_speech:
entry['buffer'].extend(pcm_data)
entry['speaking'] = True
entry['silent_frames'] = 0
entry['speech_frames'] += 1
if entry['speech_frames'] >= MAX_SPEECH_FRAMES:
# print(f"Max speech frames reached for User {user_id}. Processing segment.")
self.cog.bot.loop.create_task(self.cog.process_audio_segment(user_id, bytes(entry['buffer']), self.voice_client.guild))
entry['buffer'].clear()
entry['speaking'] = False
entry['speech_frames'] = 0
elif entry['speaking']: # Was speaking, now silence
entry['buffer'].extend(pcm_data) # Add this last silent frame for context
entry['silent_frames'] += 1
if entry['silent_frames'] >= SILENCE_THRESHOLD_FRAMES:
# print(f"Silence threshold reached for User {user_id}. Processing segment.")
self.cog.bot.loop.create_task(self.cog.process_audio_segment(user_id, bytes(entry['buffer']), self.voice_client.guild))
entry['buffer'].clear()
entry['speaking'] = False
entry['speech_frames'] = 0
entry['silent_frames'] = 0
# If not is_speech and not entry['speaking'], do nothing (ignore silence)
def cleanup(self):
print("VoiceAudioSink cleanup called.")
# Iterate over a copy of items if modifications occur, or handle user_id directly
for user_id, data_entry in list(self.user_audio_data.items()):
if data_entry['buffer']:
# user object is not directly available here, but process_audio_segment takes user_id
# We need the guild, which should be available from self.voice_client
if self.voice_client and self.voice_client.guild:
guild = self.voice_client.guild
print(f"Processing remaining audio for User ID {user_id} on cleanup.")
self.cog.bot.loop.create_task(self.cog.process_audio_segment(user_id, bytes(data_entry['buffer']), guild))
else:
print(f"Cannot process remaining audio for User ID {user_id}: voice_client or guild not available.")
self.user_audio_data.clear()
class VoiceGatewayCog(commands.Cog):
def __init__(self, bot):
self.bot = bot
self.active_sinks = {} # guild_id: VoiceAudioSink
self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
self.speech_client = None
if speech:
try:
self.speech_client = speech.SpeechClient()
print("Google Cloud Speech client initialized successfully.")
except Exception as e:
print(f"Error initializing Google Cloud Speech client: {e}. STT will not be available.")
self.speech_client = None
else:
print("Google Cloud Speech library not available. STT functionality will be disabled.")
async def _ensure_dedicated_voice_text_channel(self, guild: discord.Guild, voice_channel: discord.VoiceChannel) -> Optional[discord.TextChannel]:
if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
return None
existing_channel_id = self.dedicated_voice_text_channels.get(guild.id)
if existing_channel_id:
channel = guild.get_channel(existing_channel_id)
if channel and isinstance(channel, discord.TextChannel):
print(f"Found existing dedicated voice text channel: {channel.name} ({channel.id})")
return channel
else:
print(f"Dedicated voice text channel ID {existing_channel_id} for guild {guild.id} is invalid or not found. Will create a new one.")
del self.dedicated_voice_text_channels[guild.id] # Remove invalid ID
# Create new channel
channel_name = GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_NAME_TEMPLATE.format(
voice_channel_name=voice_channel.name,
guild_name=guild.name
# Add more placeholders if needed
)
# Sanitize channel name (Discord has restrictions)
channel_name = "".join(c for c in channel_name if c.isalnum() or c in ['-', '_', ' ']).strip()
channel_name = channel_name.replace(' ', '-').lower()
if not channel_name: # Fallback if template results in empty string
channel_name = "gurt-voice-chat"
# Check if a channel with this name already exists (to avoid duplicates if bot restarted without proper cleanup)
for existing_guild_channel in guild.text_channels:
if existing_guild_channel.name == channel_name:
print(f"Found existing channel by name '{channel_name}' ({existing_guild_channel.id}). Reusing.")
self.dedicated_voice_text_channels[guild.id] = existing_guild_channel.id
# Optionally update topic and permissions if needed
try:
if existing_guild_channel.topic != GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_TOPIC:
await existing_guild_channel.edit(topic=GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_TOPIC)
# Send initial message if channel is empty or last message isn't the initial one
async for last_message in existing_guild_channel.history(limit=1):
if last_message.content != GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE:
await existing_guild_channel.send(GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE)
break # Only need the very last message
else: # No messages in channel
await existing_guild_channel.send(GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE)
except discord.Forbidden:
print(f"Missing permissions to update reused dedicated channel {channel_name}")
except Exception as e_reuse:
print(f"Error updating reused dedicated channel {channel_name}: {e_reuse}")
return existing_guild_channel
overwrites = {
guild.me: discord.PermissionOverwrite(read_messages=True, send_messages=True, manage_messages=True), # GURT needs to manage
guild.default_role: discord.PermissionOverwrite(read_messages=False, send_messages=False) # Private by default
# Consider adding server admins/mods with read/send permissions
}
# Add owner and admins with full perms to the channel
if guild.owner:
overwrites[guild.owner] = discord.PermissionOverwrite(read_messages=True, send_messages=True, manage_channels=True, manage_messages=True)
for role in guild.roles:
if role.permissions.administrator and not role.is_default(): # Check for admin roles
overwrites[role] = discord.PermissionOverwrite(read_messages=True, send_messages=True, manage_channels=True, manage_messages=True)
try:
print(f"Creating new dedicated voice text channel: {channel_name}")
new_channel = await guild.create_text_channel(
name=channel_name,
overwrites=overwrites,
topic=GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_TOPIC,
reason="GURT Dedicated Voice Chat Channel"
)
self.dedicated_voice_text_channels[guild.id] = new_channel.id
if GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE:
await new_channel.send(GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE)
print(f"Created dedicated voice text channel: {new_channel.name} ({new_channel.id})")
return new_channel
except discord.Forbidden:
print(f"Forbidden: Could not create dedicated voice text channel '{channel_name}' in guild {guild.name}.")
return None
except Exception as e:
print(f"Error creating dedicated voice text channel '{channel_name}': {e}")
return None
def get_dedicated_text_channel_for_guild(self, guild_id: int) -> Optional[discord.TextChannel]:
channel_id = self.dedicated_voice_text_channels.get(guild_id)
if channel_id:
guild = self.bot.get_guild(guild_id)
if guild:
channel = guild.get_channel(channel_id)
if isinstance(channel, discord.TextChannel):
return channel
return None
async def cog_load(self):
print("VoiceGatewayCog loaded!")
async def cog_unload(self):
print("Unloading VoiceGatewayCog...")
# Disconnect from all voice channels and clean up sinks
for vc in list(self.bot.voice_clients): # Iterate over a copy
guild_id = vc.guild.id
if guild_id in self.active_sinks:
if vc.is_connected() and hasattr(vc, 'is_listening') and vc.is_listening():
if hasattr(vc, 'stop_listening'):
vc.stop_listening()
else: # Or equivalent for VoiceRecvClient
pass
self.active_sinks[guild_id].cleanup()
del self.active_sinks[guild_id]
# Handle dedicated text channel cleanup on cog unload
if GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED and GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_CLEANUP_ON_LEAVE:
dedicated_channel_id = self.dedicated_voice_text_channels.get(guild_id)
if dedicated_channel_id:
try:
channel_to_delete = vc.guild.get_channel(dedicated_channel_id) or await self.bot.fetch_channel(dedicated_channel_id)
if channel_to_delete:
print(f"Deleting dedicated voice text channel {channel_to_delete.name} ({channel_to_delete.id}) during cog unload.")
await channel_to_delete.delete(reason="GURT VoiceGatewayCog unload")
except discord.NotFound:
print(f"Dedicated voice text channel {dedicated_channel_id} not found for deletion during unload.")
except discord.Forbidden:
print(f"Forbidden: Could not delete dedicated voice text channel {dedicated_channel_id} during unload.")
except Exception as e:
print(f"Error deleting dedicated voice text channel {dedicated_channel_id} during unload: {e}")
if guild_id in self.dedicated_voice_text_channels:
del self.dedicated_voice_text_channels[guild_id]
if vc.is_connected():
await vc.disconnect(force=True)
print("VoiceGatewayCog unloaded and disconnected from voice channels.")
async def connect_to_voice(self, channel: discord.VoiceChannel):
"""Connects the bot to a specified voice channel and starts listening."""
if not channel:
return None, "Channel not provided."
guild = channel.guild
voice_client = guild.voice_client
if voice_client and voice_client.is_connected():
if voice_client.channel == channel:
print(f"Already connected to {channel.name} in {guild.name}.")
if isinstance(voice_client, voice_recv.VoiceRecvClient):
if guild.id not in self.active_sinks or not voice_client.is_listening():
self.start_listening_for_vc(voice_client)
# Ensure dedicated channel is set up even if already connected
await self._ensure_dedicated_voice_text_channel(guild, channel)
else:
print(f"Reconnecting with VoiceRecvClient to {channel.name}.")
await voice_client.disconnect(force=True)
try:
voice_client = await channel.connect(cls=voice_recv.VoiceRecvClient, timeout=10.0)
print(f"Reconnected to {channel.name} in {guild.name} with VoiceRecvClient.")
self.start_listening_for_vc(voice_client)
await self._ensure_dedicated_voice_text_channel(guild, channel)
except asyncio.TimeoutError:
return None, f"Timeout trying to reconnect to {channel.name} with VoiceRecvClient."
except Exception as e:
return None, f"Error reconnecting to {channel.name} with VoiceRecvClient: {str(e)}"
return voice_client, "Already connected to this channel."
else:
print(f"Moving to {channel.name} in {guild.name}. Reconnecting with VoiceRecvClient.")
await voice_client.disconnect(force=True) # This will trigger cleanup for old channel's dedicated text channel if configured
try:
voice_client = await channel.connect(cls=voice_recv.VoiceRecvClient, timeout=10.0)
print(f"Moved and reconnected to {channel.name} in {guild.name} with VoiceRecvClient.")
self.start_listening_for_vc(voice_client)
await self._ensure_dedicated_voice_text_channel(guild, channel)
except asyncio.TimeoutError:
return None, f"Timeout trying to move and connect to {channel.name}."
except Exception as e:
return None, f"Error moving and connecting to {channel.name}: {str(e)}"
else:
try:
voice_client = await channel.connect(cls=voice_recv.VoiceRecvClient, timeout=10.0)
print(f"Connected to {channel.name} in {guild.name} with VoiceRecvClient.")
self.start_listening_for_vc(voice_client)
await self._ensure_dedicated_voice_text_channel(guild, channel)
except asyncio.TimeoutError:
return None, f"Timeout trying to connect to {channel.name}."
except Exception as e:
return None, f"Error connecting to {channel.name}: {str(e)}"
if not voice_client:
return None, "Failed to establish voice client after connection."
return voice_client, f"Successfully connected and listening in {channel.name}."
def start_listening_for_vc(self, voice_client: discord.VoiceClient):
"""Starts or restarts listening for a given voice client."""
guild_id = voice_client.guild.id
if guild_id in self.active_sinks:
# If sink exists, ensure it's clean and listening is (re)started
if voice_client.is_listening():
voice_client.stop_listening() # Stop previous listening if any
self.active_sinks[guild_id].cleanup() # Clean old state
# Re-initialize or ensure the sink is fresh for the current VC
self.active_sinks[guild_id] = VoiceAudioSink(self)
else:
self.active_sinks[guild_id] = VoiceAudioSink(self)
if not voice_client.is_listening():
voice_client.listen(self.active_sinks[guild_id])
print(f"Started listening in {voice_client.channel.name} for guild {guild_id}")
else:
print(f"Already listening in {voice_client.channel.name} for guild {guild_id}")
async def disconnect_from_voice(self, guild: discord.Guild):
"""Disconnects the bot from the voice channel in the given guild."""
voice_client = guild.voice_client
if voice_client and voice_client.is_connected():
if voice_client.is_listening():
voice_client.stop_listening()
guild_id = guild.id
if guild_id in self.active_sinks:
self.active_sinks[guild_id].cleanup()
del self.active_sinks[guild_id]
# Handle dedicated text channel cleanup
if GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED and GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_CLEANUP_ON_LEAVE:
dedicated_channel_id = self.dedicated_voice_text_channels.get(guild_id)
if dedicated_channel_id:
try:
channel_to_delete = guild.get_channel(dedicated_channel_id) or await self.bot.fetch_channel(dedicated_channel_id)
if channel_to_delete:
print(f"Deleting dedicated voice text channel {channel_to_delete.name} ({channel_to_delete.id}).")
await channel_to_delete.delete(reason="GURT disconnected from voice channel")
except discord.NotFound:
print(f"Dedicated voice text channel {dedicated_channel_id} not found for deletion.")
except discord.Forbidden:
print(f"Forbidden: Could not delete dedicated voice text channel {dedicated_channel_id}.")
except Exception as e:
print(f"Error deleting dedicated voice text channel {dedicated_channel_id}: {e}")
if guild_id in self.dedicated_voice_text_channels:
del self.dedicated_voice_text_channels[guild_id]
await voice_client.disconnect(force=True)
print(f"Disconnected from voice in {guild.name}.")
return True, f"Disconnected from voice in {guild.name}."
return False, "Not connected to voice in this guild."
async def play_audio_file(self, voice_client: discord.VoiceClient, audio_file_path: str):
"""Plays an audio file in the voice channel."""
if not voice_client or not voice_client.is_connected():
print("Error: Voice client not connected.")
return False, "Voice client not connected."
if not os.path.exists(audio_file_path):
print(f"Error: Audio file not found at {audio_file_path}")
return False, "Audio file not found."
if voice_client.is_playing():
voice_client.stop() # Stop current audio if any
try:
audio_source = discord.FFmpegPCMAudio(audio_file_path, **FFMPEG_OPTIONS)
voice_client.play(audio_source, after=lambda e: self.after_audio_playback(e, audio_file_path))
print(f"Playing audio: {audio_file_path}")
return True, f"Playing {os.path.basename(audio_file_path)}"
except Exception as e:
print(f"Error creating/playing FFmpegPCMAudio source for {audio_file_path}: {e}")
return False, f"Error playing audio: {str(e)}"
def after_audio_playback(self, error, audio_file_path):
if error:
print(f"Error during audio playback for {audio_file_path}: {error}")
else:
print(f"Finished playing {audio_file_path}")
# TTSProviderCog's cleanup will handle deleting the file.
# Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.
async def process_audio_segment(self, user_id: int, audio_data: bytes, guild: discord.Guild):
"""Processes a segment of audio data using Google Cloud Speech-to-Text."""
if not self.speech_client or not audio_data:
if not audio_data: print(f"process_audio_segment called for user {user_id} with empty audio_data.")
return
try:
recognition_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE, # Defined as 16000
language_code="en-US",
enable_automatic_punctuation=True,
model="telephony" # Consider uncommenting if default isn't ideal for voice chat
)
recognition_audio = speech.RecognitionAudio(content=audio_data)
# Run in executor as it's a network call that can be blocking
response = await self.bot.loop.run_in_executor(
None, # Default ThreadPoolExecutor
functools.partial(self.speech_client.recognize, config=recognition_config, audio=recognition_audio)
)
transcribed_text = ""
for result in response.results:
if result.alternatives:
transcribed_text += result.alternatives[0].transcript + " "
transcribed_text = transcribed_text.strip()
if transcribed_text:
user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
print(f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}")
self.bot.dispatch("voice_transcription_received", guild, user, transcribed_text)
except Exception as e:
print(f"Error processing audio segment with Google STT for user {user_id}: {e}")
async def setup(bot: commands.Bot):
# Check for FFmpeg before adding cog
try:
# Try running ffmpeg -version to check if it's installed and in PATH
process = await asyncio.create_subprocess_shell(
"ffmpeg -version",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode == 0:
print("FFmpeg found. VoiceGatewayCog can be loaded.")
await bot.add_cog(VoiceGatewayCog(bot))
print("VoiceGatewayCog loaded successfully!")
else:
print("FFmpeg not found or not working correctly. VoiceGatewayCog will not be loaded.")
print(f"FFmpeg check stdout: {stdout.decode(errors='ignore')}")
print(f"FFmpeg check stderr: {stderr.decode(errors='ignore')}")
except FileNotFoundError:
print("FFmpeg command not found. VoiceGatewayCog will not be loaded. Please install FFmpeg and ensure it's in your system's PATH.")
except Exception as e:
print(f"An error occurred while checking for FFmpeg: {e}. VoiceGatewayCog will not be loaded.")