793 lines
35 KiB
Python
793 lines
35 KiB
Python
import discord
|
|
from discord.ext import commands
|
|
import asyncio
|
|
import os
|
|
import tempfile
|
|
import wave # For saving audio data
|
|
import functools # Added for partial
|
|
import subprocess # For audio conversion
|
|
from discord.ext import voice_recv # For receiving voice
|
|
from typing import Optional # For type hinting
|
|
|
|
# Gurt specific imports
|
|
from gurt import config as GurtConfig
|
|
|
|
# Attempt to import STT and VAD libraries
|
|
try:
|
|
from google.cloud import speech
|
|
except ImportError:
|
|
print(
|
|
"Google Cloud Speech library not found. Please install with 'pip install google-cloud-speech'"
|
|
)
|
|
speech = None
|
|
|
|
try:
|
|
import webrtcvad
|
|
except ImportError:
|
|
print(
|
|
"webrtcvad library not found. Please install with 'pip install webrtc-voice-activity-detector'"
|
|
)
|
|
webrtcvad = None
|
|
|
|
# OpusDecoder is no longer needed as discord-ext-voice-recv provides PCM.
|
|
|
|
FFMPEG_OPTIONS = {
|
|
# 'before_options': '-reconnect 1 -reconnect_streamed 1 -reconnect_delay_max 5', # Removed as these are for network streams and might cause issues with local files
|
|
"options": "-vn"
|
|
}
|
|
|
|
# Constants for audio processing
|
|
SAMPLE_RATE = 16000 # Whisper prefers 16kHz
|
|
CHANNELS = 1 # Mono
|
|
SAMPLE_WIDTH = 2 # 16-bit audio (2 bytes per sample)
|
|
VAD_MODE = 3 # VAD aggressiveness (0-3, 3 is most aggressive)
|
|
FRAME_DURATION_MS = 30 # Duration of a frame in ms for VAD (10, 20, or 30)
|
|
BYTES_PER_FRAME = (SAMPLE_RATE // 1000) * FRAME_DURATION_MS * CHANNELS * SAMPLE_WIDTH
|
|
# OPUS constants removed as Opus decoding is no longer handled here.
|
|
|
|
# Silence detection parameters
|
|
SILENCE_THRESHOLD_FRAMES = 25 # Number of consecutive silent VAD frames to consider end of speech (e.g., 25 * 30ms = 750ms)
|
|
MAX_SPEECH_DURATION_S = 15 # Max duration of a single speech segment to process
|
|
MAX_SPEECH_FRAMES = (MAX_SPEECH_DURATION_S * 1000) // FRAME_DURATION_MS
|
|
|
|
|
|
# Helper function for audio conversion
|
|
def _convert_audio_to_16khz_mono(raw_pcm_data_48k_stereo: bytes) -> bytes:
|
|
"""
|
|
Converts raw 48kHz stereo PCM data to 16kHz mono PCM data using FFmpeg.
|
|
"""
|
|
input_temp_file = None
|
|
output_temp_file = None
|
|
converted_audio_data = b""
|
|
|
|
try:
|
|
with tempfile.NamedTemporaryFile(suffix=".raw", delete=False) as tmp_in:
|
|
input_temp_file = tmp_in.name
|
|
tmp_in.write(raw_pcm_data_48k_stereo)
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_out:
|
|
output_temp_file = tmp_out.name
|
|
|
|
command = [
|
|
"ffmpeg",
|
|
"-f",
|
|
"s16le", # Input format: signed 16-bit little-endian PCM
|
|
"-ac",
|
|
"2", # Input channels: stereo
|
|
"-ar",
|
|
"48000", # Input sample rate: 48kHz
|
|
"-i",
|
|
input_temp_file,
|
|
"-ac",
|
|
str(CHANNELS), # Output channels (e.g., 1 for mono)
|
|
"-ar",
|
|
str(SAMPLE_RATE), # Output sample rate (e.g., 16000)
|
|
"-sample_fmt",
|
|
"s16", # Output sample format
|
|
"-y", # Overwrite output file if it exists
|
|
output_temp_file,
|
|
]
|
|
|
|
process = subprocess.run(command, capture_output=True, check=False)
|
|
|
|
if process.returncode != 0:
|
|
print(
|
|
f"FFmpeg error during audio conversion. Return code: {process.returncode}"
|
|
)
|
|
print(f"FFmpeg stdout: {process.stdout.decode(errors='ignore')}")
|
|
print(f"FFmpeg stderr: {process.stderr.decode(errors='ignore')}")
|
|
return b""
|
|
|
|
with open(output_temp_file, "rb") as f_out:
|
|
with wave.open(f_out, "rb") as wf:
|
|
if (
|
|
wf.getnchannels() == CHANNELS
|
|
and wf.getframerate() == SAMPLE_RATE
|
|
and wf.getsampwidth() == SAMPLE_WIDTH
|
|
):
|
|
converted_audio_data = wf.readframes(wf.getnframes())
|
|
else:
|
|
print(
|
|
f"Warning: Converted WAV file format mismatch. Expected {CHANNELS}ch, {SAMPLE_RATE}Hz, {SAMPLE_WIDTH}bytes/sample."
|
|
)
|
|
print(
|
|
f"Got: {wf.getnchannels()}ch, {wf.getframerate()}Hz, {wf.getsampwidth()}bytes/sample."
|
|
)
|
|
return b""
|
|
except FileNotFoundError:
|
|
print(
|
|
"FFmpeg command not found. Please ensure FFmpeg is installed and in your system's PATH."
|
|
)
|
|
return b""
|
|
except Exception as e:
|
|
print(f"Error during audio conversion: {e}")
|
|
return b""
|
|
finally:
|
|
if input_temp_file and os.path.exists(input_temp_file):
|
|
os.remove(input_temp_file)
|
|
if output_temp_file and os.path.exists(output_temp_file):
|
|
os.remove(output_temp_file)
|
|
|
|
return converted_audio_data
|
|
|
|
|
|
class VoiceAudioSink(voice_recv.AudioSink): # Inherit from voice_recv.AudioSink
|
|
def __init__(self, cog_instance): # Removed voice_client parameter
|
|
super().__init__()
|
|
self.cog = cog_instance
|
|
# self.voice_client is set by the library when listen() is called
|
|
# user_audio_data now keyed by user_id, 'decoder' removed
|
|
self.user_audio_data = (
|
|
{}
|
|
) # {user_id: {'buffer': bytearray, 'speaking': False, 'silent_frames': 0, 'speech_frames': 0, 'vad': VAD_instance}}
|
|
|
|
# OpusDecoder check removed
|
|
if not webrtcvad:
|
|
print(
|
|
"VAD library not loaded. STT might be less efficient or not work as intended."
|
|
)
|
|
|
|
def wants_opus(self) -> bool:
|
|
"""
|
|
Indicates whether the sink wants Opus-encoded audio (True) or PCM audio (False).
|
|
Our sink processes PCM data, so we return False.
|
|
"""
|
|
return False
|
|
|
|
# Signature changed: user object directly, data is VoiceData
|
|
def write(self, user: discord.User, voice_data_packet: voice_recv.VoiceData):
|
|
if (
|
|
not webrtcvad or not self.voice_client or not user
|
|
): # OpusDecoder check removed, user check added
|
|
return
|
|
|
|
user_id = user.id # Get user_id from the user object
|
|
|
|
if user_id not in self.user_audio_data:
|
|
self.user_audio_data[user_id] = {
|
|
"buffer": bytearray(),
|
|
"speaking": False,
|
|
"silent_frames": 0,
|
|
"speech_frames": 0,
|
|
# 'decoder' removed
|
|
"vad": webrtcvad.Vad(VAD_MODE) if webrtcvad else None,
|
|
}
|
|
|
|
entry = self.user_audio_data[user_id]
|
|
|
|
# Extract PCM data from VoiceData packet
|
|
raw_pcm_data_48k_stereo = voice_data_packet.pcm
|
|
|
|
# Convert incoming 48kHz stereo PCM to 16kHz mono PCM
|
|
pcm_data = _convert_audio_to_16khz_mono(raw_pcm_data_48k_stereo)
|
|
if not pcm_data: # Conversion failed or returned empty bytes
|
|
# print(f"Audio conversion failed for user {user_id}. Skipping frame.")
|
|
return
|
|
|
|
# VAD processing expects frames of 10, 20, or 30 ms.
|
|
# pcm_data is now 16kHz mono, hopefully in appropriate chunks from conversion.
|
|
# We need to ensure it's split into VAD-compatible frame lengths if not already.
|
|
# If pcm_data (now 16kHz mono) is a 20ms chunk, its length is 640 bytes.
|
|
# A 10ms frame at 16kHz is 320 bytes. A 30ms frame is 960 bytes.
|
|
|
|
# Ensure frame_length for VAD is correct (e.g. 20ms at 16kHz mono = 640 bytes)
|
|
# This constant could be defined at class or module level.
|
|
# For a 20ms frame, which is typical for voice packets:
|
|
frame_length_for_vad_20ms = (SAMPLE_RATE // 1000) * 20 * CHANNELS * SAMPLE_WIDTH
|
|
|
|
if (
|
|
len(pcm_data) % frame_length_for_vad_20ms != 0 and len(pcm_data) > 0
|
|
): # Check if it's a multiple, or handle if not.
|
|
# This might happen if the converted chunk size isn't exactly what VAD expects per call.
|
|
# For now, we'll try to process it. A more robust solution might buffer/segment pcm_data
|
|
# into exact 10, 20, or 30ms chunks for VAD.
|
|
# print(f"Warning: PCM data length {len(pcm_data)} after conversion is not an exact multiple of VAD frame size {frame_length_for_vad_20ms} for User {user_id}. Trying to process.")
|
|
pass # Continue, VAD might handle it or error.
|
|
|
|
# Process VAD in chunks if pcm_data is longer than one VAD frame
|
|
# For simplicity, let's assume pcm_data is one processable chunk for now.
|
|
# If pcm_data can be multiple VAD frames, iterate through it.
|
|
# Current VAD logic processes the whole pcm_data chunk at once.
|
|
# This is okay if pcm_data is already a single VAD frame (e.g. 20ms).
|
|
|
|
if entry["vad"]:
|
|
try:
|
|
# Ensure pcm_data is a valid frame for VAD (e.g. 10, 20, 30 ms)
|
|
# If pcm_data is, for example, 640 bytes (20ms at 16kHz mono), it's fine.
|
|
if len(pcm_data) == frame_length_for_vad_20ms: # Common case
|
|
is_speech = entry["vad"].is_speech(pcm_data, SAMPLE_RATE)
|
|
elif (
|
|
len(pcm_data) > 0
|
|
): # If not standard, but has data, try (might error)
|
|
# print(f"VAD processing for User {user_id} with non-standard PCM length {len(pcm_data)}. May error.")
|
|
# This path is risky if VAD is strict. For now, we assume it's handled or errors.
|
|
# A robust way: segment pcm_data into valid VAD frames.
|
|
# For now, let's assume the chunk from conversion is one such frame.
|
|
is_speech = entry["vad"].is_speech(
|
|
pcm_data, SAMPLE_RATE
|
|
) # This might fail if len is not 10/20/30ms worth
|
|
else: # No data
|
|
is_speech = False
|
|
|
|
except Exception as e: # webrtcvad can raise errors on invalid frame length
|
|
# print(f"VAD error for User {user_id} with PCM length {len(pcm_data)}: {e}. Defaulting to speech=True for this frame.")
|
|
is_speech = True # Fallback: if VAD fails, assume it's speech
|
|
else: # No VAD
|
|
is_speech = True
|
|
|
|
if is_speech:
|
|
entry["buffer"].extend(pcm_data)
|
|
entry["speaking"] = True
|
|
entry["silent_frames"] = 0
|
|
entry["speech_frames"] += 1
|
|
if entry["speech_frames"] >= MAX_SPEECH_FRAMES:
|
|
# print(f"Max speech frames reached for User {user_id}. Processing segment.")
|
|
self.cog.bot.loop.create_task(
|
|
self.cog.process_audio_segment(
|
|
user_id, bytes(entry["buffer"]), self.voice_client.guild
|
|
)
|
|
)
|
|
entry["buffer"].clear()
|
|
entry["speaking"] = False
|
|
entry["speech_frames"] = 0
|
|
elif entry["speaking"]: # Was speaking, now silence
|
|
entry["buffer"].extend(pcm_data) # Add this last silent frame for context
|
|
entry["silent_frames"] += 1
|
|
if entry["silent_frames"] >= SILENCE_THRESHOLD_FRAMES:
|
|
# print(f"Silence threshold reached for User {user_id}. Processing segment.")
|
|
self.cog.bot.loop.create_task(
|
|
self.cog.process_audio_segment(
|
|
user_id, bytes(entry["buffer"]), self.voice_client.guild
|
|
)
|
|
)
|
|
entry["buffer"].clear()
|
|
entry["speaking"] = False
|
|
entry["speech_frames"] = 0
|
|
entry["silent_frames"] = 0
|
|
# If not is_speech and not entry['speaking'], do nothing (ignore silence)
|
|
|
|
def cleanup(self):
|
|
print("VoiceAudioSink cleanup called.")
|
|
# Iterate over a copy of items if modifications occur, or handle user_id directly
|
|
for user_id, data_entry in list(self.user_audio_data.items()):
|
|
if data_entry["buffer"]:
|
|
# user object is not directly available here, but process_audio_segment takes user_id
|
|
# We need the guild, which should be available from self.voice_client
|
|
if self.voice_client and self.voice_client.guild:
|
|
guild = self.voice_client.guild
|
|
print(
|
|
f"Processing remaining audio for User ID {user_id} on cleanup."
|
|
)
|
|
self.cog.bot.loop.create_task(
|
|
self.cog.process_audio_segment(
|
|
user_id, bytes(data_entry["buffer"]), guild
|
|
)
|
|
)
|
|
else:
|
|
print(
|
|
f"Cannot process remaining audio for User ID {user_id}: voice_client or guild not available."
|
|
)
|
|
self.user_audio_data.clear()
|
|
|
|
|
|
class VoiceGatewayCog(commands.Cog):
|
|
def __init__(self, bot):
|
|
self.bot = bot
|
|
self.active_sinks = {} # guild_id: VoiceAudioSink
|
|
self.dedicated_voice_text_channels: dict[int, int] = {} # guild_id: channel_id
|
|
self.speech_client = None
|
|
if speech:
|
|
try:
|
|
self.speech_client = speech.SpeechClient()
|
|
print("Google Cloud Speech client initialized successfully.")
|
|
except Exception as e:
|
|
print(
|
|
f"Error initializing Google Cloud Speech client: {e}. STT will not be available."
|
|
)
|
|
self.speech_client = None
|
|
else:
|
|
print(
|
|
"Google Cloud Speech library not available. STT functionality will be disabled."
|
|
)
|
|
|
|
async def _ensure_dedicated_voice_text_channel(
|
|
self, guild: discord.Guild, voice_channel: discord.VoiceChannel
|
|
) -> Optional[discord.TextChannel]:
|
|
if not GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED:
|
|
return None
|
|
|
|
existing_channel_id = self.dedicated_voice_text_channels.get(guild.id)
|
|
if existing_channel_id:
|
|
channel = guild.get_channel(existing_channel_id)
|
|
if channel and isinstance(channel, discord.TextChannel):
|
|
print(
|
|
f"Found existing dedicated voice text channel: {channel.name} ({channel.id})"
|
|
)
|
|
return channel
|
|
else:
|
|
print(
|
|
f"Dedicated voice text channel ID {existing_channel_id} for guild {guild.id} is invalid or not found. Will create a new one."
|
|
)
|
|
del self.dedicated_voice_text_channels[guild.id] # Remove invalid ID
|
|
|
|
# Create new channel
|
|
channel_name = GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_NAME_TEMPLATE.format(
|
|
voice_channel_name=voice_channel.name,
|
|
guild_name=guild.name,
|
|
# Add more placeholders if needed
|
|
)
|
|
# Sanitize channel name (Discord has restrictions)
|
|
channel_name = "".join(
|
|
c for c in channel_name if c.isalnum() or c in ["-", "_", " "]
|
|
).strip()
|
|
channel_name = channel_name.replace(" ", "-").lower()
|
|
if not channel_name: # Fallback if template results in empty string
|
|
channel_name = "gurt-voice-chat"
|
|
|
|
# Check if a channel with this name already exists (to avoid duplicates if bot restarted without proper cleanup)
|
|
for existing_guild_channel in guild.text_channels:
|
|
if existing_guild_channel.name == channel_name:
|
|
print(
|
|
f"Found existing channel by name '{channel_name}' ({existing_guild_channel.id}). Reusing."
|
|
)
|
|
self.dedicated_voice_text_channels[guild.id] = existing_guild_channel.id
|
|
# Optionally update topic and permissions if needed
|
|
try:
|
|
if (
|
|
existing_guild_channel.topic
|
|
!= GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_TOPIC
|
|
):
|
|
await existing_guild_channel.edit(
|
|
topic=GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_TOPIC
|
|
)
|
|
# Send initial message if channel is empty or last message isn't the initial one
|
|
async for last_message in existing_guild_channel.history(limit=1):
|
|
if (
|
|
last_message.content
|
|
!= GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE
|
|
):
|
|
await existing_guild_channel.send(
|
|
GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE
|
|
)
|
|
break # Only need the very last message
|
|
else: # No messages in channel
|
|
await existing_guild_channel.send(
|
|
GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE
|
|
)
|
|
|
|
except discord.Forbidden:
|
|
print(
|
|
f"Missing permissions to update reused dedicated channel {channel_name}"
|
|
)
|
|
except Exception as e_reuse:
|
|
print(
|
|
f"Error updating reused dedicated channel {channel_name}: {e_reuse}"
|
|
)
|
|
return existing_guild_channel
|
|
|
|
overwrites = {
|
|
guild.me: discord.PermissionOverwrite(
|
|
read_messages=True, send_messages=True, manage_messages=True
|
|
), # GURT needs to manage
|
|
guild.default_role: discord.PermissionOverwrite(
|
|
read_messages=False, send_messages=False
|
|
), # Private by default
|
|
# Consider adding server admins/mods with read/send permissions
|
|
}
|
|
# Add owner and admins with full perms to the channel
|
|
if guild.owner:
|
|
overwrites[guild.owner] = discord.PermissionOverwrite(
|
|
read_messages=True,
|
|
send_messages=True,
|
|
manage_channels=True,
|
|
manage_messages=True,
|
|
)
|
|
for role in guild.roles:
|
|
if (
|
|
role.permissions.administrator and not role.is_default()
|
|
): # Check for admin roles
|
|
overwrites[role] = discord.PermissionOverwrite(
|
|
read_messages=True,
|
|
send_messages=True,
|
|
manage_channels=True,
|
|
manage_messages=True,
|
|
)
|
|
|
|
try:
|
|
print(f"Creating new dedicated voice text channel: {channel_name}")
|
|
new_channel = await guild.create_text_channel(
|
|
name=channel_name,
|
|
overwrites=overwrites,
|
|
topic=GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_TOPIC,
|
|
reason="GURT Dedicated Voice Chat Channel",
|
|
)
|
|
self.dedicated_voice_text_channels[guild.id] = new_channel.id
|
|
if GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE:
|
|
await new_channel.send(
|
|
GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_INITIAL_MESSAGE
|
|
)
|
|
print(
|
|
f"Created dedicated voice text channel: {new_channel.name} ({new_channel.id})"
|
|
)
|
|
return new_channel
|
|
except discord.Forbidden:
|
|
print(
|
|
f"Forbidden: Could not create dedicated voice text channel '{channel_name}' in guild {guild.name}."
|
|
)
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error creating dedicated voice text channel '{channel_name}': {e}")
|
|
return None
|
|
|
|
def get_dedicated_text_channel_for_guild(
|
|
self, guild_id: int
|
|
) -> Optional[discord.TextChannel]:
|
|
channel_id = self.dedicated_voice_text_channels.get(guild_id)
|
|
if channel_id:
|
|
guild = self.bot.get_guild(guild_id)
|
|
if guild:
|
|
channel = guild.get_channel(channel_id)
|
|
if isinstance(channel, discord.TextChannel):
|
|
return channel
|
|
return None
|
|
|
|
async def cog_load(self):
|
|
print("VoiceGatewayCog loaded!")
|
|
|
|
async def cog_unload(self):
|
|
print("Unloading VoiceGatewayCog...")
|
|
# Disconnect from all voice channels and clean up sinks
|
|
for vc in list(self.bot.voice_clients): # Iterate over a copy
|
|
guild_id = vc.guild.id
|
|
if guild_id in self.active_sinks:
|
|
if (
|
|
vc.is_connected()
|
|
and hasattr(vc, "is_listening")
|
|
and vc.is_listening()
|
|
):
|
|
if hasattr(vc, "stop_listening"):
|
|
vc.stop_listening()
|
|
else: # Or equivalent for VoiceRecvClient
|
|
pass
|
|
self.active_sinks[guild_id].cleanup()
|
|
del self.active_sinks[guild_id]
|
|
|
|
# Handle dedicated text channel cleanup on cog unload
|
|
if (
|
|
GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED
|
|
and GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_CLEANUP_ON_LEAVE
|
|
):
|
|
dedicated_channel_id = self.dedicated_voice_text_channels.get(guild_id)
|
|
if dedicated_channel_id:
|
|
try:
|
|
channel_to_delete = vc.guild.get_channel(
|
|
dedicated_channel_id
|
|
) or await self.bot.fetch_channel(dedicated_channel_id)
|
|
if channel_to_delete:
|
|
print(
|
|
f"Deleting dedicated voice text channel {channel_to_delete.name} ({channel_to_delete.id}) during cog unload."
|
|
)
|
|
await channel_to_delete.delete(
|
|
reason="GURT VoiceGatewayCog unload"
|
|
)
|
|
except discord.NotFound:
|
|
print(
|
|
f"Dedicated voice text channel {dedicated_channel_id} not found for deletion during unload."
|
|
)
|
|
except discord.Forbidden:
|
|
print(
|
|
f"Forbidden: Could not delete dedicated voice text channel {dedicated_channel_id} during unload."
|
|
)
|
|
except Exception as e:
|
|
print(
|
|
f"Error deleting dedicated voice text channel {dedicated_channel_id} during unload: {e}"
|
|
)
|
|
if guild_id in self.dedicated_voice_text_channels:
|
|
del self.dedicated_voice_text_channels[guild_id]
|
|
|
|
if vc.is_connected():
|
|
await vc.disconnect(force=True)
|
|
print("VoiceGatewayCog unloaded and disconnected from voice channels.")
|
|
|
|
async def connect_to_voice(self, channel: discord.VoiceChannel):
|
|
"""Connects the bot to a specified voice channel and starts listening."""
|
|
if not channel:
|
|
return None, "Channel not provided."
|
|
|
|
guild = channel.guild
|
|
voice_client = guild.voice_client
|
|
|
|
if voice_client and voice_client.is_connected():
|
|
if voice_client.channel == channel:
|
|
print(f"Already connected to {channel.name} in {guild.name}.")
|
|
if isinstance(voice_client, voice_recv.VoiceRecvClient):
|
|
if (
|
|
guild.id not in self.active_sinks
|
|
or not voice_client.is_listening()
|
|
):
|
|
self.start_listening_for_vc(voice_client)
|
|
# Ensure dedicated channel is set up even if already connected
|
|
await self._ensure_dedicated_voice_text_channel(guild, channel)
|
|
else:
|
|
print(f"Reconnecting with VoiceRecvClient to {channel.name}.")
|
|
await voice_client.disconnect(force=True)
|
|
try:
|
|
voice_client = await channel.connect(
|
|
cls=voice_recv.VoiceRecvClient, timeout=10.0
|
|
)
|
|
print(
|
|
f"Reconnected to {channel.name} in {guild.name} with VoiceRecvClient."
|
|
)
|
|
self.start_listening_for_vc(voice_client)
|
|
await self._ensure_dedicated_voice_text_channel(guild, channel)
|
|
except asyncio.TimeoutError:
|
|
return (
|
|
None,
|
|
f"Timeout trying to reconnect to {channel.name} with VoiceRecvClient.",
|
|
)
|
|
except Exception as e:
|
|
return (
|
|
None,
|
|
f"Error reconnecting to {channel.name} with VoiceRecvClient: {str(e)}",
|
|
)
|
|
return voice_client, "Already connected to this channel."
|
|
else:
|
|
print(
|
|
f"Moving to {channel.name} in {guild.name}. Reconnecting with VoiceRecvClient."
|
|
)
|
|
await voice_client.disconnect(
|
|
force=True
|
|
) # This will trigger cleanup for old channel's dedicated text channel if configured
|
|
try:
|
|
voice_client = await channel.connect(
|
|
cls=voice_recv.VoiceRecvClient, timeout=10.0
|
|
)
|
|
print(
|
|
f"Moved and reconnected to {channel.name} in {guild.name} with VoiceRecvClient."
|
|
)
|
|
self.start_listening_for_vc(voice_client)
|
|
await self._ensure_dedicated_voice_text_channel(guild, channel)
|
|
except asyncio.TimeoutError:
|
|
return (
|
|
None,
|
|
f"Timeout trying to move and connect to {channel.name}.",
|
|
)
|
|
except Exception as e:
|
|
return (
|
|
None,
|
|
f"Error moving and connecting to {channel.name}: {str(e)}",
|
|
)
|
|
else:
|
|
try:
|
|
voice_client = await channel.connect(
|
|
cls=voice_recv.VoiceRecvClient, timeout=10.0
|
|
)
|
|
print(
|
|
f"Connected to {channel.name} in {guild.name} with VoiceRecvClient."
|
|
)
|
|
self.start_listening_for_vc(voice_client)
|
|
await self._ensure_dedicated_voice_text_channel(guild, channel)
|
|
except asyncio.TimeoutError:
|
|
return None, f"Timeout trying to connect to {channel.name}."
|
|
except Exception as e:
|
|
return None, f"Error connecting to {channel.name}: {str(e)}"
|
|
|
|
if not voice_client:
|
|
return None, "Failed to establish voice client after connection."
|
|
|
|
return voice_client, f"Successfully connected and listening in {channel.name}."
|
|
|
|
def start_listening_for_vc(self, voice_client: discord.VoiceClient):
|
|
"""Starts or restarts listening for a given voice client."""
|
|
guild_id = voice_client.guild.id
|
|
if guild_id in self.active_sinks:
|
|
# If sink exists, ensure it's clean and listening is (re)started
|
|
if voice_client.is_listening():
|
|
voice_client.stop_listening() # Stop previous listening if any
|
|
self.active_sinks[guild_id].cleanup() # Clean old state
|
|
# Re-initialize or ensure the sink is fresh for the current VC
|
|
self.active_sinks[guild_id] = VoiceAudioSink(self)
|
|
else:
|
|
self.active_sinks[guild_id] = VoiceAudioSink(self)
|
|
|
|
if not voice_client.is_listening():
|
|
voice_client.listen(self.active_sinks[guild_id])
|
|
print(
|
|
f"Started listening in {voice_client.channel.name} for guild {guild_id}"
|
|
)
|
|
else:
|
|
print(
|
|
f"Already listening in {voice_client.channel.name} for guild {guild_id}"
|
|
)
|
|
|
|
async def disconnect_from_voice(self, guild: discord.Guild):
|
|
"""Disconnects the bot from the voice channel in the given guild."""
|
|
voice_client = guild.voice_client
|
|
if voice_client and voice_client.is_connected():
|
|
if voice_client.is_listening():
|
|
voice_client.stop_listening()
|
|
|
|
guild_id = guild.id
|
|
if guild_id in self.active_sinks:
|
|
self.active_sinks[guild_id].cleanup()
|
|
del self.active_sinks[guild_id]
|
|
|
|
# Handle dedicated text channel cleanup
|
|
if (
|
|
GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_ENABLED
|
|
and GurtConfig.VOICE_DEDICATED_TEXT_CHANNEL_CLEANUP_ON_LEAVE
|
|
):
|
|
dedicated_channel_id = self.dedicated_voice_text_channels.get(guild_id)
|
|
if dedicated_channel_id:
|
|
try:
|
|
channel_to_delete = guild.get_channel(
|
|
dedicated_channel_id
|
|
) or await self.bot.fetch_channel(dedicated_channel_id)
|
|
if channel_to_delete:
|
|
print(
|
|
f"Deleting dedicated voice text channel {channel_to_delete.name} ({channel_to_delete.id})."
|
|
)
|
|
await channel_to_delete.delete(
|
|
reason="GURT disconnected from voice channel"
|
|
)
|
|
except discord.NotFound:
|
|
print(
|
|
f"Dedicated voice text channel {dedicated_channel_id} not found for deletion."
|
|
)
|
|
except discord.Forbidden:
|
|
print(
|
|
f"Forbidden: Could not delete dedicated voice text channel {dedicated_channel_id}."
|
|
)
|
|
except Exception as e:
|
|
print(
|
|
f"Error deleting dedicated voice text channel {dedicated_channel_id}: {e}"
|
|
)
|
|
if guild_id in self.dedicated_voice_text_channels:
|
|
del self.dedicated_voice_text_channels[guild_id]
|
|
|
|
await voice_client.disconnect(force=True)
|
|
print(f"Disconnected from voice in {guild.name}.")
|
|
return True, f"Disconnected from voice in {guild.name}."
|
|
return False, "Not connected to voice in this guild."
|
|
|
|
async def play_audio_file(
|
|
self, voice_client: discord.VoiceClient, audio_file_path: str
|
|
):
|
|
"""Plays an audio file in the voice channel."""
|
|
if not voice_client or not voice_client.is_connected():
|
|
print("Error: Voice client not connected.")
|
|
return False, "Voice client not connected."
|
|
|
|
if not os.path.exists(audio_file_path):
|
|
print(f"Error: Audio file not found at {audio_file_path}")
|
|
return False, "Audio file not found."
|
|
|
|
if voice_client.is_playing():
|
|
voice_client.stop() # Stop current audio if any
|
|
|
|
try:
|
|
audio_source = discord.FFmpegPCMAudio(audio_file_path, **FFMPEG_OPTIONS)
|
|
voice_client.play(
|
|
audio_source,
|
|
after=lambda e: self.after_audio_playback(e, audio_file_path),
|
|
)
|
|
print(f"Playing audio: {audio_file_path}")
|
|
return True, f"Playing {os.path.basename(audio_file_path)}"
|
|
except Exception as e:
|
|
print(
|
|
f"Error creating/playing FFmpegPCMAudio source for {audio_file_path}: {e}"
|
|
)
|
|
return False, f"Error playing audio: {str(e)}"
|
|
|
|
def after_audio_playback(self, error, audio_file_path):
|
|
if error:
|
|
print(f"Error during audio playback for {audio_file_path}: {error}")
|
|
else:
|
|
print(f"Finished playing {audio_file_path}")
|
|
# TTSProviderCog's cleanup will handle deleting the file.
|
|
|
|
# Removed start_listening_pipeline as the sink now handles more logic directly or via tasks.
|
|
|
|
async def process_audio_segment(
|
|
self, user_id: int, audio_data: bytes, guild: discord.Guild
|
|
):
|
|
"""Processes a segment of audio data using Google Cloud Speech-to-Text."""
|
|
if not self.speech_client or not audio_data:
|
|
if not audio_data:
|
|
print(
|
|
f"process_audio_segment called for user {user_id} with empty audio_data."
|
|
)
|
|
return
|
|
|
|
try:
|
|
recognition_config = speech.RecognitionConfig(
|
|
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
sample_rate_hertz=SAMPLE_RATE, # Defined as 16000
|
|
language_code="en-US",
|
|
enable_automatic_punctuation=True,
|
|
model="telephony", # Consider uncommenting if default isn't ideal for voice chat
|
|
)
|
|
recognition_audio = speech.RecognitionAudio(content=audio_data)
|
|
|
|
# Run in executor as it's a network call that can be blocking
|
|
response = await self.bot.loop.run_in_executor(
|
|
None, # Default ThreadPoolExecutor
|
|
functools.partial(
|
|
self.speech_client.recognize,
|
|
config=recognition_config,
|
|
audio=recognition_audio,
|
|
),
|
|
)
|
|
|
|
transcribed_text = ""
|
|
for result in response.results:
|
|
if result.alternatives:
|
|
transcribed_text += result.alternatives[0].transcript + " "
|
|
|
|
transcribed_text = transcribed_text.strip()
|
|
|
|
if transcribed_text:
|
|
user = guild.get_member(user_id) or await self.bot.fetch_user(user_id)
|
|
print(
|
|
f"Google STT for {user.name} ({user_id}) in {guild.name}: {transcribed_text}"
|
|
)
|
|
self.bot.dispatch(
|
|
"voice_transcription_received", guild, user, transcribed_text
|
|
)
|
|
|
|
except Exception as e:
|
|
print(
|
|
f"Error processing audio segment with Google STT for user {user_id}: {e}"
|
|
)
|
|
|
|
|
|
async def setup(bot: commands.Bot):
|
|
# Check for FFmpeg before adding cog
|
|
try:
|
|
# Try running ffmpeg -version to check if it's installed and in PATH
|
|
process = await asyncio.create_subprocess_shell(
|
|
"ffmpeg -version",
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await process.communicate()
|
|
if process.returncode == 0:
|
|
print("FFmpeg found. VoiceGatewayCog can be loaded.")
|
|
await bot.add_cog(VoiceGatewayCog(bot))
|
|
print("VoiceGatewayCog loaded successfully!")
|
|
else:
|
|
print(
|
|
"FFmpeg not found or not working correctly. VoiceGatewayCog will not be loaded."
|
|
)
|
|
print(f"FFmpeg check stdout: {stdout.decode(errors='ignore')}")
|
|
print(f"FFmpeg check stderr: {stderr.decode(errors='ignore')}")
|
|
|
|
except FileNotFoundError:
|
|
print(
|
|
"FFmpeg command not found. VoiceGatewayCog will not be loaded. Please install FFmpeg and ensure it's in your system's PATH."
|
|
)
|
|
except Exception as e:
|
|
print(
|
|
f"An error occurred while checking for FFmpeg: {e}. VoiceGatewayCog will not be loaded."
|
|
)
|