feat(aimod): support full video and gif input

This commit is contained in:
Codex 2025-06-06 22:11:21 +00:00 committed by Slipstream
parent a527346ae6
commit ae25f85cb5
Signed by: slipstream
GPG Key ID: 13E498CE010AC6FD

View File

@ -96,7 +96,7 @@ class AIModerationCog(commands.Cog):
] # Added heic/heif for Vertex
# Supported animated file extensions
self.gif_extensions = [".gif"]
# Supported video file extensions (Vertex AI typically processes first frame of videos as image)
# Supported video file extensions (Vertex AI can process short video clips directly)
self.video_extensions = [
".mp4",
".webm",
@ -393,33 +393,11 @@ class AIModerationCog(commands.Cog):
return None, None
async def process_gif(self, attachment: discord.Attachment) -> tuple[str, bytes]:
"""
Process a GIF attachment and extract the first frame.
Args:
attachment: The Discord attachment containing the GIF
Returns:
Tuple of (mime_type, image_bytes) of the first frame
"""
"""Return the raw bytes for a GIF attachment."""
try:
# Download the GIF
gif_bytes = await attachment.read()
# Open the GIF using PIL
with Image.open(io.BytesIO(gif_bytes)) as gif:
# Convert to RGB if needed
if gif.mode != "RGB":
first_frame = gif.convert("RGB")
else:
first_frame = gif
# Save the first frame to a bytes buffer
output = io.BytesIO()
first_frame.save(output, format="JPEG")
output.seek(0)
return "image/jpeg", output.getvalue()
mime_type = attachment.content_type or "image/gif"
return mime_type, gif_bytes
except Exception as e:
print(f"Error processing GIF: {e}")
return None, None
@ -459,54 +437,11 @@ class AIModerationCog(commands.Cog):
return None, None, None
async def process_video(self, attachment: discord.Attachment) -> tuple[str, bytes]:
"""
Process a video attachment and extract the first frame.
Args:
attachment: The Discord attachment containing the video
Returns:
Tuple of (mime_type, image_bytes) of the first frame
"""
"""Return the raw bytes for a video attachment."""
try:
# Download the video to a temporary file
video_bytes = await attachment.read()
with tempfile.NamedTemporaryFile(
suffix=os.path.splitext(attachment.filename)[1], delete=False
) as temp_file:
temp_file_path = temp_file.name
temp_file.write(video_bytes)
try:
# Open the video with OpenCV
cap = cv2.VideoCapture(temp_file_path)
ret, frame = cap.read()
if not ret:
print(f"Failed to read frame from video: {attachment.filename}")
return None, None
# Convert BGR to RGB (OpenCV uses BGR by default)
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Convert to PIL Image
pil_image = Image.fromarray(frame_rgb)
# Save to bytes buffer
output = io.BytesIO()
pil_image.save(output, format="JPEG")
output.seek(0)
# Clean up
cap.release()
return "image/jpeg", output.getvalue()
finally:
# Clean up the temporary file
try:
os.unlink(temp_file_path)
except Exception as e:
print(f"Error removing temporary file: {e}")
mime_type = attachment.content_type or "video/mp4"
return mime_type, video_bytes
except Exception as e:
print(f"Error processing video: {e}")
return None, None
@ -1102,20 +1037,20 @@ Server Rules:
---
Context Provided:
You will receive the following information to aid your analysis:
You will receive the following information to aid your analysis:
- User's Server Role: (e.g., "Server Owner", "Admin", "Moderator", "Member").
- Channel Category: The name of the category the channel belongs to.
- Channel Age-Restricted/NSFW (Discord Setting): Boolean (true/false).
- Replied-to Message: If the current message is a reply, the content of the original message will be provided. This is crucial for understanding direct interactions.
- Recent Channel History: The last few messages in the channel to understand the flow of conversation.
- Attached Media: If the message contains image, GIF, or video attachments, they will be provided as image_url objects in the content array. For GIFs and videos, only the first frame is extracted.
- Attached Media: If the message contains image, GIF, or video attachments, they will be provided directly in the content array for analysis.
Instructions:
1. Review the "Message Content" and any attached media against EACH rule, considering ALL provided context (User Role, Channel Info, Replied-to Message, Recent Channel History).
- The "Channel Age-Restricted/NSFW (Discord Setting)" is the definitive indicator for NSFW content by Discord.
- The "Channel Category" provides general context.
- **"Replied-to Message" and "Recent Channel History" are vital for understanding banter, jokes, and ongoing discussions. A statement that seems offensive in isolation might be acceptable within the flow of conversation or as a direct reply.**
- If images, GIFs, or videos are attached, analyze ALL of them for rule violations. For GIFs and videos, only the first frame is provided.
- If images, GIFs, or videos are attached, analyze ALL of them for rule violations.
- Pay special attention to images that may contain NSFW content, pornography, gore, or other prohibited visual content.
- If multiple attachments are present, a violation in ANY of them should be flagged.
2. Determine if ANY rule is violated. When evaluating, consider the server's culture where **extremely edgy, dark, and sexual humor, including potentially offensive jokes (e.g., rape jokes, saying you want to be raped), are common and generally permissible IF THEY ARE CLEARLY JOKES, part of an established banter, or a direct non-malicious reply, and not targeted harassment or explicit rule violations.**
@ -1416,11 +1351,8 @@ CRITICAL: Do NOT output anything other than the required JSON response.
if image_data_list:
for mime_type, image_bytes, attachment_type, filename in image_data_list:
try:
# Vertex AI directly supports these mime types for images.
# For video, it processes the first frame.
# Ensure mime_type is one of the supported ones by Vertex, e.g., image/png, image/jpeg, etc.
# Common image types are generally fine.
# For video, the extracted frame is JPEG.
# Vertex AI directly supports common image and video MIME types.
# Ensure mime_type is one of the supported ones by Vertex, e.g., image/png or video/mp4.
supported_image_mimes = [
"image/png",
"image/jpeg",
@ -1429,26 +1361,30 @@ CRITICAL: Do NOT output anything other than the required JSON response.
"image/heif",
"image/gif",
]
supported_video_mimes = [
"video/mp4",
"video/webm",
"video/quicktime",
"video/x-msvideo",
"video/x-matroska",
"video/x-flv",
]
clean_mime_type = mime_type.split(";")[0].lower()
if (
clean_mime_type in supported_image_mimes
or attachment_type == "video"
): # Video frame is jpeg
or clean_mime_type in supported_video_mimes
):
vertex_parts.append(
types.Part(
inline_data=types.Blob(
data=image_bytes,
mime_type=(
clean_mime_type
if clean_mime_type in supported_image_mimes
else "image/jpeg"
),
mime_type=clean_mime_type,
)
)
)
print(
f"Added attachment {filename} ({attachment_type} as {clean_mime_type if clean_mime_type in supported_image_mimes else 'image/jpeg'}) to Vertex prompt"
f"Added attachment {filename} ({attachment_type}) with MIME {clean_mime_type} to Vertex prompt"
)
else:
print(