feat(aimod): support full video and gif input

2025-06-06 22:11:21 +00:00 · 2025-06-06 22:11:21 +00:00 · ae25f85cb5
commit ae25f85cb5
parent a527346ae6
1 changed files with 24 additions and 88 deletions
--- a/cogs/aimod_cog.py
+++ b/cogs/aimod_cog.py
@ -96,7 +96,7 @@ class AIModerationCog(commands.Cog):
        ]  # Added heic/heif for Vertex
        # Supported animated file extensions
        self.gif_extensions = [".gif"]
-        # Supported video file extensions (Vertex AI typically processes first frame of videos as image)
+        # Supported video file extensions (Vertex AI can process short video clips directly)
        self.video_extensions = [
            ".mp4",
            ".webm",
@ -393,33 +393,11 @@ class AIModerationCog(commands.Cog):
            return None, None

    async def process_gif(self, attachment: discord.Attachment) -> tuple[str, bytes]:
-        """
-        Process a GIF attachment and extract the first frame.
-
-        Args:
-            attachment: The Discord attachment containing the GIF
-
-        Returns:
-            Tuple of (mime_type, image_bytes) of the first frame
-        """
+        """Return the raw bytes for a GIF attachment."""
        try:
-            # Download the GIF
            gif_bytes = await attachment.read()
-
-            # Open the GIF using PIL
-            with Image.open(io.BytesIO(gif_bytes)) as gif:
-                # Convert to RGB if needed
-                if gif.mode != "RGB":
-                    first_frame = gif.convert("RGB")
-                else:
-                    first_frame = gif
-
-                # Save the first frame to a bytes buffer
-                output = io.BytesIO()
-                first_frame.save(output, format="JPEG")
-                output.seek(0)
-
-                return "image/jpeg", output.getvalue()
+            mime_type = attachment.content_type or "image/gif"
+            return mime_type, gif_bytes
        except Exception as e:
            print(f"Error processing GIF: {e}")
            return None, None
@ -459,54 +437,11 @@ class AIModerationCog(commands.Cog):
            return None, None, None

    async def process_video(self, attachment: discord.Attachment) -> tuple[str, bytes]:
-        """
-        Process a video attachment and extract the first frame.
-
-        Args:
-            attachment: The Discord attachment containing the video
-
-        Returns:
-            Tuple of (mime_type, image_bytes) of the first frame
-        """
+        """Return the raw bytes for a video attachment."""
        try:
-            # Download the video to a temporary file
            video_bytes = await attachment.read()
-            with tempfile.NamedTemporaryFile(
-                suffix=os.path.splitext(attachment.filename)[1], delete=False
-            ) as temp_file:
-                temp_file_path = temp_file.name
-                temp_file.write(video_bytes)
-
-            try:
-                # Open the video with OpenCV
-                cap = cv2.VideoCapture(temp_file_path)
-                ret, frame = cap.read()
-
-                if not ret:
-                    print(f"Failed to read frame from video: {attachment.filename}")
-                    return None, None
-
-                # Convert BGR to RGB (OpenCV uses BGR by default)
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-
-                # Convert to PIL Image
-                pil_image = Image.fromarray(frame_rgb)
-
-                # Save to bytes buffer
-                output = io.BytesIO()
-                pil_image.save(output, format="JPEG")
-                output.seek(0)
-
-                # Clean up
-                cap.release()
-
-                return "image/jpeg", output.getvalue()
-            finally:
-                # Clean up the temporary file
-                try:
-                    os.unlink(temp_file_path)
-                except Exception as e:
-                    print(f"Error removing temporary file: {e}")
+            mime_type = attachment.content_type or "video/mp4"
+            return mime_type, video_bytes
        except Exception as e:
            print(f"Error processing video: {e}")
            return None, None
@ -1102,20 +1037,20 @@ Server Rules:
 ---

 Context Provided:
-You will receive the following information to aid your analysis:
+        You will receive the following information to aid your analysis:
 - User's Server Role: (e.g., "Server Owner", "Admin", "Moderator", "Member").
 - Channel Category: The name of the category the channel belongs to.
 - Channel Age-Restricted/NSFW (Discord Setting): Boolean (true/false).
 - Replied-to Message: If the current message is a reply, the content of the original message will be provided. This is crucial for understanding direct interactions.
 - Recent Channel History: The last few messages in the channel to understand the flow of conversation.
- Attached Media: If the message contains image, GIF, or video attachments, they will be provided as image_url objects in the content array. For GIFs and videos, only the first frame is extracted.
+        - Attached Media: If the message contains image, GIF, or video attachments, they will be provided directly in the content array for analysis.

 Instructions:
 1. Review the "Message Content" and any attached media against EACH rule, considering ALL provided context (User Role, Channel Info, Replied-to Message, Recent Channel History).
   - The "Channel Age-Restricted/NSFW (Discord Setting)" is the definitive indicator for NSFW content by Discord.
   - The "Channel Category" provides general context.
   - **"Replied-to Message" and "Recent Channel History" are vital for understanding banter, jokes, and ongoing discussions. A statement that seems offensive in isolation might be acceptable within the flow of conversation or as a direct reply.**
-   - If images, GIFs, or videos are attached, analyze ALL of them for rule violations. For GIFs and videos, only the first frame is provided.
+   - If images, GIFs, or videos are attached, analyze ALL of them for rule violations.
   - Pay special attention to images that may contain NSFW content, pornography, gore, or other prohibited visual content.
   - If multiple attachments are present, a violation in ANY of them should be flagged.
 2. Determine if ANY rule is violated. When evaluating, consider the server's culture where **extremely edgy, dark, and sexual humor, including potentially offensive jokes (e.g., rape jokes, saying you want to be raped), are common and generally permissible IF THEY ARE CLEARLY JOKES, part of an established banter, or a direct non-malicious reply, and not targeted harassment or explicit rule violations.**
@ -1416,11 +1351,8 @@ CRITICAL: Do NOT output anything other than the required JSON response.
        if image_data_list:
            for mime_type, image_bytes, attachment_type, filename in image_data_list:
                try:
-                    # Vertex AI directly supports these mime types for images.
-                    # For video, it processes the first frame.
-                    # Ensure mime_type is one of the supported ones by Vertex, e.g., image/png, image/jpeg, etc.
-                    # Common image types are generally fine.
-                    # For video, the extracted frame is JPEG.
+                    # Vertex AI directly supports common image and video MIME types.
+                    # Ensure mime_type is one of the supported ones by Vertex, e.g., image/png or video/mp4.
                    supported_image_mimes = [
                        "image/png",
                        "image/jpeg",
@ -1429,26 +1361,30 @@ CRITICAL: Do NOT output anything other than the required JSON response.
                        "image/heif",
                        "image/gif",
                    ]
+                    supported_video_mimes = [
+                        "video/mp4",
+                        "video/webm",
+                        "video/quicktime",
+                        "video/x-msvideo",
+                        "video/x-matroska",
+                        "video/x-flv",
+                    ]
                    clean_mime_type = mime_type.split(";")[0].lower()

                    if (
                        clean_mime_type in supported_image_mimes
-                        or attachment_type == "video"
-                    ):  # Video frame is jpeg
+                        or clean_mime_type in supported_video_mimes
+                    ):
                        vertex_parts.append(
                            types.Part(
                                inline_data=types.Blob(
                                    data=image_bytes,
-                                    mime_type=(
-                                        clean_mime_type
-                                        if clean_mime_type in supported_image_mimes
-                                        else "image/jpeg"
-                                    ),
+                                    mime_type=clean_mime_type,
                                )
                            )
                        )
                        print(
-                            f"Added attachment {filename} ({attachment_type} as {clean_mime_type if clean_mime_type in supported_image_mimes else 'image/jpeg'}) to Vertex prompt"
+                            f"Added attachment {filename} ({attachment_type}) with MIME {clean_mime_type} to Vertex prompt"
                        )
                    else:
                        print(