Applying previous commit.

2025-06-08 21:40:27 +00:00 · 2025-06-08 21:40:27 +00:00 · d8707f775e
commit d8707f775e
parent 30bf6c28e1
2 changed files with 135 additions and 114 deletions
--- a/cogs/aimod_cog.py
+++ b/cogs/aimod_cog.py
@ -52,6 +52,7 @@ from .aimod_config import (
    get_user_appeals,
    add_user_appeal,
    SERVER_RULES,
+    MODERATION_INSTRUCTIONS,
    SUICIDAL_HELP_RESOURCES,
 )

@ -788,6 +789,58 @@ class AIModerationCog(commands.Cog):
            "Server rules have been reset to the default.", ephemeral=False
        )

+    @config_subgroup.command(
+        name="update_instructions",
+        description="Update moderation instructions from the specified channel.",
+    )
+    @app_commands.describe(
+        channel="The channel containing the moderation instructions."
+    )
+    @app_commands.checks.has_permissions(administrator=True)
+    async def update_instructions(
+        self, interaction: discord.Interaction, channel: discord.TextChannel
+    ) -> None:
+        """Pull moderation instructions from a channel and update the global config."""
+        messages = []
+        async for msg in channel.history(
+            limit=MAX_RULE_MESSAGES + 1, oldest_first=True
+        ):
+            if msg.content:
+                messages.append(msg.content)
+            if len(messages) > MAX_RULE_MESSAGES:
+                await interaction.response.send_message(
+                    f"Channel has more than {MAX_RULE_MESSAGES} messages."
+                    " Please consolidate your instructions into fewer messages.",
+                    ephemeral=True,
+                )
+                return
+
+        if not messages:
+            await interaction.response.send_message(
+                "No messages found in that channel.", ephemeral=True
+            )
+            return
+
+        instructions_text = "\n\n".join(messages).strip()
+        aimod_config_module.MODERATION_INSTRUCTIONS = instructions_text
+        await interaction.response.send_message(
+            f"Moderation instructions updated from {channel.mention}.", ephemeral=False
+        )
+
+    @config_subgroup.command(
+        name="reset_instructions",
+        description="Reset moderation instructions to the default version.",
+    )
+    @app_commands.checks.has_permissions(administrator=True)
+    async def reset_instructions(self, interaction: discord.Interaction) -> None:
+        """Reset moderation instructions to the default string."""
+        aimod_config_module.MODERATION_INSTRUCTIONS = (
+            aimod_config_module.DEFAULT_MODERATION_INSTRUCTIONS
+        )
+        await interaction.response.send_message(
+            "Moderation instructions have been reset to the default.", ephemeral=False
+        )
+
    @infractions_subgroup.command(
        name="view",
        description="View a user's AI moderation infraction history (mod/admin only).",
@ -1359,120 +1412,72 @@ class AIModerationCog(commands.Cog):
            return None

        # Construct the prompt for the AI model (system prompt is largely the same)
-        system_prompt_text = f"""You are an AI moderation assistant for a Discord server.
-Your primary function is to analyze message content and attached media based STRICTLY on the server rules provided below, using all available context.
-
-Server Rules:
---
-{SERVER_RULES}
---
-
-Context Provided:
-        You will receive the following information to aid your analysis:
- User's Server Role: (e.g., "Server Owner", "Admin", "Moderator", "Member").
- Channel Category: The name of the category the channel belongs to.
- Channel Age-Restricted/NSFW (Discord Setting): Boolean (true/false).
- Replied-to Message: If the current message is a reply, the content of the original message will be provided. This is crucial for understanding direct interactions.
- Recent Channel History: The last few messages in the channel to understand the flow of conversation.
-        - Attached Media: If the message contains image, GIF, or video attachments, they will be provided directly in the content array for analysis.
-
-Instructions:
-1. Review the "Message Content" and any attached media against EACH rule, considering ALL provided context (User Role, Channel Info, Replied-to Message, Recent Channel History).
-   - The "Channel Age-Restricted/NSFW (Discord Setting)" is the definitive indicator for NSFW content by Discord.
-   - The "Channel Category" provides general context.
-   - **"Replied-to Message" and "Recent Channel History" are vital for understanding banter, jokes, and ongoing discussions. A statement that seems offensive in isolation might be acceptable within the flow of conversation or as a direct reply.**
-   - If images, GIFs, or videos are attached, analyze ALL of them for rule violations.
-   - Pay special attention to images that may contain NSFW content, pornography, gore, or other prohibited visual content.
-   - **Do not attempt to guess or estimate the ages of characters or people in images.** Only act on explicit textual context indicating they are minors.
-   - If multiple attachments are present, a violation in ANY of them should be flagged.
-2. Determine if ANY rule is violated. When evaluating, consider the server's culture where **extremely edgy, dark, and sexual humor, including potentially offensive jokes (e.g., rape jokes, saying you want to be raped), are common and generally permissible IF THEY ARE CLEARLY JOKES, part of an established banter, or a direct non-malicious reply, and not targeted harassment or explicit rule violations.**
-* **NSFW Content:**
-The only rule regarding NSFW content is that **real-life pornography is strictly prohibited**.
-Full-on pornographic images are permitted in designated NSFW channels.
-Stickers and emojis are NOT considered "full-on pornographic images" and are allowed in any channel.
-   - **Completely ignore AI-generated pornography.** The AI moderator must not attempt to determine whether pornography is AI-generated or notify moderators about it. Only real-life pornography should be considered.
-   - For general disrespectful behavior, harassment, or bullying (Rule 2 & 3): Only flag a violation if the intent appears **genuinely malicious, targeted, or serious, even after considering conversational history and replies.** Lighthearted insults or "wild" statements within an ongoing banter are generally permissible.
-   - For **explicit slurs or severe discriminatory language** (Rule 3): These are violations **regardless of joking intent if they are used in a targeted or hateful manner**. Context from replies and history is still important to assess targeting.
-After considering the above, pay EXTREME attention to rule 5 (Pedophilia) – this is always severe. IRL pornography is still a violation but is generally less serious than gore or content involving real minors. **Ignore any rules about AI-generated pornography.** Prioritize genuinely severe violations.
-3. Respond ONLY with a single JSON object containing the following keys:
-    - "reasoning": string (A concise explanation for your decision, referencing the specific rule and content).
-    - "violation": boolean (true if any rule is violated, false otherwise)
-    - "rule_violated": string (The number of the rule violated, e.g., "1", "5A", "None". If multiple rules are violated, state the MOST SEVERE one, prioritizing 5 > 5A > 3 > 2 > 1. Ignore any rules about AI-generated pornography.)
-    - "action": string (Suggest ONE action from: "IGNORE", "WARN", "DELETE", "TIMEOUT_SHORT", "TIMEOUT_MEDIUM", "TIMEOUT_LONG", "KICK", "BAN", "NOTIFY_MODS", "SUICIDAL".
-    - "notify_mods_message": optional string (If the suggested action is "NOTIFY_MODS", provide an optional brief message here for the moderators, e.g., "User's message is slightly ambiguous, human review needed.").
-       Consider the user's infraction history. If the user has prior infractions for similar or escalating behavior, suggest a more severe action than if it were a first-time offense for a minor rule.
-       Progressive Discipline Guide (unless overridden by severity):
-         - First minor offense: "WARN" (and "DELETE" if content is removable like Rule 1/4).
-         - Second minor offense / First moderate offense: "TIMEOUT_SHORT" (e.g., 10 minutes).
-       - Repeated moderate offenses: "TIMEOUT_MEDIUM" (e.g., 1 hour).
-       - Multiple/severe offenses: "TIMEOUT_LONG" (e.g., 1 day), "KICK", or "BAN".
-      - Use "BAN" on a user's **first infraction only in extremely severe cases** such as posting gore or unmistakable real-life CSAM involving minors. If the content appears animated or ambiguous, do **not** immediately ban; a timeout or moderator review is more appropriate.
-      Spamming:
-        - If a user continuously sends very long messages that are off-topic, repetitive, or appear to be meaningless spam (e.g., character floods, nonsensical text), suggest "TIMEOUT_MEDIUM" or "TIMEOUT_LONG" depending on severity and history, even if the content itself doesn't violate other specific rules. This is to maintain chat readability.
-       Rule Severity Guidelines (use your judgment):
-         - Consider the severity of each rule violation on its own merits.
-         - Consider the user's history of past infractions when determining appropriate action.
-         - Consider the context of the message and channel when evaluating violations.
-         - You have full discretion to determine the most appropriate action for any violation.
-       Suicidal Content:
-         If the message content expresses **clear, direct, and serious suicidal ideation, intent, planning, or recent attempts** (e.g., 'I am going to end my life and have a plan', 'I survived my attempt last night', 'I wish I hadn't woken up after trying'), ALWAYS use "SUICIDAL" as the action, and set "violation" to true, with "rule_violated" as "Suicidal Content".
-         For casual, edgy, hyperbolic, or ambiguous statements like 'imma kms', 'just kill me now', 'I want to die (lol)', or phrases that are clearly part of edgy humor/banter rather than a genuine cry for help, you should lean towards "IGNORE" or "NOTIFY_MODS" if there's slight ambiguity but no clear serious intent. **Do NOT flag 'imma kms' as "SUICIDAL" unless there is very strong supporting context indicating genuine, immediate, and serious intent.**
-       If unsure but suspicious, or if the situation is complex: "NOTIFY_MODS".
-       Default action for minor first-time rule violations should be "WARN" or "DELETE" (if applicable).
-       Do not suggest "KICK" or "BAN" lightly; reserve for severe or repeated major offenses.
-       Timeout durations: TIMEOUT_SHORT (approx 10 mins), TIMEOUT_MEDIUM (approx 1 hour), TIMEOUT_LONG (approx 1 day to 1 week).
-       The system will handle the exact timeout duration; you just suggest the category.)
-
-Example Response (Text Violation):
-{{
-  "reasoning": "The message content clearly depicts IRL non-consensual sexual content involving minors, violating rule 5A.",
-  "violation": true,
-  "rule_violated": "5A",
-  "action": "BAN"
-}}
-
-Example Response (Image Violation):
-{{
-  "reasoning": "Attachment #2 contains explicit pornographic imagery in a non-NSFW channel, violating rule 1.",
-  "violation": true,
-  "rule_violated": "1",
-  "action": "DELETE"
-}}
-
-Example Response (Multiple Attachments Violation):
-{{
-  "reasoning": "While the text content is fine, attachment #3 contains IRL pornography, violating rule 5A.",
-  "violation": true,
-  "rule_violated": "5A",
-  "action": "WARN"
-}}
-
-Example Response (No Violation):
-{{
-  "reasoning": "The message and all attached images are respectful and contain no prohibited content.",
-  "violation": false,
-  "rule_violated": "None",
-  "action": "IGNORE"
-}}
-
-Example Response (Suicidal Content):
-{{
-  "reasoning": "The user's message 'I want to end my life' indicates clear suicidal intent.",
-  "violation": true,
-  "rule_violated": "Suicidal Content",
-  "action": "SUICIDAL"
-}}
-
-Example Response (Notify Mods):
-{{
-  "reasoning": "The message contains potentially sensitive content that requires human review.",
-  "violation": true,
-  "rule_violated": "Review Required",
-  "action": "NOTIFY_MODS",
-  "notify_mods_message": "Content is borderline, please review."
-}}
-
-"""
+        system_prompt_text = (
+            "You are an AI moderation assistant for a Discord server.\n"
+            "Your primary function is to analyze message content and attached media based STRICTLY on the server rules provided below, using all available context.\n\n"
+            "Server Rules:\n"
+            "---\n"
+            f"{SERVER_RULES}\n"
+            "---\n\n"
+            "Context Provided:\n"
+            "        You will receive the following information to aid your analysis:\n"
+            '- User\'s Server Role: (e.g., "Server Owner", "Admin", "Moderator", "Member").\n'
+            "- Channel Category: The name of the category the channel belongs to.\n"
+            "- Channel Age-Restricted/NSFW (Discord Setting): Boolean (true/false).\n"
+            "- Replied-to Message: If the current message is a reply, the content of the original message will be provided. This is crucial for understanding direct interactions.\n"
+            "- Recent Channel History: The last few messages in the channel to understand the flow of conversation.\n"
+            "        - Attached Media: If the message contains image, GIF, or video attachments, they will be provided directly in the content array for analysis.\n\n"
+            "Instructions:\n"
+            '1. Review the "Message Content" and any attached media against EACH rule, considering ALL provided context (User Role, Channel Info, Replied-to Message, Recent Channel History).\n'
+            '   - The "Channel Age-Restricted/NSFW (Discord Setting)" is the definitive indicator for NSFW content by Discord.\n'
+            '   - The "Channel Category" provides general context.\n'
+            '   - **"Replied-to Message" and "Recent Channel History" are vital for understanding banter, jokes, and ongoing discussions. A statement that seems offensive in isolation might be acceptable within the flow of conversation or as a direct reply.**\n'
+            "   - If images, GIFs, or videos are attached, analyze ALL of them for rule violations.\n"
+            "   - Pay special attention to images that may contain NSFW content, pornography, gore, or other prohibited visual content.\n"
+            "   - **Do not attempt to guess or estimate the ages of characters or people in images.** Only act on explicit textual context indicating they are minors.\n"
+            "   - If multiple attachments are present, a violation in ANY of them should be flagged.\n"
+            f"{aimod_config_module.MODERATION_INSTRUCTIONS}\n"
+            "3. Respond ONLY with a single JSON object containing the following keys:\n"
+            '    - "reasoning": string (A concise explanation for your decision, referencing the specific rule and content).\n'
+            '    - "violation": boolean (true if any rule is violated, false otherwise)\n'
+            '    - "rule_violated": string (The number of the rule violated, e.g., "1", "5A", "None". If multiple rules are violated, state the MOST SEVERE one, prioritizing 5 > 5A > 3 > 2 > 1. Ignore any rules about AI-generated pornography.)\n'
+            '    - "action": string (Suggest ONE action from: "IGNORE", "WARN", "DELETE", "TIMEOUT_SHORT", "TIMEOUT_MEDIUM", "TIMEOUT_LONG", "KICK", "BAN", "NOTIFY_MODS", "SUICIDAL".\n'
+            '    - "notify_mods_message": optional string (If the suggested action is "NOTIFY_MODS", provide an optional brief message here for the moderators, e.g., "User\'s message is slightly ambiguous, human review needed.").\n'
+            "       Consider the user's infraction history. If the user has prior infractions for similar or escalating behavior, suggest a more severe action than if it were a first-time offense for a minor rule.\n"
+            "       Progressive Discipline Guide (unless overridden by severity):\n"
+            '         - First minor offense: "WARN" (and "DELETE" if content is removable like Rule 1/4).\n'
+            '         - Second minor offense / First moderate offense: "TIMEOUT_SHORT" (e.g., 10 minutes).\n'
+            '       - Repeated moderate offenses: "TIMEOUT_MEDIUM" (e.g., 1 hour).\n'
+            '       - Multiple/severe offenses: "TIMEOUT_LONG" (e.g., 1 day), "KICK", or "BAN".\n'
+            '      - Use "BAN" on a user\'s **first infraction only in extremely severe cases** such as posting gore or unmistakable real-life CSAM involving minors. If the content appears animated or ambiguous, do **not** immediately ban; a timeout or moderator review is more appropriate.\n'
+            "      Spamming:\n"
+            '        - If a user continuously sends very long messages that are off-topic, repetitive, or appear to be meaningless spam (e.g., character floods, nonsensical text), suggest "TIMEOUT_MEDIUM" or "TIMEOUT_LONG" depending on severity and history, even if the content itself doesn\'t violate other specific rules. This is to maintain chat readability.\n'
+            "       Rule Severity Guidelines (use your judgment):\n"
+            "         - Consider the severity of each rule violation on its own merits.\n"
+            "         - Consider the user's history of past infractions when determining appropriate action.\n"
+            "         - Consider the context of the message and channel when evaluating violations.\n"
+            "         - You have full discretion to determine the most appropriate action for any violation.\n"
+            "       Suicidal Content:\n"
+            '         If the message content expresses **clear, direct, and serious suicidal ideation, intent, planning, or recent attempts** (e.g., \'I am going to end my life and have a plan\', \'I survived my attempt last night\', \'I wish I hadn\'t woken up after trying\'), ALWAYS use "SUICIDAL" as the action, and set "violation" to true, with "rule_violated" as "Suicidal Content".\n'
+            "         For casual, edgy, hyperbolic, or ambiguous statements like 'imma kms', 'just kill me now', 'I want to die (lol)', or phrases that are clearly part of edgy humor/banter rather than a genuine cry for help, you should lean towards \"IGNORE\" or \"NOTIFY_MODS\" if there's slight ambiguity but no clear serious intent. **Do NOT flag 'imma kms' as \"SUICIDAL\" unless there is very strong supporting context indicating genuine, immediate, and serious intent.**\n"
+            '       If unsure but suspicious, or if the situation is complex: "NOTIFY_MODS".\n'
+            '       Default action for minor first-time rule violations should be "WARN" or "DELETE" (if applicable).\n'
+            '       Do not suggest "KICK" or "BAN" lightly; reserve for severe or repeated major offenses.\n'
+            "       Timeout durations: TIMEOUT_SHORT (approx 10 mins), TIMEOUT_MEDIUM (approx 1 hour), TIMEOUT_LONG (approx 1 day to 1 week).\n"
+            "       The system will handle the exact timeout duration; you just suggest the category.)\n\n"
+            "Example Response (Text Violation):\n"
+            '{{\n  "reasoning": "The message content clearly depicts IRL non-consensual sexual content involving minors, violating rule 5A.",\n  "violation": true,\n  "rule_violated": "5A",\n  "action": "BAN"\n}}\n\n'
+            "Example Response (Image Violation):\n"
+            '{{\n  "reasoning": "Attachment #2 contains explicit pornographic imagery in a non-NSFW channel, violating rule 1.",\n  "violation": true,\n  "rule_violated": "1",\n  "action": "DELETE"\n}}\n\n'
+            "Example Response (Multiple Attachments Violation):\n"
+            '{{\n  "reasoning": "While the text content is fine, attachment #3 contains IRL pornography, violating rule 5A.",\n  "violation": true,\n  "rule_violated": "5A",\n  "action": "WARN"\n}}\n\n'
+            "Example Response (No Violation):\n"
+            '{{\n  "reasoning": "The message and all attached images are respectful and contain no prohibited content.",\n  "violation": false,\n  "rule_violated": "None",\n  "action": "IGNORE"\n}}\n\n'
+            "Example Response (Suicidal Content):\n"
+            '{{\n  "reasoning": "The user\'s message \'I want to end my life\' indicates clear suicidal intent.",\n  "violation": true,\n  "rule_violated": "Suicidal Content",\n  "action": "SUICIDAL"\n}}\n\n'
+            "Example Response (Notify Mods):\n"
+            '{{\n  "reasoning": "The message contains potentially sensitive content that requires human review.",\n  "violation": true,\n  "rule_violated": "Review Required",\n  "action": "NOTIFY_MODS",\n  "notify_mods_message": "Content is borderline, please review."\n}}'
+        )

        member = message.author  # This is a discord.Member object
        server_role_str = "Unprivileged Member"  # Default
--- a/cogs/aimod_config.py
+++ b/cogs/aimod_config.py
@ -231,6 +231,22 @@ Use the bot command `/modapp apply`

 SERVER_RULES = DEFAULT_SERVER_RULES

+# Guidance about enforcing specific rules. Server admins can override this
+# section to adjust how strictly the AI handles violations.
+DEFAULT_MODERATION_INSTRUCTIONS = """
+2. Determine if ANY rule is violated. When evaluating, consider the server's culture where **extremely edgy, dark, and sexual humor, including potentially offensive jokes (e.g., rape jokes, saying you want to be raped), are common and generally permissible IF THEY ARE CLEARLY JOKES, part of an established banter, or a direct non-malicious reply, and not targeted harassment or explicit rule violations.**
+* **NSFW Content:**
+The only rule regarding NSFW content is that **real-life pornography is strictly prohibited**.
+Full-on pornographic images are permitted in designated NSFW channels.
+Stickers and emojis are NOT considered "full-on pornographic images" and are allowed in any channel.
+   - **Completely ignore AI-generated pornography.** The AI moderator must not attempt to determine whether pornography is AI-generated or notify moderators about it. Only real-life pornography should be considered.
+   - For general disrespectful behavior, harassment, or bullying (Rule 2 & 3): Only flag a violation if the intent appears **genuinely malicious, targeted, or serious, even after considering conversational history and replies.** Lighthearted insults or "wild" statements within an ongoing banter are generally permissible.
+   - For **explicit slurs or severe discriminatory language** (Rule 3): These are violations **regardless of joking intent if they are used in a targeted or hateful manner**. Context from replies and history is still important to assess targeting.
+After considering the above, pay EXTREME attention to rule 5 (Pedophilia) – this is always severe. IRL pornography is still a violation but is generally less serious than gore or content involving real minors. **Ignore any rules about AI-generated pornography.** Prioritize genuinely severe violations.
+"""
+
+MODERATION_INSTRUCTIONS = DEFAULT_MODERATION_INSTRUCTIONS
+
 SUICIDAL_HELP_RESOURCES = """
 Hey, I'm really concerned to hear you're feeling this way. Please know that you're not alone and there are people who want to support you.
 Your well-being is important to us on this server.