Skip to content

Commit 60c3bd3

Browse files
committed
Refactor 'crosspost_cmp()' to use better message content comparison function, split 'message_length_threshold' into same and cross-channel cases
1 parent 748bdcf commit 60c3bd3

File tree

1 file changed

+57
-12
lines changed

1 file changed

+57
-12
lines changed

pcbot/exts/anti_crosspost.py

+57-12
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,22 @@
2020
fetched_attachments: dict[int, bytes] = {}
2121

2222

23+
def hamming_distance_padded(str1, str2):
24+
# Pad the shorter string with spaces to match the lengths
25+
max_len = max(len(str1), len(str2))
26+
str1 = str1.ljust(max_len)
27+
str2 = str2.ljust(max_len)
28+
29+
return sum(c1 != c2 for c1, c2 in zip(str1, str2))
30+
31+
32+
def hamming_similarity_score(str1, str2):
33+
distance = hamming_distance_padded(str1, str2)
34+
max_len = max(len(str1), len(str2))
35+
similarity_ratio = (max_len - distance) / max_len
36+
return similarity_ratio
37+
38+
2339
async def fetch_attachment(attachment: discord.Attachment, cache: bool = True) -> bytes:
2440
if cache and attachment.id in fetched_attachments:
2541
logger.debug(f"Fetched attachment from cache: {attachment.id}")
@@ -55,10 +71,7 @@ async def crosspost_cmp(message: discord.Message, other: discord.Message) -> boo
5571
)
5672

5773
if have_content:
58-
hamming_score = sum(
59-
x != y for x, y in zip(message.content, other.content)
60-
) / max(len(message.content), len(other.content))
61-
similarity_score = min(max(0, 1 - hamming_score), 1)
74+
similarity_score = hamming_similarity_score(message.content, other.content)
6275
logger.debug(f"Computed similarity score for content: {similarity_score}")
6376
else:
6477
similarity_score = 0
@@ -115,7 +128,8 @@ def __init__(
115128
bot: BotT,
116129
channel_ids: Collection[int],
117130
crosspost_timedelta_threshold: int,
118-
message_length_threshold: int,
131+
same_channel_message_length_threshold: int,
132+
cross_channel_message_length_threshold: int,
119133
max_tracked_users: int,
120134
max_tracked_message_groups_per_user: int,
121135
theme_color: int | discord.Color = 0,
@@ -127,7 +141,10 @@ def __init__(
127141
bot (BotT): The bot instance.
128142
channel_ids (Collection[int]): Collection of channel IDs to monitor.
129143
crosspost_timedelta_threshold (int): Minimum time difference between messages to not be considered crossposts.
130-
message_length_threshold (int): Minimum length of a text-only message to be considered.
144+
same_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
145+
if the messages are in the same channel.
146+
cross_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
147+
if the messages are in different channels.
131148
max_tracked_users (int): Maximum number of users to track.
132149
max_tracked_message_groups_per_user (int): Maximum number of message
133150
groups to track per user.
@@ -140,7 +157,12 @@ def __init__(
140157
self.crosspost_timedelta_threshold = crosspost_timedelta_threshold
141158
self.max_tracked_users = max_tracked_users
142159
self.max_tracked_message_groups_per_user = max_tracked_message_groups_per_user
143-
self.message_length_threshold = message_length_threshold
160+
self.same_channel_message_length_threshold = (
161+
same_channel_message_length_threshold
162+
)
163+
self.cross_channel_message_length_threshold = (
164+
cross_channel_message_length_threshold
165+
)
144166

145167
@commands.Cog.listener()
146168
async def on_message(self, message: discord.Message):
@@ -157,12 +179,18 @@ async def on_message(self, message: discord.Message):
157179
or (
158180
message.content
159181
and not message.attachments
160-
and len(message.content) < self.message_length_threshold
182+
and (
183+
len(message.content)
184+
< min(
185+
self.same_channel_message_length_threshold,
186+
self.cross_channel_message_length_threshold,
187+
)
188+
)
161189
)
162190
):
163191
return
164192

165-
logger.debug(f"Received message from {message.author.name}: {message.jump_url}")
193+
logger.debug(f"Received noteworthy message from {message.author.name}: {message.jump_url}")
166194

167195
# Attempt to enforce the cache size limit
168196
for user_id in list(self.crossposting_cache.keys()):
@@ -189,6 +217,18 @@ async def on_message(self, message: discord.Message):
189217
for messages in user_cache["message_groups"]:
190218
for existing_message in messages:
191219
if (
220+
message.channel.id == existing_message.channel.id
221+
and len(message.content)
222+
< self.same_channel_message_length_threshold
223+
) or (
224+
message.channel.id != existing_message.channel.id
225+
and len(message.content)
226+
< self.cross_channel_message_length_threshold
227+
):
228+
# enforce same-channel and cross-channel message length thresholds in order for them to be considered crossposts
229+
continue
230+
231+
elif (
192232
await crosspost_cmp(message, existing_message)
193233
and message.created_at.timestamp()
194234
- existing_message.created_at.timestamp()
@@ -320,7 +360,8 @@ async def setup(
320360
max_tracked_users: int = 10,
321361
max_tracked_message_groups_per_user: int = 10,
322362
crosspost_timedelta_threshold: int = 86400,
323-
message_length_threshold: int = 64,
363+
same_channel_message_length_threshold: int = 64,
364+
cross_channel_message_length_threshold: int = 16,
324365
theme_color: int | discord.Color = 0,
325366
):
326367
"""
@@ -332,15 +373,19 @@ async def setup(
332373
max_tracked_users (int): Maximum number of users to track.
333374
max_tracked_message_groups_per_user (int): Maximum number of message groups to track per user.
334375
crosspost_timedelta_threshold (int): Minimum time difference between messages to not be considered crossposts.
335-
message_length_threshold (int): Minimum length of a text-only message to be considered.
376+
same_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
377+
if the messages are in the same channel.
378+
cross_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
379+
if the messages are in different channels.
336380
theme_color (int | discord.Color): Theme color for the bot's responses.
337381
"""
338382
await bot.add_cog(
339383
AntiCrosspostCog(
340384
bot,
341385
channel_ids,
342386
crosspost_timedelta_threshold,
343-
message_length_threshold,
387+
same_channel_message_length_threshold,
388+
cross_channel_message_length_threshold,
344389
max_tracked_users,
345390
max_tracked_message_groups_per_user,
346391
theme_color,

0 commit comments

Comments
 (0)