Change spam check to apply to local accounts and add a threshold (#11806)
Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives
This commit is contained in:
parent
577706987d
commit
4f6af87906
4 changed files with 66 additions and 29 deletions
|
@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
|
||||||
end
|
end
|
||||||
|
|
||||||
def check_for_spam
|
def check_for_spam
|
||||||
spam_check = SpamCheck.new(@status)
|
SpamCheck.perform(@status)
|
||||||
|
|
||||||
return if spam_check.skip?
|
|
||||||
|
|
||||||
if spam_check.spam?
|
|
||||||
spam_check.flag!
|
|
||||||
else
|
|
||||||
spam_check.remember!
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def forward_for_reply
|
def forward_for_reply
|
||||||
|
|
|
@ -4,9 +4,25 @@ class SpamCheck
|
||||||
include Redisable
|
include Redisable
|
||||||
include ActionView::Helpers::TextHelper
|
include ActionView::Helpers::TextHelper
|
||||||
|
|
||||||
|
# Threshold over which two Nilsimsa values are considered
|
||||||
|
# to refer to the same text
|
||||||
NILSIMSA_COMPARE_THRESHOLD = 95
|
NILSIMSA_COMPARE_THRESHOLD = 95
|
||||||
NILSIMSA_MIN_SIZE = 10
|
|
||||||
EXPIRE_SET_AFTER = 1.week.seconds
|
# Nilsimsa doesn't work well on small inputs, so below
|
||||||
|
# this size, we check only for exact matches with MD5
|
||||||
|
NILSIMSA_MIN_SIZE = 10
|
||||||
|
|
||||||
|
# How long to keep the trail of digests between updates,
|
||||||
|
# there is no reason to store it forever
|
||||||
|
EXPIRE_SET_AFTER = 1.week.seconds
|
||||||
|
|
||||||
|
# How many digests to keep in an account's trail. If it's
|
||||||
|
# too small, spam could rotate around different message templates
|
||||||
|
MAX_TRAIL_SIZE = 10
|
||||||
|
|
||||||
|
# How many detected duplicates to allow through before
|
||||||
|
# considering the message as spam
|
||||||
|
THRESHOLD = 5
|
||||||
|
|
||||||
def initialize(status)
|
def initialize(status)
|
||||||
@account = status.account
|
@account = status.account
|
||||||
|
@ -21,9 +37,9 @@ class SpamCheck
|
||||||
if insufficient_data?
|
if insufficient_data?
|
||||||
false
|
false
|
||||||
elsif nilsimsa?
|
elsif nilsimsa?
|
||||||
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
|
digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
|
||||||
else
|
else
|
||||||
any_other_digest?('md5') { |_, other_digest| other_digest == digest }
|
digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -38,7 +54,7 @@ class SpamCheck
|
||||||
# get the correct status ID back, we have to save it in the string value
|
# get the correct status ID back, we have to save it in the string value
|
||||||
|
|
||||||
redis.zadd(redis_key, @status.id, digest_with_algorithm)
|
redis.zadd(redis_key, @status.id, digest_with_algorithm)
|
||||||
redis.zremrangebyrank(redis_key, '0', '-10')
|
redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
|
||||||
redis.expire(redis_key, EXPIRE_SET_AFTER)
|
redis.expire(redis_key, EXPIRE_SET_AFTER)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -78,6 +94,20 @@ class SpamCheck
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
class << self
|
||||||
|
def perform(status)
|
||||||
|
spam_check = new(status)
|
||||||
|
|
||||||
|
return if spam_check.skip?
|
||||||
|
|
||||||
|
if spam_check.spam?
|
||||||
|
spam_check.flag!
|
||||||
|
else
|
||||||
|
spam_check.remember!
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def disabled?
|
def disabled?
|
||||||
|
@ -149,14 +179,14 @@ class SpamCheck
|
||||||
redis.zrange(redis_key, 0, -1)
|
redis.zrange(redis_key, 0, -1)
|
||||||
end
|
end
|
||||||
|
|
||||||
def any_other_digest?(filter_algorithm)
|
def digests_over_threshold?(filter_algorithm)
|
||||||
other_digests.any? do |record|
|
other_digests.select do |record|
|
||||||
algorithm, other_digest, status_id = record.split(':')
|
algorithm, other_digest, status_id = record.split(':')
|
||||||
|
|
||||||
next unless algorithm == filter_algorithm
|
next unless algorithm == filter_algorithm
|
||||||
|
|
||||||
yield algorithm, other_digest, status_id
|
yield algorithm, other_digest, status_id
|
||||||
end
|
end.size >= THRESHOLD
|
||||||
end
|
end
|
||||||
|
|
||||||
def matching_status_ids
|
def matching_status_ids
|
||||||
|
|
|
@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService
|
||||||
end
|
end
|
||||||
|
|
||||||
status.save!
|
status.save!
|
||||||
|
check_for_spam(status)
|
||||||
|
|
||||||
mentions.each { |mention| create_notification(mention) }
|
mentions.each { |mention| create_notification(mention) }
|
||||||
end
|
end
|
||||||
|
@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService
|
||||||
def resolve_account_service
|
def resolve_account_service
|
||||||
ResolveAccountService.new
|
ResolveAccountService.new
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def check_for_spam(status)
|
||||||
|
SpamCheck.perform(status)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -86,23 +86,33 @@ RSpec.describe SpamCheck do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'returns true for duplicate statuses to the same recipient' do
|
it 'returns true for duplicate statuses to the same recipient' do
|
||||||
status1 = status_with_html('@alice Hello')
|
described_class::THRESHOLD.times do
|
||||||
described_class.new(status1).remember!
|
status1 = status_with_html('@alice Hello')
|
||||||
|
described_class.new(status1).remember!
|
||||||
|
end
|
||||||
|
|
||||||
status2 = status_with_html('@alice Hello')
|
status2 = status_with_html('@alice Hello')
|
||||||
expect(described_class.new(status2).spam?).to be true
|
expect(described_class.new(status2).spam?).to be true
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'returns true for duplicate statuses to different recipients' do
|
it 'returns true for duplicate statuses to different recipients' do
|
||||||
status1 = status_with_html('@alice Hello')
|
described_class::THRESHOLD.times do
|
||||||
described_class.new(status1).remember!
|
status1 = status_with_html('@alice Hello')
|
||||||
|
described_class.new(status1).remember!
|
||||||
|
end
|
||||||
|
|
||||||
status2 = status_with_html('@bob Hello')
|
status2 = status_with_html('@bob Hello')
|
||||||
expect(described_class.new(status2).spam?).to be true
|
expect(described_class.new(status2).spam?).to be true
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'returns true for nearly identical statuses with random numbers' do
|
it 'returns true for nearly identical statuses with random numbers' do
|
||||||
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
|
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
|
||||||
status1 = status_with_html('@alice ' + source_text + ' 1234')
|
|
||||||
described_class.new(status1).remember!
|
described_class::THRESHOLD.times do
|
||||||
|
status1 = status_with_html('@alice ' + source_text + ' 1234')
|
||||||
|
described_class.new(status1).remember!
|
||||||
|
end
|
||||||
|
|
||||||
status2 = status_with_html('@bob ' + source_text + ' 9568')
|
status2 = status_with_html('@bob ' + source_text + ' 9568')
|
||||||
expect(described_class.new(status2).spam?).to be true
|
expect(described_class.new(status2).spam?).to be true
|
||||||
end
|
end
|
||||||
|
@ -140,9 +150,9 @@ RSpec.describe SpamCheck do
|
||||||
let(:redis_key) { spam_check.send(:redis_key) }
|
let(:redis_key) { spam_check.send(:redis_key) }
|
||||||
|
|
||||||
it 'remembers' do
|
it 'remembers' do
|
||||||
expect do
|
expect(Redis.current.exists(redis_key)).to be true
|
||||||
spam_check.remember!
|
spam_check.remember!
|
||||||
end.to change { Redis.current.exists(redis_key) }.from(false).to(true)
|
expect(Redis.current.exists(redis_key)).to be true
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -156,9 +166,9 @@ RSpec.describe SpamCheck do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'resets' do
|
it 'resets' do
|
||||||
expect do
|
expect(Redis.current.exists(redis_key)).to be true
|
||||||
spam_check.reset!
|
spam_check.reset!
|
||||||
end.to change { Redis.current.exists(redis_key) }.from(true).to(false)
|
expect(Redis.current.exists(redis_key)).to be false
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue