spam_check.rb 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # frozen_string_literal: true
  2. class SpamCheck
  3. include Redisable
  4. include ActionView::Helpers::TextHelper
  5. # Threshold over which two Nilsimsa values are considered
  6. # to refer to the same text
  7. NILSIMSA_COMPARE_THRESHOLD = 95
  8. # Nilsimsa doesn't work well on small inputs, so below
  9. # this size, we check only for exact matches with MD5
  10. NILSIMSA_MIN_SIZE = 10
  11. # How long to keep the trail of digests between updates,
  12. # there is no reason to store it forever
  13. EXPIRE_SET_AFTER = 1.week.seconds
  14. # How many digests to keep in an account's trail. If it's
  15. # too small, spam could rotate around different message templates
  16. MAX_TRAIL_SIZE = 10
  17. # How many detected duplicates to allow through before
  18. # considering the message as spam
  19. THRESHOLD = 5
  20. def initialize(status)
  21. @account = status.account
  22. @status = status
  23. end
  24. def skip?
  25. disabled? || already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply?
  26. end
  27. def spam?
  28. if insufficient_data?
  29. false
  30. elsif nilsimsa?
  31. digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
  32. else
  33. digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
  34. end
  35. end
  36. def flag!
  37. auto_report_status!
  38. end
  39. def remember!
  40. # The scores in sorted sets don't actually have enough bits to hold an exact
  41. # value of our snowflake IDs, so we use it only for its ordering property. To
  42. # get the correct status ID back, we have to save it in the string value
  43. redis.zadd(redis_key, @status.id, digest_with_algorithm)
  44. redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
  45. redis.expire(redis_key, EXPIRE_SET_AFTER)
  46. end
  47. def reset!
  48. redis.del(redis_key)
  49. end
  50. def hashable_text
  51. return @hashable_text if defined?(@hashable_text)
  52. @hashable_text = @status.text
  53. @hashable_text = remove_mentions(@hashable_text)
  54. @hashable_text = strip_tags(@hashable_text) unless @status.local?
  55. @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text)
  56. @hashable_text = remove_whitespace(@hashable_text)
  57. end
  58. def insufficient_data?
  59. hashable_text.blank?
  60. end
  61. def digest
  62. @digest ||= begin
  63. if nilsimsa?
  64. Nilsimsa.new(hashable_text).hexdigest
  65. else
  66. Digest::MD5.hexdigest(hashable_text)
  67. end
  68. end
  69. end
  70. def digest_with_algorithm
  71. if nilsimsa?
  72. ['nilsimsa', digest, @status.id].join(':')
  73. else
  74. ['md5', digest, @status.id].join(':')
  75. end
  76. end
  77. class << self
  78. def perform(status)
  79. spam_check = new(status)
  80. return if spam_check.skip?
  81. if spam_check.spam?
  82. spam_check.flag!
  83. else
  84. spam_check.remember!
  85. end
  86. end
  87. end
  88. private
  89. def disabled?
  90. !Setting.spam_check_enabled
  91. end
  92. def remove_mentions(text)
  93. return text.gsub(Account::MENTION_RE, '') if @status.local?
  94. Nokogiri::HTML.fragment(text).tap do |html|
  95. mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) }
  96. html.traverse do |element|
  97. element.unlink if element.name == 'a' && mentions.include?(element['href'])
  98. end
  99. end.to_s
  100. end
  101. def normalize_unicode(text)
  102. text.unicode_normalize(:nfkc).downcase
  103. end
  104. def remove_whitespace(text)
  105. text.gsub(/\s+/, ' ').strip
  106. end
  107. def auto_report_status!
  108. status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable?
  109. ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected'))
  110. end
  111. def already_flagged?
  112. @account.silenced? || @account.targeted_reports.unresolved.where(account_id: -99).exists?
  113. end
  114. def trusted?
  115. @account.trust_level > Account::TRUST_LEVELS[:untrusted] || (@account.local? && @account.user_staff?)
  116. end
  117. def no_unsolicited_mentions?
  118. @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) }
  119. end
  120. def solicited_reply?
  121. !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists?
  122. end
  123. def nilsimsa_compare_value(first, second)
  124. first = [first].pack('H*')
  125. second = [second].pack('H*')
  126. bits = 0
  127. 0.upto(31) do |i|
  128. bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord
  129. end
  130. 128 - bits # -128 <= Nilsimsa Compare Value <= 128
  131. end
  132. def nilsimsa?
  133. hashable_text.size > NILSIMSA_MIN_SIZE
  134. end
  135. def other_digests
  136. redis.zrange(redis_key, 0, -1)
  137. end
  138. def digests_over_threshold?(filter_algorithm)
  139. other_digests.select do |record|
  140. algorithm, other_digest, status_id = record.split(':')
  141. next unless algorithm == filter_algorithm
  142. yield algorithm, other_digest, status_id
  143. end.size >= THRESHOLD
  144. end
  145. def matching_status_ids
  146. if nilsimsa?
  147. other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact
  148. else
  149. other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact
  150. end
  151. end
  152. def redis_key
  153. @redis_key ||= "spam_check:#{@account.id}"
  154. end
  155. end