Full-text search for authorized statuses (#6423)

* Add full-text search for authorized statuses

- Search API will return statuses that match the query
- Only for logged in users
- Only if you are author of the status,
- Or you were mentioned in it
- Or you favourited or reblogged it
- Configuration over `ES_ENABLED`, `ES_HOST`, `ES_PORT`, `ES_PREFIX`
- Run `rails chewy:deploy` to create & populate index

Fix #5880
Fix #4293
Fix #1152

* Add commented out docker-compose configuration for ES container

* Optimize index import, filter search results

* Add basic normalization to the index

* Add better stemming and normalization to the index

* Skip webfinger request if search query includes both @ and a space

* Fix code style

* Visually separate search result sections

* Fix code style issues
This commit is contained in:
Eugen Rochko 2018-02-09 23:04:47 +01:00 committed by GitHub
parent 235c14c79d
commit 3ebc0ad4d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 230 additions and 5 deletions

View file

@ -9,6 +9,10 @@ DB_USER=postgres
DB_NAME=postgres DB_NAME=postgres
DB_PASS= DB_PASS=
DB_PORT=5432 DB_PORT=5432
# Optional ElasticSearch configuration
# ES_ENABLED=true
# ES_HOST=localhost
# ES_PORT=9200
# Federation # Federation
# Note: Changing LOCAL_DOMAIN at a later time will cause unwanted side effects, including breaking all existing federation. # Note: Changing LOCAL_DOMAIN at a later time will cause unwanted side effects, including breaking all existing federation.

View file

@ -27,6 +27,7 @@ gem 'bootsnap'
gem 'browser' gem 'browser'
gem 'charlock_holmes', '~> 0.7.5' gem 'charlock_holmes', '~> 0.7.5'
gem 'iso-639' gem 'iso-639'
gem 'chewy', '~> 0.10', git: 'https://github.com/toptal/chewy.git'
gem 'cld3', '~> 3.2.0' gem 'cld3', '~> 3.2.0'
gem 'devise', '~> 4.4' gem 'devise', '~> 4.4'
gem 'devise-two-factor', '~> 3.0' gem 'devise-two-factor', '~> 3.0'

View file

@ -1,3 +1,12 @@
GIT
remote: https://github.com/toptal/chewy.git
revision: a7d21eb4b0bd7415533ef134bb6d31b2df309701
specs:
chewy (0.10.1)
activesupport (>= 4.0)
elasticsearch (>= 2.0.0)
elasticsearch-dsl
GEM GEM
remote: https://rubygems.org/ remote: https://rubygems.org/
specs: specs:
@ -154,6 +163,15 @@ GEM
json json
thread thread
thread_safe thread_safe
elasticsearch (6.0.1)
elasticsearch-api (= 6.0.1)
elasticsearch-transport (= 6.0.1)
elasticsearch-api (6.0.1)
multi_json
elasticsearch-dsl (0.1.5)
elasticsearch-transport (6.0.1)
faraday
multi_json
encryptor (3.0.0) encryptor (3.0.0)
erubi (1.7.0) erubi (1.7.0)
et-orbi (1.0.8) et-orbi (1.0.8)
@ -163,6 +181,8 @@ GEM
fabrication (2.18.0) fabrication (2.18.0)
faker (1.8.4) faker (1.8.4)
i18n (~> 0.5) i18n (~> 0.5)
faraday (0.14.0)
multipart-post (>= 1.2, < 3)
fast_blank (1.0.0) fast_blank (1.0.0)
ffi (1.9.18) ffi (1.9.18)
fog-core (1.45.0) fog-core (1.45.0)
@ -291,6 +311,7 @@ GEM
minitest (5.11.3) minitest (5.11.3)
msgpack (1.1.0) msgpack (1.1.0)
multi_json (1.12.2) multi_json (1.12.2)
multipart-post (2.0.0)
net-scp (1.2.1) net-scp (1.2.1)
net-ssh (>= 2.6.5) net-ssh (>= 2.6.5)
net-ssh (4.2.0) net-ssh (4.2.0)
@ -583,6 +604,7 @@ DEPENDENCIES
capistrano-yarn (~> 2.0) capistrano-yarn (~> 2.0)
capybara (~> 2.15) capybara (~> 2.15)
charlock_holmes (~> 0.7.5) charlock_holmes (~> 0.7.5)
chewy (~> 0.10)!
cld3 (~> 3.2.0) cld3 (~> 3.2.0)
climate_control (~> 0.2) climate_control (~> 0.2)
devise (~> 4.4) devise (~> 4.4)

View file

@ -0,0 +1,61 @@
# frozen_string_literal: true
class StatusesIndex < Chewy::Index
settings index: { refresh_interval: '15m' }, analysis: {
filter: {
english_stop: {
type: 'stop',
stopwords: '_english_',
},
english_stemmer: {
type: 'stemmer',
language: 'english',
},
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english',
},
},
analyzer: {
content: {
tokenizer: 'uax_url_email',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
},
}
define_type ::Status.without_reblogs do
crutch :mentions do |collection|
data = ::Mention.where(status_id: collection.map(&:id)).pluck(:status_id, :account_id)
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }
end
crutch :favourites do |collection|
data = ::Favourite.where(status_id: collection.map(&:id)).pluck(:status_id, :account_id)
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }
end
crutch :reblogs do |collection|
data = ::Status.where(reblog_of_id: collection.map(&:id)).pluck(:reblog_of_id, :account_id)
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }
end
root date_detection: false do
field :account_id, type: 'long'
field :text, type: 'text', value: ->(status) { [status.spoiler_text, Formatter.instance.plaintext(status)].join("\n\n") } do
field :stemmed, type: 'text', analyzer: 'content'
end
field :searchable_by, type: 'long', value: ->(status, crutches) { status.searchable_by(crutches) }
field :created_at, type: 'date'
end
end
end

View file

@ -22,6 +22,8 @@ export default class SearchResults extends ImmutablePureComponent {
count += results.get('accounts').size; count += results.get('accounts').size;
accounts = ( accounts = (
<div className='search-results__section'> <div className='search-results__section'>
<h5><FormattedMessage id='search_results.accounts' defaultMessage='People' /></h5>
{results.get('accounts').map(accountId => <AccountContainer key={accountId} id={accountId} />)} {results.get('accounts').map(accountId => <AccountContainer key={accountId} id={accountId} />)}
</div> </div>
); );
@ -31,6 +33,8 @@ export default class SearchResults extends ImmutablePureComponent {
count += results.get('statuses').size; count += results.get('statuses').size;
statuses = ( statuses = (
<div className='search-results__section'> <div className='search-results__section'>
<h5><FormattedMessage id='search_results.statuses' defaultMessage='Toots' /></h5>
{results.get('statuses').map(statusId => <StatusContainer key={statusId} id={statusId} />)} {results.get('statuses').map(statusId => <StatusContainer key={statusId} id={statusId} />)}
</div> </div>
); );
@ -40,6 +44,8 @@ export default class SearchResults extends ImmutablePureComponent {
count += results.get('hashtags').size; count += results.get('hashtags').size;
hashtags = ( hashtags = (
<div className='search-results__section'> <div className='search-results__section'>
<h5><FormattedMessage id='search_results.hashtags' defaultMessage='Hashtags' /></h5>
{results.get('hashtags').map(hashtag => ( {results.get('hashtags').map(hashtag => (
<Link key={hashtag} className='search-results__hashtag' to={`/timelines/tag/${hashtag}`}> <Link key={hashtag} className='search-results__hashtag' to={`/timelines/tag/${hashtag}`}>
#{hashtag} #{hashtag}

View file

@ -1786,7 +1786,7 @@
flex: 1; flex: 1;
min-height: 47px; min-height: 47px;
> img { > img {
display: block; display: block;
object-fit: contain; object-fit: contain;
object-position: bottom left; object-position: bottom left;
@ -3229,6 +3229,43 @@
font-weight: 500; font-weight: 500;
} }
.search-results__section {
margin-bottom: 20px;
h5 {
position: relative;
&::before {
content: "";
display: block;
position: absolute;
left: 0;
right: 0;
top: 50%;
width: 100%;
height: 0;
border-top: 1px solid lighten($ui-base-color, 8%);
}
span {
display: inline-block;
background: $ui-base-color;
color: $ui-primary-color;
font-size: 14px;
font-weight: 500;
padding: 10px;
position: relative;
z-index: 1;
cursor: default;
}
}
.account:last-child,
& > div:last-child .status {
border-bottom: 0;
}
}
.search-results__hashtag { .search-results__hashtag {
display: block; display: block;
padding: 10px; padding: 10px;

View file

@ -9,6 +9,7 @@ class StatusFilter
end end
def filtered? def filtered?
return false if !account.nil? && account.id == status.account_id
blocked_by_policy? || (account_present? && filtered_status?) || silenced_account? blocked_by_policy? || (account_present? && filtered_status?) || silenced_account?
end end

View file

@ -13,6 +13,8 @@
class Favourite < ApplicationRecord class Favourite < ApplicationRecord
include Paginable include Paginable
update_index('statuses#status', :status) if Chewy.enabled?
belongs_to :account, inverse_of: :favourites belongs_to :account, inverse_of: :favourites
belongs_to :status, inverse_of: :favourites, counter_cache: true belongs_to :status, inverse_of: :favourites, counter_cache: true

View file

@ -31,6 +31,8 @@ class Status < ApplicationRecord
include Cacheable include Cacheable
include StatusThreadingConcern include StatusThreadingConcern
update_index('statuses#status', :proper) if Chewy.enabled?
enum visibility: [:public, :unlisted, :private, :direct], _suffix: :visibility enum visibility: [:public, :unlisted, :private, :direct], _suffix: :visibility
belongs_to :application, class_name: 'Doorkeeper::Application', optional: true belongs_to :application, class_name: 'Doorkeeper::Application', optional: true
@ -78,6 +80,22 @@ class Status < ApplicationRecord
delegate :domain, to: :account, prefix: true delegate :domain, to: :account, prefix: true
def searchable_by(preloaded = nil)
ids = [account_id]
if preloaded.nil?
ids += mentions.pluck(:account_id)
ids += favourites.pluck(:account_id)
ids += reblogs.pluck(:account_id)
else
ids += preloaded.mentions[id] || []
ids += preloaded.favourites[id] || []
ids += preloaded.reblogs[id] || []
end
ids.uniq
end
def reply? def reply?
!in_reply_to_id.nil? || attributes['reply'] !in_reply_to_id.nil? || attributes['reply']
end end

View file

@ -1,21 +1,43 @@
# frozen_string_literal: true # frozen_string_literal: true
class SearchService < BaseService class SearchService < BaseService
attr_accessor :query attr_accessor :query, :account, :limit, :resolve
def call(query, limit, resolve = false, account = nil) def call(query, limit, resolve = false, account = nil)
@query = query @query = query
@account = account
@limit = limit
@resolve = resolve
default_results.tap do |results| default_results.tap do |results|
if url_query? if url_query?
results.merge!(url_resource_results) unless url_resource.nil? results.merge!(url_resource_results) unless url_resource.nil?
elsif query.present? elsif query.present?
results[:accounts] = AccountSearchService.new.call(query, limit, account, resolve: resolve) results[:accounts] = perform_accounts_search! if account_searchable?
results[:hashtags] = Tag.search_for(query.gsub(/\A#/, ''), limit) unless query.start_with?('@') results[:statuses] = perform_statuses_search! if full_text_searchable?
results[:hashtags] = perform_hashtags_search! if hashtag_searchable?
end end
end end
end end
private
def perform_accounts_search!
AccountSearchService.new.call(query, limit, account, resolve: resolve)
end
def perform_statuses_search!
statuses = StatusesIndex.filter(term: { searchable_by: account.id })
.query(multi_match: { type: 'most_fields', query: query, operator: 'and', fields: %w(text text.stemmed) })
.limit(limit).objects
statuses.reject { |status| StatusFilter.new(status, account).filtered? }
end
def perform_hashtags_search!
Tag.search_for(query.gsub(/\A#/, ''), limit)
end
def default_results def default_results
{ accounts: [], hashtags: [], statuses: [] } { accounts: [], hashtags: [], statuses: [] }
end end
@ -35,4 +57,17 @@ class SearchService < BaseService
def url_resource_symbol def url_resource_symbol
url_resource.class.name.downcase.pluralize.to_sym url_resource.class.name.downcase.pluralize.to_sym
end end
def full_text_searchable?
return false unless Chewy.enabled?
!account.nil? && !((query.start_with?('#') || query.include?('@')) && !query.include?(' '))
end
def account_searchable?
!(query.include?('@') && query.include?(' '))
end
def hashtag_searchable?
!query.include?('@')
end
end end

View file

@ -0,0 +1,22 @@
enabled = ENV['ES_ENABLED'] == 'true'
host = ENV.fetch('ES_HOST') { 'localhost' }
port = ENV.fetch('ES_PORT') { 9200 }
fallback_prefix = ENV.fetch('REDIS_NAMESPACE') { nil }
prefix = ENV.fetch('ES_PREFIX') { fallback_prefix }
Chewy.settings = {
host: "#{host}:#{port}",
prefix: prefix,
enabled: enabled,
journal: false,
}
Chewy.root_strategy = enabled ? :sidekiq : :bypass
module Chewy
class << self
def enabled?
settings[:enabled]
end
end
end

View file

@ -19,6 +19,17 @@ services:
# volumes: # volumes:
# - ./redis:/data # - ./redis:/data
# es:
# restart: always
# image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.1.3
# environment:
# - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
# networks:
# - internal_network
#### Uncomment to enable ES persistance
## volumes:
## - ./elasticsearch:/usr/share/elasticsearch/data
web: web:
build: . build: .
image: gargron/mastodon image: gargron/mastodon
@ -33,6 +44,7 @@ services:
depends_on: depends_on:
- db - db
- redis - redis
# - es
volumes: volumes:
- ./public/assets:/mastodon/public/assets - ./public/assets:/mastodon/public/assets
- ./public/packs:/mastodon/public/packs - ./public/packs:/mastodon/public/packs

View file

@ -25,6 +25,10 @@ RSpec.configure do |config|
end end
end end
config.before :suite do
Chewy.strategy(:bypass)
end
config.after :suite do config.after :suite do
gc_counter = 0 gc_counter = 0
FileUtils.rm_rf(Dir["#{Rails.root}/spec/test_files/"]) FileUtils.rm_rf(Dir["#{Rails.root}/spec/test_files/"])