link_details_extractor.rb 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. # frozen_string_literal: true
  2. class LinkDetailsExtractor
  3. include ActionView::Helpers::TagHelper
  4. include LanguagesHelper
  5. # Some publications wrap their JSON-LD data in their <script> tags
  6. # in commented-out CDATA blocks, they need to be removed before
  7. # attempting to parse JSON
  8. CDATA_JUNK_PATTERN = %r{^\s*(
  9. (/\*\s*<!\[CDATA\[\s*\*/) # Block comment style opening
  10. |
  11. (//\s*<!\[CDATA\[) # Single-line comment style opening
  12. |
  13. (/\*\s*\]\]>\s*\*/) # Block comment style closing
  14. |
  15. (//\s*\]\]>) # Single-line comment style closing
  16. )\s*$}x
  17. class StructuredData
  18. SUPPORTED_TYPES = %w(
  19. NewsArticle
  20. WebPage
  21. ).freeze
  22. def initialize(data)
  23. @data = data
  24. end
  25. def headline
  26. json['headline']
  27. end
  28. def description
  29. json['description']
  30. end
  31. def language
  32. lang = json['inLanguage']
  33. lang = lang.first if lang.is_a?(Array)
  34. lang.is_a?(Hash) ? (lang['alternateName'] || lang['name']) : lang
  35. end
  36. def type
  37. json['@type']
  38. end
  39. def image
  40. obj = first_of_value(json['image'])
  41. return obj['url'] if obj.is_a?(Hash)
  42. obj
  43. end
  44. def date_published
  45. json['datePublished']
  46. end
  47. def date_modified
  48. json['dateModified']
  49. end
  50. def author_name
  51. author['name']
  52. end
  53. def author_url
  54. author['url']
  55. end
  56. def publisher_name
  57. publisher['name']
  58. end
  59. def publisher_logo
  60. publisher.dig('logo', 'url')
  61. end
  62. def valid?
  63. json.present?
  64. end
  65. private
  66. def author
  67. first_of_value(json['author']) || {}
  68. end
  69. def publisher
  70. first_of_value(json['publisher']) || {}
  71. end
  72. def first_of_value(arr)
  73. arr.is_a?(Array) ? arr.first : arr
  74. end
  75. def root_array(root)
  76. root.is_a?(Array) ? root : [root]
  77. end
  78. def json
  79. @json ||= root_array(Oj.load(@data)).find { |obj| SUPPORTED_TYPES.include?(obj['@type']) } || {}
  80. end
  81. end
  82. def initialize(original_url, html, html_charset)
  83. @original_url = Addressable::URI.parse(original_url)
  84. @html = html
  85. @html_charset = html_charset
  86. end
  87. def to_preview_card_attributes
  88. {
  89. title: title || '',
  90. description: description || '',
  91. image_remote_url: image,
  92. image_description: image_alt || '',
  93. type: type,
  94. link_type: link_type,
  95. width: width || 0,
  96. height: height || 0,
  97. html: html || '',
  98. provider_name: provider_name || '',
  99. provider_url: provider_url || '',
  100. author_name: author_name || '',
  101. author_url: author_url || '',
  102. embed_url: embed_url || '',
  103. language: language,
  104. published_at: published_at.presence,
  105. }
  106. end
  107. def type
  108. player_url.present? ? :video : :link
  109. end
  110. def link_type
  111. if structured_data&.type == 'NewsArticle' || opengraph_tag('og:type') == 'article'
  112. :article
  113. else
  114. :unknown
  115. end
  116. end
  117. def html
  118. player_url.present? ? content_tag(:iframe, nil, src: player_url, width: width, height: height, allowfullscreen: 'true', allowtransparency: 'true', scrolling: 'no', frameborder: '0') : nil
  119. end
  120. def width
  121. opengraph_tag('twitter:player:width')
  122. end
  123. def height
  124. opengraph_tag('twitter:player:height')
  125. end
  126. def title
  127. html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)
  128. end
  129. def description
  130. html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description'))
  131. end
  132. def published_at
  133. structured_data&.date_published || opengraph_tag('article:published_time')
  134. end
  135. def image
  136. valid_url_or_nil(opengraph_tag('og:image'))
  137. end
  138. def image_alt
  139. opengraph_tag('og:image:alt')
  140. end
  141. def canonical_url
  142. valid_url_or_nil(link_tag('canonical') || opengraph_tag('og:url'), same_origin_only: true) || @original_url.to_s
  143. end
  144. def provider_name
  145. html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name'))
  146. end
  147. def provider_url
  148. valid_url_or_nil(host_to_url(opengraph_tag('og:site')))
  149. end
  150. def author_name
  151. html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username'))
  152. end
  153. def author_url
  154. structured_data&.author_url
  155. end
  156. def embed_url
  157. valid_url_or_nil(opengraph_tag('twitter:player:stream'))
  158. end
  159. def language
  160. valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
  161. end
  162. def icon
  163. valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
  164. end
  165. private
  166. def player_url
  167. valid_url_or_nil(opengraph_tag('twitter:player'))
  168. end
  169. def host_to_url(str)
  170. return if str.blank?
  171. str.start_with?(%r{https?://}) ? str : "http://#{str}"
  172. end
  173. def valid_url_or_nil(str, same_origin_only: false)
  174. return if str.blank? || str == 'null'
  175. url = @original_url + Addressable::URI.parse(str)
  176. return if url.host.blank? || !%w(http https).include?(url.scheme) || (same_origin_only && url.host != @original_url.host)
  177. url.to_s
  178. rescue Addressable::URI::InvalidURIError
  179. nil
  180. end
  181. def link_tag(name)
  182. document.xpath("//link[@rel=\"#{name}\"]").pick('href')
  183. end
  184. def opengraph_tag(name)
  185. document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
  186. end
  187. def meta_tag(name)
  188. document.xpath("//meta[@name=\"#{name}\"]").pick('content')
  189. end
  190. def structured_data
  191. # Some publications have more than one JSON-LD definition on the page,
  192. # and some of those definitions aren't valid JSON either, so we have
  193. # to loop through here until we find something that is the right type
  194. # and doesn't break
  195. @structured_data ||= document.xpath('//script[@type="application/ld+json"]').filter_map do |element|
  196. json_ld = element.content&.gsub(CDATA_JUNK_PATTERN, '')
  197. next if json_ld.blank?
  198. structured_data = StructuredData.new(html_entities.decode(json_ld))
  199. next unless structured_data.valid?
  200. structured_data
  201. rescue Oj::ParseError, EncodingError
  202. Rails.logger.debug { "Invalid JSON-LD in #{@original_url}" }
  203. next
  204. end.first
  205. end
  206. def document
  207. @document ||= Nokogiri::HTML(@html, nil, encoding)
  208. end
  209. def encoding
  210. @encoding ||= begin
  211. guess = detector.detect(@html, @html_charset)
  212. guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
  213. end
  214. end
  215. def detector
  216. @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
  217. detector.strip_tags = true
  218. end
  219. end
  220. def html_entities
  221. @html_entities ||= HTMLEntities.new
  222. end
  223. end