link_details_extractor_spec.rb 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # frozen_string_literal: true
  2. require 'rails_helper'
  3. RSpec.describe LinkDetailsExtractor do
  4. subject { described_class.new(original_url, html, nil) }
  5. let(:original_url) { 'https://example.com/dog.html?tracking=123' }
  6. describe '#canonical_url' do
  7. let(:html) { "<!doctype html><link rel='canonical' href='#{url}'>" }
  8. context 'when canonical URL points to the same host' do
  9. let(:url) { 'https://example.com/dog.html' }
  10. it 'ignores the canonical URLs' do
  11. expect(subject.canonical_url).to eq 'https://example.com/dog.html'
  12. end
  13. end
  14. context 'when canonical URL points to another host' do
  15. let(:url) { 'https://different.example.net/dog.html' }
  16. it 'ignores the canonical URLs' do
  17. expect(subject.canonical_url).to eq original_url
  18. end
  19. end
  20. context 'when canonical URL is set to "null"' do
  21. let(:url) { 'null' }
  22. it 'ignores the canonical URLs' do
  23. expect(subject.canonical_url).to eq original_url
  24. end
  25. end
  26. end
  27. context 'when only basic metadata is present' do
  28. let(:html) { <<~HTML }
  29. <!doctype html>
  30. <html lang="en">
  31. <head>
  32. <title>Man bites dog</title>
  33. <meta name="description" content="A dog&#39;s tale">
  34. </head>
  35. </html>
  36. HTML
  37. describe '#title' do
  38. it 'returns the title from title tag' do
  39. expect(subject.title).to eq 'Man bites dog'
  40. end
  41. end
  42. describe '#description' do
  43. it 'returns the description from meta tag' do
  44. expect(subject.description).to eq "A dog's tale"
  45. end
  46. end
  47. describe '#language' do
  48. it 'returns the language from lang attribute' do
  49. expect(subject.language).to eq 'en'
  50. end
  51. end
  52. end
  53. context 'when structured data is present' do
  54. let(:ld_json) do
  55. {
  56. '@context' => 'https://schema.org',
  57. '@type' => 'NewsArticle',
  58. 'headline' => 'Man bites dog',
  59. 'description' => "A dog's tale",
  60. 'datePublished' => '2022-01-31T19:53:00+00:00',
  61. 'author' => {
  62. '@type' => 'Organization',
  63. 'name' => 'Charlie Brown',
  64. },
  65. 'publisher' => {
  66. '@type' => 'NewsMediaOrganization',
  67. 'name' => 'Pet News',
  68. 'url' => 'https://example.com',
  69. },
  70. }.to_json
  71. end
  72. shared_examples 'structured data' do
  73. describe '#title' do
  74. it 'returns the title from structured data' do
  75. expect(subject.title).to eq 'Man bites dog'
  76. end
  77. end
  78. describe '#description' do
  79. it 'returns the description from structured data' do
  80. expect(subject.description).to eq "A dog's tale"
  81. end
  82. end
  83. describe '#published_at' do
  84. it 'returns the publicaton time from structured data' do
  85. expect(subject.published_at).to eq '2022-01-31T19:53:00+00:00'
  86. end
  87. end
  88. describe '#author_name' do
  89. it 'returns the author name from structured data' do
  90. expect(subject.author_name).to eq 'Charlie Brown'
  91. end
  92. end
  93. describe '#provider_name' do
  94. it 'returns the provider name from structured data' do
  95. expect(subject.provider_name).to eq 'Pet News'
  96. end
  97. end
  98. end
  99. context 'when is wrapped in CDATA tags' do
  100. let(:html) { <<~HTML }
  101. <!doctype html>
  102. <html>
  103. <head>
  104. <script type="application/ld+json">
  105. //<![CDATA[
  106. #{ld_json}
  107. //]]>
  108. </script>
  109. </head>
  110. </html>
  111. HTML
  112. include_examples 'structured data'
  113. end
  114. context 'with the first tag is invalid JSON' do
  115. let(:html) { <<~HTML }
  116. <!doctype html>
  117. <html>
  118. <body>
  119. <script type="application/ld+json">
  120. invalid LD+JSON
  121. </script>
  122. <script type="application/ld+json">
  123. #{ld_json}
  124. </script>
  125. </body>
  126. </html>
  127. HTML
  128. include_examples 'structured data'
  129. end
  130. context 'with preceding block of unsupported LD+JSON' do
  131. let(:html) { <<~HTML }
  132. <!doctype html>
  133. <html>
  134. <body>
  135. <script type="application/ld+json">
  136. [
  137. {
  138. "@context": "https://schema.org",
  139. "@type": "ItemList",
  140. "url": "https://example.com/cat.html",
  141. "name": "Man bites cat",
  142. "description": "A cat's tale"
  143. },
  144. {
  145. "@context": "https://schema.org",
  146. "@type": "BreadcrumbList",
  147. "itemListElement":[
  148. {
  149. "@type": "ListItem",
  150. "position": 1,
  151. "item": {
  152. "@id": "https://www.example.com",
  153. "name": "Cat News"
  154. }
  155. }
  156. ]
  157. }
  158. ]
  159. </script>
  160. <script type="application/ld+json">
  161. #{ld_json}
  162. </script>
  163. </body>
  164. </html>
  165. HTML
  166. include_examples 'structured data'
  167. end
  168. context 'with unsupported in same block LD+JSON' do
  169. let(:html) { <<~HTML }
  170. <!doctype html>
  171. <html>
  172. <body>
  173. <script type="application/ld+json">
  174. [
  175. {
  176. "@context": "https://schema.org",
  177. "@type": "ItemList",
  178. "url": "https://example.com/cat.html",
  179. "name": "Man bites cat",
  180. "description": "A cat's tale"
  181. },
  182. #{ld_json}
  183. ]
  184. </script>
  185. </body>
  186. </html>
  187. HTML
  188. include_examples 'structured data'
  189. end
  190. end
  191. context 'when Open Graph protocol data is present' do
  192. let(:html) { <<~HTML }
  193. <!doctype html>
  194. <html>
  195. <head>
  196. <meta property="og:url" content="https://example.com/dog.html">
  197. <meta property="og:title" content="Man bites dog">
  198. <meta property="og:description" content="A dog's tale">
  199. <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
  200. <meta property="og:author" content="Charlie Brown">
  201. <meta property="og:locale" content="en">
  202. <meta property="og:image" content="https://example.com/snoopy.jpg">
  203. <meta property="og:image:alt" content="A good boy">
  204. <meta property="og:site_name" content="Pet News">
  205. </head>
  206. </html>
  207. HTML
  208. describe '#canonical_url' do
  209. it 'returns the URL from Open Graph protocol data' do
  210. expect(subject.canonical_url).to eq 'https://example.com/dog.html'
  211. end
  212. end
  213. describe '#title' do
  214. it 'returns the title from Open Graph protocol data' do
  215. expect(subject.title).to eq 'Man bites dog'
  216. end
  217. end
  218. describe '#description' do
  219. it 'returns the description from Open Graph protocol data' do
  220. expect(subject.description).to eq "A dog's tale"
  221. end
  222. end
  223. describe '#published_at' do
  224. it 'returns the publicaton time from Open Graph protocol data' do
  225. expect(subject.published_at).to eq '2022-01-31T19:53:00+00:00'
  226. end
  227. end
  228. describe '#author_name' do
  229. it 'returns the author name from Open Graph protocol data' do
  230. expect(subject.author_name).to eq 'Charlie Brown'
  231. end
  232. end
  233. describe '#language' do
  234. it 'returns the language from Open Graph protocol data' do
  235. expect(subject.language).to eq 'en'
  236. end
  237. end
  238. describe '#image' do
  239. it 'returns the image from Open Graph protocol data' do
  240. expect(subject.image).to eq 'https://example.com/snoopy.jpg'
  241. end
  242. end
  243. describe '#image:alt' do
  244. it 'returns the image description from Open Graph protocol data' do
  245. expect(subject.image_alt).to eq 'A good boy'
  246. end
  247. end
  248. describe '#provider_name' do
  249. it 'returns the provider name from Open Graph protocol data' do
  250. expect(subject.provider_name).to eq 'Pet News'
  251. end
  252. end
  253. end
  254. end