link_details_extractor_spec.rb 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # frozen_string_literal: true
  2. require 'rails_helper'
  3. RSpec.describe LinkDetailsExtractor do
  4. subject { described_class.new(original_url, html, nil) }
  5. let(:original_url) { 'https://example.com/dog.html?tracking=123' }
  6. describe '#canonical_url' do
  7. let(:html) { "<!doctype html><link rel='canonical' href='#{url}'>" }
  8. context 'when canonical URL points to the same host' do
  9. let(:url) { 'https://example.com/dog.html' }
  10. it 'ignores the canonical URLs' do
  11. expect(subject.canonical_url).to eq 'https://example.com/dog.html'
  12. end
  13. end
  14. context 'when canonical URL points to another host' do
  15. let(:url) { 'https://different.example.net/dog.html' }
  16. it 'ignores the canonical URLs' do
  17. expect(subject.canonical_url).to eq original_url
  18. end
  19. end
  20. context 'when canonical URL is set to "null"' do
  21. let(:url) { 'null' }
  22. it 'ignores the canonical URLs' do
  23. expect(subject.canonical_url).to eq original_url
  24. end
  25. end
  26. end
  27. context 'when only basic metadata is present' do
  28. let(:html) { <<~HTML }
  29. <!doctype html>
  30. <html lang="en">
  31. <head>
  32. <title>Man bites dog</title>
  33. <meta name="description" content="A dog&#39;s tale">
  34. </head>
  35. </html>
  36. HTML
  37. it 'extracts the expected values from html metadata' do
  38. expect(subject)
  39. .to have_attributes(
  40. title: eq('Man bites dog'),
  41. description: eq("A dog's tale"),
  42. language: eq('en')
  43. )
  44. end
  45. end
  46. context 'when structured data is present' do
  47. let(:ld_json) do
  48. {
  49. '@context' => 'https://schema.org',
  50. '@type' => 'NewsArticle',
  51. 'headline' => 'Man bites dog',
  52. 'description' => "A dog's tale",
  53. 'datePublished' => '2022-01-31T19:53:00+00:00',
  54. 'author' => {
  55. '@type' => 'Organization',
  56. 'name' => 'Charlie Brown',
  57. },
  58. 'publisher' => {
  59. '@type' => 'NewsMediaOrganization',
  60. 'name' => 'Pet News',
  61. 'url' => 'https://example.com',
  62. },
  63. 'inLanguage' => {
  64. name: 'English',
  65. alternateName: 'en',
  66. },
  67. }.to_json
  68. end
  69. shared_examples 'structured data' do
  70. it 'extracts the expected values from structured data' do
  71. expect(subject)
  72. .to have_attributes(
  73. title: eq('Man bites dog'),
  74. description: eq("A dog's tale"),
  75. published_at: eq('2022-01-31T19:53:00+00:00'),
  76. author_name: eq('Charlie Brown'),
  77. provider_name: eq('Pet News'),
  78. language: eq('en')
  79. )
  80. end
  81. end
  82. context 'when is wrapped in CDATA tags' do
  83. let(:html) { <<~HTML }
  84. <!doctype html>
  85. <html>
  86. <head>
  87. <script type="application/ld+json">
  88. //<![CDATA[
  89. #{ld_json}
  90. //]]>
  91. </script>
  92. </head>
  93. </html>
  94. HTML
  95. include_examples 'structured data'
  96. end
  97. context 'with the first tag is invalid JSON' do
  98. let(:html) { <<~HTML }
  99. <!doctype html>
  100. <html>
  101. <body>
  102. <script type="application/ld+json">
  103. invalid LD+JSON
  104. </script>
  105. <script type="application/ld+json">
  106. #{ld_json}
  107. </script>
  108. </body>
  109. </html>
  110. HTML
  111. include_examples 'structured data'
  112. end
  113. context 'with preceding block of unsupported LD+JSON' do
  114. let(:html) { <<~HTML }
  115. <!doctype html>
  116. <html>
  117. <body>
  118. <script type="application/ld+json">
  119. [
  120. {
  121. "@context": "https://schema.org",
  122. "@type": "ItemList",
  123. "url": "https://example.com/cat.html",
  124. "name": "Man bites cat",
  125. "description": "A cat's tale"
  126. },
  127. {
  128. "@context": "https://schema.org",
  129. "@type": "BreadcrumbList",
  130. "itemListElement":[
  131. {
  132. "@type": "ListItem",
  133. "position": 1,
  134. "item": {
  135. "@id": "https://www.example.com",
  136. "name": "Cat News"
  137. }
  138. }
  139. ]
  140. }
  141. ]
  142. </script>
  143. <script type="application/ld+json">
  144. #{ld_json}
  145. </script>
  146. </body>
  147. </html>
  148. HTML
  149. include_examples 'structured data'
  150. end
  151. context 'with unsupported in same block LD+JSON' do
  152. let(:html) { <<~HTML }
  153. <!doctype html>
  154. <html>
  155. <body>
  156. <script type="application/ld+json">
  157. [
  158. {
  159. "@context": "https://schema.org",
  160. "@type": "ItemList",
  161. "url": "https://example.com/cat.html",
  162. "name": "Man bites cat",
  163. "description": "A cat's tale"
  164. },
  165. #{ld_json}
  166. ]
  167. </script>
  168. </body>
  169. </html>
  170. HTML
  171. include_examples 'structured data'
  172. end
  173. end
  174. context 'when Open Graph protocol data is present' do
  175. let(:html) { <<~HTML }
  176. <!doctype html>
  177. <html>
  178. <head>
  179. <meta property="og:url" content="https://example.com/dog.html">
  180. <meta property="og:title" content="Man bites dog">
  181. <meta property="og:description" content="A dog's tale">
  182. <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
  183. <meta property="og:author" content="Charlie Brown">
  184. <meta property="og:locale" content="en">
  185. <meta property="og:image" content="https://example.com/snoopy.jpg">
  186. <meta property="og:image:alt" content="A good boy">
  187. <meta property="og:site_name" content="Pet News">
  188. </head>
  189. </html>
  190. HTML
  191. it 'extracts the expected values from open graph data' do
  192. expect(subject)
  193. .to have_attributes(
  194. canonical_url: eq('https://example.com/dog.html'),
  195. title: eq('Man bites dog'),
  196. description: eq("A dog's tale"),
  197. published_at: eq('2022-01-31T19:53:00+00:00'),
  198. author_name: eq('Charlie Brown'),
  199. language: eq('en'),
  200. image: eq('https://example.com/snoopy.jpg'),
  201. image_alt: eq('A good boy'),
  202. provider_name: eq('Pet News')
  203. )
  204. end
  205. end
  206. end