NodeTrait.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. <?php
  2. namespace andreskrey\Readability\Nodes;
  3. use andreskrey\Readability\Nodes\DOM\DOMDocument;
  4. use andreskrey\Readability\Nodes\DOM\DOMElement;
  5. use andreskrey\Readability\Nodes\DOM\DOMNode;
  6. use andreskrey\Readability\Nodes\DOM\DOMText;
  7. /**
  8. * @method \DOMNode removeAttribute($name)
  9. */
  10. trait NodeTrait
  11. {
  12. /**
  13. * Content score of the node. Used to determine the value of the content.
  14. *
  15. * @var int
  16. */
  17. public $contentScore = 0;
  18. /**
  19. * Flag for initialized status.
  20. *
  21. * @var bool
  22. */
  23. private $initialized = false;
  24. /**
  25. * Flag data tables.
  26. *
  27. * @var bool
  28. */
  29. private $readabilityDataTable = false;
  30. /**
  31. * @var array
  32. */
  33. private $divToPElements = [
  34. 'a',
  35. 'blockquote',
  36. 'dl',
  37. 'div',
  38. 'img',
  39. 'ol',
  40. 'p',
  41. 'pre',
  42. 'table',
  43. 'ul',
  44. 'select',
  45. ];
  46. /**
  47. * initialized getter.
  48. *
  49. * @return bool
  50. */
  51. public function isInitialized()
  52. {
  53. return $this->initialized;
  54. }
  55. /**
  56. * @return bool
  57. */
  58. public function isReadabilityDataTable()
  59. {
  60. return $this->readabilityDataTable;
  61. }
  62. /**
  63. * @param bool $param
  64. */
  65. public function setReadabilityDataTable($param)
  66. {
  67. $this->readabilityDataTable = $param;
  68. }
  69. /**
  70. * Initializer. Calculates the current score of the node and returns a full Readability object.
  71. *
  72. * @ TODO: I don't like the weightClasses param. How can we get the config here?
  73. *
  74. * @param $weightClasses bool Weight classes?
  75. *
  76. * @return static
  77. */
  78. public function initializeNode($weightClasses)
  79. {
  80. if (!$this->isInitialized()) {
  81. $contentScore = 0;
  82. switch ($this->nodeName) {
  83. case 'div':
  84. $contentScore += 5;
  85. break;
  86. case 'pre':
  87. case 'td':
  88. case 'blockquote':
  89. $contentScore += 3;
  90. break;
  91. case 'address':
  92. case 'ol':
  93. case 'ul':
  94. case 'dl':
  95. case 'dd':
  96. case 'dt':
  97. case 'li':
  98. case 'form':
  99. $contentScore -= 3;
  100. break;
  101. case 'h1':
  102. case 'h2':
  103. case 'h3':
  104. case 'h4':
  105. case 'h5':
  106. case 'h6':
  107. case 'th':
  108. $contentScore -= 5;
  109. break;
  110. }
  111. $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
  112. $this->initialized = true;
  113. }
  114. return $this;
  115. }
  116. /**
  117. * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
  118. * to check first the existence of the attributes property.
  119. *
  120. * @param $attributeName string Attribute to retrieve
  121. *
  122. * @return string
  123. */
  124. public function getAttribute($attributeName)
  125. {
  126. if (!is_null($this->attributes)) {
  127. return parent::getAttribute($attributeName);
  128. }
  129. return '';
  130. }
  131. /**
  132. * Get the ancestors of the current node.
  133. *
  134. * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
  135. *
  136. * @return array
  137. */
  138. public function getNodeAncestors($maxLevel = 3)
  139. {
  140. $ancestors = [];
  141. $level = 0;
  142. $node = $this->parentNode;
  143. while ($node && !($node instanceof DOMDocument)) {
  144. $ancestors[] = $node;
  145. $level++;
  146. if ($level === $maxLevel) {
  147. break;
  148. }
  149. $node = $node->parentNode;
  150. }
  151. return $ancestors;
  152. }
  153. /**
  154. * Returns all links from the current element.
  155. *
  156. * @return array
  157. */
  158. public function getAllLinks()
  159. {
  160. return iterator_to_array($this->getElementsByTagName('a'));
  161. }
  162. /**
  163. * Get the density of links as a percentage of the content
  164. * This is the amount of text that is inside a link divided by the total text in the node.
  165. *
  166. * @return int
  167. */
  168. public function getLinkDensity()
  169. {
  170. $linkLength = 0;
  171. $textLength = mb_strlen($this->getTextContent(true));
  172. if (!$textLength) {
  173. return 0;
  174. }
  175. $links = $this->getAllLinks();
  176. if ($links) {
  177. /** @var DOMElement $link */
  178. foreach ($links as $link) {
  179. $linkLength += mb_strlen($link->getTextContent(true));
  180. }
  181. }
  182. return $linkLength / $textLength;
  183. }
  184. /**
  185. * Calculates the weight of the class/id of the current element.
  186. *
  187. * @return int
  188. */
  189. public function getClassWeight()
  190. {
  191. $weight = 0;
  192. // Look for a special classname
  193. $class = $this->getAttribute('class');
  194. if (trim($class)) {
  195. if (preg_match(NodeUtility::$regexps['negative'], $class)) {
  196. $weight -= 25;
  197. }
  198. if (preg_match(NodeUtility::$regexps['positive'], $class)) {
  199. $weight += 25;
  200. }
  201. }
  202. // Look for a special ID
  203. $id = $this->getAttribute('id');
  204. if (trim($id)) {
  205. if (preg_match(NodeUtility::$regexps['negative'], $id)) {
  206. $weight -= 25;
  207. }
  208. if (preg_match(NodeUtility::$regexps['positive'], $id)) {
  209. $weight += 25;
  210. }
  211. }
  212. return $weight;
  213. }
  214. /**
  215. * Returns the full text of the node.
  216. *
  217. * @param bool $normalize Normalize white space?
  218. *
  219. * @return string
  220. */
  221. public function getTextContent($normalize = false)
  222. {
  223. $nodeValue = $this->nodeValue;
  224. if ($normalize) {
  225. $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
  226. }
  227. return $nodeValue;
  228. }
  229. /**
  230. * Returns the children of the current node.
  231. *
  232. * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
  233. *
  234. * @return array
  235. */
  236. public function getChildren($filterEmptyDOMText = false)
  237. {
  238. $ret = iterator_to_array($this->childNodes);
  239. if ($filterEmptyDOMText) {
  240. // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
  241. $ret = array_values(array_filter($ret, function ($node) {
  242. return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue));
  243. }));
  244. }
  245. return $ret;
  246. }
  247. /**
  248. * Return an array indicating how many rows and columns this table has.
  249. *
  250. * @return array
  251. */
  252. public function getRowAndColumnCount()
  253. {
  254. $rows = $columns = 0;
  255. $trs = $this->getElementsByTagName('tr');
  256. foreach ($trs as $tr) {
  257. /** @var \DOMElement $tr */
  258. $rowspan = $tr->getAttribute('rowspan');
  259. $rows += ($rowspan || 1);
  260. // Now look for column-related info
  261. $columnsInThisRow = 0;
  262. $cells = $tr->getElementsByTagName('td');
  263. foreach ($cells as $cell) {
  264. /** @var \DOMElement $cell */
  265. $colspan = $cell->getAttribute('colspan');
  266. $columnsInThisRow += ($colspan || 1);
  267. }
  268. $columns = max($columns, $columnsInThisRow);
  269. }
  270. return ['rows' => $rows, 'columns' => $columns];
  271. }
  272. /**
  273. * Creates a new node based on the text content of the original node.
  274. *
  275. * @param $originalNode DOMNode
  276. * @param $tagName string
  277. *
  278. * @return DOMElement
  279. */
  280. public function createNode($originalNode, $tagName)
  281. {
  282. $text = $originalNode->getTextContent();
  283. $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
  284. return $newNode;
  285. }
  286. /**
  287. * Check if a given node has one of its ancestor tag name matching the
  288. * provided one.
  289. *
  290. * @param DOMElement $node
  291. * @param string $tagName
  292. * @param int $maxDepth
  293. *
  294. * @return bool
  295. */
  296. public function hasAncestorTag($node, $tagName, $maxDepth = 3)
  297. {
  298. $depth = 0;
  299. while ($node->parentNode) {
  300. if ($maxDepth > 0 && $depth > $maxDepth) {
  301. return false;
  302. }
  303. if ($node->parentNode->nodeName === $tagName) {
  304. return true;
  305. }
  306. $node = $node->parentNode;
  307. $depth++;
  308. }
  309. return false;
  310. }
  311. /**
  312. * Checks if the current node has a single child and if that child is a P node.
  313. * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
  314. * tags are, in practice, paragraphs.
  315. *
  316. * @param DOMNode $node
  317. *
  318. * @return bool
  319. */
  320. public function hasSinglePNode()
  321. {
  322. // There should be exactly 1 element child which is a P:
  323. if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') {
  324. return false;
  325. }
  326. // And there should be no text nodes with real content (param true on ->getChildren)
  327. foreach ($children as $child) {
  328. /** @var $child DOMNode */
  329. if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) {
  330. return false;
  331. }
  332. }
  333. return true;
  334. }
  335. /**
  336. * Check if the current element has a single child block element.
  337. * Block elements are the ones defined in the divToPElements array.
  338. *
  339. * @return bool
  340. */
  341. public function hasSingleChildBlockElement()
  342. {
  343. $result = false;
  344. if ($this->hasChildNodes()) {
  345. foreach ($this->getChildren() as $child) {
  346. if (in_array($child->nodeName, $this->divToPElements)) {
  347. $result = true;
  348. } else {
  349. // If any of the hasSingleChildBlockElement calls return true, return true then.
  350. /** @var $child DOMElement */
  351. $result = ($result || $child->hasSingleChildBlockElement());
  352. }
  353. }
  354. }
  355. return $result;
  356. }
  357. /**
  358. * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
  359. *
  360. * @return bool
  361. */
  362. public function isElementWithoutContent()
  363. {
  364. return $this instanceof DOMElement &&
  365. mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
  366. ($this->childNodes->length === 0 ||
  367. $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
  368. /*
  369. * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
  370. * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
  371. * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
  372. * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
  373. * are dealing with (And at this point we know they are empty or are just whitespace, because of the
  374. * mb_strlen in this chain of checks).
  375. */
  376. + count(array_filter(iterator_to_array($this->childNodes), function ($child) {
  377. return $child instanceof DOMText;
  378. }))
  379. );
  380. }
  381. }