123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434 |
- <?php
- namespace andreskrey\Readability\Nodes;
- use andreskrey\Readability\Nodes\DOM\DOMDocument;
- use andreskrey\Readability\Nodes\DOM\DOMElement;
- use andreskrey\Readability\Nodes\DOM\DOMNode;
- use andreskrey\Readability\Nodes\DOM\DOMText;
- /**
- * @method \DOMNode removeAttribute($name)
- */
- trait NodeTrait
- {
- /**
- * Content score of the node. Used to determine the value of the content.
- *
- * @var int
- */
- public $contentScore = 0;
- /**
- * Flag for initialized status.
- *
- * @var bool
- */
- private $initialized = false;
- /**
- * Flag data tables.
- *
- * @var bool
- */
- private $readabilityDataTable = false;
- /**
- * @var array
- */
- private $divToPElements = [
- 'a',
- 'blockquote',
- 'dl',
- 'div',
- 'img',
- 'ol',
- 'p',
- 'pre',
- 'table',
- 'ul',
- 'select',
- ];
- /**
- * initialized getter.
- *
- * @return bool
- */
- public function isInitialized()
- {
- return $this->initialized;
- }
- /**
- * @return bool
- */
- public function isReadabilityDataTable()
- {
- return $this->readabilityDataTable;
- }
- /**
- * @param bool $param
- */
- public function setReadabilityDataTable($param)
- {
- $this->readabilityDataTable = $param;
- }
- /**
- * Initializer. Calculates the current score of the node and returns a full Readability object.
- *
- * @ TODO: I don't like the weightClasses param. How can we get the config here?
- *
- * @param $weightClasses bool Weight classes?
- *
- * @return static
- */
- public function initializeNode($weightClasses)
- {
- if (!$this->isInitialized()) {
- $contentScore = 0;
- switch ($this->nodeName) {
- case 'div':
- $contentScore += 5;
- break;
- case 'pre':
- case 'td':
- case 'blockquote':
- $contentScore += 3;
- break;
- case 'address':
- case 'ol':
- case 'ul':
- case 'dl':
- case 'dd':
- case 'dt':
- case 'li':
- case 'form':
- $contentScore -= 3;
- break;
- case 'h1':
- case 'h2':
- case 'h3':
- case 'h4':
- case 'h5':
- case 'h6':
- case 'th':
- $contentScore -= 5;
- break;
- }
- $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
- $this->initialized = true;
- }
- return $this;
- }
- /**
- * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
- * to check first the existence of the attributes property.
- *
- * @param $attributeName string Attribute to retrieve
- *
- * @return string
- */
- public function getAttribute($attributeName)
- {
- if (!is_null($this->attributes)) {
- return parent::getAttribute($attributeName);
- }
- return '';
- }
- /**
- * Get the ancestors of the current node.
- *
- * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
- *
- * @return array
- */
- public function getNodeAncestors($maxLevel = 3)
- {
- $ancestors = [];
- $level = 0;
- $node = $this->parentNode;
- while ($node && !($node instanceof DOMDocument)) {
- $ancestors[] = $node;
- $level++;
- if ($level === $maxLevel) {
- break;
- }
- $node = $node->parentNode;
- }
- return $ancestors;
- }
- /**
- * Returns all links from the current element.
- *
- * @return array
- */
- public function getAllLinks()
- {
- return iterator_to_array($this->getElementsByTagName('a'));
- }
- /**
- * Get the density of links as a percentage of the content
- * This is the amount of text that is inside a link divided by the total text in the node.
- *
- * @return int
- */
- public function getLinkDensity()
- {
- $linkLength = 0;
- $textLength = mb_strlen($this->getTextContent(true));
- if (!$textLength) {
- return 0;
- }
- $links = $this->getAllLinks();
- if ($links) {
- /** @var DOMElement $link */
- foreach ($links as $link) {
- $linkLength += mb_strlen($link->getTextContent(true));
- }
- }
- return $linkLength / $textLength;
- }
- /**
- * Calculates the weight of the class/id of the current element.
- *
- * @return int
- */
- public function getClassWeight()
- {
- $weight = 0;
- // Look for a special classname
- $class = $this->getAttribute('class');
- if (trim($class)) {
- if (preg_match(NodeUtility::$regexps['negative'], $class)) {
- $weight -= 25;
- }
- if (preg_match(NodeUtility::$regexps['positive'], $class)) {
- $weight += 25;
- }
- }
- // Look for a special ID
- $id = $this->getAttribute('id');
- if (trim($id)) {
- if (preg_match(NodeUtility::$regexps['negative'], $id)) {
- $weight -= 25;
- }
- if (preg_match(NodeUtility::$regexps['positive'], $id)) {
- $weight += 25;
- }
- }
- return $weight;
- }
- /**
- * Returns the full text of the node.
- *
- * @param bool $normalize Normalize white space?
- *
- * @return string
- */
- public function getTextContent($normalize = false)
- {
- $nodeValue = $this->nodeValue;
- if ($normalize) {
- $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
- }
- return $nodeValue;
- }
- /**
- * Returns the children of the current node.
- *
- * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
- *
- * @return array
- */
- public function getChildren($filterEmptyDOMText = false)
- {
- $ret = iterator_to_array($this->childNodes);
- if ($filterEmptyDOMText) {
- // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
- $ret = array_values(array_filter($ret, function ($node) {
- return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue));
- }));
- }
- return $ret;
- }
- /**
- * Return an array indicating how many rows and columns this table has.
- *
- * @return array
- */
- public function getRowAndColumnCount()
- {
- $rows = $columns = 0;
- $trs = $this->getElementsByTagName('tr');
- foreach ($trs as $tr) {
- /** @var \DOMElement $tr */
- $rowspan = $tr->getAttribute('rowspan');
- $rows += ($rowspan || 1);
- // Now look for column-related info
- $columnsInThisRow = 0;
- $cells = $tr->getElementsByTagName('td');
- foreach ($cells as $cell) {
- /** @var \DOMElement $cell */
- $colspan = $cell->getAttribute('colspan');
- $columnsInThisRow += ($colspan || 1);
- }
- $columns = max($columns, $columnsInThisRow);
- }
- return ['rows' => $rows, 'columns' => $columns];
- }
- /**
- * Creates a new node based on the text content of the original node.
- *
- * @param $originalNode DOMNode
- * @param $tagName string
- *
- * @return DOMElement
- */
- public function createNode($originalNode, $tagName)
- {
- $text = $originalNode->getTextContent();
- $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
- return $newNode;
- }
- /**
- * Check if a given node has one of its ancestor tag name matching the
- * provided one.
- *
- * @param DOMElement $node
- * @param string $tagName
- * @param int $maxDepth
- *
- * @return bool
- */
- public function hasAncestorTag($node, $tagName, $maxDepth = 3)
- {
- $depth = 0;
- while ($node->parentNode) {
- if ($maxDepth > 0 && $depth > $maxDepth) {
- return false;
- }
- if ($node->parentNode->nodeName === $tagName) {
- return true;
- }
- $node = $node->parentNode;
- $depth++;
- }
- return false;
- }
- /**
- * Checks if the current node has a single child and if that child is a P node.
- * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
- * tags are, in practice, paragraphs.
- *
- * @param DOMNode $node
- *
- * @return bool
- */
- public function hasSinglePNode()
- {
- // There should be exactly 1 element child which is a P:
- if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') {
- return false;
- }
- // And there should be no text nodes with real content (param true on ->getChildren)
- foreach ($children as $child) {
- /** @var $child DOMNode */
- if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) {
- return false;
- }
- }
- return true;
- }
- /**
- * Check if the current element has a single child block element.
- * Block elements are the ones defined in the divToPElements array.
- *
- * @return bool
- */
- public function hasSingleChildBlockElement()
- {
- $result = false;
- if ($this->hasChildNodes()) {
- foreach ($this->getChildren() as $child) {
- if (in_array($child->nodeName, $this->divToPElements)) {
- $result = true;
- } else {
- // If any of the hasSingleChildBlockElement calls return true, return true then.
- /** @var $child DOMElement */
- $result = ($result || $child->hasSingleChildBlockElement());
- }
- }
- }
- return $result;
- }
- /**
- * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
- *
- * @return bool
- */
- public function isElementWithoutContent()
- {
- return $this instanceof DOMElement &&
- mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
- ($this->childNodes->length === 0 ||
- $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
- /*
- * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
- * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
- * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
- * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
- * are dealing with (And at this point we know they are empty or are just whitespace, because of the
- * mb_strlen in this chain of checks).
- */
- + count(array_filter(iterator_to_array($this->childNodes), function ($child) {
- return $child instanceof DOMText;
- }))
- );
- }
- }
|