Readability.php 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152
  1. <?php
  2. /**
  3. * Arc90's Readability ported to PHP for FiveFilters.org
  4. * Based on readability.js version 1.7.1 (without multi-page support)
  5. * Updated to allow HTML5 parsing with html5lib
  6. * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
  7. * ------------------------------------------------------
  8. * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
  9. * Arc90's project URL: http://lab.arc90.com/experiments/readability/
  10. * JS Source: http://code.google.com/p/arc90labs-readability
  11. * Ported by: Keyvan Minoukadeh, http://www.keyvan.net
  12. * More information: http://fivefilters.org/content-only/
  13. * License: Apache License, Version 2.0
  14. * Requires: PHP5
  15. * Date: 2012-09-19
  16. *
  17. * Differences between the PHP port and the original
  18. * ------------------------------------------------------
  19. * Arc90's Readability is designed to run in the browser. It works on the DOM
  20. * tree (the parsed HTML) after the page's CSS styles have been applied and
  21. * Javascript code executed. This PHP port does not run inside a browser.
  22. * We use PHP's ability to parse HTML to build our DOM tree, but we cannot
  23. * rely on CSS or Javascript support. As such, the results will not always
  24. * match Arc90's Readability. (For example, if a web page contains CSS style
  25. * rules or Javascript code which hide certain HTML elements from display,
  26. * Arc90's Readability will dismiss those from consideration but our PHP port,
  27. * unable to understand CSS or Javascript, will not know any better.)
  28. *
  29. * Another significant difference is that the aim of Arc90's Readability is
  30. * to re-present the main content block of a given web page so users can
  31. * read it more easily in their browsers. Correct identification, clean up,
  32. * and separation of the content block is only a part of this process.
  33. * This PHP port is only concerned with this part, it does not include code
  34. * that relates to presentation in the browser - Arc90 already do
  35. * that extremely well, and for PDF output there's FiveFilters.org's
  36. * PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
  37. *
  38. * Finally, this class contains methods that might be useful for developers
  39. * working on HTML document fragments. So without deviating too much from
  40. * the original code (which I don't want to do because it makes debugging
  41. * and updating more difficult), I've tried to make it a little more
  42. * developer friendly. You should be able to use the methods here on
  43. * existing DOMElement objects without passing an entire HTML document to
  44. * be parsed.
  45. */
  46. // This class allows us to do JavaScript like assignements to innerHTML
  47. require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
  48. libxml_use_internal_errors(true);
  49. // Alternative usage (for testing only!)
  50. // uncomment the lines below and call Readability.php in your browser
  51. // passing it the URL of the page you'd like content from, e.g.:
  52. // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
  53. /*
  54. if (!isset($_GET['url']) || $_GET['url'] == '') {
  55. die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
  56. }
  57. $url = $_GET['url'];
  58. if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
  59. $html = file_get_contents($url);
  60. $r = new Readability($html, $url);
  61. $r->init();
  62. echo $r->articleContent->innerHTML;
  63. */
  64. class Readability
  65. {
  66. public $version = '1.7.1-without-multi-page';
  67. public $convertLinksToFootnotes = false;
  68. public $revertForcedParagraphElements = true;
  69. public $articleTitle;
  70. public $articleContent;
  71. public $dom;
  72. public $url = null; // optional - URL where HTML was retrieved
  73. public $debug = false;
  74. public $lightClean = true; // preserves more content (experimental) added 2012-09-19
  75. protected $body = null; //
  76. protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
  77. protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
  78. protected $success = false; // indicates whether we were able to extract or not
  79. /**
  80. * All of the regular expressions in use within readability.
  81. * Defined up here so we don't instantiate them repeatedly in loops.
  82. **/
  83. public $regexps = array(
  84. 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
  85. 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
  86. 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
  87. 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
  88. 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
  89. 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
  90. 'replaceFonts' => '/<(\/?)font[^>]*>/i',
  91. // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
  92. 'normalize' => '/\s{2,}/',
  93. 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
  94. 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
  95. 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
  96. );
  97. /* constants */
  98. const FLAG_STRIP_UNLIKELYS = 1;
  99. const FLAG_WEIGHT_CLASSES = 2;
  100. const FLAG_CLEAN_CONDITIONALLY = 4;
  101. /**
  102. * Create instance of Readability
  103. * @param string UTF-8 encoded string
  104. * @param string (optional) URL associated with HTML (used for footnotes)
  105. * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
  106. */
  107. function __construct($html, $url=null, $parser='libxml')
  108. {
  109. $this->url = $url;
  110. /* Turn all double br's into p's */
  111. $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
  112. $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
  113. $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
  114. if (trim($html) == '') $html = '<html></html>';
  115. if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
  116. // all good
  117. } else {
  118. $this->dom = new DOMDocument();
  119. $this->dom->preserveWhiteSpace = false;
  120. @$this->dom->loadHTML($html);
  121. }
  122. $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
  123. }
  124. /**
  125. * Get article title element
  126. * @return DOMElement
  127. */
  128. public function getTitle() {
  129. return $this->articleTitle;
  130. }
  131. /**
  132. * Get article content element
  133. * @return DOMElement
  134. */
  135. public function getContent() {
  136. return $this->articleContent;
  137. }
  138. /**
  139. * Runs readability.
  140. *
  141. * Workflow:
  142. * 1. Prep the document by removing script tags, css, etc.
  143. * 2. Build readability's DOM tree.
  144. * 3. Grab the article content from the current dom tree.
  145. * 4. Replace the current DOM tree with the new one.
  146. * 5. Read peacefully.
  147. *
  148. * @return boolean true if we found content, false otherwise
  149. **/
  150. public function init()
  151. {
  152. if (!isset($this->dom->documentElement)) return false;
  153. $this->removeScripts($this->dom);
  154. //die($this->getInnerHTML($this->dom->documentElement));
  155. // Assume successful outcome
  156. $this->success = true;
  157. $bodyElems = $this->dom->getElementsByTagName('body');
  158. if ($bodyElems->length > 0) {
  159. if ($this->bodyCache == null) {
  160. $this->bodyCache = $bodyElems->item(0)->innerHTML;
  161. }
  162. if ($this->body == null) {
  163. $this->body = $bodyElems->item(0);
  164. }
  165. }
  166. $this->prepDocument();
  167. //die($this->dom->documentElement->parentNode->nodeType);
  168. //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
  169. //die($this->getInnerHTML($this->dom->documentElement));
  170. /* Build readability's DOM tree */
  171. $overlay = $this->dom->createElement('div');
  172. $innerDiv = $this->dom->createElement('div');
  173. $articleTitle = $this->getArticleTitle();
  174. $articleContent = $this->grabArticle();
  175. if (!$articleContent) {
  176. $this->success = false;
  177. $articleContent = $this->dom->createElement('div');
  178. $articleContent->setAttribute('id', 'readability-content');
  179. $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
  180. }
  181. $overlay->setAttribute('id', 'readOverlay');
  182. $innerDiv->setAttribute('id', 'readInner');
  183. /* Glue the structure of our document together. */
  184. $innerDiv->appendChild($articleTitle);
  185. $innerDiv->appendChild($articleContent);
  186. $overlay->appendChild($innerDiv);
  187. /* Clear the old HTML, insert the new content. */
  188. $this->body->innerHTML = '';
  189. $this->body->appendChild($overlay);
  190. //document.body.insertBefore(overlay, document.body.firstChild);
  191. $this->body->removeAttribute('style');
  192. $this->postProcessContent($articleContent);
  193. // Set title and content instance variables
  194. $this->articleTitle = $articleTitle;
  195. $this->articleContent = $articleContent;
  196. return $this->success;
  197. }
  198. /**
  199. * Debug
  200. */
  201. protected function dbg($msg) {
  202. if ($this->debug) echo '* ',$msg, "\n";
  203. }
  204. /**
  205. * Run any post-process modifications to article content as necessary.
  206. *
  207. * @param DOMElement
  208. * @return void
  209. */
  210. public function postProcessContent($articleContent) {
  211. if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
  212. $this->addFootnotes($articleContent);
  213. }
  214. }
  215. /**
  216. * Get the article title as an H1.
  217. *
  218. * @return DOMElement
  219. */
  220. protected function getArticleTitle() {
  221. $curTitle = '';
  222. $origTitle = '';
  223. try {
  224. $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
  225. } catch(Exception $e) {}
  226. if (preg_match('/ [\|\-] /', $curTitle))
  227. {
  228. $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
  229. if (count(explode(' ', $curTitle)) < 3) {
  230. $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
  231. }
  232. }
  233. else if (strpos($curTitle, ': ') !== false)
  234. {
  235. $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
  236. if (count(explode(' ', $curTitle)) < 3) {
  237. $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
  238. }
  239. }
  240. else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
  241. {
  242. $hOnes = $this->dom->getElementsByTagName('h1');
  243. if($hOnes->length == 1)
  244. {
  245. $curTitle = $this->getInnerText($hOnes->item(0));
  246. }
  247. }
  248. $curTitle = trim($curTitle);
  249. if (count(explode(' ', $curTitle)) <= 4) {
  250. $curTitle = $origTitle;
  251. }
  252. $articleTitle = $this->dom->createElement('h1');
  253. $articleTitle->innerHTML = $curTitle;
  254. return $articleTitle;
  255. }
  256. /**
  257. * Prepare the HTML document for readability to scrape it.
  258. * This includes things like stripping javascript, CSS, and handling terrible markup.
  259. *
  260. * @return void
  261. **/
  262. protected function prepDocument() {
  263. /**
  264. * In some cases a body element can't be found (if the HTML is totally hosed for example)
  265. * so we create a new body node and append it to the document.
  266. */
  267. if ($this->body == null)
  268. {
  269. $this->body = $this->dom->createElement('body');
  270. $this->dom->documentElement->appendChild($this->body);
  271. }
  272. $this->body->setAttribute('id', 'readabilityBody');
  273. /* Remove all style tags in head */
  274. $styleTags = $this->dom->getElementsByTagName('style');
  275. for ($i = $styleTags->length-1; $i >= 0; $i--)
  276. {
  277. $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
  278. }
  279. /* Turn all double br's into p's */
  280. /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
  281. //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
  282. // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
  283. // Manipulating innerHTML as it's done in JS is not possible in PHP.
  284. }
  285. /**
  286. * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
  287. * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
  288. *
  289. * @return void
  290. **/
  291. public function addFootnotes($articleContent) {
  292. $footnotesWrapper = $this->dom->createElement('div');
  293. $footnotesWrapper->setAttribute('id', 'readability-footnotes');
  294. $footnotesWrapper->innerHTML = '<h3>References</h3>';
  295. $articleFootnotes = $this->dom->createElement('ol');
  296. $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
  297. $footnotesWrapper->appendChild($articleFootnotes);
  298. $articleLinks = $articleContent->getElementsByTagName('a');
  299. $linkCount = 0;
  300. for ($i = 0; $i < $articleLinks->length; $i++)
  301. {
  302. $articleLink = $articleLinks->item($i);
  303. $footnoteLink = $articleLink->cloneNode(true);
  304. $refLink = $this->dom->createElement('a');
  305. $footnote = $this->dom->createElement('li');
  306. $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
  307. if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
  308. //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
  309. $linkText = $this->getInnerText($articleLink);
  310. if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
  311. continue;
  312. }
  313. $linkCount++;
  314. /** Add a superscript reference after the article link */
  315. $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
  316. $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
  317. $refLink->setAttribute('class', 'readability-DoNotFootnote');
  318. $refLink->setAttribute('style', 'color: inherit;');
  319. //TODO: does this work or should we use DOMNode.isSameNode()?
  320. if ($articleLink->parentNode->lastChild == $articleLink) {
  321. $articleLink->parentNode->appendChild($refLink);
  322. } else {
  323. $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
  324. }
  325. $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
  326. $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
  327. $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
  328. $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
  329. $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
  330. $footnote->appendChild($footnoteLink);
  331. if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
  332. $articleFootnotes->appendChild($footnote);
  333. }
  334. if ($linkCount > 0) {
  335. $articleContent->appendChild($footnotesWrapper);
  336. }
  337. }
  338. /**
  339. * Reverts P elements with class 'readability-styled'
  340. * to text nodes - which is what they were before.
  341. *
  342. * @param DOMElement
  343. * @return void
  344. */
  345. function revertReadabilityStyledElements($articleContent) {
  346. $xpath = new DOMXPath($articleContent->ownerDocument);
  347. $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
  348. //$elems = $articleContent->getElementsByTagName('p');
  349. for ($i = $elems->length-1; $i >= 0; $i--) {
  350. $e = $elems->item($i);
  351. $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
  352. //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
  353. // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
  354. //}
  355. }
  356. }
  357. /**
  358. * Prepare the article node for display. Clean out any inline styles,
  359. * iframes, forms, strip extraneous <p> tags, etc.
  360. *
  361. * @param DOMElement
  362. * @return void
  363. */
  364. function prepArticle($articleContent) {
  365. $this->cleanStyles($articleContent);
  366. $this->killBreaks($articleContent);
  367. if ($this->revertForcedParagraphElements) {
  368. $this->revertReadabilityStyledElements($articleContent);
  369. }
  370. /* Clean out junk from the article content */
  371. $this->cleanConditionally($articleContent, 'form');
  372. $this->clean($articleContent, 'object');
  373. $this->clean($articleContent, 'h1');
  374. /**
  375. * If there is only one h2, they are probably using it
  376. * as a header and not a subheader, so remove it since we already have a header.
  377. ***/
  378. if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
  379. $this->clean($articleContent, 'h2');
  380. }
  381. $this->clean($articleContent, 'iframe');
  382. $this->cleanHeaders($articleContent);
  383. /* Do these last as the previous stuff may have removed junk that will affect these */
  384. $this->cleanConditionally($articleContent, 'table');
  385. $this->cleanConditionally($articleContent, 'ul');
  386. $this->cleanConditionally($articleContent, 'div');
  387. /* Remove extra paragraphs */
  388. $articleParagraphs = $articleContent->getElementsByTagName('p');
  389. for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
  390. {
  391. $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
  392. $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
  393. $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
  394. $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
  395. if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
  396. {
  397. $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
  398. }
  399. }
  400. try {
  401. $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
  402. //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
  403. }
  404. catch (Exception $e) {
  405. $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
  406. }
  407. }
  408. /**
  409. * Initialize a node with the readability object. Also checks the
  410. * className/id for special names to add to its score.
  411. *
  412. * @param Element
  413. * @return void
  414. **/
  415. protected function initializeNode($node) {
  416. $readability = $this->dom->createAttribute('readability');
  417. $readability->value = 0; // this is our contentScore
  418. $node->setAttributeNode($readability);
  419. switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
  420. case 'DIV':
  421. $readability->value += 5;
  422. break;
  423. case 'PRE':
  424. case 'TD':
  425. case 'BLOCKQUOTE':
  426. $readability->value += 3;
  427. break;
  428. case 'ADDRESS':
  429. case 'OL':
  430. case 'UL':
  431. case 'DL':
  432. case 'DD':
  433. case 'DT':
  434. case 'LI':
  435. case 'FORM':
  436. $readability->value -= 3;
  437. break;
  438. case 'H1':
  439. case 'H2':
  440. case 'H3':
  441. case 'H4':
  442. case 'H5':
  443. case 'H6':
  444. case 'TH':
  445. $readability->value -= 5;
  446. break;
  447. }
  448. $readability->value += $this->getClassWeight($node);
  449. }
  450. /***
  451. * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
  452. * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
  453. *
  454. * @return DOMElement
  455. **/
  456. protected function grabArticle($page=null) {
  457. $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
  458. if (!$page) $page = $this->dom;
  459. $allElements = $page->getElementsByTagName('*');
  460. /**
  461. * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
  462. * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
  463. *
  464. * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
  465. * TODO: Shouldn't this be a reverse traversal?
  466. **/
  467. $node = null;
  468. $nodesToScore = array();
  469. for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
  470. //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
  471. //$node = $targetList->item($nodeIndex);
  472. $tagName = strtoupper($node->tagName);
  473. /* Remove unlikely candidates */
  474. if ($stripUnlikelyCandidates) {
  475. $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
  476. if (
  477. preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
  478. !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
  479. $tagName != 'BODY'
  480. )
  481. {
  482. $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
  483. //$nodesToRemove[] = $node;
  484. $node->parentNode->removeChild($node);
  485. $nodeIndex--;
  486. continue;
  487. }
  488. }
  489. if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
  490. $nodesToScore[] = $node;
  491. }
  492. /* Turn all divs that don't have children block level elements into p's */
  493. if ($tagName == 'DIV') {
  494. if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
  495. //$this->dbg('Altering div to p');
  496. $newNode = $this->dom->createElement('p');
  497. try {
  498. $newNode->innerHTML = $node->innerHTML;
  499. //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
  500. $node->parentNode->replaceChild($newNode, $node);
  501. $nodeIndex--;
  502. $nodesToScore[] = $node; // or $newNode?
  503. }
  504. catch(Exception $e) {
  505. $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
  506. }
  507. }
  508. else
  509. {
  510. /* EXPERIMENTAL */
  511. // TODO: change these p elements back to text nodes after processing
  512. for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
  513. $childNode = $node->childNodes->item($i);
  514. if ($childNode->nodeType == 3) { // XML_TEXT_NODE
  515. //$this->dbg('replacing text node with a p tag with the same content.');
  516. $p = $this->dom->createElement('p');
  517. $p->innerHTML = $childNode->nodeValue;
  518. $p->setAttribute('style', 'display: inline;');
  519. $p->setAttribute('class', 'readability-styled');
  520. $childNode->parentNode->replaceChild($p, $childNode);
  521. }
  522. }
  523. }
  524. }
  525. }
  526. /**
  527. * Loop through all paragraphs, and assign a score to them based on how content-y they look.
  528. * Then add their score to their parent node.
  529. *
  530. * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
  531. **/
  532. $candidates = array();
  533. for ($pt=0; $pt < count($nodesToScore); $pt++) {
  534. $parentNode = $nodesToScore[$pt]->parentNode;
  535. // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
  536. $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
  537. $innerText = $this->getInnerText($nodesToScore[$pt]);
  538. if (!$parentNode || !isset($parentNode->tagName)) {
  539. continue;
  540. }
  541. /* If this paragraph is less than 25 characters, don't even count it. */
  542. if(strlen($innerText) < 25) {
  543. continue;
  544. }
  545. /* Initialize readability data for the parent. */
  546. if (!$parentNode->hasAttribute('readability'))
  547. {
  548. $this->initializeNode($parentNode);
  549. $candidates[] = $parentNode;
  550. }
  551. /* Initialize readability data for the grandparent. */
  552. if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
  553. {
  554. $this->initializeNode($grandParentNode);
  555. $candidates[] = $grandParentNode;
  556. }
  557. $contentScore = 0;
  558. /* Add a point for the paragraph itself as a base. */
  559. $contentScore++;
  560. /* Add points for any commas within this paragraph */
  561. $contentScore += count(explode(',', $innerText));
  562. /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
  563. $contentScore += min(floor(strlen($innerText) / 100), 3);
  564. /* Add the score to the parent. The grandparent gets half. */
  565. $parentNode->getAttributeNode('readability')->value += $contentScore;
  566. if ($grandParentNode) {
  567. $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
  568. }
  569. }
  570. /**
  571. * After we've calculated scores, loop through all of the possible candidate nodes we found
  572. * and find the one with the highest score.
  573. **/
  574. $topCandidate = null;
  575. for ($c=0, $cl=count($candidates); $c < $cl; $c++)
  576. {
  577. /**
  578. * Scale the final candidates score based on link density. Good content should have a
  579. * relatively small link density (5% or less) and be mostly unaffected by this operation.
  580. **/
  581. $readability = $candidates[$c]->getAttributeNode('readability');
  582. $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
  583. $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
  584. if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
  585. $topCandidate = $candidates[$c];
  586. }
  587. }
  588. /**
  589. * If we still have no top candidate, just use the body as a last resort.
  590. * We also have to copy the body node so it is something we can modify.
  591. **/
  592. if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
  593. {
  594. $topCandidate = $this->dom->createElement('div');
  595. if ($page instanceof DOMDocument) {
  596. if (!isset($page->documentElement)) {
  597. // we don't have a body either? what a mess! :)
  598. } else {
  599. $topCandidate->innerHTML = $page->documentElement->innerHTML;
  600. $page->documentElement->innerHTML = '';
  601. $this->reinitBody();
  602. $page->documentElement->appendChild($topCandidate);
  603. }
  604. } else {
  605. $topCandidate->innerHTML = $page->innerHTML;
  606. $page->innerHTML = '';
  607. $page->appendChild($topCandidate);
  608. }
  609. $this->initializeNode($topCandidate);
  610. }
  611. /**
  612. * Now that we have the top candidate, look through its siblings for content that might also be related.
  613. * Things like preambles, content split by ads that we removed, etc.
  614. **/
  615. $articleContent = $this->dom->createElement('div');
  616. $articleContent->setAttribute('id', 'readability-content');
  617. $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
  618. $siblingNodes = @$topCandidate->parentNode->childNodes;
  619. if (!isset($siblingNodes)) {
  620. $siblingNodes = new stdClass;
  621. $siblingNodes->length = 0;
  622. }
  623. for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
  624. {
  625. $siblingNode = $siblingNodes->item($s);
  626. $append = false;
  627. $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
  628. //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
  629. if ($siblingNode === $topCandidate)
  630. // or if ($siblingNode->isSameNode($topCandidate))
  631. {
  632. $append = true;
  633. }
  634. $contentBonus = 0;
  635. /* Give a bonus if sibling nodes and top candidates have the example same classname */
  636. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
  637. $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
  638. }
  639. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
  640. {
  641. $append = true;
  642. }
  643. if (strtoupper($siblingNode->nodeName) == 'P') {
  644. $linkDensity = $this->getLinkDensity($siblingNode);
  645. $nodeContent = $this->getInnerText($siblingNode);
  646. $nodeLength = strlen($nodeContent);
  647. if ($nodeLength > 80 && $linkDensity < 0.25)
  648. {
  649. $append = true;
  650. }
  651. else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
  652. {
  653. $append = true;
  654. }
  655. }
  656. if ($append)
  657. {
  658. $this->dbg('Appending node: ' . $siblingNode->nodeName);
  659. $nodeToAppend = null;
  660. $sibNodeName = strtoupper($siblingNode->nodeName);
  661. if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
  662. /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
  663. $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
  664. $nodeToAppend = $this->dom->createElement('div');
  665. try {
  666. $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
  667. $nodeToAppend->innerHTML = $siblingNode->innerHTML;
  668. }
  669. catch(Exception $e)
  670. {
  671. $this->dbg('Could not alter siblingNode to div, reverting back to original.');
  672. $nodeToAppend = $siblingNode;
  673. $s--;
  674. $sl--;
  675. }
  676. } else {
  677. $nodeToAppend = $siblingNode;
  678. $s--;
  679. $sl--;
  680. }
  681. /* To ensure a node does not interfere with readability styles, remove its classnames */
  682. $nodeToAppend->removeAttribute('class');
  683. /* Append sibling and subtract from our list because it removes the node when you append to another node */
  684. $articleContent->appendChild($nodeToAppend);
  685. }
  686. }
  687. /**
  688. * So we have all of the content that we need. Now we clean it up for presentation.
  689. **/
  690. $this->prepArticle($articleContent);
  691. /**
  692. * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
  693. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
  694. * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
  695. * finding the -right- content.
  696. **/
  697. if (strlen($this->getInnerText($articleContent, false)) < 250)
  698. {
  699. // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
  700. // in the meantime, we check and create an empty element if it's not there.
  701. $this->reinitBody();
  702. if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
  703. $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
  704. return $this->grabArticle($this->body);
  705. }
  706. else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
  707. $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
  708. return $this->grabArticle($this->body);
  709. }
  710. else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
  711. $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
  712. return $this->grabArticle($this->body);
  713. }
  714. else {
  715. return false;
  716. }
  717. }
  718. return $articleContent;
  719. }
  720. /**
  721. * Remove script tags from document
  722. *
  723. * @param DOMElement
  724. * @return void
  725. */
  726. public function removeScripts($doc) {
  727. $scripts = $doc->getElementsByTagName('script');
  728. for($i = $scripts->length-1; $i >= 0; $i--)
  729. {
  730. $scripts->item($i)->parentNode->removeChild($scripts->item($i));
  731. }
  732. }
  733. /**
  734. * Get the inner text of a node.
  735. * This also strips out any excess whitespace to be found.
  736. *
  737. * @param DOMElement $
  738. * @param boolean $normalizeSpaces (default: true)
  739. * @return string
  740. **/
  741. public function getInnerText($e, $normalizeSpaces=true) {
  742. $textContent = '';
  743. if (!isset($e->textContent) || $e->textContent == '') {
  744. return '';
  745. }
  746. $textContent = trim($e->textContent);
  747. if ($normalizeSpaces) {
  748. return preg_replace($this->regexps['normalize'], ' ', $textContent);
  749. } else {
  750. return $textContent;
  751. }
  752. }
  753. /**
  754. * Get the number of times a string $s appears in the node $e.
  755. *
  756. * @param DOMElement $e
  757. * @param string - what to count. Default is ","
  758. * @return number (integer)
  759. **/
  760. public function getCharCount($e, $s=',') {
  761. return substr_count($this->getInnerText($e), $s);
  762. }
  763. /**
  764. * Remove the style attribute on every $e and under.
  765. *
  766. * @param DOMElement $e
  767. * @return void
  768. */
  769. public function cleanStyles($e) {
  770. if (!is_object($e)) return;
  771. $elems = $e->getElementsByTagName('*');
  772. foreach ($elems as $elem) {
  773. $elem->removeAttribute('style');
  774. }
  775. }
  776. /**
  777. * Get the density of links as a percentage of the content
  778. * This is the amount of text that is inside a link divided by the total text in the node.
  779. *
  780. * @param DOMElement $e
  781. * @return number (float)
  782. */
  783. public function getLinkDensity($e) {
  784. $links = $e->getElementsByTagName('a');
  785. $textLength = strlen($this->getInnerText($e));
  786. $linkLength = 0;
  787. for ($i=0, $il=$links->length; $i < $il; $i++)
  788. {
  789. $linkLength += strlen($this->getInnerText($links->item($i)));
  790. }
  791. if ($textLength > 0) {
  792. return $linkLength / $textLength;
  793. } else {
  794. return 0;
  795. }
  796. }
  797. /**
  798. * Get an elements class/id weight. Uses regular expressions to tell if this
  799. * element looks good or bad.
  800. *
  801. * @param DOMElement $e
  802. * @return number (Integer)
  803. */
  804. public function getClassWeight($e) {
  805. if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
  806. return 0;
  807. }
  808. $weight = 0;
  809. /* Look for a special classname */
  810. if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
  811. {
  812. if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
  813. $weight -= 25;
  814. }
  815. if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
  816. $weight += 25;
  817. }
  818. }
  819. /* Look for a special ID */
  820. if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
  821. {
  822. if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
  823. $weight -= 25;
  824. }
  825. if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
  826. $weight += 25;
  827. }
  828. }
  829. return $weight;
  830. }
  831. /**
  832. * Remove extraneous break tags from a node.
  833. *
  834. * @param DOMElement $node
  835. * @return void
  836. */
  837. public function killBreaks($node) {
  838. $html = $node->innerHTML;
  839. $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
  840. $node->innerHTML = $html;
  841. }
  842. /**
  843. * Clean a node of all elements of type "tag".
  844. * (Unless it's a youtube/vimeo video. People love movies.)
  845. *
  846. * Updated 2012-09-18 to preserve youtube/vimeo iframes
  847. *
  848. * @param DOMElement $e
  849. * @param string $tag
  850. * @return void
  851. */
  852. public function clean($e, $tag) {
  853. $targetList = $e->getElementsByTagName($tag);
  854. $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
  855. for ($y=$targetList->length-1; $y >= 0; $y--) {
  856. /* Allow youtube and vimeo videos through as people usually want to see those. */
  857. if ($isEmbed) {
  858. $attributeValues = '';
  859. for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
  860. $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
  861. }
  862. /* First, check the elements attributes to see if any of them contain youtube or vimeo */
  863. if (preg_match($this->regexps['video'], $attributeValues)) {
  864. continue;
  865. }
  866. /* Then check the elements inside this element for the same. */
  867. if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
  868. continue;
  869. }
  870. }
  871. $targetList->item($y)->parentNode->removeChild($targetList->item($y));
  872. }
  873. }
  874. /**
  875. * Clean an element of all tags of type "tag" if they look fishy.
  876. * "Fishy" is an algorithm based on content length, classnames,
  877. * link density, number of images & embeds, etc.
  878. *
  879. * @param DOMElement $e
  880. * @param string $tag
  881. * @return void
  882. */
  883. public function cleanConditionally($e, $tag) {
  884. if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
  885. return;
  886. }
  887. $tagsList = $e->getElementsByTagName($tag);
  888. $curTagsLength = $tagsList->length;
  889. /**
  890. * Gather counts for other typical elements embedded within.
  891. * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
  892. *
  893. * TODO: Consider taking into account original contentScore here.
  894. */
  895. for ($i=$curTagsLength-1; $i >= 0; $i--) {
  896. $weight = $this->getClassWeight($tagsList->item($i));
  897. $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
  898. $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
  899. if ($weight + $contentScore < 0) {
  900. $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
  901. }
  902. else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
  903. /**
  904. * If there are not very many commas, and the number of
  905. * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
  906. **/
  907. $p = $tagsList->item($i)->getElementsByTagName('p')->length;
  908. $img = $tagsList->item($i)->getElementsByTagName('img')->length;
  909. $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
  910. $input = $tagsList->item($i)->getElementsByTagName('input')->length;
  911. $a = $tagsList->item($i)->getElementsByTagName('a')->length;
  912. $embedCount = 0;
  913. $embeds = $tagsList->item($i)->getElementsByTagName('embed');
  914. for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
  915. if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
  916. $embedCount++;
  917. }
  918. }
  919. $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
  920. for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
  921. if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
  922. $embedCount++;
  923. }
  924. }
  925. $linkDensity = $this->getLinkDensity($tagsList->item($i));
  926. $contentLength = strlen($this->getInnerText($tagsList->item($i)));
  927. $toRemove = false;
  928. if ($this->lightClean) {
  929. $this->dbg('Light clean...');
  930. if ( ($img > $p) && ($img > 4) ) {
  931. $this->dbg(' more than 4 images and more image elements than paragraph elements');
  932. $toRemove = true;
  933. } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
  934. $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
  935. $toRemove = true;
  936. } else if ( $input > floor($p/3) ) {
  937. $this->dbg(' too many <input> elements');
  938. $toRemove = true;
  939. } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
  940. $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
  941. $toRemove = true;
  942. } else if($weight < 25 && $linkDensity > 0.2) {
  943. $this->dbg(' weight smaller than 25 and link density above 0.2');
  944. $toRemove = true;
  945. } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
  946. $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
  947. $toRemove = true;
  948. } else if($embedCount > 3) {
  949. $this->dbg(' more than 3 embeds');
  950. $toRemove = true;
  951. }
  952. } else {
  953. $this->dbg('Standard clean...');
  954. if ( $img > $p ) {
  955. $this->dbg(' more image elements than paragraph elements');
  956. $toRemove = true;
  957. } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
  958. $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
  959. $toRemove = true;
  960. } else if ( $input > floor($p/3) ) {
  961. $this->dbg(' too many <input> elements');
  962. $toRemove = true;
  963. } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
  964. $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
  965. $toRemove = true;
  966. } else if($weight < 25 && $linkDensity > 0.2) {
  967. $this->dbg(' weight smaller than 25 and link density above 0.2');
  968. $toRemove = true;
  969. } else if($weight >= 25 && $linkDensity > 0.5) {
  970. $this->dbg(' weight above 25 but link density greater than 0.5');
  971. $toRemove = true;
  972. } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
  973. $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
  974. $toRemove = true;
  975. }
  976. }
  977. if ($toRemove) {
  978. //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
  979. $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
  980. }
  981. }
  982. }
  983. }
  984. /**
  985. * Clean out spurious headers from an Element. Checks things like classnames and link density.
  986. *
  987. * @param DOMElement $e
  988. * @return void
  989. */
  990. public function cleanHeaders($e) {
  991. for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
  992. $headers = $e->getElementsByTagName('h' . $headerIndex);
  993. for ($i=$headers->length-1; $i >=0; $i--) {
  994. if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
  995. $headers->item($i)->parentNode->removeChild($headers->item($i));
  996. }
  997. }
  998. }
  999. }
  1000. public function flagIsActive($flag) {
  1001. return ($this->flags & $flag) > 0;
  1002. }
  1003. public function addFlag($flag) {
  1004. $this->flags = $this->flags | $flag;
  1005. }
  1006. public function removeFlag($flag) {
  1007. $this->flags = $this->flags & ~$flag;
  1008. }
  1009. /**
  1010. * Will recreate previously deleted body property
  1011. *
  1012. * @return void
  1013. */
  1014. protected function reinitBody() {
  1015. if (!isset($this->body->childNodes)) {
  1016. $this->body = $this->dom->createElement('body');
  1017. $this->body->innerHTML = $this->bodyCache;
  1018. }
  1019. }
  1020. }
  1021. ?>