htmlfilter.php 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166
  1. <?php
  2. /**
  3. * htmlfilter.inc
  4. * ---------------
  5. * This set of functions allows you to filter html in order to remove
  6. * any malicious tags from it. Useful in cases when you need to filter
  7. * user input for any cross-site-scripting attempts.
  8. *
  9. * Copyright (C) 2002-2004 by Duke University
  10. *
  11. * This library is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU Lesser General Public
  13. * License as published by the Free Software Foundation; either
  14. * version 2.1 of the License, or (at your option) any later version.
  15. *
  16. * This library is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. * Lesser General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser General Public
  22. * License along with this library; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  24. * 02110-1301 USA
  25. *
  26. * @Author Konstantin Riabitsev <icon@linux.duke.edu>
  27. * @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
  28. * @Version 1.1 ($Date$)
  29. */
  30. /**
  31. * This function returns the final tag out of the tag name, an array
  32. * of attributes, and the type of the tag. This function is called by
  33. * tln_sanitize internally.
  34. *
  35. * @param string $tagname the name of the tag.
  36. * @param array $attary the array of attributes and their values
  37. * @param integer $tagtype The type of the tag (see in comments).
  38. * @return string A string with the final tag representation.
  39. */
  40. function tln_tagprint($tagname, $attary, $tagtype)
  41. {
  42. if ($tagtype == 2) {
  43. $fulltag = '</' . $tagname . '>';
  44. } else {
  45. $fulltag = '<' . $tagname;
  46. if (is_array($attary) && sizeof($attary)) {
  47. $atts = array();
  48. while (list($attname, $attvalue) = each($attary)) {
  49. array_push($atts, "$attname=$attvalue");
  50. }
  51. $fulltag .= ' ' . join(' ', $atts);
  52. }
  53. if ($tagtype == 3) {
  54. $fulltag .= ' /';
  55. }
  56. $fulltag .= '>';
  57. }
  58. return $fulltag;
  59. }
  60. /**
  61. * A small helper function to use with array_walk. Modifies a by-ref
  62. * value and makes it lowercase.
  63. *
  64. * @param string $val a value passed by-ref.
  65. * @return void since it modifies a by-ref value.
  66. */
  67. function tln_casenormalize(&$val)
  68. {
  69. $val = strtolower($val);
  70. }
  71. /**
  72. * This function skips any whitespace from the current position within
  73. * a string and to the next non-whitespace value.
  74. *
  75. * @param string $body the string
  76. * @param integer $offset the offset within the string where we should start
  77. * looking for the next non-whitespace character.
  78. * @return integer the location within the $body where the next
  79. * non-whitespace char is located.
  80. */
  81. function tln_skipspace($body, $offset)
  82. {
  83. preg_match('/^(\s*)/s', substr($body, $offset), $matches);
  84. if (sizeof($matches[1])) {
  85. $count = strlen($matches[1]);
  86. $offset += $count;
  87. }
  88. return $offset;
  89. }
  90. /**
  91. * This function looks for the next character within a string. It's
  92. * really just a glorified "strpos", except it catches the failures
  93. * nicely.
  94. *
  95. * @param string $body The string to look for needle in.
  96. * @param integer $offset Start looking from this position.
  97. * @param string $needle The character/string to look for.
  98. * @return integer location of the next occurrence of the needle, or
  99. * strlen($body) if needle wasn't found.
  100. */
  101. function tln_findnxstr($body, $offset, $needle)
  102. {
  103. $pos = strpos($body, $needle, $offset);
  104. if ($pos === false) {
  105. $pos = strlen($body);
  106. }
  107. return $pos;
  108. }
  109. /**
  110. * This function takes a PCRE-style regexp and tries to match it
  111. * within the string.
  112. *
  113. * @param string $body The string to look for needle in.
  114. * @param integer $offset Start looking from here.
  115. * @param string $reg A PCRE-style regex to match.
  116. * @return array|boolean Returns a false if no matches found, or an array
  117. * with the following members:
  118. * - integer with the location of the match within $body
  119. * - string with whatever content between offset and the match
  120. * - string with whatever it is we matched
  121. */
  122. function tln_findnxreg($body, $offset, $reg)
  123. {
  124. $matches = array();
  125. $retarr = array();
  126. $preg_rule = '%^(.*?)(' . $reg . ')%s';
  127. preg_match($preg_rule, substr($body, $offset), $matches);
  128. if (!isset($matches[0]) || !$matches[0]) {
  129. $retarr = false;
  130. } else {
  131. $retarr[0] = $offset + strlen($matches[1]);
  132. $retarr[1] = $matches[1];
  133. $retarr[2] = $matches[2];
  134. }
  135. return $retarr;
  136. }
  137. /**
  138. * This function looks for the next tag.
  139. *
  140. * @param string $body String where to look for the next tag.
  141. * @param integer $offset Start looking from here.
  142. * @return array|boolean false if no more tags exist in the body, or
  143. * an array with the following members:
  144. * - string with the name of the tag
  145. * - array with attributes and their values
  146. * - integer with tag type (1, 2, or 3)
  147. * - integer where the tag starts (starting "<")
  148. * - integer where the tag ends (ending ">")
  149. * first three members will be false, if the tag is invalid.
  150. */
  151. function tln_getnxtag($body, $offset)
  152. {
  153. if ($offset > strlen($body)) {
  154. return false;
  155. }
  156. $lt = tln_findnxstr($body, $offset, '<');
  157. if ($lt == strlen($body)) {
  158. return false;
  159. }
  160. /**
  161. * We are here:
  162. * blah blah <tag attribute="value">
  163. * \---------^
  164. */
  165. $pos = tln_skipspace($body, $lt + 1);
  166. if ($pos >= strlen($body)) {
  167. return array(false, false, false, $lt, strlen($body));
  168. }
  169. /**
  170. * There are 3 kinds of tags:
  171. * 1. Opening tag, e.g.:
  172. * <a href="blah">
  173. * 2. Closing tag, e.g.:
  174. * </a>
  175. * 3. XHTML-style content-less tag, e.g.:
  176. * <img src="blah"/>
  177. */
  178. switch (substr($body, $pos, 1)) {
  179. case '/':
  180. $tagtype = 2;
  181. $pos++;
  182. break;
  183. case '!':
  184. /**
  185. * A comment or an SGML declaration.
  186. */
  187. if (substr($body, $pos + 1, 2) == '--') {
  188. $gt = strpos($body, '-->', $pos);
  189. if ($gt === false) {
  190. $gt = strlen($body);
  191. } else {
  192. $gt += 2;
  193. }
  194. return array(false, false, false, $lt, $gt);
  195. } else {
  196. $gt = tln_findnxstr($body, $pos, '>');
  197. return array(false, false, false, $lt, $gt);
  198. }
  199. break;
  200. default:
  201. /**
  202. * Assume tagtype 1 for now. If it's type 3, we'll switch values
  203. * later.
  204. */
  205. $tagtype = 1;
  206. break;
  207. }
  208. /**
  209. * Look for next [\W-_], which will indicate the end of the tag name.
  210. */
  211. $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
  212. if ($regary == false) {
  213. return array(false, false, false, $lt, strlen($body));
  214. }
  215. list($pos, $tagname, $match) = $regary;
  216. $tagname = strtolower($tagname);
  217. /**
  218. * $match can be either of these:
  219. * '>' indicating the end of the tag entirely.
  220. * '\s' indicating the end of the tag name.
  221. * '/' indicating that this is type-3 xhtml tag.
  222. *
  223. * Whatever else we find there indicates an invalid tag.
  224. */
  225. switch ($match) {
  226. case '/':
  227. /**
  228. * This is an xhtml-style tag with a closing / at the
  229. * end, like so: <img src="blah"/>. Check if it's followed
  230. * by the closing bracket. If not, then this tag is invalid
  231. */
  232. if (substr($body, $pos, 2) == '/>') {
  233. $pos++;
  234. $tagtype = 3;
  235. } else {
  236. $gt = tln_findnxstr($body, $pos, '>');
  237. $retary = array(false, false, false, $lt, $gt);
  238. return $retary;
  239. }
  240. //intentional fall-through
  241. case '>':
  242. return array($tagname, false, $tagtype, $lt, $pos);
  243. break;
  244. default:
  245. /**
  246. * Check if it's whitespace
  247. */
  248. if (!preg_match('/\s/', $match)) {
  249. /**
  250. * This is an invalid tag! Look for the next closing ">".
  251. */
  252. $gt = tln_findnxstr($body, $lt, '>');
  253. return array(false, false, false, $lt, $gt);
  254. }
  255. break;
  256. }
  257. /**
  258. * At this point we're here:
  259. * <tagname attribute='blah'>
  260. * \-------^
  261. *
  262. * At this point we loop in order to find all attributes.
  263. */
  264. $attary = array();
  265. while ($pos <= strlen($body)) {
  266. $pos = tln_skipspace($body, $pos);
  267. if ($pos == strlen($body)) {
  268. /**
  269. * Non-closed tag.
  270. */
  271. return array(false, false, false, $lt, $pos);
  272. }
  273. /**
  274. * See if we arrived at a ">" or "/>", which means that we reached
  275. * the end of the tag.
  276. */
  277. $matches = array();
  278. if (preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches)) {
  279. /**
  280. * Yep. So we did.
  281. */
  282. $pos += strlen($matches[1]);
  283. if ($matches[2] == '/>') {
  284. $tagtype = 3;
  285. $pos++;
  286. }
  287. return array($tagname, $attary, $tagtype, $lt, $pos);
  288. }
  289. /**
  290. * There are several types of attributes, with optional
  291. * [:space:] between members.
  292. * Type 1:
  293. * attrname[:space:]=[:space:]'CDATA'
  294. * Type 2:
  295. * attrname[:space:]=[:space:]"CDATA"
  296. * Type 3:
  297. * attr[:space:]=[:space:]CDATA
  298. * Type 4:
  299. * attrname
  300. *
  301. * We leave types 1 and 2 the same, type 3 we check for
  302. * '"' and convert to "&quot" if needed, then wrap in
  303. * double quotes. Type 4 we convert into:
  304. * attrname="yes".
  305. */
  306. $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
  307. if ($regary == false) {
  308. /**
  309. * Looks like body ended before the end of tag.
  310. */
  311. return array(false, false, false, $lt, strlen($body));
  312. }
  313. list($pos, $attname, $match) = $regary;
  314. $attname = strtolower($attname);
  315. /**
  316. * We arrived at the end of attribute name. Several things possible
  317. * here:
  318. * '>' means the end of the tag and this is attribute type 4
  319. * '/' if followed by '>' means the same thing as above
  320. * '\s' means a lot of things -- look what it's followed by.
  321. * anything else means the attribute is invalid.
  322. */
  323. switch ($match) {
  324. case '/':
  325. /**
  326. * This is an xhtml-style tag with a closing / at the
  327. * end, like so: <img src="blah"/>. Check if it's followed
  328. * by the closing bracket. If not, then this tag is invalid
  329. */
  330. if (substr($body, $pos, 2) == '/>') {
  331. $pos++;
  332. $tagtype = 3;
  333. } else {
  334. $gt = tln_findnxstr($body, $pos, '>');
  335. $retary = array(false, false, false, $lt, $gt);
  336. return $retary;
  337. }
  338. //intentional fall-through
  339. case '>':
  340. $attary{$attname} = '"yes"';
  341. return array($tagname, $attary, $tagtype, $lt, $pos);
  342. break;
  343. default:
  344. /**
  345. * Skip whitespace and see what we arrive at.
  346. */
  347. $pos = tln_skipspace($body, $pos);
  348. $char = substr($body, $pos, 1);
  349. /**
  350. * Two things are valid here:
  351. * '=' means this is attribute type 1 2 or 3.
  352. * \w means this was attribute type 4.
  353. * anything else we ignore and re-loop. End of tag and
  354. * invalid stuff will be caught by our checks at the beginning
  355. * of the loop.
  356. */
  357. if ($char == '=') {
  358. $pos++;
  359. $pos = tln_skipspace($body, $pos);
  360. /**
  361. * Here are 3 possibilities:
  362. * "'" attribute type 1
  363. * '"' attribute type 2
  364. * everything else is the content of tag type 3
  365. */
  366. $quot = substr($body, $pos, 1);
  367. if ($quot == '\'') {
  368. $regary = tln_findnxreg($body, $pos + 1, '\'');
  369. if ($regary == false) {
  370. return array(false, false, false, $lt, strlen($body));
  371. }
  372. list($pos, $attval, $match) = $regary;
  373. $pos++;
  374. $attary{$attname} = '\'' . $attval . '\'';
  375. } elseif ($quot == '"') {
  376. $regary = tln_findnxreg($body, $pos + 1, '\"');
  377. if ($regary == false) {
  378. return array(false, false, false, $lt, strlen($body));
  379. }
  380. list($pos, $attval, $match) = $regary;
  381. $pos++;
  382. $attary{$attname} = '"' . $attval . '"';
  383. } else {
  384. /**
  385. * These are hateful. Look for \s, or >.
  386. */
  387. $regary = tln_findnxreg($body, $pos, '[\s>]');
  388. if ($regary == false) {
  389. return array(false, false, false, $lt, strlen($body));
  390. }
  391. list($pos, $attval, $match) = $regary;
  392. /**
  393. * If it's ">" it will be caught at the top.
  394. */
  395. $attval = preg_replace('/\"/s', '&quot;', $attval);
  396. $attary{$attname} = '"' . $attval . '"';
  397. }
  398. } elseif (preg_match('|[\w/>]|', $char)) {
  399. /**
  400. * That was attribute type 4.
  401. */
  402. $attary{$attname} = '"yes"';
  403. } else {
  404. /**
  405. * An illegal character. Find next '>' and return.
  406. */
  407. $gt = tln_findnxstr($body, $pos, '>');
  408. return array(false, false, false, $lt, $gt);
  409. }
  410. break;
  411. }
  412. }
  413. /**
  414. * The fact that we got here indicates that the tag end was never
  415. * found. Return invalid tag indication so it gets stripped.
  416. */
  417. return array(false, false, false, $lt, strlen($body));
  418. }
  419. /**
  420. * Translates entities into literal values so they can be checked.
  421. *
  422. * @param string $attvalue the by-ref value to check.
  423. * @param string $regex the regular expression to check against.
  424. * @param boolean $hex whether the entites are hexadecimal.
  425. * @return boolean True or False depending on whether there were matches.
  426. */
  427. function tln_deent(&$attvalue, $regex, $hex = false)
  428. {
  429. preg_match_all($regex, $attvalue, $matches);
  430. if (is_array($matches) && sizeof($matches[0]) > 0) {
  431. $repl = array();
  432. for ($i = 0; $i < sizeof($matches[0]); $i++) {
  433. $numval = $matches[1][$i];
  434. if ($hex) {
  435. $numval = hexdec($numval);
  436. }
  437. $repl{$matches[0][$i]} = chr($numval);
  438. }
  439. $attvalue = strtr($attvalue, $repl);
  440. return true;
  441. } else {
  442. return false;
  443. }
  444. }
  445. /**
  446. * This function checks attribute values for entity-encoded values
  447. * and returns them translated into 8-bit strings so we can run
  448. * checks on them.
  449. *
  450. * @param string $attvalue A string to run entity check against.
  451. * @return Void, modifies a reference value.
  452. */
  453. function tln_defang(&$attvalue)
  454. {
  455. /**
  456. * Skip this if there aren't ampersands or backslashes.
  457. */
  458. if (strpos($attvalue, '&') === false
  459. && strpos($attvalue, '\\') === false
  460. ) {
  461. return;
  462. }
  463. do {
  464. $m = false;
  465. $m = $m || tln_deent($attvalue, '/\&#0*(\d+);*/s');
  466. $m = $m || tln_deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
  467. $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
  468. } while ($m == true);
  469. $attvalue = stripslashes($attvalue);
  470. }
  471. /**
  472. * Kill any tabs, newlines, or carriage returns. Our friends the
  473. * makers of the browser with 95% market value decided that it'd
  474. * be funny to make "java[tab]script" be just as good as "javascript".
  475. *
  476. * @param string $attvalue The attribute value before extraneous spaces removed.
  477. * @return Void, modifies a reference value.
  478. */
  479. function tln_unspace(&$attvalue)
  480. {
  481. if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)) {
  482. $attvalue = str_replace(
  483. array("\t", "\r", "\n", "\0", " "),
  484. array('', '', '', '', ''),
  485. $attvalue
  486. );
  487. }
  488. }
  489. /**
  490. * This function runs various checks against the attributes.
  491. *
  492. * @param string $tagname String with the name of the tag.
  493. * @param array $attary Array with all tag attributes.
  494. * @param array $rm_attnames See description for tln_sanitize
  495. * @param array $bad_attvals See description for tln_sanitize
  496. * @param array $add_attr_to_tag See description for tln_sanitize
  497. * @param string $trans_image_path
  498. * @param boolean $block_external_images
  499. * @return Array with modified attributes.
  500. */
  501. function tln_fixatts(
  502. $tagname,
  503. $attary,
  504. $rm_attnames,
  505. $bad_attvals,
  506. $add_attr_to_tag,
  507. $trans_image_path,
  508. $block_external_images
  509. ) {
  510. while (list($attname, $attvalue) = each($attary)) {
  511. /**
  512. * See if this attribute should be removed.
  513. */
  514. foreach ($rm_attnames as $matchtag => $matchattrs) {
  515. if (preg_match($matchtag, $tagname)) {
  516. foreach ($matchattrs as $matchattr) {
  517. if (preg_match($matchattr, $attname)) {
  518. unset($attary{$attname});
  519. continue;
  520. }
  521. }
  522. }
  523. }
  524. /**
  525. * Remove any backslashes, entities, or extraneous whitespace.
  526. */
  527. $oldattvalue = $attvalue;
  528. tln_defang($attvalue);
  529. if ($attname == 'style' && $attvalue !== $oldattvalue) {
  530. $attvalue = "idiocy";
  531. $attary{$attname} = $attvalue;
  532. }
  533. tln_unspace($attvalue);
  534. /**
  535. * Now let's run checks on the attvalues.
  536. * I don't expect anyone to comprehend this. If you do,
  537. * get in touch with me so I can drive to where you live and
  538. * shake your hand personally. :)
  539. */
  540. foreach ($bad_attvals as $matchtag => $matchattrs) {
  541. if (preg_match($matchtag, $tagname)) {
  542. foreach ($matchattrs as $matchattr => $valary) {
  543. if (preg_match($matchattr, $attname)) {
  544. /**
  545. * There are two arrays in valary.
  546. * First is matches.
  547. * Second one is replacements
  548. */
  549. list($valmatch, $valrepl) = $valary;
  550. $newvalue = preg_replace($valmatch, $valrepl, $attvalue);
  551. if ($newvalue != $attvalue) {
  552. $attary{$attname} = $newvalue;
  553. $attvalue = $newvalue;
  554. }
  555. }
  556. }
  557. }
  558. }
  559. if ($attname == 'style') {
  560. if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) {
  561. $attary{$attname} = '"disallowed character"';
  562. }
  563. preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch);
  564. if (count($aMatch)) {
  565. foreach($aMatch[1] as $sMatch) {
  566. $urlvalue = $sMatch;
  567. tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images);
  568. $attary{$attname} = str_replace($sMatch, $urlvalue, $attvalue);
  569. }
  570. }
  571. }
  572. }
  573. /**
  574. * See if we need to append any attributes to this tag.
  575. */
  576. foreach ($add_attr_to_tag as $matchtag => $addattary) {
  577. if (preg_match($matchtag, $tagname)) {
  578. $attary = array_merge($attary, $addattary);
  579. }
  580. }
  581. return $attary;
  582. }
  583. function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images)
  584. {
  585. $sQuote = '"';
  586. $attvalue = trim($attvalue);
  587. if ($attvalue && ($attvalue[0] =='"'|| $attvalue[0] == "'")) {
  588. // remove the double quotes
  589. $sQuote = $attvalue[0];
  590. $attvalue = trim(substr($attvalue,1,-1));
  591. }
  592. /**
  593. * Replace empty src tags with the blank image. src is only used
  594. * for frames, images, and image inputs. Doing a replace should
  595. * not affect them working as should be, however it will stop
  596. * IE from being kicked off when src for img tags are not set
  597. */
  598. if ($attvalue == '') {
  599. $attvalue = $sQuote . $trans_image_path . $sQuote;
  600. } else {
  601. // first, disallow 8 bit characters and control characters
  602. if (preg_match('/[\0-\37\200-\377]+/',$attvalue)) {
  603. switch ($attname) {
  604. case 'href':
  605. $attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote;
  606. break;
  607. default:
  608. $attvalue = $sQuote . $trans_image_path . $sQuote;
  609. break;
  610. }
  611. } else {
  612. $aUrl = parse_url($attvalue);
  613. if (isset($aUrl['scheme'])) {
  614. switch(strtolower($aUrl['scheme'])) {
  615. case 'mailto':
  616. case 'http':
  617. case 'https':
  618. case 'ftp':
  619. if ($attname != 'href') {
  620. if ($block_external_images == true) {
  621. $attvalue = $sQuote . $trans_image_path . $sQuote;
  622. } else {
  623. if (!isset($aUrl['path'])) {
  624. $attvalue = $sQuote . $trans_image_path . $sQuote;
  625. }
  626. }
  627. } else {
  628. $attvalue = $sQuote . $attvalue . $sQuote;
  629. }
  630. break;
  631. case 'outbind':
  632. $attvalue = $sQuote . $attvalue . $sQuote;
  633. break;
  634. case 'cid':
  635. $attvalue = $sQuote . $attvalue . $sQuote;
  636. break;
  637. default:
  638. $attvalue = $sQuote . $trans_image_path . $sQuote;
  639. break;
  640. }
  641. } else {
  642. if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) {
  643. $$attvalue = $sQuote . $trans_image_path . $sQuote;
  644. }
  645. }
  646. }
  647. }
  648. }
  649. function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images)
  650. {
  651. $me = 'tln_fixstyle';
  652. // workaround for </style> in between comments
  653. $iCurrentPos = $pos;
  654. $content = '';
  655. $sToken = '';
  656. $bSucces = false;
  657. $bEndTag = false;
  658. for ($i=$pos,$iCount=strlen($body);$i<$iCount;++$i) {
  659. $char = $body{$i};
  660. switch ($char) {
  661. case '<':
  662. $sToken = $char;
  663. break;
  664. case '/':
  665. if ($sToken == '<') {
  666. $sToken .= $char;
  667. $bEndTag = true;
  668. } else {
  669. $content .= $char;
  670. }
  671. break;
  672. case '>':
  673. if ($bEndTag) {
  674. $sToken .= $char;
  675. if (preg_match('/\<\/\s*style\s*\>/i',$sToken,$aMatch)) {
  676. $newpos = $i + 1;
  677. $bSucces = true;
  678. break 2;
  679. } else {
  680. $content .= $sToken;
  681. }
  682. $bEndTag = false;
  683. } else {
  684. $content .= $char;
  685. }
  686. break;
  687. case '!':
  688. if ($sToken == '<') {
  689. // possible comment
  690. if (isset($body{$i+2}) && substr($body,$i,3) == '!--') {
  691. $i = strpos($body,'-->',$i+3);
  692. if ($i === false) { // no end comment
  693. $i = strlen($body);
  694. }
  695. $sToken = '';
  696. }
  697. } else {
  698. $content .= $char;
  699. }
  700. break;
  701. default:
  702. if ($bEndTag) {
  703. $sToken .= $char;
  704. } else {
  705. $content .= $char;
  706. }
  707. break;
  708. }
  709. }
  710. if ($bSucces == FALSE){
  711. return array(FALSE, strlen($body));
  712. }
  713. /**
  714. * First look for general BODY style declaration, which would be
  715. * like so:
  716. * body {background: blah-blah}
  717. * and change it to .bodyclass so we can just assign it to a <div>
  718. */
  719. $content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
  720. $trans_image_path = $trans_image_path;
  721. /**
  722. * Fix url('blah') declarations.
  723. */
  724. // $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si",
  725. // "url(\\1$trans_image_path\\2)", $content);
  726. // first check for 8bit sequences and disallowed control characters
  727. if (preg_match('/[\16-\37\200-\377]+/',$content)) {
  728. $content = '<!-- style block removed by html filter due to presence of 8bit characters -->';
  729. return array($content, $newpos);
  730. }
  731. // remove @import line
  732. $content = preg_replace("/^\s*(@import.*)$/mi","\n<!-- @import rules forbidden -->\n",$content);
  733. $content = preg_replace("/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i", 'url', $content);
  734. preg_match_all("/url\s*\((.+)\)/si",$content,$aMatch);
  735. if (count($aMatch)) {
  736. $aValue = $aReplace = array();
  737. foreach($aMatch[1] as $sMatch) {
  738. // url value
  739. $urlvalue = $sMatch;
  740. tln_fixurl('style',$urlvalue, $trans_image_path, $block_external_images);
  741. $aValue[] = $sMatch;
  742. $aReplace[] = $urlvalue;
  743. }
  744. $content = str_replace($aValue,$aReplace,$content);
  745. }
  746. /**
  747. * Remove any backslashes, entities, and extraneous whitespace.
  748. */
  749. $contentTemp = $content;
  750. tln_defang($contentTemp);
  751. tln_unspace($contentTemp);
  752. $match = Array('/\/\*.*\*\//',
  753. '/expression/i',
  754. '/behaviou*r/i',
  755. '/binding/i',
  756. '/include-source/i',
  757. '/javascript/i',
  758. '/script/i',
  759. '/position/i');
  760. $replace = Array('','idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '');
  761. $contentNew = preg_replace($match, $replace, $contentTemp);
  762. if ($contentNew !== $contentTemp) {
  763. $content = $contentNew;
  764. }
  765. return array($content, $newpos);
  766. }
  767. function tln_body2div($attary, $trans_image_path)
  768. {
  769. $me = 'tln_body2div';
  770. $divattary = array('class' => "'bodyclass'");
  771. $text = '#000000';
  772. $has_bgc_stl = $has_txt_stl = false;
  773. $styledef = '';
  774. if (is_array($attary) && sizeof($attary) > 0){
  775. foreach ($attary as $attname=>$attvalue){
  776. $quotchar = substr($attvalue, 0, 1);
  777. $attvalue = str_replace($quotchar, "", $attvalue);
  778. switch ($attname){
  779. case 'background':
  780. $styledef .= "background-image: url('$trans_image_path'); ";
  781. break;
  782. case 'bgcolor':
  783. $has_bgc_stl = true;
  784. $styledef .= "background-color: $attvalue; ";
  785. break;
  786. case 'text':
  787. $has_txt_stl = true;
  788. $styledef .= "color: $attvalue; ";
  789. break;
  790. }
  791. }
  792. // Outlook defines a white bgcolor and no text color. This can lead to
  793. // white text on a white bg with certain themes.
  794. if ($has_bgc_stl && !$has_txt_stl) {
  795. $styledef .= "color: $text; ";
  796. }
  797. if (strlen($styledef) > 0){
  798. $divattary{"style"} = "\"$styledef\"";
  799. }
  800. }
  801. return $divattary;
  802. }
  803. /**
  804. *
  805. * @param string $body The HTML you wish to filter
  806. * @param array $tag_list see description above
  807. * @param array $rm_tags_with_content see description above
  808. * @param array $self_closing_tags see description above
  809. * @param boolean $force_tag_closing see description above
  810. * @param array $rm_attnames see description above
  811. * @param array $bad_attvals see description above
  812. * @param array $add_attr_to_tag see description above
  813. * @param string $trans_image_path
  814. * @param boolean $block_external_images
  815. * @return string Sanitized html safe to show on your pages.
  816. */
  817. function tln_sanitize(
  818. $body,
  819. $tag_list,
  820. $rm_tags_with_content,
  821. $self_closing_tags,
  822. $force_tag_closing,
  823. $rm_attnames,
  824. $bad_attvals,
  825. $add_attr_to_tag,
  826. $trans_image_path,
  827. $block_external_images
  828. ) {
  829. /**
  830. * Normalize rm_tags and rm_tags_with_content.
  831. */
  832. $rm_tags = array_shift($tag_list);
  833. @array_walk($tag_list, 'tln_casenormalize');
  834. @array_walk($rm_tags_with_content, 'tln_casenormalize');
  835. @array_walk($self_closing_tags, 'tln_casenormalize');
  836. /**
  837. * See if tag_list is of tags to remove or tags to allow.
  838. * false means remove these tags
  839. * true means allow these tags
  840. */
  841. $curpos = 0;
  842. $open_tags = array();
  843. $trusted = "<!-- begin tln_sanitized html -->\n";
  844. $skip_content = false;
  845. /**
  846. * Take care of netscape's stupid javascript entities like
  847. * &{alert('boo')};
  848. */
  849. $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
  850. while (($curtag = tln_getnxtag($body, $curpos)) != false) {
  851. list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
  852. $free_content = substr($body, $curpos, $lt-$curpos);
  853. /**
  854. * Take care of <style>
  855. */
  856. if ($tagname == "style" && $tagtype == 1){
  857. list($free_content, $curpos) =
  858. tln_fixstyle($body, $gt+1, $trans_image_path, $block_external_images);
  859. if ($free_content != FALSE){
  860. if ( !empty($attary) ) {
  861. $attary = tln_fixatts($tagname,
  862. $attary,
  863. $rm_attnames,
  864. $bad_attvals,
  865. $add_attr_to_tag,
  866. $trans_image_path,
  867. $block_external_images
  868. );
  869. }
  870. $trusted .= tln_tagprint($tagname, $attary, $tagtype);
  871. $trusted .= $free_content;
  872. $trusted .= tln_tagprint($tagname, false, 2);
  873. }
  874. continue;
  875. }
  876. if ($skip_content == false){
  877. $trusted .= $free_content;
  878. }
  879. if ($tagname != false) {
  880. if ($tagtype == 2) {
  881. if ($skip_content == $tagname) {
  882. /**
  883. * Got to the end of tag we needed to remove.
  884. */
  885. $tagname = false;
  886. $skip_content = false;
  887. } else {
  888. if ($skip_content == false) {
  889. if ($tagname == "body") {
  890. $tagname = "div";
  891. }
  892. if (isset($open_tags{$tagname}) &&
  893. $open_tags{$tagname} > 0
  894. ) {
  895. $open_tags{$tagname}--;
  896. } else {
  897. $tagname = false;
  898. }
  899. }
  900. }
  901. } else {
  902. /**
  903. * $rm_tags_with_content
  904. */
  905. if ($skip_content == false) {
  906. /**
  907. * See if this is a self-closing type and change
  908. * tagtype appropriately.
  909. */
  910. if ($tagtype == 1
  911. && in_array($tagname, $self_closing_tags)
  912. ) {
  913. $tagtype = 3;
  914. }
  915. /**
  916. * See if we should skip this tag and any content
  917. * inside it.
  918. */
  919. if ($tagtype == 1
  920. && in_array($tagname, $rm_tags_with_content)
  921. ) {
  922. $skip_content = $tagname;
  923. } else {
  924. if (($rm_tags == false
  925. && in_array($tagname, $tag_list)) ||
  926. ($rm_tags == true
  927. && !in_array($tagname, $tag_list))
  928. ) {
  929. $tagname = false;
  930. } else {
  931. /**
  932. * Convert body into div.
  933. */
  934. if ($tagname == "body"){
  935. $tagname = "div";
  936. $attary = tln_body2div($attary, $trans_image_path);
  937. }
  938. if ($tagtype == 1) {
  939. if (isset($open_tags{$tagname})) {
  940. $open_tags{$tagname}++;
  941. } else {
  942. $open_tags{$tagname} = 1;
  943. }
  944. }
  945. /**
  946. * This is where we run other checks.
  947. */
  948. if (is_array($attary) && sizeof($attary) > 0) {
  949. $attary = tln_fixatts(
  950. $tagname,
  951. $attary,
  952. $rm_attnames,
  953. $bad_attvals,
  954. $add_attr_to_tag,
  955. $trans_image_path,
  956. $block_external_images
  957. );
  958. }
  959. }
  960. }
  961. }
  962. }
  963. if ($tagname != false && $skip_content == false) {
  964. $trusted .= tln_tagprint($tagname, $attary, $tagtype);
  965. }
  966. }
  967. $curpos = $gt + 1;
  968. }
  969. $trusted .= substr($body, $curpos, strlen($body) - $curpos);
  970. if ($force_tag_closing == true) {
  971. foreach ($open_tags as $tagname => $opentimes) {
  972. while ($opentimes > 0) {
  973. $trusted .= '</' . $tagname . '>';
  974. $opentimes--;
  975. }
  976. }
  977. $trusted .= "\n";
  978. }
  979. $trusted .= "<!-- end tln_sanitized html -->\n";
  980. return $trusted;
  981. }
  982. //
  983. // Use the nifty htmlfilter library
  984. //
  985. function HTMLFilter($body, $trans_image_path, $block_external_images = false)
  986. {
  987. $tag_list = array(
  988. false,
  989. "object",
  990. "meta",
  991. "html",
  992. "head",
  993. "base",
  994. "link",
  995. "frame",
  996. "iframe",
  997. "plaintext",
  998. "marquee"
  999. );
  1000. $rm_tags_with_content = array(
  1001. "script",
  1002. "applet",
  1003. "embed",
  1004. "title",
  1005. "frameset",
  1006. "xmp",
  1007. "xml"
  1008. );
  1009. $self_closing_tags = array(
  1010. "img",
  1011. "br",
  1012. "hr",
  1013. "input",
  1014. "outbind"
  1015. );
  1016. $force_tag_closing = true;
  1017. $rm_attnames = array(
  1018. "/.*/" =>
  1019. array(
  1020. // "/target/i",
  1021. "/^on.*/i",
  1022. "/^dynsrc/i",
  1023. "/^data.*/i",
  1024. "/^lowsrc.*/i"
  1025. )
  1026. );
  1027. $bad_attvals = array(
  1028. "/.*/" =>
  1029. array(
  1030. "/^src|background/i" =>
  1031. array(
  1032. array(
  1033. '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
  1034. '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
  1035. '/^([\'"])\s*about\s*:.*([\'"])/si'
  1036. ),
  1037. array(
  1038. "\\1$trans_image_path\\2",
  1039. "\\1$trans_image_path\\2",
  1040. "\\1$trans_image_path\\2"
  1041. )
  1042. ),
  1043. "/^href|action/i" =>
  1044. array(
  1045. array(
  1046. '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
  1047. '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
  1048. '/^([\'"])\s*about\s*:.*([\'"])/si'
  1049. ),
  1050. array(
  1051. "\\1#\\1",
  1052. "\\1#\\1",
  1053. "\\1#\\1"
  1054. )
  1055. ),
  1056. "/^style/i" =>
  1057. array(
  1058. array(
  1059. "/\/\*.*\*\//",
  1060. "/expression/i",
  1061. "/binding/i",
  1062. "/behaviou*r/i",
  1063. "/include-source/i",
  1064. '/position\s*:/i',
  1065. '/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i',
  1066. '/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si',
  1067. '/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si',
  1068. '/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si',
  1069. '/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si'
  1070. ),
  1071. array(
  1072. "",
  1073. "idiocy",
  1074. "idiocy",
  1075. "idiocy",
  1076. "idiocy",
  1077. "idiocy",
  1078. "url",
  1079. "url(\\1#\\1)",
  1080. "url(\\1#\\1)",
  1081. "url(\\1#\\1)",
  1082. "\\1:url(\\2#\\3)"
  1083. )
  1084. )
  1085. )
  1086. );
  1087. if ($block_external_images) {
  1088. array_push(
  1089. $bad_attvals{'/.*/'}{'/^src|background/i'}[0],
  1090. '/^([\'\"])\s*https*:.*([\'\"])/si'
  1091. );
  1092. array_push(
  1093. $bad_attvals{'/.*/'}{'/^src|background/i'}[1],
  1094. "\\1$trans_image_path\\1"
  1095. );
  1096. array_push(
  1097. $bad_attvals{'/.*/'}{'/^style/i'}[0],
  1098. '/url\(([\'\"])\s*https*:.*([\'\"])\)/si'
  1099. );
  1100. array_push(
  1101. $bad_attvals{'/.*/'}{'/^style/i'}[1],
  1102. "url(\\1$trans_image_path\\1)"
  1103. );
  1104. }
  1105. $add_attr_to_tag = array(
  1106. "/^a$/i" =>
  1107. array('target' => '"_blank"')
  1108. );
  1109. $trusted = tln_sanitize(
  1110. $body,
  1111. $tag_list,
  1112. $rm_tags_with_content,
  1113. $self_closing_tags,
  1114. $force_tag_closing,
  1115. $rm_attnames,
  1116. $bad_attvals,
  1117. $add_attr_to_tag,
  1118. $trans_image_path,
  1119. $block_external_images
  1120. );
  1121. return $trusted;
  1122. }