Big revision of the “strip” function: now it strips more, so bad html in instances’ extended descriptions is less likely to break results pages

2024-08-25 17:51:20 +02:00 · 2024-08-25 17:51:20 +02:00 · 132ab498b5
commit 132ab498b5
parent a3a27d8082
1 changed files with 15 additions and 13 deletions
--- a/web/site/instances.php
+++ b/web/site/instances.php
@ -965,21 +965,23 @@ function nullyp($str) {

 function strip($str,$uri) {
 	if (nullemp($str)) return(null);
-	$str=preg_replace('#<style( [^>]*)?>.*</style>#is','',$str);// strip all inline css styles definitions
-	$str=preg_replace('#<a href="(?![a-zA-Z]+://)([^"]+)#i','<a href="https://'.$uri.'$1',$str);// if an href is not preceded by a protocol scheme, it's relative ...
+	//echo '<pre>'.$str.'</pre>';
+	$str=preg_replace(['#^\s*#m','#[\r\n]#'],['',' '],$str);// strip all spaces from empty lines, then all carriage return and new line chars
+	$str=preg_replace('#<br\s+/>#i','<br>',$str);// convert all "<br />" to "<br>"
+	$str=preg_replace('#<a\s+[^>]*></a>#i','',$str);
+	$str=preg_replace('#<a\s+[^>]*href="(?![a-zA-Z]+://)([^"]+)#i','<a href="https://'.$uri.'$1',$str);// if an href is not preceded by a protocol scheme, it's relative ...
 	$str=str_ireplace('</a><a','</a> <a',$str);// always put at least one space between links
-	$str=preg_replace(['#<h[1-9][^>]*>#i','#</h[1-9]>#i'],['<p class="exh">','</p>'],$str);
-	$str=preg_replace(['#</p><br>#i','#</li><br>#i','#</ul><br>#i','#<ul><br>#i'],['</p>','</li>','</ul>','<ul>'],$str);
+	//$str=preg_replace(['#</p><br>#i','#</li><br>#i','#</ul><br>#i','#<ul><br>#i'],['</p>','</li>','</ul>','<ul>'],$str);
 	$str=preg_replace(['#<b>#i','#</b>#i','#<i>#i','#</i>#i'],['<strong>','</strong>','<em>','</em>'],$str);
-	$str=preg_replace('#<p>\s*</p>#is','',$str);
-	$str=strip_tags($str,'<a><br><ol><ul><li><p><div><strong><em><small><img>');
-	// all this part below is to try and assign the css "nobott" css class to a possible closing <p>/<ol>/<ul>/<div>,
-	// to avoid the useless and UGLY last bottom-margin :-))
-	$str=preg_replace('#^\s*#m','',$str);// strip all spaces from empty lines
-	$str=preg_replace('#[\r\n]#',' ',$str);// strip all "wrap chars"
-	$str=preg_replace('#(</p>|</ol>|</ul>|</div>)#i','$1'.N,$str);// now add a newline after every </p> and so on
-	$str=rtrim($str);// trim the newline at the end of the whole text block in order for the next preg_replace to match against $ as end of the whole text block
-	$str=preg_replace(['#<p[^>]*>(.*)</p>$#i', '#<ol[^>]*>(.*)</ol>$#i', '#<ul[^>]*>(.*)</ul>$#i', '#<div[^>]*>(.*)</div>$#i'],['<p class="nobott">$1</p>', '<ol class="nobott">$1</ol>', '<ul class="nobott">$1</ul>', '<div class="nobott">$1</div>'],$str);
+	$str=preg_replace('#<p[^>]*>\s*</p>#is','',$str);
+	$str=preg_replace(['#<p[^>]*>(.*)</p>#i','#<div[^>]>(.*)</div>#i'],['<br><br>$1<br><br>','<br><br>$1<br><br>'],$str);
+	$str=strip_tags($str,'<a><br><ol><ul><li><strong><em><small><img><h1><h2><h3><h4><h5><h6>');
+	$str=preg_replace('#<([^>]*)\s(style|class)="[^"]*"([^>]*)>#i','<$1$3>',$str);
+	$str=preg_replace(['#<h[1-6][^>]*>#i','#</h[1-6]>#i'],['<br><br><span class="exh">','</span><br><br>'],$str);
+	$str=preg_replace(['#<([ou])l[^>]*>#i','#</([ou])l>#i'],['<br><br><$1l class="nobott">','</$1l><br><br>'],$str);
+	$str=preg_replace(['#<br>\s+<br>#i','#^(<br>)+#i','#(<br>){3,}#i','#(<br>)+$#i','#</([ou])l>\s*(<br>)+#i','#</li>\s*(<br>)+#i'],['<br><br>','','<br><br>','','</$1l><br>','</li><br>'],$str);
+	// this part below is to try and assign the css "nobott" css class to a possible closing <ol>/<ul>, to avoid the useless and UGLY last bottom-margin :-)
+	$str=preg_replace(['#<ol[^>]*>(.*)</ol>$#i','#<ul[^>]*>(.*)</ul>$#i'],['<ol class="nobott">$1</ol>','<ul class="nobott">$1</ul>'],$str);
 	return $str;
 }