Browse Source

[core] Add urljoin (#756)

Adds php-urljoin from https://github.com/fluffy-critter/php-urljoin to replace the custom implementation of 'defaultLinkTo'
Eugene Molotov 5 years ago
parent
commit
df58f5bbdb
3 changed files with 144 additions and 9 deletions
  1. 11 0
      lib/RssBridge.php
  2. 2 9
      lib/html.php
  3. 131 0
      vendor/php-urljoin/src/urljoin.php

+ 11 - 0
lib/RssBridge.php

@@ -34,6 +34,17 @@ if(!file_exists($vendorLibSimpleHtmlDom)) {
 }
 require_once $vendorLibSimpleHtmlDom;
 
+$vendorLibPhpUrlJoin = __DIR__ . PATH_VENDOR . '/php-urljoin/src/urljoin.php';
+if(!file_exists($vendorLibPhpUrlJoin)) {
+	throw new \HttpException('"php-urljoin" library is missing.
+ Get it from https://github.com/fluffy-critter/php-urljoin and place the script "urljoin.php" in '
+		. substr(PATH_VENDOR, 4)
+		. '/php-urljoin/src/',
+	500);
+}
+require_once $vendorLibPhpUrlJoin;
+
+
 /* Example use
 
 	require_once __DIR__ . '/lib/RssBridge.php';

+ 2 - 9
lib/html.php

@@ -42,18 +42,11 @@ function backgroundToImg($htmlContent) {
 
 function defaultLinkTo($content, $server){
 	foreach($content->find('img') as $image) {
-		if(strpos($image->src, 'http') === false
-		&& strpos($image->src, '//') === false
-		&& strpos($image->src, 'data:') === false)
-			$image->src = $server . $image->src;
+		$image->src = urljoin($server, $image->src);
 	}
 
 	foreach($content->find('a') as $anchor) {
-		if(strpos($anchor->href, 'http') === false
-		&& strpos($anchor->href, '//') === false
-		&& strpos($anchor->href, '#') !== 0
-		&& strpos($anchor->href, '?') !== 0)
-			$anchor->href = $server . $anchor->href;
+		$anchor->href = urljoin($server, $anchor->href);
 	}
 
 	return $content;

+ 131 - 0
vendor/php-urljoin/src/urljoin.php

@@ -0,0 +1,131 @@
+<?php
+
+/*
+
+A spiritual port of Python's urlparse.urljoin() function to PHP. Why this isn't in the standard library is anyone's guess.
+
+Author: fluffy, http://beesbuzz.biz/
+Latest version at: https://github.com/plaidfluff/php-urljoin
+
+ */
+
+function urljoin($base, $rel) {
+	if (!$base) {
+		return $rel;
+	}
+
+	if (!$rel) {
+		return $base;
+	}
+
+	$uses_relative = array('', 'ftp', 'http', 'gopher', 'nntp', 'imap',
+		'wais', 'file', 'https', 'shttp', 'mms',
+		'prospero', 'rtsp', 'rtspu', 'sftp',
+		'svn', 'svn+ssh', 'ws', 'wss');
+
+	$pbase = parse_url($base);
+	$prel = parse_url($rel);
+
+	if (array_key_exists('path', $pbase) && $pbase['path'] === '/') {
+		unset($pbase['path']);
+	}
+
+	if (isset($prel['scheme'])) {
+		if ($prel['scheme'] != $pbase['scheme'] || in_array($prel['scheme'], $uses_relative) == false) {
+			return $rel;
+		}
+	}
+
+	$merged = array_merge($pbase, $prel);
+
+	// Handle relative paths:
+	//   'path/to/file.ext'
+	// './path/to/file.ext'
+	if (array_key_exists('path', $prel) && substr($prel['path'], 0, 1) != '/') {
+
+		// Normalize: './path/to/file.ext' => 'path/to/file.ext'
+		if (substr($prel['path'], 0, 2) === './') {
+			$prel['path'] = substr($prel['path'], 2);
+		}
+
+		if (array_key_exists('path', $pbase)) {
+			$dir = preg_replace('@/[^/]*$@', '', $pbase['path']);
+			$merged['path'] = $dir . '/' . $prel['path'];
+		} else {
+			$merged['path'] = '/' . $prel['path'];
+		}
+
+	}
+
+	if(array_key_exists('path', $merged)) {
+		// Get the path components, and remove the initial empty one
+		$pathParts = explode('/', $merged['path']);
+		array_shift($pathParts);
+
+		$path = [];
+		$prevPart = '';
+		foreach ($pathParts as $part) {
+			if ($part == '..' && count($path) > 0) {
+				// Cancel out the parent directory (if there's a parent to cancel)
+				$parent = array_pop($path);
+				// But if it was also a parent directory, leave it in
+				if ($parent == '..') {
+					array_push($path, $parent);
+					array_push($path, $part);
+				}
+			} else if ($prevPart != '' || ($part != '.' && $part != '')) {
+				// Don't include empty or current-directory components
+				if ($part == '.') {
+					$part = '';
+				}
+				array_push($path, $part);
+			}
+			$prevPart = $part;
+		}
+		$merged['path'] = '/' . implode('/', $path);
+	}
+
+	$ret = '';
+	if (isset($merged['scheme'])) {
+		$ret .= $merged['scheme'] . ':';
+	}
+
+	if (isset($merged['scheme']) || isset($merged['host'])) {
+		$ret .= '//';
+	}
+
+	if (isset($prel['host'])) {
+		$hostSource = $prel;
+	} else {
+		$hostSource = $pbase;
+	}
+
+	// username, password, and port are associated with the hostname, not merged
+	if (isset($hostSource['host'])) {
+		if (isset($hostSource['user'])) {
+			$ret .= $hostSource['user'];
+			if (isset($hostSource['pass'])) {
+				$ret .= ':' . $hostSource['pass'];
+			}
+			$ret .= '@';
+		}
+		$ret .= $hostSource['host'];
+		if (isset($hostSource['port'])) {
+			$ret .= ':' . $hostSource['port'];
+		}
+	}
+
+	if (isset($merged['path'])) {
+		$ret .= $merged['path'];
+	}
+
+	if (isset($prel['query'])) {
+		$ret .= '?' . $prel['query'];
+	}
+
+	if (isset($prel['fragment'])) {
+		$ret .= '#' . $prel['fragment'];
+	}
+
+	return $ret;
+}