Browse Source

[JustETFBridge] Add new bridge

Supports latest news and profiling a given ETF in Englisch, German
or Italian language. Cover images are attached as enclosures and not
as part of the content.

News:

Optionally loads the full article for each news item. Some articles
may include scripts to provide interactive graphs. These scripts are
removed as they would be rendered as pure text and a message is shown
instead: "[Content removed! Visit site to see full contents!]"

Profile:

Optionally includes the ETF strategy and description.
logmanoriginal 5 years ago
parent
commit
2bc8daa101
1 changed files with 353 additions and 0 deletions
  1. 353 0
      bridges/JustETFBridge.php

+ 353 - 0
bridges/JustETFBridge.php

@@ -0,0 +1,353 @@
+<?php
+class JustETFBridge extends BridgeAbstract {
+	const NAME = 'justETF Bridge';
+	const URI = 'https://www.justetf.com';
+	const DESCRIPTION = 'Currently only supports the news feed';
+	const MAINTAINER = 'logmanoriginal';
+	const PARAMETERS = array(
+		'News' => array(
+			'full' => array(
+				'name' => 'Full Article',
+				'type' => 'checkbox',
+				'title' => 'Enable to load full articles'
+			)
+		),
+		'Profile' => array(
+			'isin' => array(
+				'name' => 'ISIN',
+				'type' => 'text',
+				'required' => true,
+				'pattern' => '[a-zA-Z]{2}[a-zA-Z0-9]{10}',
+				'title' => 'ISIN, consisting of 2-letter country code, 9-character identifier, check character'
+			),
+			'strategy' => array(
+				'name' => 'Include Strategy',
+				'type' => 'checkbox',
+				'defaultValue' => 'checked'
+			),
+			'description' => array(
+				'name' => 'Include Description',
+				'type' => 'checkbox',
+				'defaultValue' => 'checked'
+			)
+		),
+		'global' => array(
+			'lang' => array(
+				'name' => 'Language',
+				'required' => true,
+				'type' => 'list',
+				'values' => array(
+					'Englisch' => 'en',
+					'Deutsch'  => 'de',
+					'Italiano' => 'it'
+				),
+				'defaultValue' => 'Englisch'
+			)
+		)
+	);
+
+	public function collectData() {
+		$html = getSimpleHTMLDOM($this->getURI())
+			or returnServerError('Failed loading contents from ' . $this->getURI());
+
+		defaultLinkTo($html, static::URI);
+
+		switch($this->queriedContext) {
+			case 'News':
+				$this->collectNews($html);
+				break;
+			case 'Profile':
+				$this->collectProfile($html);
+				break;
+		}
+	}
+
+	public function getURI() {
+		$uri = static::URI;
+
+		if($this->getInput('lang')) {
+			$uri .= '/' . $this->getInput('lang');
+		}
+
+		switch($this->queriedContext) {
+			case 'News':
+				$uri .= '/news';
+				break;
+			case 'Profile':
+				$uri .= '/etf-profile.html?' . http_build_query(array(
+					'isin' => strtoupper($this->getInput('isin'))
+				));
+				break;
+		}
+
+		return $uri;
+	}
+
+	public function getName() {
+		$name = static::NAME;
+
+		$name .= ($this->queriedContext) ? ' - ' . $this->queriedContext : '';
+
+		switch($this->queriedContext) {
+			case 'News': break;
+			case 'Profile':
+				if($this->getInput('isin')) {
+					$name .= ' ISIN ' . strtoupper($this->getInput('isin'));
+				}
+		}
+
+		if($this->getInput('lang')) {
+			$name .= ' (' . strtoupper($this->getInput('lang')) . ')';
+		}
+
+		return $name;
+	}
+
+	#region Common
+
+	/**
+	 * Fixes dates depending on the choosen language:
+	 *
+	 * de : dd.mm.yy
+	 * en : dd.mm.yy
+	 * it : dd/mm/yy
+	 *
+	 * Basically strtotime doesn't convert dates correctly due to formats
+	 * being hard to interpret. So we use the DateTime object, manually
+	 * fixing dates and times (set to 00:00:00.000).
+	 *
+	 * We don't know the timezone, so just assume +00:00 (or whatever
+	 * DateTime chooses)
+	 */
+	private function fixDate($date) {
+		switch($this->getInput('lang')) {
+			case 'en':
+			case 'de':
+				$df = date_create_from_format('d.m.y', $date);
+				break;
+			case 'it':
+				$df = date_create_from_format('d/m/y', $date);
+				break;
+		}
+
+		date_time_set($df, 0, 0);
+
+		// debugMessage(date_format($df, 'U'));
+
+		return date_format($df, 'U');
+	}
+
+	private function extractImages($article) {
+		// Notice: We can have zero or more images (though it should mostly be 1)
+		$elements = $article->find('img');
+
+		$images = array();
+
+		foreach($elements as $img) {
+			// Skip the logo (mostly provided part of a hidden div)
+			if(substr($img->src, strrpos($img->src, '/') + 1) === 'logo.png')
+				continue;
+
+			$images[] = $img->src;
+		}
+
+		return $images;
+	}
+
+	#endregion
+
+	#region News
+
+	private function collectNews($html) {
+		$articles = $html->find('div.newsTopArticle')
+			or returnServerError('No articles found! Layout might have changed!');
+
+		foreach($articles as $article) {
+
+			$item = array();
+
+			// Common data
+
+			$item['uri'] = $this->extractNewsUri($article);
+			$item['timestamp'] = $this->extractNewsDate($article);
+			$item['title'] = $this->extractNewsTitle($article);
+
+			if($this->getInput('full')) {
+
+				$uri = $this->extractNewsUri($article);
+
+				$html = getSimpleHTMLDOMCached($uri)
+					or returnServerError('Failed loading full article from ' . $uri);
+
+				$fullArticle = $html->find('div.article', 0)
+					or returnServerError('No content found! Layout might have changed!');
+
+				defaultLinkTo($fullArticle, static::URI);
+
+				$item['author'] = $this->extractFullArticleAuthor($fullArticle);
+				$item['content'] = $this->extractFullArticleContent($fullArticle);
+				$item['enclosures'] = $this->extractImages($fullArticle);
+
+			} else {
+
+				$item['content'] = $this->extractNewsDescription($article);
+				$item['enclosures'] = $this->extractImages($article);
+
+			}
+
+			$this->items[] = $item;
+		}
+	}
+
+	private function extractNewsUri($article) {
+		$element = $article->find('a', 0)
+			or returnServerError('Anchor not found!');
+
+		return $element->href;
+	}
+
+	private function extractNewsDate($article) {
+		$element = $article->find('div.subheadline', 0)
+			or returnServerError('Date not found!');
+
+		// debugMessage($element->plaintext);
+
+		$date = trim(explode('|', $element->plaintext)[0]);
+
+		return $this->fixDate($date);
+	}
+
+	private function extractNewsDescription($article) {
+		$element = $article->find('span.newsText', 0)
+			or returnServerError('Description not found!');
+
+		$element->find('a', 0)->onclick = '';
+
+		// debugMessage($element->innertext);
+
+		return $element->innertext;
+	}
+
+	private function extractNewsTitle($article) {
+		$element = $article->find('h3', 0)
+			or returnServerError('Title not found!');
+
+		return $element->plaintext;
+	}
+
+	private function extractFullArticleContent($article) {
+		$element = $article->find('div.article_body', 0)
+			or returnServerError('Article body not found!');
+
+		// Remove teaser image
+		$element->find('img.teaser-img', 0)->outertext = '';
+
+		// Remove self advertisements
+		foreach($element->find('.call-action') as $adv) {
+			$adv->outertext = '';
+		}
+
+		// Remove tips
+		foreach($element->find('.panel-edu') as $tip) {
+			$tip->outertext = '';
+		}
+
+		// Remove inline scripts (used for i.e. interactive graphs) as they are
+		// rendered as a long series of strings
+		foreach($element->find('script') as $script) {
+			$script->outertext = '[Content removed! Visit site to see full contents!]';
+		}
+
+		return $element->innertext;
+	}
+
+	private function extractFullArticleAuthor($article) {
+		$element = $article->find('span[itemprop=name]', 0)
+			or returnServerError('Author not found!');
+
+		return $element->plaintext;
+	}
+
+	#endregion
+
+	#region Profile
+
+	private function collectProfile($html) {
+		$item = array();
+
+		$item['uri'] = $this->getURI();
+		$item['timestamp'] = $this->extractProfileDate($html);
+		$item['title'] = $this->extractProfiletitle($html);
+		$item['author'] = $this->extractProfileAuthor($html);
+		$item['content'] = $this->extractProfileContent($html);
+
+		$this->items[] = $item;
+	}
+
+	private function extractProfileDate($html) {
+		$element = $html->find('div.infobox div.vallabel', 0)
+			or returnServerError('Date not found!');
+
+		// debugMessage($element->plaintext);
+
+		$date = trim(explode("\r\n", $element->plaintext)[1]);
+
+		return $this->fixDate($date);
+	}
+
+	private function extractProfileTitle($html) {
+		$element = $html->find('span.h1', 0)
+			or returnServerError('Title not found!');
+
+		return $element->plaintext;
+	}
+
+	private function extractProfileContent($html) {
+		// There are a few thins we are interested:
+		// - Investment Strategy
+		// - Description
+		// - Quote
+
+		$strategy = $html->find('div.tab-container div.col-sm-6 p', 0)
+			or returnServerError('Investment Strategy not found!');
+
+		// Description requires a bit of cleanup due to lack of propper identification
+
+		$description = $html->find('div.headline', 5)
+			or returnServerError('Description container not found!');
+
+		$description = $description->parent();
+
+		foreach($description->find('div') as $div) {
+			$div->outertext = '';
+		}
+
+		$quote = $html->find('div.infobox div.val', 0)
+			or returnServerError('Quote not found!');
+
+		$quote_html = '<strong>Quote</strong><br><p>' . $quote . '</p>';
+		$strategy_html = '';
+		$description_html = '';
+
+		if($this->getInput('strategy') === true) {
+			$strategy_html = '<strong>Strategy</strong><br><p>' . $strategy . '</p><br>';
+		}
+
+		if($this->getInput('description') === true) {
+			$description_html = '<strong>Description</strong><br><p>' . $description . '</p><br>';
+		}
+
+		return $strategy_html . $description_html . $quote_html;
+	}
+
+	private function extractProfileAuthor($html) {
+		// Use ISIN + WKN as author
+		// Notice: "identfier" is not a typo [sic]!
+		$element = $html->find('span.identfier', 0)
+			or returnServerError('Author not found!');
+
+		return $element->plaintext;
+	}
+
+	#endregion
+}