123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- <?php
- class JustETFBridge extends BridgeAbstract {
- const NAME = 'justETF Bridge';
- const URI = 'https://www.justetf.com';
- const DESCRIPTION = 'Currently only supports the news feed';
- const MAINTAINER = 'logmanoriginal';
- const PARAMETERS = array(
- 'News' => array(
- 'full' => array(
- 'name' => 'Full Article',
- 'type' => 'checkbox',
- 'title' => 'Enable to load full articles'
- )
- ),
- 'Profile' => array(
- 'isin' => array(
- 'name' => 'ISIN',
- 'type' => 'text',
- 'required' => true,
- 'pattern' => '[a-zA-Z]{2}[a-zA-Z0-9]{10}',
- 'title' => 'ISIN, consisting of 2-letter country code, 9-character identifier, check character'
- ),
- 'strategy' => array(
- 'name' => 'Include Strategy',
- 'type' => 'checkbox',
- 'defaultValue' => 'checked'
- ),
- 'description' => array(
- 'name' => 'Include Description',
- 'type' => 'checkbox',
- 'defaultValue' => 'checked'
- )
- ),
- 'global' => array(
- 'lang' => array(
- 'name' => 'Language',
- 'required' => true,
- 'type' => 'list',
- 'values' => array(
- 'Englisch' => 'en',
- 'Deutsch' => 'de',
- 'Italiano' => 'it'
- ),
- 'defaultValue' => 'Englisch'
- )
- )
- );
- public function collectData() {
- $html = getSimpleHTMLDOM($this->getURI())
- or returnServerError('Failed loading contents from ' . $this->getURI());
- defaultLinkTo($html, static::URI);
- switch($this->queriedContext) {
- case 'News':
- $this->collectNews($html);
- break;
- case 'Profile':
- $this->collectProfile($html);
- break;
- }
- }
- public function getURI() {
- $uri = static::URI;
- if($this->getInput('lang')) {
- $uri .= '/' . $this->getInput('lang');
- }
- switch($this->queriedContext) {
- case 'News':
- $uri .= '/news';
- break;
- case 'Profile':
- $uri .= '/etf-profile.html?' . http_build_query(array(
- 'isin' => strtoupper($this->getInput('isin'))
- ));
- break;
- }
- return $uri;
- }
- public function getName() {
- $name = static::NAME;
- $name .= ($this->queriedContext) ? ' - ' . $this->queriedContext : '';
- switch($this->queriedContext) {
- case 'News': break;
- case 'Profile':
- if($this->getInput('isin')) {
- $name .= ' ISIN ' . strtoupper($this->getInput('isin'));
- }
- }
- if($this->getInput('lang')) {
- $name .= ' (' . strtoupper($this->getInput('lang')) . ')';
- }
- return $name;
- }
- #region Common
- /**
- * Fixes dates depending on the choosen language:
- *
- * de : dd.mm.yy
- * en : dd.mm.yy
- * it : dd/mm/yy
- *
- * Basically strtotime doesn't convert dates correctly due to formats
- * being hard to interpret. So we use the DateTime object, manually
- * fixing dates and times (set to 00:00:00.000).
- *
- * We don't know the timezone, so just assume +00:00 (or whatever
- * DateTime chooses)
- */
- private function fixDate($date) {
- switch($this->getInput('lang')) {
- case 'en':
- case 'de':
- $df = date_create_from_format('d.m.y', $date);
- break;
- case 'it':
- $df = date_create_from_format('d/m/y', $date);
- break;
- }
- date_time_set($df, 0, 0);
- // debugMessage(date_format($df, 'U'));
- return date_format($df, 'U');
- }
- private function extractImages($article) {
- // Notice: We can have zero or more images (though it should mostly be 1)
- $elements = $article->find('img');
- $images = array();
- foreach($elements as $img) {
- // Skip the logo (mostly provided part of a hidden div)
- if(substr($img->src, strrpos($img->src, '/') + 1) === 'logo.png')
- continue;
- $images[] = $img->src;
- }
- return $images;
- }
- #endregion
- #region News
- private function collectNews($html) {
- $articles = $html->find('div.newsTopArticle')
- or returnServerError('No articles found! Layout might have changed!');
- foreach($articles as $article) {
- $item = array();
- // Common data
- $item['uri'] = $this->extractNewsUri($article);
- $item['timestamp'] = $this->extractNewsDate($article);
- $item['title'] = $this->extractNewsTitle($article);
- if($this->getInput('full')) {
- $uri = $this->extractNewsUri($article);
- $html = getSimpleHTMLDOMCached($uri)
- or returnServerError('Failed loading full article from ' . $uri);
- $fullArticle = $html->find('div.article', 0)
- or returnServerError('No content found! Layout might have changed!');
- defaultLinkTo($fullArticle, static::URI);
- $item['author'] = $this->extractFullArticleAuthor($fullArticle);
- $item['content'] = $this->extractFullArticleContent($fullArticle);
- $item['enclosures'] = $this->extractImages($fullArticle);
- } else {
- $item['content'] = $this->extractNewsDescription($article);
- $item['enclosures'] = $this->extractImages($article);
- }
- $this->items[] = $item;
- }
- }
- private function extractNewsUri($article) {
- $element = $article->find('a', 0)
- or returnServerError('Anchor not found!');
- return $element->href;
- }
- private function extractNewsDate($article) {
- $element = $article->find('div.subheadline', 0)
- or returnServerError('Date not found!');
- // debugMessage($element->plaintext);
- $date = trim(explode('|', $element->plaintext)[0]);
- return $this->fixDate($date);
- }
- private function extractNewsDescription($article) {
- $element = $article->find('span.newsText', 0)
- or returnServerError('Description not found!');
- $element->find('a', 0)->onclick = '';
- // debugMessage($element->innertext);
- return $element->innertext;
- }
- private function extractNewsTitle($article) {
- $element = $article->find('h3', 0)
- or returnServerError('Title not found!');
- return $element->plaintext;
- }
- private function extractFullArticleContent($article) {
- $element = $article->find('div.article_body', 0)
- or returnServerError('Article body not found!');
- // Remove teaser image
- $element->find('img.teaser-img', 0)->outertext = '';
- // Remove self advertisements
- foreach($element->find('.call-action') as $adv) {
- $adv->outertext = '';
- }
- // Remove tips
- foreach($element->find('.panel-edu') as $tip) {
- $tip->outertext = '';
- }
- // Remove inline scripts (used for i.e. interactive graphs) as they are
- // rendered as a long series of strings
- foreach($element->find('script') as $script) {
- $script->outertext = '[Content removed! Visit site to see full contents!]';
- }
- return $element->innertext;
- }
- private function extractFullArticleAuthor($article) {
- $element = $article->find('span[itemprop=name]', 0)
- or returnServerError('Author not found!');
- return $element->plaintext;
- }
- #endregion
- #region Profile
- private function collectProfile($html) {
- $item = array();
- $item['uri'] = $this->getURI();
- $item['timestamp'] = $this->extractProfileDate($html);
- $item['title'] = $this->extractProfiletitle($html);
- $item['author'] = $this->extractProfileAuthor($html);
- $item['content'] = $this->extractProfileContent($html);
- $this->items[] = $item;
- }
- private function extractProfileDate($html) {
- $element = $html->find('div.infobox div.vallabel', 0)
- or returnServerError('Date not found!');
- // debugMessage($element->plaintext);
- $date = trim(explode("\r\n", $element->plaintext)[1]);
- return $this->fixDate($date);
- }
- private function extractProfileTitle($html) {
- $element = $html->find('span.h1', 0)
- or returnServerError('Title not found!');
- return $element->plaintext;
- }
- private function extractProfileContent($html) {
- // There are a few thins we are interested:
- // - Investment Strategy
- // - Description
- // - Quote
- $strategy = $html->find('div.tab-container div.col-sm-6 p', 0)
- or returnServerError('Investment Strategy not found!');
- // Description requires a bit of cleanup due to lack of propper identification
- $description = $html->find('div.headline', 5)
- or returnServerError('Description container not found!');
- $description = $description->parent();
- foreach($description->find('div') as $div) {
- $div->outertext = '';
- }
- $quote = $html->find('div.infobox div.val', 0)
- or returnServerError('Quote not found!');
- $quote_html = '<strong>Quote</strong><br><p>' . $quote . '</p>';
- $strategy_html = '';
- $description_html = '';
- if($this->getInput('strategy') === true) {
- $strategy_html = '<strong>Strategy</strong><br><p>' . $strategy . '</p><br>';
- }
- if($this->getInput('description') === true) {
- $description_html = '<strong>Description</strong><br><p>' . $description . '</p><br>';
- }
- return $strategy_html . $description_html . $quote_html;
- }
- private function extractProfileAuthor($html) {
- // Use ISIN + WKN as author
- // Notice: "identfier" is not a typo [sic]!
- $element = $html->find('span.identfier', 0)
- or returnServerError('Author not found!');
- return $element->plaintext;
- }
- #endregion
- }
|