diff --git a/bridges/JustETFBridge.php b/bridges/JustETFBridge.php new file mode 100644 index 0000000..f0223e9 --- /dev/null +++ b/bridges/JustETFBridge.php @@ -0,0 +1,353 @@ + array( + 'full' => array( + 'name' => 'Full Article', + 'type' => 'checkbox', + 'title' => 'Enable to load full articles' + ) + ), + 'Profile' => array( + 'isin' => array( + 'name' => 'ISIN', + 'type' => 'text', + 'required' => true, + 'pattern' => '[a-zA-Z]{2}[a-zA-Z0-9]{10}', + 'title' => 'ISIN, consisting of 2-letter country code, 9-character identifier, check character' + ), + 'strategy' => array( + 'name' => 'Include Strategy', + 'type' => 'checkbox', + 'defaultValue' => 'checked' + ), + 'description' => array( + 'name' => 'Include Description', + 'type' => 'checkbox', + 'defaultValue' => 'checked' + ) + ), + 'global' => array( + 'lang' => array( + 'name' => 'Language', + 'required' => true, + 'type' => 'list', + 'values' => array( + 'Englisch' => 'en', + 'Deutsch' => 'de', + 'Italiano' => 'it' + ), + 'defaultValue' => 'Englisch' + ) + ) + ); + + public function collectData() { + $html = getSimpleHTMLDOM($this->getURI()) + or returnServerError('Failed loading contents from ' . $this->getURI()); + + defaultLinkTo($html, static::URI); + + switch($this->queriedContext) { + case 'News': + $this->collectNews($html); + break; + case 'Profile': + $this->collectProfile($html); + break; + } + } + + public function getURI() { + $uri = static::URI; + + if($this->getInput('lang')) { + $uri .= '/' . $this->getInput('lang'); + } + + switch($this->queriedContext) { + case 'News': + $uri .= '/news'; + break; + case 'Profile': + $uri .= '/etf-profile.html?' . http_build_query(array( + 'isin' => strtoupper($this->getInput('isin')) + )); + break; + } + + return $uri; + } + + public function getName() { + $name = static::NAME; + + $name .= ($this->queriedContext) ? ' - ' . $this->queriedContext : ''; + + switch($this->queriedContext) { + case 'News': break; + case 'Profile': + if($this->getInput('isin')) { + $name .= ' ISIN ' . strtoupper($this->getInput('isin')); + } + } + + if($this->getInput('lang')) { + $name .= ' (' . strtoupper($this->getInput('lang')) . ')'; + } + + return $name; + } + + #region Common + + /** + * Fixes dates depending on the choosen language: + * + * de : dd.mm.yy + * en : dd.mm.yy + * it : dd/mm/yy + * + * Basically strtotime doesn't convert dates correctly due to formats + * being hard to interpret. So we use the DateTime object, manually + * fixing dates and times (set to 00:00:00.000). + * + * We don't know the timezone, so just assume +00:00 (or whatever + * DateTime chooses) + */ + private function fixDate($date) { + switch($this->getInput('lang')) { + case 'en': + case 'de': + $df = date_create_from_format('d.m.y', $date); + break; + case 'it': + $df = date_create_from_format('d/m/y', $date); + break; + } + + date_time_set($df, 0, 0); + + // debugMessage(date_format($df, 'U')); + + return date_format($df, 'U'); + } + + private function extractImages($article) { + // Notice: We can have zero or more images (though it should mostly be 1) + $elements = $article->find('img'); + + $images = array(); + + foreach($elements as $img) { + // Skip the logo (mostly provided part of a hidden div) + if(substr($img->src, strrpos($img->src, '/') + 1) === 'logo.png') + continue; + + $images[] = $img->src; + } + + return $images; + } + + #endregion + + #region News + + private function collectNews($html) { + $articles = $html->find('div.newsTopArticle') + or returnServerError('No articles found! Layout might have changed!'); + + foreach($articles as $article) { + + $item = array(); + + // Common data + + $item['uri'] = $this->extractNewsUri($article); + $item['timestamp'] = $this->extractNewsDate($article); + $item['title'] = $this->extractNewsTitle($article); + + if($this->getInput('full')) { + + $uri = $this->extractNewsUri($article); + + $html = getSimpleHTMLDOMCached($uri) + or returnServerError('Failed loading full article from ' . $uri); + + $fullArticle = $html->find('div.article', 0) + or returnServerError('No content found! Layout might have changed!'); + + defaultLinkTo($fullArticle, static::URI); + + $item['author'] = $this->extractFullArticleAuthor($fullArticle); + $item['content'] = $this->extractFullArticleContent($fullArticle); + $item['enclosures'] = $this->extractImages($fullArticle); + + } else { + + $item['content'] = $this->extractNewsDescription($article); + $item['enclosures'] = $this->extractImages($article); + + } + + $this->items[] = $item; + } + } + + private function extractNewsUri($article) { + $element = $article->find('a', 0) + or returnServerError('Anchor not found!'); + + return $element->href; + } + + private function extractNewsDate($article) { + $element = $article->find('div.subheadline', 0) + or returnServerError('Date not found!'); + + // debugMessage($element->plaintext); + + $date = trim(explode('|', $element->plaintext)[0]); + + return $this->fixDate($date); + } + + private function extractNewsDescription($article) { + $element = $article->find('span.newsText', 0) + or returnServerError('Description not found!'); + + $element->find('a', 0)->onclick = ''; + + // debugMessage($element->innertext); + + return $element->innertext; + } + + private function extractNewsTitle($article) { + $element = $article->find('h3', 0) + or returnServerError('Title not found!'); + + return $element->plaintext; + } + + private function extractFullArticleContent($article) { + $element = $article->find('div.article_body', 0) + or returnServerError('Article body not found!'); + + // Remove teaser image + $element->find('img.teaser-img', 0)->outertext = ''; + + // Remove self advertisements + foreach($element->find('.call-action') as $adv) { + $adv->outertext = ''; + } + + // Remove tips + foreach($element->find('.panel-edu') as $tip) { + $tip->outertext = ''; + } + + // Remove inline scripts (used for i.e. interactive graphs) as they are + // rendered as a long series of strings + foreach($element->find('script') as $script) { + $script->outertext = '[Content removed! Visit site to see full contents!]'; + } + + return $element->innertext; + } + + private function extractFullArticleAuthor($article) { + $element = $article->find('span[itemprop=name]', 0) + or returnServerError('Author not found!'); + + return $element->plaintext; + } + + #endregion + + #region Profile + + private function collectProfile($html) { + $item = array(); + + $item['uri'] = $this->getURI(); + $item['timestamp'] = $this->extractProfileDate($html); + $item['title'] = $this->extractProfiletitle($html); + $item['author'] = $this->extractProfileAuthor($html); + $item['content'] = $this->extractProfileContent($html); + + $this->items[] = $item; + } + + private function extractProfileDate($html) { + $element = $html->find('div.infobox div.vallabel', 0) + or returnServerError('Date not found!'); + + // debugMessage($element->plaintext); + + $date = trim(explode("\r\n", $element->plaintext)[1]); + + return $this->fixDate($date); + } + + private function extractProfileTitle($html) { + $element = $html->find('span.h1', 0) + or returnServerError('Title not found!'); + + return $element->plaintext; + } + + private function extractProfileContent($html) { + // There are a few thins we are interested: + // - Investment Strategy + // - Description + // - Quote + + $strategy = $html->find('div.tab-container div.col-sm-6 p', 0) + or returnServerError('Investment Strategy not found!'); + + // Description requires a bit of cleanup due to lack of propper identification + + $description = $html->find('div.headline', 5) + or returnServerError('Description container not found!'); + + $description = $description->parent(); + + foreach($description->find('div') as $div) { + $div->outertext = ''; + } + + $quote = $html->find('div.infobox div.val', 0) + or returnServerError('Quote not found!'); + + $quote_html = 'Quote

' . $quote . '

'; + $strategy_html = ''; + $description_html = ''; + + if($this->getInput('strategy') === true) { + $strategy_html = 'Strategy

' . $strategy . '


'; + } + + if($this->getInput('description') === true) { + $description_html = 'Description

' . $description . '


'; + } + + return $strategy_html . $description_html . $quote_html; + } + + private function extractProfileAuthor($html) { + // Use ISIN + WKN as author + // Notice: "identfier" is not a typo [sic]! + $element = $html->find('span.identfier', 0) + or returnServerError('Author not found!'); + + return $element->plaintext; + } + + #endregion +}