forked from blallo/rss-bridge
5432cabef5
Some bridges used getName() and getURI() to put information into the metadatas. Instead the metadatas should be initialized with data and (not yet done) returned by default via getName() and getURI().
86 lines
No EOL
3.7 KiB
PHP
86 lines
No EOL
3.7 KiB
PHP
<?php
|
|
class TheHackerNewsBridge extends BridgeAbstract {
|
|
|
|
public function loadMetadatas() {
|
|
|
|
$this->maintainer = 'ORelio';
|
|
$this->name = 'The Hacker News Bridge';
|
|
$this->uri = 'https://thehackernews.com/';
|
|
$this->description = 'Cyber Security, Hacking, Technology News.';
|
|
$this->update = '2016-08-06';
|
|
|
|
}
|
|
|
|
public function collectData(array $param) {
|
|
|
|
function StripWithDelimiters($string, $start, $end) {
|
|
while (strpos($string, $start) !== false) {
|
|
$section_to_remove = substr($string, strpos($string, $start));
|
|
$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
|
|
$string = str_replace($section_to_remove, '', $string);
|
|
} return $string;
|
|
}
|
|
|
|
function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
|
|
$open_tag = '<'.$tag_name;
|
|
$close_tag = '</'.$tag_name.'>';
|
|
$close_tag_length = strlen($close_tag);
|
|
if (strpos($tag_start, $open_tag) === 0) {
|
|
while (strpos($string, $tag_start) !== false) {
|
|
$max_recursion = 100;
|
|
$section_to_remove = null;
|
|
$section_start = strpos($string, $tag_start);
|
|
$search_offset = $section_start;
|
|
do {
|
|
$max_recursion--;
|
|
$section_end = strpos($string, $close_tag, $search_offset);
|
|
$search_offset = $section_end + $close_tag_length;
|
|
$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
|
|
$open_tag_count = substr_count($section_to_remove, $open_tag);
|
|
$close_tag_count = substr_count($section_to_remove, $close_tag);
|
|
} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
|
|
$string = str_replace($section_to_remove, '', $string);
|
|
}
|
|
}
|
|
return $string;
|
|
}
|
|
|
|
$html = $this->file_get_html($this->getURI()) or $this->returnError('Could not request TheHackerNews: '.$this->getURI(), 500);
|
|
$limit = 0;
|
|
|
|
foreach ($html->find('article') as $element) {
|
|
if ($limit < 5) {
|
|
|
|
$article_url = $element->find('a.entry-title', 0)->href;
|
|
$article_author = trim($element->find('span.vcard', 0)->plaintext);
|
|
$article_title = $element->find('a.entry-title', 0)->plaintext;
|
|
$article_timestamp = strtotime($element->find('span.updated', 0)->plaintext);
|
|
$article_thumbnail = $element->find('img', 0)->src;
|
|
$article = $this->file_get_html($article_url) or $this->returnError('Could not request TheHackerNews: '.$article_url, 500);
|
|
|
|
$contents = $article->find('div.articlebodyonly', 0)->innertext;
|
|
$contents = StripRecursiveHTMLSection($contents, 'div', '<div class=\'clear\'');
|
|
$contents = StripWithDelimiters($contents, '<script', '</script>');
|
|
|
|
$item = new \Item();
|
|
$item->uri = $article_url;
|
|
$item->title = $article_title;
|
|
$item->author = $article_author;
|
|
$item->thumbnailUri = $article_thumbnail;
|
|
$item->timestamp = $article_timestamp;
|
|
$item->content = trim($contents);
|
|
$this->items[] = $item;
|
|
$limit++;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
public function getName() {
|
|
return $this->name;
|
|
}
|
|
|
|
public function getURI() {
|
|
return $this->uri;
|
|
}
|
|
} |