From b37877bc746fc2bd793585600413692cf0412faa Mon Sep 17 00:00:00 2001 From: prysme01 Date: Mon, 1 Aug 2016 15:18:32 +0200 Subject: [PATCH 1/2] very basic support of ArsTechnica --- bridges/ArstechnicaBridge.php | 81 +++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 bridges/ArstechnicaBridge.php diff --git a/bridges/ArstechnicaBridge.php b/bridges/ArstechnicaBridge.php new file mode 100644 index 0000000..80c8086 --- /dev/null +++ b/bridges/ArstechnicaBridge.php @@ -0,0 +1,81 @@ +maintainer = "prysme"; + $this->name = "ArstechnicaBridge"; + $this->uri = "http://arstechnica.com"; + $this->description = "The PC enthusiast's resource. Power users and the tools they love, without computing religion"; + $this->update = "01/08/2016"; + + } + + public function collectData(array $param) { + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + function StripCDATA($string) { + $string = str_replace('', '', $string); + return $string; + } + + function ExtractContent($url) { + #echo $url; + $html2 = file_get_html($url); + + $text = $html2->find("section[id='article-guts']", 0); + $text = StripWithDelimiters($text->innertext,''); + $text = StripWithDelimiters($text,'
','
'); + $text = StripWithDelimiters($text,''); + $text = StripWithDelimiters($text,'
','
'); + $text = StripWithDelimiters($text,'
'); + $text = StripWithDelimiters($text,'
  • '); + //$text = strip_tags($text->innertext, '

    '); + #print_r("ICI"); + #print_r($text); + #print_r("FIN"); + return $text; + } + + $html = $this->file_get_html('http://feeds.arstechnica.com/arstechnica/index') or $this->returnError('Could not request NextInpact.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 5) { + $item = new \Item(); + $item->title = StripCDATA($element->find('title', 0)->innertext); + $item->uri = StripCDATA($element->find('guid', 0)->plaintext); + $item->thumbnailUri = StripCDATA($element->find('enclosure', 0)->url); + $item->author = StripCDATA($element->find('author', 0)->innertext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = ExtractContent($item->uri); + //$item->content = $item->uri; + $this->items[] = $item; + $limit++; + } + } + +} + + + public function getName() { + return 'ArsTechnica'; + } + + public function getCacheDuration() { + return 0; // 2h + } + + public function getURI() { + return "http://arstechnica.com"; + } + +} From e3cf486ac56452efc989d475ab299499e94ac80c Mon Sep 17 00:00:00 2001 From: prysme01 Date: Mon, 1 Aug 2016 16:16:18 +0200 Subject: [PATCH 2/2] Better tag handling --- bridges/ArstechnicaBridge.php | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bridges/ArstechnicaBridge.php b/bridges/ArstechnicaBridge.php index 80c8086..f9a7398 100644 --- a/bridges/ArstechnicaBridge.php +++ b/bridges/ArstechnicaBridge.php @@ -32,16 +32,14 @@ class ArstechnicaBridge extends BridgeAbstract { $html2 = file_get_html($url); $text = $html2->find("section[id='article-guts']", 0); + /*foreach ($text->find('