From 0e27a6ebabf7c2d39ea0849bad5b8adf573f4297 Mon Sep 17 00:00:00 2001 From: ORelio Date: Tue, 19 Jul 2016 19:35:43 +0200 Subject: [PATCH 1/4] [Numerama] Coding style: reindent code --- bridges/NumeramaBridge.php | 88 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index 4099178..f5b24ea 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -1,60 +1,60 @@ maintainer = "mitsukarenai"; - $this->name = "Numerama"; - $this->uri = "http://www.numerama.com/"; - $this->description = "Returns the 5 newest posts from Numerama (full text)"; - $this->update = "2015-10-12"; - - } - - public function collectData(array $param){ - - function NumeramaStripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } - - function NumeramaExtractContent($url) - { - $html2 = file_get_html($url); - $text = $html2->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block - $text = ''; // add post picture - $text = $text.$html2->find('article[class=post-content]', 0)->innertext; // extract the post - return $text; - } - - $html = $this->file_get_html('http://www.numerama.com/feed/') or $this->returnError('Could not request Numerama.', 404); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 5) { - $item = new \Item(); - $item->title = html_entity_decode(NumeramaStripCDATA($element->find('title', 0)->innertext)); - $item->author = NumeramaStripCDATA($element->find('dc:creator', 0)->innertext); - $item->uri = NumeramaStripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = NumeramaExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } + $this->maintainer = 'mitsukarenai'; + $this->name = 'Numerama'; + $this->uri = 'http://www.numerama.com/'; + $this->description = 'Returns the 5 newest posts from Numerama (full text)'; + $this->update = '2015-10-12'; } - public function getName(){ + public function collectData(array $param) { + + function NumeramaStripCDATA($string) { + $string = str_replace('', '', $string); + return $string; + } + + function NumeramaExtractContent($url) + { + $html2 = file_get_html($url); + $text = $html2->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block + $text = ''; // add post picture + $text = $text.$html2->find('article[class=post-content]', 0)->innertext; // extract the post + return $text; + } + + $html = $this->file_get_html('http://www.numerama.com/feed/') or $this->returnError('Could not request Numerama.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 5) { + $item = new \Item(); + $item->title = html_entity_decode(NumeramaStripCDATA($element->find('title', 0)->innertext)); + $item->author = NumeramaStripCDATA($element->find('dc:creator', 0)->innertext); + $item->uri = NumeramaStripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = NumeramaExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName() { return 'Numerama'; } - public function getURI(){ + public function getURI() { return 'http://www.numerama.com/'; } - public function getCacheDuration(){ + public function getCacheDuration() { return 1800; // 30min } } From 2f3bddb2915012de132a1b512dcf16105bff8180 Mon Sep 17 00:00:00 2001 From: ORelio Date: Tue, 19 Jul 2016 19:37:06 +0200 Subject: [PATCH 2/4] [Numerama] Use proxied file_get_html --- bridges/NumeramaBridge.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index f5b24ea..e1829c3 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -7,7 +7,7 @@ class NumeramaBridge extends BridgeAbstract{ $this->name = 'Numerama'; $this->uri = 'http://www.numerama.com/'; $this->description = 'Returns the 5 newest posts from Numerama (full text)'; - $this->update = '2015-10-12'; + $this->update = '2016-07-19'; } @@ -19,9 +19,9 @@ class NumeramaBridge extends BridgeAbstract{ return $string; } - function NumeramaExtractContent($url) + function NumeramaExtractContent($bridge, $url) { - $html2 = file_get_html($url); + $html2 = $bridge->file_get_html($url); $text = $html2->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block $text = ''; // add post picture $text = $text.$html2->find('article[class=post-content]', 0)->innertext; // extract the post @@ -38,7 +38,7 @@ class NumeramaBridge extends BridgeAbstract{ $item->author = NumeramaStripCDATA($element->find('dc:creator', 0)->innertext); $item->uri = NumeramaStripCDATA($element->find('guid', 0)->plaintext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = NumeramaExtractContent($item->uri); + $item->content = NumeramaExtractContent($this, $item->uri); $this->items[] = $item; $limit++; } From d5eb53ecd21784489a202e57ca786e8050443d20 Mon Sep 17 00:00:00 2001 From: ORelio Date: Tue, 19 Jul 2016 19:37:33 +0200 Subject: [PATCH 3/4] [WeLiveSecurity] New bridge, Security News. --- bridges/WeLiveSecurityBridge.php | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 bridges/WeLiveSecurityBridge.php diff --git a/bridges/WeLiveSecurityBridge.php b/bridges/WeLiveSecurityBridge.php new file mode 100644 index 0000000..cdb04bd --- /dev/null +++ b/bridges/WeLiveSecurityBridge.php @@ -0,0 +1,75 @@ +maintainer = 'ORelio'; + $this->name = $this->getName(); + $this->uri = $this->getURI(); + $this->description = 'Returns the newest articles.'; + $this->update = '2016-07-19'; + } + + public function collectData(array $param) { + + function ExtractFromDelimiters($string, $start, $end) { + if (strpos($string, $start) !== false) { + $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); + $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); + return $section_retrieved; + } return false; + } + + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + $feed = $this->getURI().'feed/'; + $html = $this->file_get_html($feed) or $this->returnError('Could not request '.$this->getName().': '.$feed, 500); + $limit = 0; + + foreach ($html->find('item') as $element) { + if ($limit < 5) { + + $article_image = $element->find('image', 0)->plaintext; + $article_url = ExtractFromDelimiters($element->innertext, '', ''); + $article_summary = ExtractFromDelimiters($element->innertext, '', '

'); + $article_html = file_get_contents($article_url) or $this->returnError('Could not request '.$this->getName().': '.$article_url, 500); + if (substr($article_html, 0, 2) == "\x1f\x8b") //http://www.gzip.org/zlib/rfc-gzip.html#header-trailer -> GZip ID1 + $article_html = gzdecode($article_html); //Response is GZipped even if we didn't accept GZip!? Let's decompress... + $article_html = str_get_html($article_html); //Now we have our HTML data. But still, that's an important HTTP violation... + $article_content = $article_html->find('div.wlistingsingletext', 0)->innertext; + $article_content = StripWithDelimiters($article_content, ''); + $article_content = '

' + .'

'.$article_summary.'

' + .trim($article_content); + + $item = new \Item(); + $item->uri = $article_url; + $item->thumbnailUri = $article_image; + $item->title = $element->find('title', 0)->plaintext; + $item->author = $article_html->find('a[rel=author]', 0)->plaintext; + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = $article_content; + $this->items[] = $item; + $limit++; + + } + } + } + + public function getName() { + return 'We Live Security'; + } + + public function getURI() { + return 'http://www.welivesecurity.com/'; + } + + public function getCacheDuration() { + return 3600; //1 hour + } +} \ No newline at end of file From c6190514c46cad68cac6bdf26ec9befbaecd9d97 Mon Sep 17 00:00:00 2001 From: ORelio Date: Tue, 19 Jul 2016 19:38:42 +0200 Subject: [PATCH 4/4] [ZDNet] New bridge, Tech News. --- bridges/ZDNetBridge.php | 308 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 bridges/ZDNetBridge.php diff --git a/bridges/ZDNetBridge.php b/bridges/ZDNetBridge.php new file mode 100644 index 0000000..4267b98 --- /dev/null +++ b/bridges/ZDNetBridge.php @@ -0,0 +1,308 @@ +maintainer = 'ORelio'; + $this->name = $this->getName(); + $this->uri = $this->getURI(); + $this->description = 'Returns the newest articles.'; + $this->update = '2016-07-18'; + + $this->parameters[] = + // http://www.zdnet.com/zdnet.opml + '[ + { + "name" : "Feed", + "type" : "list", + "identifier" : "feed", + "values" : + [ + { "name" : "---- Select ----", "value" : "" }, + + { "name" : "", "value" : "" }, + { "name" : "Subscribe to ZDNet RSS Feeds", "value" : "" }, + + { "name" : "    All Blogs", "value" : "blog" }, + { "name" : "    Just News", "value" : "news" }, + { "name" : "    All Reviews", "value" : "topic/reviews" }, + { "name" : "    Latest Downloads", "value" : "downloads!recent" }, + { "name" : "    Latest Articles", "value" : "/" }, + { "name" : "    Latest Australia Articles", "value" : "au" }, + { "name" : "    Latest UK Articles", "value" : "uk" }, + { "name" : "    Latest US Articles", "value" : "us" }, + { "name" : "    Latest Asia Articles", "value" : "as" }, + + { "name" : "", "value" : "" }, + { "name" : "Keep up with ZDNet Blogs RSS:", "value" : "" }, + + { "name" : "    Transforming the Datacenter", "value" : "blog/transforming-datacenter" }, + { "name" : "    SMB India", "value" : "blog/smb-india" }, + { "name" : "    Indonesia BizTech", "value" : "blog/indonesia-biztech" }, + { "name" : "    Hong Kong Techie", "value" : "blog/hong-kong-techie" }, + { "name" : "    Tech Taiwan", "value" : "blog/tech-taiwan" }, + { "name" : "    Startup India", "value" : "blog/startup-india" }, + { "name" : "    Starting Up Asia", "value" : "blog/starting-up-asia" }, + { "name" : "    Next-Gen Partner", "value" : "blog/partner" }, + { "name" : "    Post-PC Developments", "value" : "blog/post-pc" }, + { "name" : "    Benelux", "value" : "blog/benelux" }, + { "name" : "    Heat Sink", "value" : "blog/heat-sink" }, + { "name" : "    Italy's got tech", "value" : "blog/italy" }, + { "name" : "    African Enterprise", "value" : "blog/african-enterprise" }, + { "name" : "    New Tech for Old India", "value" : "blog/new-india" }, + { "name" : "    Estonia Uncovered", "value" : "blog/estonia" }, + { "name" : "    IT Iberia", "value" : "blog/iberia" }, + { "name" : "    Brazil Tech", "value" : "blog/brazil" }, + { "name" : "    500 words into the future", "value" : "blog/500-words-into-the-future" }, + { "name" : "    ÜberTech", "value" : "blog/ubertech" }, + { "name" : "    All About Microsoft", "value" : "blog/microsoft" }, + { "name" : "    Back office", "value" : "blog/back-office" }, + { "name" : "    Barker Bites Back", "value" : "blog/barker-bites-back" }, + { "name" : "    Between the Lines", "value" : "blog/btl" }, + { "name" : "    Big on Data", "value" : "blog/big-data" }, + { "name" : "    bootstrappr", "value" : "blog/bootstrappr" }, + { "name" : "    By The Way", "value" : "blog/by-the-way" }, + { "name" : "    Central European Processing", "value" : "blog/central-europe" }, + { "name" : "    Cloud Builders", "value" : "blog/cloud-builders" }, + { "name" : "    Communication Breakdown", "value" : "blog/communication-breakdown" }, + { "name" : "    Collaboration 2.0", "value" : "blog/collaboration" }, + { "name" : "    Constellation Research", "value" : "blog/constellation" }, + { "name" : "    Consumerization: BYOD", "value" : "blog/consumerization" }, + { "name" : "    DIY-IT", "value" : "blog/diy-it" }, + { "name" : "    Enterprise Web 2.0", "value" : "blog/hinchcliffe" }, + { "name" : "    Five Nines: The Next Gen Datacenter", "value" : "blog/datacenter" }, + { "name" : "    Forrester Research", "value" : "blog/forrester" }, + { "name" : "    Full Duplex", "value" : "blog/full-duplex" }, + { "name" : "    Gen Why?", "value" : "blog/gen-why" }, + { "name" : "    Hardware 2.0", "value" : "blog/hardware" }, + { "name" : "    Identity Matters", "value" : "blog/identity" }, + { "name" : "    iGeneration", "value" : "blog/igeneration" }, + { "name" : "    Internet of Everything", "value" : "blog/cisco" }, + { "name" : "    Beyond IT Failure", "value" : "blog/projectfailures" }, + { "name" : "    Jamie's Mostly Linux Stuff", "value" : "blog/jamies-mostly-linux-stuff" }, + { "name" : "    Jack's Blog", "value" : "blog/jacks-blog" }, + { "name" : "    Laptops & Desktops", "value" : "blog/computers" }, + { "name" : "    Linux and Open Source", "value" : "blog/open-source" }, + { "name" : "    London Calling", "value" : "blog/london" }, + { "name" : "    Mapping Babel", "value" : "blog/mapping-babel" }, + { "name" : "    Mixed Signals", "value" : "blog/mixed-signals" }, + { "name" : "    Mobile India", "value" : "blog/mobile-india" }, + { "name" : "    Mobile News", "value" : "blog/mobile-news" }, + { "name" : "    Networking", "value" : "blog/networking" }, + { "name" : "    Norse Code", "value" : "blog/norse-code" }, + { "name" : "    Null Pointer", "value" : "blog/null-pointer" }, + { "name" : "    The Full Tilt", "value" : "blog/the-full-tilt" }, + { "name" : "    Pinoy Post", "value" : "blog/pinoy-post" }, + { "name" : "    Practically Tech", "value" : "blog/practically-tech" }, + { "name" : "    Product Central", "value" : "blog/product-central" }, + { "name" : "    Pulp Tech", "value" : "blog/violetblue" }, + { "name" : "    Qubits and Pieces", "value" : "blog/qubits-and-pieces" }, + { "name" : "    Securify This!", "value" : "blog/securify-this" }, + { "name" : "    Service Oriented", "value" : "blog/service-oriented" }, + { "name" : "    Small Talk", "value" : "blog/small-talk" }, + { "name" : "    Small Business Matters", "value" : "blog/small-business-matters" }, + { "name" : "    Smartphones and Cell Phones", "value" : "blog/cell-phones" }, + { "name" : "    Social Business", "value" : "blog/feeds" }, + { "name" : "    Social CRM: The Conversation", "value" : "blog/crm" }, + { "name" : "    Software & Services Safari", "value" : "blog/sommer" }, + { "name" : "    Storage Bits", "value" : "blog/storage" }, + { "name" : "    Stacking up Open Clouds", "value" : "blog/apac-redhat" }, + { "name" : "    Techie Isles", "value" : "blog/techie-isles" }, + { "name" : "    Technolatte", "value" : "blog/technolatte" }, + { "name" : "    Tech Podium", "value" : "blog/tech-podium" }, + { "name" : "    Tel Aviv Tech", "value" : "blog/tel-aviv" }, + { "name" : "    Tech Broiler", "value" : "blog/perlow" }, + { "name" : "    The SANMAN", "value" : "blog/the-sanman" }, + { "name" : "    The open source revolution", "value" : "blog/the-open-source-revolution" }, + { "name" : "    The German View", "value" : "blog/german" }, + { "name" : "    The Ed Bott Report", "value" : "blog/bott" }, + { "name" : "    The Mobile Gadgeteer", "value" : "blog/mobile-gadgeteer" }, + { "name" : "    The Apple Core", "value" : "blog/apple" }, + { "name" : "    Tom Foremski: IMHO", "value" : "blog/foremski" }, + { "name" : "    Twisted Wire", "value" : "blog/twisted-wire" }, + { "name" : "    Vive la tech", "value" : "blog/france" }, + { "name" : "    Virtually Speaking", "value" : "blog/virtualization" }, + { "name" : "    View from China", "value" : "blog/china" }, + { "name" : "    Web design & Free Software", "value" : "blog/web-design-and-free-software" }, + { "name" : "    ZDNet Government", "value" : "blog/government" }, + { "name" : "    ZDNet UK Book Reviews", "value" : "blog/zdnet-uk-book-reviews" }, + { "name" : "    ZDNet UK First Take", "value" : "blog/zdnet-uk-first-take" }, + { "name" : "    Zero Day", "value" : "blog/security" }, + + { "name" : "", "value" : "" }, + { "name" : "ZDNet Hot Topics RSS:", "value" : "" }, + + { "name" : "    Apple", "value" : "topic/apple" }, + { "name" : "    Collaboration", "value" : "topic/collaboration" }, + { "name" : "    Enterprise Software", "value" : "topic/enterprise-software" }, + { "name" : "    Google", "value" : "topic/google" }, + { "name" : "    Great debate", "value" : "topic/great-debate" }, + { "name" : "    Hardware", "value" : "topic/hardware" }, + { "name" : "    IBM", "value" : "topic/ibm" }, + { "name" : "    iOS", "value" : "topic/ios" }, + { "name" : "    iPhone", "value" : "topic/iphone" }, + { "name" : "    iPad", "value" : "topic/ipad" }, + { "name" : "    IT Priorities", "value" : "topic/it-priorities" }, + { "name" : "    Laptops", "value" : "topic/laptops" }, + { "name" : "    Legal", "value" : "topic/legal" }, + { "name" : "    Linux", "value" : "topic/linux" }, + { "name" : "    Microsoft", "value" : "topic/microsoft" }, + { "name" : "    Mobile OS", "value" : "topic/mobile-os" }, + { "name" : "    Mobility", "value" : "topic/mobility" }, + { "name" : "    Networking", "value" : "topic/networking" }, + { "name" : "    Oracle", "value" : "topic/oracle" }, + { "name" : "    Processors", "value" : "topic/processors" }, + { "name" : "    Samsung", "value" : "topic/samsung" }, + { "name" : "    Security", "value" : "topic/security" }, + { "name" : "    Small business: going big on mobility", "value" : "topic/small-business-going-big-on-mobility" }, + + { "name" : "", "value" : "" }, + { "name" : "Product Blogs:", "value" : "" }, + + { "name" : "    Digital Cameras & Camcorders", "value" : "blog/digitalcameras" }, + { "name" : "    Home Theater", "value" : "blog/home-theater" }, + { "name" : "    Laptops and Desktops", "value" : "blog/computers" }, + { "name" : "    The Mobile Gadgeteer", "value" : "blog/mobile-gadgeteer" }, + { "name" : "    Smartphones and Cell Phones", "value" : "blog/cell-phones" }, + { "name" : "    The ToyBox", "value" : "blog/gadgetreviews" }, + + { "name" : "", "value" : "" }, + { "name" : "Vertical Blogs:", "value" : "" }, + + { "name" : "    ZDNet Education", "value" : "blog/education" }, + { "name" : "    ZDNet Healthcare", "value" : "blog/healthcare" }, + { "name" : "    ZDNet Government", "value" : "blog/government" } + ] + } + ]'; + + } + + public function collectData(array $param) { + + function StripCDATA($string) { + $string = str_replace('', '', $string); + return trim($string); + } + + function ExtractFromDelimiters($string, $start, $end) { + if (strpos($string, $start) !== false) { + $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); + $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); + return $section_retrieved; + } return false; + } + + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { + $open_tag = '<'.$tag_name; + $close_tag = ''; + $close_tag_length = strlen($close_tag); + if (strpos($tag_start, $open_tag) === 0) { + while (strpos($string, $tag_start) !== false) { + $max_recursion = 100; + $section_to_remove = null; + $section_start = strpos($string, $tag_start); + $search_offset = $section_start; + do { + $max_recursion--; + $section_end = strpos($string, $close_tag, $search_offset); + $search_offset = $section_end + $close_tag_length; + $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); + $open_tag_count = substr_count($section_to_remove, $open_tag); + $close_tag_count = substr_count($section_to_remove, $close_tag); + } while ($open_tag_count > $close_tag_count && $max_recursion > 0); + $string = str_replace($section_to_remove, '', $string); + } + } + return $string; + } + + $baseUri = $this->getURI(); + $feed = $param['feed']; + if (empty($feed)) + $this->returnError('Please select a feed to display.', 400); + if (strpos($feed, 'downloads!') !== false) { + $feed = str_replace('downloads!', '', $feed); + $baseUri = str_replace('www.', 'downloads.', $baseUri); + } + if ($feed !== preg_replace('/[^a-zA-Z0-9-\/]+/', '', $feed) || substr_count($feed, '/') > 1 || strlen($feed > 64)) + $this->returnError('Invalid "feed" parameter.', 400); + $url = $baseUri.trim($feed, '/').'/rss.xml'; + $html = $this->file_get_html($url) or $this->returnError('Could not request ZDNet: '.$url, 500); + $limit = 0; + + foreach ($html->find('item') as $element) { + if ($limit < 10) { + $article_url = preg_replace('/([^#]+)#ftag=.*/', '$1', StripCDATA(ExtractFromDelimiters($element->innertext, '', ''))); + $article_author = StripCDATA(ExtractFromDelimiters($element->innertext, 'role="author">', '<')); + $article_title = StripCDATA($element->find('title', 0)->plaintext); + $article_subtitle = StripCDATA($element->find('description', 0)->plaintext); + $article_timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); + $article = $this->file_get_html($article_url) or $this->returnError('Could not request ZDNet: '.$article_url, 500); + + if (!empty($article_author)) + $author = $article_author; + else { + $author = $article->find('meta[name=author]', 0); + if (is_object($author)) + $author = $author->content; + else $author = 'ZDNet'; + } + + $thumbnail = $article->find('meta[itemprop=image]', 0); + if (is_object($thumbnail)) + $thumbnail = $thumbnail->content; + else $thumbnail = 'http://zdnet1.cbsistatic.com/fly/bundles/zdnetcss/images/logos/logo-192x192.png'; + + $contents = $article->find('article', 0)->innertext; + foreach (array( + '
'); + $contents = StripWithDelimiters($contents, ''); + $contents = StripWithDelimiters($contents, '