|
@@ -3,97 +3,87 @@ class SexactuBridge extends BridgeAbstract {
|
|
|
|
|
|
const MAINTAINER = 'Riduidel';
|
|
|
const NAME = 'Sexactu';
|
|
|
- const URI = 'https://www.gqmagazine.fr';
|
|
|
+ const AUTHOR = 'Maïa Mazaurette';
|
|
|
+ const DOMAIN = 'http://www.gqmagazine.fr';
|
|
|
const CACHE_TIMEOUT = 7200; // 2h
|
|
|
const DESCRIPTION = 'Sexactu via rss-bridge';
|
|
|
|
|
|
- public function collectData(){
|
|
|
- $find = array(
|
|
|
- 'janvier',
|
|
|
- 'février',
|
|
|
- 'mars',
|
|
|
- 'avril',
|
|
|
- 'mai',
|
|
|
- 'juin',
|
|
|
- 'juillet',
|
|
|
- 'août',
|
|
|
- 'septembre',
|
|
|
- 'novembre',
|
|
|
- 'décembre'
|
|
|
- );
|
|
|
+ const REPLACED_ATTRIBUTES = array(
|
|
|
+ 'href' => 'href',
|
|
|
+ 'src' => 'src',
|
|
|
+ 'data-original' => 'src'
|
|
|
+ );
|
|
|
+
|
|
|
|
|
|
- $replace = array(
|
|
|
- 'January',
|
|
|
- 'February',
|
|
|
- 'March',
|
|
|
- 'April',
|
|
|
- 'May',
|
|
|
- 'June',
|
|
|
- 'July',
|
|
|
- 'August',
|
|
|
- 'September',
|
|
|
- 'October',
|
|
|
- 'November',
|
|
|
- 'December'
|
|
|
- );
|
|
|
+ public function getURI(){
|
|
|
+ return self::DOMAIN . '/sexactu';
|
|
|
+ }
|
|
|
|
|
|
+ public function collectData(){
|
|
|
$html = getSimpleHTMLDOM($this->getURI())
|
|
|
or returnServerError('Could not request ' . $this->getURI());
|
|
|
|
|
|
- foreach($html->find('.content-holder') as $contentHolder){
|
|
|
+ $sexactu = $html->find('.container_sexactu', 0);
|
|
|
+ $rowList = $sexactu->find('.row');
|
|
|
+ foreach($rowList as $row){
|
|
|
// only use first list as second one only contains pages numbers
|
|
|
- $articles = $contentHolder->find('ul', 0);
|
|
|
- foreach($articles->find('li') as $element){
|
|
|
- // if you ask about that method_exists, there seems to be a bug in simple html dom
|
|
|
- // see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619
|
|
|
- if(is_object($element)){
|
|
|
- $item = array();
|
|
|
- // various metadata
|
|
|
- $titleBlock = $element->find('.title-holder', 0);
|
|
|
- if(is_object($titleBlock)){
|
|
|
- $titleDetails = $titleBlock->find('.article-title', 0);
|
|
|
- $titleData = $titleDetails->find('h2', 0)->find('a', 0);
|
|
|
- $titleTimestamp = $titleDetails->find('h4', 0);
|
|
|
- $item['title'] = $this->correctCase(trim($titleData->innertext));
|
|
|
- $item['uri'] = self::URI . $titleData->href;
|
|
|
-
|
|
|
- // Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension
|
|
|
- $dateText = $titleTimestamp->innertext;
|
|
|
- $dateText = substr($dateText, strpos($dateText, ',') + 1);
|
|
|
- $dateText = str_replace($find, $replace, strtolower($dateText));
|
|
|
- $date = strtotime($dateText);
|
|
|
- $item['timestamp'] = $date;
|
|
|
|
|
|
- $item['author'] = 'Maïa Mazaurette';
|
|
|
- $elementText = $element->find('.text-container', 0);
|
|
|
- // don't forget to replace images server url with gq one
|
|
|
- foreach($elementText->find('img') as $image){
|
|
|
- $image->src = self::URI . $image->src;
|
|
|
- }
|
|
|
- $item['content'] = $elementText->innertext;
|
|
|
- $this->items[] = $item;
|
|
|
- }
|
|
|
+ $title = $row->find('.title', 0);
|
|
|
+ if($title){
|
|
|
+ $item = array();
|
|
|
+ $item['author'] = self::AUTHOR;
|
|
|
+ $item['title'] = $title->plaintext;
|
|
|
+ $urlAttribute = "data-href";
|
|
|
+ $uri = $title->$urlAttribute;
|
|
|
+ if($uri === false)
|
|
|
+ continue;
|
|
|
+ if(substr($uri, 0, 1) === 'h'){ // absolute uri
|
|
|
+ $item['uri'] = $uri;
|
|
|
+ } else if(substr($uri, 0, 1) === '/'){ // domain relative url
|
|
|
+ $item['uri'] = self::DOMAIN . $uri;
|
|
|
+ } else {
|
|
|
+ $item['uri'] = $this->getURI() . $uri;
|
|
|
}
|
|
|
+ $article = $this->loadFullArticle($item['uri']);
|
|
|
+ $item['content'] = $this->replaceUriInHtmlElement($article->find('.article_content', 0));
|
|
|
+
|
|
|
+ $publicationDate = $article->find('time[itemprop=datePublished]', 0);
|
|
|
+ $short_date = $publicationDate->datetime;
|
|
|
+ $item['timestamp'] = date_parse($short_date);
|
|
|
+ } else {
|
|
|
+ // Sometimes we get rubbish, ignore.
|
|
|
+ continue;
|
|
|
}
|
|
|
+ $this->items[] = $item;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- public function getURI(){
|
|
|
- return self::URI . '/sexactu';
|
|
|
- }
|
|
|
+ /**
|
|
|
+ * Loads the full article and returns the contents
|
|
|
+ * @param $uri The article URI
|
|
|
+ * @return The article content
|
|
|
+ */
|
|
|
+ private function loadFullArticle($uri){
|
|
|
+ $html = getSimpleHTMLDOMCached($uri);
|
|
|
|
|
|
- private function correctCase($str){
|
|
|
- $sentences = explode('.', mb_strtolower($str, 'UTF-8'));
|
|
|
- $str = '';
|
|
|
- $sep = '';
|
|
|
- foreach ($sentences as $sentence){
|
|
|
- //upper case first char
|
|
|
- $sentence = ucfirst(trim($sentence));
|
|
|
+ $content = $html->find('#article', 0);
|
|
|
+ if($content){
|
|
|
+ return $content;
|
|
|
+ }
|
|
|
+
|
|
|
+ return null;
|
|
|
+ }
|
|
|
|
|
|
- //append sentence to output
|
|
|
- $str = $str . $sep . $sentence;
|
|
|
- $sep = '. ';
|
|
|
+ /**
|
|
|
+ * Replaces all relative URIs with absolute ones
|
|
|
+ * @param $element A simplehtmldom element
|
|
|
+ * @return The $element->innertext with all URIs replaced
|
|
|
+ */
|
|
|
+ private function replaceUriInHtmlElement($element){
|
|
|
+ $returned = $element->innertext;
|
|
|
+ foreach (self::REPLACED_ATTRIBUTES as $initial => $final) {
|
|
|
+ $returned = str_replace($initial.'="/', $final.'="' . self::DOMAIN . '/', $returned);
|
|
|
}
|
|
|
- return $str;
|
|
|
+ return $returned;
|
|
|
}
|
|
|
}
|