2016-08-10 23:18:35 +02:00
|
|
|
<?php
|
2016-08-28 20:07:56 +02:00
|
|
|
class KununuBridge extends HttpCachingBridgeAbstract {
|
2016-08-27 21:03:26 +02:00
|
|
|
public $maintainer = "logmanoriginal";
|
|
|
|
public $name = "Kununu Bridge"; /* This will be replaced later! */
|
|
|
|
public $uri = "https://www.kununu.com"; /* This will be replaced later! */
|
|
|
|
public $description = "Returns the latest reviews for a company and site of your choice.";
|
2016-08-10 23:18:35 +02:00
|
|
|
|
2016-08-27 21:03:26 +02:00
|
|
|
public $parameters = array(
|
|
|
|
'global' => array(
|
2016-08-22 01:25:56 +02:00
|
|
|
'site'=>array(
|
|
|
|
'name'=>'Site',
|
|
|
|
'type'=>'list',
|
|
|
|
'required'=>true,
|
|
|
|
'exampleValue'=>'United States',
|
|
|
|
'title'=>'Select your site',
|
|
|
|
'values'=>array(
|
|
|
|
'Austria'=>'at',
|
|
|
|
'Germany'=>'de',
|
|
|
|
'Switzerland'=>'ch',
|
|
|
|
'United States'=>'us'
|
|
|
|
)
|
|
|
|
),
|
|
|
|
'full'=>array(
|
|
|
|
'name'=>'Load full article',
|
|
|
|
'type'=>'checkbox',
|
|
|
|
'required'=>false,
|
|
|
|
'exampleValue'=>'checked',
|
|
|
|
'title'=>'Activate to load full article'
|
|
|
|
)
|
2016-08-27 21:03:26 +02:00
|
|
|
),
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-27 21:03:26 +02:00
|
|
|
array(
|
2016-08-22 01:25:56 +02:00
|
|
|
'company'=>array(
|
|
|
|
'name'=>'Company',
|
|
|
|
'required'=>true,
|
|
|
|
'exampleValue'=>'kununu-us',
|
|
|
|
'title'=>'Insert company name (i.e. Kununu US) or URI path (i.e. kununu-us)'
|
|
|
|
)
|
2016-08-27 21:03:26 +02:00
|
|
|
)
|
|
|
|
);
|
2016-08-10 23:18:35 +02:00
|
|
|
|
2016-08-25 01:24:53 +02:00
|
|
|
public function collectData(){
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Get Site
|
2016-08-28 01:25:33 +02:00
|
|
|
$site = strtolower(trim($this->getInput('site')));
|
2016-08-10 23:18:35 +02:00
|
|
|
if(!isset($site) || empty($site) || !$this->site_is_valid($site))
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnClientError('You must specify a valid site (&site=...)!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Get Company (fixing whitespace and umlauts)
|
2016-08-28 01:25:33 +02:00
|
|
|
$company = $this->encode_umlauts(strtolower(str_replace(' ', '-', trim($this->getInput('company')))));
|
2016-08-10 23:18:35 +02:00
|
|
|
if(!isset($company) || empty($company))
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnClientError('You must specify a company (&company=...)!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
$full = false; // By default we'll load only short article
|
2016-08-28 20:38:01 +02:00
|
|
|
if($this->getInput('full'))
|
2016-08-28 01:25:33 +02:00
|
|
|
$full = strtolower(trim($this->getInput('full')));
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Get reviews section name (depends on site)
|
|
|
|
$section = '';
|
|
|
|
switch($site){
|
|
|
|
case 'at':
|
|
|
|
case 'de':
|
|
|
|
case 'ch':
|
|
|
|
$section = 'kommentare';
|
|
|
|
break;
|
|
|
|
case 'us':
|
|
|
|
$section = 'reviews';
|
|
|
|
break;
|
|
|
|
default:
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('The reviews section is not defined for you selection!');
|
2016-08-10 23:18:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update URI for the content
|
|
|
|
$this->uri .= "/{$site}/{$company}/{$section}";
|
|
|
|
|
|
|
|
// Load page
|
2016-08-21 19:23:35 +02:00
|
|
|
$html = $this->getSimpleHTMLDOM($this->uri);
|
2016-08-10 23:18:35 +02:00
|
|
|
if($html === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Unable to receive data from ' . $this->uri . '!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Update name for this request
|
|
|
|
$this->name = $this->extract_company_name($html) . ' - ' . $this->name;
|
|
|
|
|
|
|
|
// Find the section with all the panels (reviews)
|
|
|
|
$section = $html->find('section.kununu-scroll-element', 0);
|
|
|
|
if($section === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Unable to find panel section!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Find all articles (within the panels)
|
|
|
|
$articles = $section->find('article');
|
|
|
|
if($articles === false || empty($articles))
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Unable to find articles!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Go through all articles
|
|
|
|
foreach($articles as $article){
|
2016-08-22 18:55:59 +02:00
|
|
|
$item = array();
|
2016-08-10 23:18:35 +02:00
|
|
|
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['author'] = $this->extract_article_author_position($article);
|
|
|
|
$item['timestamp'] = $this->extract_article_date($article);
|
|
|
|
$item['title'] = $this->extract_article_rating($article) . ' : ' . $this->extract_article_summary($article);
|
|
|
|
$item['uri'] = $this->extract_article_uri($article);
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
if($full)
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content'] = $this->extract_full_description($item['uri']);
|
2016-08-10 23:18:35 +02:00
|
|
|
else
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content'] = $this->extract_article_description($article);
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
$this->items[] = $item;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-10 23:27:48 +02:00
|
|
|
public function getCacheDuration(){
|
|
|
|
return 86400; // 1 day
|
|
|
|
}
|
|
|
|
|
2016-08-22 01:25:56 +02:00
|
|
|
/**
|
2016-08-10 23:18:35 +02:00
|
|
|
* Returns true if the given site is part of the parameters list
|
|
|
|
*/
|
|
|
|
private function site_is_valid($site){
|
2016-08-22 01:25:56 +02:00
|
|
|
$parameter = $this->parameters['global'];
|
|
|
|
$sites = $parameter['site']['values'];
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
$site_names = array();
|
|
|
|
|
2016-08-22 01:25:56 +02:00
|
|
|
foreach($sites as $name=>$value)
|
|
|
|
$site_names[] = $value;
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
return in_array($site, $site_names);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fixes relative URLs in the given text
|
|
|
|
*/
|
|
|
|
private function fix_url($text){
|
|
|
|
return preg_replace('/href=(\'|\")\//i', 'href="https://www.kununu.com/', $text);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Encodes unmlauts in the given text
|
|
|
|
*/
|
|
|
|
private function encode_umlauts($text){
|
|
|
|
$umlauts = Array("/ä/","/ö/","/ü/","/Ä/","/Ö/","/Ü/","/ß/");
|
|
|
|
$replace = Array("ae","oe","ue","Ae","Oe","Ue","ss");
|
|
|
|
|
|
|
|
return preg_replace($umlauts, $replace, $text);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the company name from the review html
|
|
|
|
*/
|
|
|
|
private function extract_company_name($html){
|
|
|
|
$panel = $html->find('div.panel', 0);
|
|
|
|
if($panel === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find panel for company name!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
$company_name = $panel->find('h1', 0);
|
|
|
|
if($company_name === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find company name!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return $company_name->plaintext;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the date from a given article
|
|
|
|
*/
|
|
|
|
private function extract_article_date($article){
|
|
|
|
// They conviniently provide a time attribute for us :)
|
|
|
|
$date = $article->find('time[itemprop=dtreviewed]', 0);
|
|
|
|
if($date === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article date!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return strtotime($date->datetime);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the rating from a given article
|
|
|
|
*/
|
|
|
|
private function extract_article_rating($article){
|
|
|
|
$rating = $article->find('span.rating', 0);
|
|
|
|
if($rating === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article rating!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return $rating->getAttribute('aria-label');
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the summary from a given article
|
|
|
|
*/
|
|
|
|
private function extract_article_summary($article){
|
|
|
|
$summary = $article->find('[itemprop=summary]', 0);
|
|
|
|
if($summary === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article summary!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return strip_tags($summary->innertext);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the URI from a given article
|
|
|
|
*/
|
|
|
|
private function extract_article_uri($article){
|
|
|
|
// Notice: This first part is the same as in extract_article_summary!
|
|
|
|
$summary = $article->find('[itemprop=summary]', 0);
|
|
|
|
if($summary === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article summary!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
$anchor = $summary->find('a', 0);
|
|
|
|
if($anchor === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article URI!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return 'https://www.kununu.com' . $anchor->href;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the position of the author from a given article
|
|
|
|
*/
|
|
|
|
private function extract_article_author_position($article){
|
|
|
|
// We need to parse the aside manually
|
|
|
|
$aside = $article->find('aside', 0);
|
|
|
|
if($aside === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article author information!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Go through all h2 elements to find index of required span (I know... it's stupid)
|
|
|
|
$author_position = 'Unknown';
|
|
|
|
foreach($aside->find('h2') as $subject){
|
|
|
|
if(stristr(strtolower($subject->plaintext), 'position')){ /* This works for at, ch, de, us */
|
|
|
|
$author_position = $subject->next_sibling()->plaintext;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return $author_position;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the description from a given article
|
|
|
|
*/
|
|
|
|
private function extract_article_description($article){
|
|
|
|
$description = $article->find('div[itemprop=description]', 0);
|
|
|
|
if($description === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article description!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-10 23:18:35 +02:00
|
|
|
return $this->fix_url($description->innertext);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the full description from a given uri
|
|
|
|
*/
|
|
|
|
private function extract_full_description($uri){
|
|
|
|
// Load full article
|
2016-08-28 20:07:56 +02:00
|
|
|
if($this->get_cached_time($uri) <= strtotime('-24 hours'))
|
|
|
|
$this->remove_from_cache($uri);
|
|
|
|
|
|
|
|
$html = $this->get_cached($uri);
|
2016-08-10 23:18:35 +02:00
|
|
|
if($html === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Could not load full description!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Find the article
|
|
|
|
$article = $html->find('article', 0);
|
|
|
|
if($article === false)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Cannot find article!');
|
2016-08-10 23:18:35 +02:00
|
|
|
|
|
|
|
// Luckily they use the same layout for the review overview and full article pages :)
|
|
|
|
return $this->extract_article_description($article);
|
|
|
|
}
|
|
|
|
}
|