1
0
Fork 0
forked from blallo/rss-bridge

[AmazonPriceTrackerBridge] Improve Amazon scraper logic (#761)

- Now works on all websites, and even with products
  with multiple prices
- Closes #750
This commit is contained in:
Nemo 2018-08-01 01:14:37 +05:30 committed by LogMANOriginal
parent 53bdfa3bf0
commit 8f9a385b4d

View file

@ -92,6 +92,14 @@ class AmazonPriceTrackerBridge extends BridgeAbstract {
} }
} }
private function parseDynamicImage($attribute) {
$json = json_decode(html_entity_decode($attribute), true);
if ($json and count($json) > 0) {
return array_keys($json)[0];
}
}
/** /**
* Returns a generated image tag for the product * Returns a generated image tag for the product
*/ */
@ -99,11 +107,15 @@ class AmazonPriceTrackerBridge extends BridgeAbstract {
$imageSrc = $html->find('#main-image-container img', 0); $imageSrc = $html->find('#main-image-container img', 0);
if ($imageSrc) { if ($imageSrc) {
$imageSrc = $imageSrc ? $imageSrc->getAttribute('data-old-hires') : ''; $hiresImage = $imageSrc->getAttribute('data-old-hires');
return <<<EOT $dynamicImageAttribute = $imageSrc->getAttribute('data-a-dynamic-image');
<img width="300" style="max-width:300;max-height:300" src="$imageSrc" alt="{$this->title}" /> $image = $hiresImage ?: $this->parseDynamicImage($dynamicImageAttribute);
EOT;
} }
$image = $image ?: 'https://placekitten.com/200/300';
return <<<EOT
<img width="300" style="max-width:300;max-height:300" src="$image" alt="{$this->title}" />
EOT;
} }
/** /**
@ -116,6 +128,39 @@ EOT;
return getSimpleHTMLDOM($uri) ?: returnServerError('Could not request Amazon.'); return getSimpleHTMLDOM($uri) ?: returnServerError('Could not request Amazon.');
} }
private function scrapePriceFromMetrics($html) {
$asinData = $html->find('#cerberus-data-metrics', 0);
// <div id="cerberus-data-metrics" style="display: none;"
// data-asin="B00WTHJ5SU" data-asin-price="14.99" data-asin-shipping="0"
// data-asin-currency-code="USD" data-substitute-count="-1" ... />
if ($asinData) {
return [
'price' => $asinData->getAttribute('data-asin-price'),
'currency' => $asinData->getAttribute('data-asin-currency-code'),
'shipping' => $asinData->getAttribute('data-asin-shipping')
];
}
return false;
}
private function scrapePriceGeneric($html) {
$priceDiv = $html->find('span.offer-price', 0) ?: $html->find('.a-color-price', 0);
preg_match('/^\s*([A-Z]{3}|£|\$)\s?([\d.,]+)\s*$/', $priceDiv->plaintext, $matches);
if (count($matches) === 3) {
return [
'price' => $matches[2],
'currency' => $matches[1],
'shipping' => '0'
];
}
return false;
}
/** /**
* Scrape method for Amazon product page * Scrape method for Amazon product page
* @return [type] [description] * @return [type] [description]
@ -125,23 +170,16 @@ EOT;
$this->title = $this->getTitle($html); $this->title = $this->getTitle($html);
$imageTag = $this->getImage($html); $imageTag = $this->getImage($html);
$asinData = $html->find('#cerberus-data-metrics', 0); $data = $this->scrapePriceFromMetrics($html) ?: $this->scrapePriceGeneric($html);
// <div id="cerberus-data-metrics" style="display: none;"
// data-asin="B00WTHJ5SU" data-asin-price="14.99" data-asin-shipping="0"
// data-asin-currency-code="USD" data-substitute-count="-1" ... />
$currency = $asinData->getAttribute('data-asin-currency-code');
$shipping = $asinData->getAttribute('data-asin-shipping');
$price = $asinData->getAttribute('data-asin-price');
$item = array( $item = array(
'title' => $this->title, 'title' => $this->title,
'uri' => $this->getURI(), 'uri' => $this->getURI(),
'content' => "$imageTag<br/>Price: $price $currency", 'content' => "$imageTag<br/>Price: {$data['price']} {$data['currency']}",
); );
if ($shipping !== '0') { if ($data['shipping'] !== '0') {
$item['content'] .= "<br>Shipping: $shipping $currency</br>"; $item['content'] .= "<br>Shipping: {$data['shipping']} {$data['currency']}</br>";
} }
$this->items[] = $item; $this->items[] = $item;