I gave a presentation entitled “The SEOs Guide to Scraping Everything” on May 10th at the SEOmoz and SEER Interactive Meetup in Philadelphia, PA. Since I only had 8 minutes to present, I figured I’d augment my presentation by providing a simple PHP scraper class that people can use (and extend) to get started with scraping.
Update: The fine folks at SEER Interactive were kind enough to post a video of my presentation
You can download the scraper class here.
There’s a quick sample for how to use the scraper class below my slide deck from the meetup:
The SEO’s Guide to Scraping Everything
View more presentations from eppievojt.
Using the Scraper:
<?php require_once('Scraper.php'); $scraper = new Eppie_Service_Scraper(); /* if you want to use proxies, list them as an array and pass into the setProxies method. Note that the ip addresses listed here are not real proxies. */ $proxies = array( '123.123.123.123', '111.111.111.111' ); $scraper->setProxies($proxies); $scraper->scrape('http://www.cnn.com'); var_dump($scraper); ?> |
And here’s the actual scraper class:
class Eppie_Service_Scraper{ public function __construct(){ // set proxies -- you can add your own here or use the setProxies method $this->_proxies = array(); } public function scrape($url) { $this->_url = $url; $dom = new DOMDocument(); $proxy = $this->_pickProxy(); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_REFERER, ""); curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19"); if($proxy){ curl_setopt($ch, CURLOPT_PROXY, $proxy); curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, 1); } $body = curl_exec($ch); curl_close($ch); $this->_curl_result = $body; @$dom->loadHTML($body); $this->_dom = $dom; $this->_parseDOM(); } public function setProxies($proxies) { $this->_proxies = $proxies; } private function _pickProxy() { if(count($this->_proxies) > 0) return $this->_proxies[rand(0, count($this->_proxies) - 1)]; else return false; } public function setKeyword($keyword) { $this->_keyword = $keyword; } private function _parseDOM() { $xpath = new DOMXPath($this->_dom); $title = $xpath->query("//head/title"); $meta_desc = $xpath->query("//head/meta[@name='description']/@content"); $meta_kw = $xpath->query("//head/meta[@name='keywords']/@content"); $h1 = $xpath->query("//h1"); $h2 = $xpath->query("//h2"); $h3 = $xpath->query("//h3"); $h4 = $xpath->query("//h4"); $h5 = $xpath->query("//h5"); $h6 = $xpath->query("//h6"); $img = $xpath->query("//img"); $img_alt = $xpath->query("//img[@alt!='']/@alt"); $strong = $xpath->query("//strong | //b"); $body = $xpath->query("//body"); if($title->length > 0) $this->_title = $title->item(0)->nodeValue; if($meta_desc->length > 0) $this->_meta_desc = $meta_desc->item(0)->nodeValue; if($meta_kw->length > 0) $this->_meta_kw = $meta_kw->item(0)->nodeValue; if($h1->length > 0) { for($i=0; $i < $h1->length; $i++) $this->_h1[] = $h1->item($i)->nodeValue; } if($h2->length > 0) { for($i=0; $i < $h2->length; $i++) $this->_h2[] = $h2->item($i)->nodeValue; } if($h3->length > 0) { for($i=0; $i < $h3->length; $i++) $this->_h3[] = $h3->item($i)->nodeValue; } if($h4->length > 0) { for($i=0; $i < $h4->length; $i++) $this->_h4[] = $h4->item($i)->nodeValue; } if($h5->length > 0) { for($i=0; $i < $h5->length; $i++) $this->_h5[] = $h5->item($i)->nodeValue; } if($h6->length > 0) { for($i=0; $i < $h6->length; $i++) $this->_h6[] = $h6->item($i)->nodeValue; } if($img_alt->length > 0) { for($i=0; $i < $img_alt->length; $i++) $this->_img_alt[] = $img_alt->item($i)->nodeValue; } $this->_img_alt_pct = ($img_alt->length / $img->length)*100; if($strong->length > 0) { for($i=0; $i < $strong->length; $i++) $this->_strong[] = $strong->item($i)->nodeValue; } } } |
Pingback: The SEO’s Guide to Scraping Everything – Eppie Vojt | Inbound.org