This tiny class allows you to easily extract data from various website. It uses the fileOpen class I created, but can easily be changed if you need it to.
The implementation file shows you how easily it can be implemented, you basically need 3 lines of code to do the extraction and a for loop to show the data. It enables you to remove duplicate results as well.
Sample Implementation:
<?php require_once('extractThis.class.php'); // Example get table of contents $test = new extractThis('http://www.fortran.com/F77_std/rjcnf0001.html'); $test->setReg('/<li><a href="(.*)" name=".*">(.*)<\/a>/Uism'); $test->doExtract(true); // results foreach($test->array0 as $sub => $link) { echo "Link: $link » ".$test->array1[$sub]."<br />"; } // Example for google search results $test = new extractThis('http://www.google.com/search?q=aziz+saleh'); $test->setReg('/<h3><a href="(.*)".*>(.*)<\/a><\/h3>/Uism'); $test->doExtract(true); // results foreach($test->array0 as $sub => $searchLink) { echo "<a href=\"$searchLink\">".$test->array1[$sub]."</a><br />"; } // Example for vb forum topic results $test = new extractThis('http://forums.digitalpoint.com/forumdisplay.php?f=37'); $test->setReg('/<a href="showthread\.php\?t=(.*)" .*>(.*)<\/a>.*<a href="member\.php\?u=(.*)" .*">(.*)<\/a><\/span>/Uism'); $test->doExtract(true); // results foreach($test->array0 as $sub => $topicID) { echo "<a href=\"http://forums.digitalpoint.com/showthread.php?t=$topicID\">".$test->array1[$sub]."</a> Posted By <a href=\"http://forums.digitalpoint.com/member.php?u=".$test->array2[$sub]."\">".$test->array3[$sub]."</a><br />"; }
Main Class:
<?php class extractThis { /* URL to extract from */ public $extractURL; /* Regular expressions to find */ public $extractRegs; /* Where the returns of findings are stored */ public $extractReturns = array(); /* Constructor, just add the URL */ public function __construct($url) { $this->extractURL = $url;} /* Add regular expression */ public function setReg($expression){ $this->extractRegs = $expression;} /* Actually do extraction */ public function doExtract($doOption) { include('fileOpen.class.php'); $page = new openURL($this->extractURL); $page->doOpen(2); if($page->errorNumber) { die("$page->errorNumber ($page->errorMessage)<br />\n"); } preg_match_all($this->extractRegs,$page->returnPage(),$returns); for($x=1;$x<=count($returns)-1;$x++) { $this->extractReturns[$x-1] = $returns[$x]; } if($doOption){ $this->removeDubs();}else{ $this->doOrganize();} } /* Function to organize array */ public function doOrganize() { $this->array0 = array(); foreach($this->extractReturns as $sub => $item) { $this->array0[$sub] = $item; $mySub = 1; while($this->extractReturns[$mySub] != NULL) { $this->{"array$mySub"}[$sub] = $this->extractReturns[$mySub][$sub]; $mySub++; } } } /* function to organize array & remove duplicates */ public function removeDubs() { $this->array0 = array(); foreach($this->extractReturns[0] as $subSub => $theItem) { if(!in_array($theItem,$this->array0)) { $this->array0[$subSub] = $theItem; $mySub = 1; while(is_array($this->extractReturns[$mySub])) { $this->{"array$mySub"}[$subSub] = $this->extractReturns[$mySub][$subSub]; $mySub++; } } } } }
Helper Class (fileOpen.class.php):
<?php /* Class for the Open */ class openURL { /* Page content */ private $pageContent; /* Page URL */ private $pageURL; /* Error number */ public $errorNumber = NULL; /* Error message */ public $errorMessage = NULL; /* Fsockopen Headers array */ private $fsockopenHeaders = array(); /* Curl Options */ private $curlOptions = array(); /* File Get Contents Array */ private $fgcOptions = array(); /* Page Handeler */ private $fileHandler; /* constructor, set page url */ public function __construct($pageURL) { $this->pageURL = $pageURL; } public function doOpen($openOption) { /* do actual opening */ switch ($openOption) { /* fopen */ case 1: $this->myfopen(); break; /* file_get_contents */ case 2: $this->myfile_get_contents(); break; /* fsockopen */ case 3: $this->myfoscokopen(); break; /* curl */ case 4: $this->mycurl(); break; } } /* Add to curl options */ public function addToCurl($curlOption,$curlValue){$this->curlOptions[$curlOption] = $curlValue;} /* Add to headers for fsockopen */ public function addToFsockopen($headerText){$this->fsockopenHeaders[] = $headerText;} /* Add to stream context (file_get_contents) */ public function addToFileGetContents($fgcOption,$fgcValue){$this->fgcOptions[$fgcOption] = $fgcValue;} /* Return page content */ public function returnPage(){ return $this->pageContent;} /* Function to compose headers for fsockopen */ private function composefsoHeaders() { $tempUse = NULL; foreach($this->fsockopenHeaders as $curHeader) { $tempUse .= $curHeader."\r\n"; } $tempUse .= "\r\n"; return $tempUse; } /* Function to compose headers for file_get_contents */ private function composefgcHeaders() { $tempUse = array( 'http' => array() ); foreach($this->fgcOptions as $fgcOption => $fcgValue) { $tempUse['http'][$fgcOption] = $fcgValue."\r\n"; } return $tempUse; } /* Actual read contents */ private function readContents() { $tempUse = NULL; while (!feof($this->fileHandler)) { $this->pageContent .= fread($this->fileHandler,1024); } unset($tempUse); } /* fopen function */ public function myfopen() { /* use ob_start to record errors */ ob_start(); $this->fileHandler = fopen($this->pageURL,'r'); if($this->fileHandler) { ob_end_clean(); $this->readContents(); fclose($this->fileHandler); } else { $this->errorMessage = ob_get_contents(); $this->errorNumber = '001-FPE'; ob_end_clean(); } } /* file_get_contents function */ public function myfile_get_contents() { $context = stream_context_create($this->composefgcHeaders()); ob_start(); $this->pageContent = file_get_contents($this->pageURL,false, $context); $this->errorMessage = ob_get_contents(); /* check for errors */ if($this->errorMessage != NULL) { $this->errorNumber = '002-FPE'; } ob_end_clean(); } /* fsockopen function */ public function myfoscokopen() { /* Supress errors using ob_start */ ob_start(); $this->fileHandler = fsockopen($this->pageURL, 80,$this->errorNumber, $this->errorMessage, 30); if($this->fileHandler) { fwrite($this->fileHandler,$this->composefsoHeaders()); ob_end_clean(); $this->readContents(); } else { $this->errorMessage .= ob_get_contents(); $this->errorNumber = '003-FPE'; ob_end_clean(); } } /* curl function */ public function mycurl() { $this->fileHandler = curl_init(); curl_setopt($this->fileHandler, CURLOPT_URL, $this->pageURL); curl_setopt($this->fileHandler, CURLOPT_RETURNTRANSFER, true); foreach($this->curlOptions as $option => $value) { curl_setopt($this->fileHandler, $option, $value); } $this->pageContent = curl_exec($this->fileHandler); $this->errorMessage = curl_errno($this->fileHandler); $this->errorNumber = curl_error($this->fileHandler); curl_close($this->fileHandler); } }