This tiny class allows you to easily extract data from various website. It uses the fileOpen class I created, but can easily be changed if you need it to.
The implementation file shows you how easily it can be implemented, you basically need 3 lines of code to do the extraction and a for loop to show the data. It enables you to remove duplicate results as well.
Sample Implementation:
<?php
require_once('extractThis.class.php');
// Example get table of contents
$test = new extractThis('http://www.fortran.com/F77_std/rjcnf0001.html');
$test->setReg('/<li><a href="(.*)" name=".*">(.*)<\/a>/Uism');
$test->doExtract(true);
// results
foreach($test->array0 as $sub => $link)
{
echo "Link: $link » ".$test->array1[$sub]."<br />";
}
// Example for google search results
$test = new extractThis('http://www.google.com/search?q=aziz+saleh');
$test->setReg('/<h3><a href="(.*)".*>(.*)<\/a><\/h3>/Uism');
$test->doExtract(true);
// results
foreach($test->array0 as $sub => $searchLink)
{
echo "<a href=\"$searchLink\">".$test->array1[$sub]."</a><br />";
}
// Example for vb forum topic results
$test = new extractThis('http://forums.digitalpoint.com/forumdisplay.php?f=37');
$test->setReg('/<a href="showthread\.php\?t=(.*)" .*>(.*)<\/a>.*<a href="member\.php\?u=(.*)" .*">(.*)<\/a><\/span>/Uism');
$test->doExtract(true);
// results
foreach($test->array0 as $sub => $topicID)
{
echo "<a href=\"http://forums.digitalpoint.com/showthread.php?t=$topicID\">".$test->array1[$sub]."</a>
Posted By <a href=\"http://forums.digitalpoint.com/member.php?u=".$test->array2[$sub]."\">".$test->array3[$sub]."</a><br />";
}
Main Class:
<?php
class extractThis
{
/* URL to extract from */
public $extractURL;
/* Regular expressions to find */
public $extractRegs;
/* Where the returns of findings are stored */
public $extractReturns = array();
/* Constructor, just add the URL */
public function __construct($url) { $this->extractURL = $url;}
/* Add regular expression */
public function setReg($expression){ $this->extractRegs = $expression;}
/* Actually do extraction */
public function doExtract($doOption)
{
include('fileOpen.class.php');
$page = new openURL($this->extractURL);
$page->doOpen(2);
if($page->errorNumber)
{
die("$page->errorNumber ($page->errorMessage)<br />\n");
}
preg_match_all($this->extractRegs,$page->returnPage(),$returns);
for($x=1;$x<=count($returns)-1;$x++)
{
$this->extractReturns[$x-1] = $returns[$x];
}
if($doOption){ $this->removeDubs();}else{ $this->doOrganize();}
}
/* Function to organize array */
public function doOrganize()
{
$this->array0 = array();
foreach($this->extractReturns as $sub => $item)
{
$this->array0[$sub] = $item;
$mySub = 1;
while($this->extractReturns[$mySub] != NULL)
{
$this->{"array$mySub"}[$sub] = $this->extractReturns[$mySub][$sub];
$mySub++;
}
}
}
/* function to organize array & remove duplicates */
public function removeDubs()
{
$this->array0 = array();
foreach($this->extractReturns[0] as $subSub => $theItem)
{
if(!in_array($theItem,$this->array0))
{
$this->array0[$subSub] = $theItem;
$mySub = 1;
while(is_array($this->extractReturns[$mySub]))
{
$this->{"array$mySub"}[$subSub] = $this->extractReturns[$mySub][$subSub];
$mySub++;
}
}
}
}
}
Helper Class (fileOpen.class.php):
<?php
/* Class for the Open */
class openURL
{
/* Page content */
private $pageContent;
/* Page URL */
private $pageURL;
/* Error number */
public $errorNumber = NULL;
/* Error message */
public $errorMessage = NULL;
/* Fsockopen Headers array */
private $fsockopenHeaders = array();
/* Curl Options */
private $curlOptions = array();
/* File Get Contents Array */
private $fgcOptions = array();
/* Page Handeler */
private $fileHandler;
/* constructor, set page url */
public function __construct($pageURL)
{
$this->pageURL = $pageURL;
}
public function doOpen($openOption)
{
/* do actual opening */
switch ($openOption)
{
/* fopen */
case 1:
$this->myfopen();
break;
/* file_get_contents */
case 2:
$this->myfile_get_contents();
break;
/* fsockopen */
case 3:
$this->myfoscokopen();
break;
/* curl */
case 4:
$this->mycurl();
break;
}
}
/* Add to curl options */
public function addToCurl($curlOption,$curlValue){$this->curlOptions[$curlOption] = $curlValue;}
/* Add to headers for fsockopen */
public function addToFsockopen($headerText){$this->fsockopenHeaders[] = $headerText;}
/* Add to stream context (file_get_contents) */
public function addToFileGetContents($fgcOption,$fgcValue){$this->fgcOptions[$fgcOption] = $fgcValue;}
/* Return page content */
public function returnPage(){ return $this->pageContent;}
/* Function to compose headers for fsockopen */
private function composefsoHeaders()
{
$tempUse = NULL;
foreach($this->fsockopenHeaders as $curHeader)
{
$tempUse .= $curHeader."\r\n";
}
$tempUse .= "\r\n";
return $tempUse;
}
/* Function to compose headers for file_get_contents */
private function composefgcHeaders()
{
$tempUse = array(
'http' => array()
);
foreach($this->fgcOptions as $fgcOption => $fcgValue)
{
$tempUse['http'][$fgcOption] = $fcgValue."\r\n";
}
return $tempUse;
}
/* Actual read contents */
private function readContents()
{
$tempUse = NULL;
while (!feof($this->fileHandler))
{
$this->pageContent .= fread($this->fileHandler,1024);
}
unset($tempUse);
}
/* fopen function */
public function myfopen()
{
/* use ob_start to record errors */
ob_start();
$this->fileHandler = fopen($this->pageURL,'r');
if($this->fileHandler)
{
ob_end_clean();
$this->readContents();
fclose($this->fileHandler);
} else {
$this->errorMessage = ob_get_contents();
$this->errorNumber = '001-FPE';
ob_end_clean();
}
}
/* file_get_contents function */
public function myfile_get_contents()
{
$context = stream_context_create($this->composefgcHeaders());
ob_start();
$this->pageContent = file_get_contents($this->pageURL,false, $context);
$this->errorMessage = ob_get_contents();
/* check for errors */
if($this->errorMessage != NULL)
{
$this->errorNumber = '002-FPE';
}
ob_end_clean();
}
/* fsockopen function */
public function myfoscokopen()
{
/* Supress errors using ob_start */
ob_start();
$this->fileHandler = fsockopen($this->pageURL, 80,$this->errorNumber, $this->errorMessage, 30);
if($this->fileHandler)
{
fwrite($this->fileHandler,$this->composefsoHeaders());
ob_end_clean();
$this->readContents();
} else {
$this->errorMessage .= ob_get_contents();
$this->errorNumber = '003-FPE';
ob_end_clean();
}
}
/* curl function */
public function mycurl()
{
$this->fileHandler = curl_init();
curl_setopt($this->fileHandler, CURLOPT_URL, $this->pageURL);
curl_setopt($this->fileHandler, CURLOPT_RETURNTRANSFER, true);
foreach($this->curlOptions as $option => $value)
{
curl_setopt($this->fileHandler, $option, $value);
}
$this->pageContent = curl_exec($this->fileHandler);
$this->errorMessage = curl_errno($this->fileHandler);
$this->errorNumber = curl_error($this->fileHandler);
curl_close($this->fileHandler);
}
}