PHP – Page 2 – Aziz Saleh – Software development

Content Extraction using Preg match

Posted on May 8, 2010 by Aziz

Posted in PHP Leave a comment

This tiny class allows you to easily extract data from various website. It uses the fileOpen class I created, but can easily be changed if you need it to.

The implementation file shows you how easily it can be implemented, you basically need 3 lines of code to do the extraction and a for loop to show the data. It enables you to remove duplicate results as well.

Sample Implementation:

<?php

require_once('extractThis.class.php');

// Example get table of contents
$test = new extractThis('http://www.fortran.com/F77_std/rjcnf0001.html');
$test->setReg('/<li><a href="(.*)" name=".*">(.*)<\/a>/Uism');
$test->doExtract(true);

// results
foreach($test->array0 as $sub => $link)
{
    echo "Link: $link &raquo; ".$test->array1[$sub]."<br />";
}

// Example for google search results
$test = new extractThis('http://www.google.com/search?q=aziz+saleh');
$test->setReg('/<h3><a href="(.*)".*>(.*)<\/a><\/h3>/Uism');
$test->doExtract(true);

// results
foreach($test->array0 as $sub => $searchLink)
{
    echo "<a href=\"$searchLink\">".$test->array1[$sub]."</a><br />";
}

// Example for vb forum topic results
$test = new extractThis('http://forums.digitalpoint.com/forumdisplay.php?f=37');
$test->setReg('/<a href="showthread\.php\?t=(.*)" .*>(.*)<\/a>.*<a href="member\.php\?u=(.*)" .*">(.*)<\/a><\/span>/Uism');
$test->doExtract(true);

// results
foreach($test->array0 as $sub => $topicID)
{
    echo "<a href=\"http://forums.digitalpoint.com/showthread.php?t=$topicID\">".$test->array1[$sub]."</a>
    Posted By <a href=\"http://forums.digitalpoint.com/member.php?u=".$test->array2[$sub]."\">".$test->array3[$sub]."</a><br />";
}

Main Class:

<?php

class extractThis
{
    /* URL to extract from */
    public $extractURL;

    /* Regular expressions to find */
    public $extractRegs;

    /* Where the returns of findings are stored */
    public $extractReturns = array();

    /* Constructor, just add the URL */
    public function __construct($url) { $this->extractURL = $url;}

    /* Add regular expression */
    public function setReg($expression){ $this->extractRegs = $expression;}

    /* Actually do extraction */
    public function doExtract($doOption)
    {
        include('fileOpen.class.php');
        $page = new openURL($this->extractURL);
        $page->doOpen(2);
        if($page->errorNumber)
        {
            die("$page->errorNumber ($page->errorMessage)<br />\n");
        }
        preg_match_all($this->extractRegs,$page->returnPage(),$returns);

        for($x=1;$x<=count($returns)-1;$x++)
        {
            $this->extractReturns[$x-1] = $returns[$x];
        }
        if($doOption){ $this->removeDubs();}else{ $this->doOrganize();}
    }

    /* Function to organize array */
    public function doOrganize()
    {
        $this->array0 = array();

        foreach($this->extractReturns as $sub => $item)
        {
            $this->array0[$sub] = $item;            
            $mySub = 1;
            while($this->extractReturns[$mySub] != NULL)
            {
                $this->{"array$mySub"}[$sub] = $this->extractReturns[$mySub][$sub];
                $mySub++;
            }
        }
    }
    /* function to organize array & remove duplicates */
    public function removeDubs()
    {
        $this->array0 = array();
        foreach($this->extractReturns[0] as $subSub => $theItem)
        {
            if(!in_array($theItem,$this->array0))
            {
                $this->array0[$subSub] = $theItem;
                $mySub = 1;
                while(is_array($this->extractReturns[$mySub]))
                {    
                    $this->{"array$mySub"}[$subSub] = $this->extractReturns[$mySub][$subSub];
                    $mySub++;
                }
            }
        }
    }
}

Helper Class (fileOpen.class.php):

<?php

/* Class for the Open */
class openURL
{
    /* Page content */
    private $pageContent;

    /* Page URL */
    private $pageURL;

    /* Error number */
    public $errorNumber = NULL;

    /* Error message */
    public $errorMessage = NULL;

    /* Fsockopen Headers array */
    private $fsockopenHeaders = array();

    /* Curl Options */
    private $curlOptions = array();

    /* File Get Contents Array */
    private $fgcOptions = array();

    /* Page Handeler */
    private $fileHandler;

    /* constructor, set page url */
    public function __construct($pageURL)
    {
        $this->pageURL = $pageURL;
    }

    public function doOpen($openOption)
    {
        /* do actual opening */    
        switch ($openOption)
        {
            /* fopen */
            case 1:
            $this->myfopen();
            break;

            /* file_get_contents */
            case 2:
            $this->myfile_get_contents();
            break;

            /* fsockopen */
            case 3:
            $this->myfoscokopen();
            break;

            /* curl */
            case 4:
            $this->mycurl();
            break;
        }
    }

    /* Add to curl options */
    public function addToCurl($curlOption,$curlValue){$this->curlOptions[$curlOption] = $curlValue;}

    /* Add to headers for fsockopen */
    public function addToFsockopen($headerText){$this->fsockopenHeaders[] = $headerText;}

    /* Add to stream context (file_get_contents) */
    public function addToFileGetContents($fgcOption,$fgcValue){$this->fgcOptions[$fgcOption] = $fgcValue;}

    /* Return page content */
    public function returnPage(){ return $this->pageContent;}

    /* Function to compose headers for fsockopen */
    private function composefsoHeaders()
    {
        $tempUse = NULL;
        foreach($this->fsockopenHeaders as $curHeader)
        {
            $tempUse .= $curHeader."\r\n";
        }
        $tempUse .= "\r\n";
        return $tempUse;
    }

    /* Function to compose headers for file_get_contents */
    private function composefgcHeaders()
    {
        $tempUse = array(
            'http' => array()
        );

        foreach($this->fgcOptions as $fgcOption => $fcgValue)
        {
            $tempUse['http'][$fgcOption] = $fcgValue."\r\n";
        }
        return $tempUse;
    }

    /* Actual read contents */
    private function readContents()
    {
        $tempUse = NULL;
        while (!feof($this->fileHandler))
        {
            $this->pageContent .= fread($this->fileHandler,1024);
        }
        unset($tempUse);
    }        

    /* fopen function */
    public function myfopen()
    {
        /* use ob_start to record errors */
        ob_start();

        $this->fileHandler = fopen($this->pageURL,'r');

        if($this->fileHandler)
        {
            ob_end_clean();
            $this->readContents();
            fclose($this->fileHandler);
        } else {
            $this->errorMessage = ob_get_contents();
            $this->errorNumber = '001-FPE';
            ob_end_clean();
        }
    }    

    /* file_get_contents function */
    public function myfile_get_contents()
    {
        $context = stream_context_create($this->composefgcHeaders());
        ob_start();
        $this->pageContent = file_get_contents($this->pageURL,false, $context);
        $this->errorMessage = ob_get_contents();

        /* check for errors */
        if($this->errorMessage != NULL)
        {
            $this->errorNumber = '002-FPE';    
        }
        ob_end_clean();
    }

    /* fsockopen function */    
    public function myfoscokopen()
    {
        /* Supress errors using ob_start */
        ob_start();

        $this->fileHandler = fsockopen($this->pageURL, 80,$this->errorNumber, $this->errorMessage, 30);

        if($this->fileHandler)
        {
            fwrite($this->fileHandler,$this->composefsoHeaders());
            ob_end_clean();
            $this->readContents();
        } else {
            $this->errorMessage .= ob_get_contents();
            $this->errorNumber = '003-FPE';
            ob_end_clean();
        }
    }

    /* curl function */
    public function mycurl()
    {

        $this->fileHandler = curl_init();
        curl_setopt($this->fileHandler, CURLOPT_URL, $this->pageURL);
        curl_setopt($this->fileHandler, CURLOPT_RETURNTRANSFER, true);          
        foreach($this->curlOptions as $option => $value)
        {        
            curl_setopt($this->fileHandler, $option, $value);
        }
        $this->pageContent = curl_exec($this->fileHandler);
        $this->errorMessage = curl_errno($this->fileHandler);
        $this->errorNumber = curl_error($this->fileHandler);

        curl_close($this->fileHandler);
    }
}

File open class

Posted on May 8, 2010 by Aziz

Posted in PHP Leave a comment

This class basically encapsulates the following four built in functions: fopen, file_get_contents, filesockopen, and curl.

The only reason I created this class is because sometime some of my work will work on some hosting and some will not because of the file open method I use. This will enable me to easily ask people to switch functions without having to implement much coding on their side.

The class allows users to add headers to file_get_contents, filesockopen, and curl (including referrals, cookies, field inputs, etc..).

Sample Implementation:

<?php

require_once('fileOpen.class.php');

/*
File Open fopen example
*/
$page = new openURL('http://www.google.com');
$page->doOpen(1);
echo $page->returnPage();

/*
File Open file_get_contents example
*/
$page = new openURL('http://www.google.com');
$page->addToFileGetContents('method','GET');
$page->doOpen(2);
echo $page->returnPage();

/*
File Open fsockopen example
*/
$page = new openURL('www.google.com');
$page->addToFsockopen("GET / HTTP/1.1");
$page->addToFsockopen("Host: www.google.com");
$page->addToFsockopen("Connection: Close\r\n");
$page->doOpen(3);
echo $page->returnPage();

/*
File Open curl example
*/
$page = new openURL('us3.php.net/search.php');
$page->addToCurl(CURLOPT_REFERER,'us3.php.net');
$page->addToCurl(CURLOPT_POST,1);
$page->addToCurl(CURLOPT_POSTFIELDS,'pattern=curl&show=quickref&lang=en');    
$page->addToCurl(CURLOPT_FOLLOWLOCATION,true);    
$page->doOpen(4);
echo $page->returnPage();

/* Check for any errors */
/*
if($page->errorNumber)
{
    echo "$page->errorNumber ($page->errorMessage)<br />\n";
}
*/

Main Class:

<?php

/* Class for the Open */
class openURL
{
    /* Page content */
    private $pageContent;

    /* Page URL */
    private $pageURL;

    /* Error number */
    public $errorNumber = NULL;

    /* Error message */
    public $errorMessage = NULL;

    /* Fsockopen Headers array */
    private $fsockopenHeaders = array();

    /* Curl Options */
    private $curlOptions = array();

    /* File Get Contents Array */
    private $fgcOptions = array();

    /* Page Handeler */
    private $fileHandler;

    /* constructor, set page url */
    public function __construct($pageURL)
    {
        $this->pageURL = $pageURL;
    }

    public function doOpen($openOption)
    {
        /* do actual opening */    
        switch ($openOption)
        {
            /* fopen */
            case 1:
            $this->myfopen();
            break;

            /* file_get_contents */
            case 2:
            $this->myfile_get_contents();
            break;

            /* fsockopen */
            case 3:
            $this->myfoscokopen();
            break;

            /* curl */
            case 4:
            $this->mycurl();
            break;
        }
    }

    /* Add to curl options */
    public function addToCurl($curlOption,$curlValue){$this->curlOptions[$curlOption] = $curlValue;}

    /* Add to headers for fsockopen */
    public function addToFsockopen($headerText){$this->fsockopenHeaders[] = $headerText;}

    /* Add to stream context (file_get_contents) */
    public function addToFileGetContents($fgcOption,$fgcValue){$this->fgcOptions[$fgcOption] = $fgcValue;}

    /* Return page content */
    public function returnPage(){ return $this->pageContent;}

    /* Function to compose headers for fsockopen */
    private function composefsoHeaders()
    {
        $tempUse = NULL;
        foreach($this->fsockopenHeaders as $curHeader)
        {
            $tempUse .= $curHeader."\r\n";
        }
        $tempUse .= "\r\n";
        return $tempUse;
    }

    /* Function to compose headers for file_get_contents */
    private function composefgcHeaders()
    {
        $tempUse = array(
            'http' => array()
        );

        foreach($this->fgcOptions as $fgcOption => $fcgValue)
        {
            $tempUse['http'][$fgcOption] = $fcgValue."\r\n";
        }
        return $tempUse;
    }

    /* Actual read contents */
    private function readContents()
    {
        $tempUse = NULL;
        while (!feof($this->fileHandler))
        {
            $this->pageContent .= fread($this->fileHandler,1024);
        }
        unset($tempUse);
    }        

    /* fopen function */
    public function myfopen()
    {
        /* use ob_start to record errors */
        ob_start();

        $this->fileHandler = fopen($this->pageURL,'r');

        if($this->fileHandler)
        {
            ob_end_clean();
            $this->readContents();
            fclose($this->fileHandler);
        } else {
            $this->errorMessage = ob_get_contents();
            $this->errorNumber = '001-FPE';
            ob_end_clean();
        }
    }    

    /* file_get_contents function */
    public function myfile_get_contents()
    {
        $context = stream_context_create($this->composefgcHeaders());
        ob_start();
        $this->pageContent = file_get_contents($this->pageURL,false, $context);
        $this->errorMessage = ob_get_contents();

        /* check for errors */
        if($this->errorMessage != NULL)
        {
            $this->errorNumber = '002-FPE';    
        }
        ob_end_clean();
    }

    /* fsockopen function */    
    public function myfoscokopen()
    {
        /* Supress errors using ob_start */
        ob_start();

        $this->fileHandler = fsockopen($this->pageURL, 80,$this->errorNumber, $this->errorMessage, 30);

        if($this->fileHandler)
        {
            fwrite($this->fileHandler,$this->composefsoHeaders());
            ob_end_clean();
            $this->readContents();
        } else {
            $this->errorMessage .= ob_get_contents();
            $this->errorNumber = '003-FPE';
            ob_end_clean();
        }
    }

    /* curl function */
    public function mycurl()
    {

        $this->fileHandler = curl_init();
        curl_setopt($this->fileHandler, CURLOPT_URL, $this->pageURL);
        curl_setopt($this->fileHandler, CURLOPT_RETURNTRANSFER, true);          
        foreach($this->curlOptions as $option => $value)
        {        
            curl_setopt($this->fileHandler, $option, $value);
        }
        $this->pageContent = curl_exec($this->fileHandler);
        $this->errorMessage = curl_errno($this->fileHandler);
        $this->errorNumber = curl_error($this->fileHandler);

        curl_close($this->fileHandler);
    }
}

Email Scraper

Posted on September 4, 2009 by Aziz

Posted in PHP 1 Comment

Here is the promised Email scraper class. In 4 hours, I was able to retrieve over 1000 emails!

The package is located at PhpClass.org:

http://www.phpclasses.org/package/5663-PHP-Crawl-pages-and-scrape-e-mail-addresses-into-MySQL.html

You will need to first create your database using the database.sql file:

CREATE TABLE IF NOT EXISTS `emaillist` (
  `emailadd` varchar(255) NOT NULL,
  PRIMARY KEY  (`emailadd`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of all gotten emails';


CREATE TABLE IF NOT EXISTS `finishedurls` (
  `urlname` varchar(255) NOT NULL,
  PRIMARY KEY  (`urlname`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of finished urls';



CREATE TABLE IF NOT EXISTS `workingurls` (
  `urlname` varchar(255) NOT NULL,
  PRIMARY KEY  (`urlname`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of current working urls';

A Sample implementation:

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<?php
$DB_USER =  'root';
$DB_PASSWORD = '';
$DB_HOST = 'localhost';
$DB_NAME = 'scrape';
$dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
mysql_select_db($DB_NAME) or $error = mysql_error();
mysql_query("SET NAMES `utf8`") or $error = mysql_error();
if($error){ die($error);}

include('scraper.class.php');

$new = new scraper;
// Start Path can be empty, which will be extracted from the start URL
$new->setStartPath();
//$new->setStartPath('http://forums.digitalpoint.com');
$new->startURL('http://forums.digitalpoint.com/forumdisplay.php?f=37');
$new->startScraping();
?>
</body>
</html>

Main Class:

<?php
/*****/
/*
Written by: Aziz S. Hussain
Email: azizsaleh@gmail.com
Website: www.azizsaleh.com
Produced under GPL License
*/
/*****/


/*
Email address scraper based on a URL.
*/      

class scraper
{
    // URL that stores first URL to start
    var $startURL;
    
    // List of allowed page extensions
    var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv'
        ,'.avi','.mp3','.flash','.swf','.css');
    
    // Which URL to scrape
    var $useURL;
    
    // Start path, for links that are relative
    var $startPath;
    
    // Set start path
    function setStartPath($path = NULL){
        if($path != NULL)
        {
            $this->startPath = $path;
        } else {
            $temp = explode('/',$this->startURL);
            $this->startPath = $temp[0].'//'.$temp[2];
        }
    }
    
    // Add the start URL
    function startURL($theURL){
        // Set start URL
        $this->startURL = $theURL;
    }
    
    // Function to get URL contents
    function getContents($url)
    {
        $ch = curl_init(); // initialize curl handle
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_VERBOSE, 0);
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
        curl_setopt($ch, CURLOPT_AUTOREFERER, false);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
        curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
        curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
        curl_setopt($ch, CURLOPT_FAILONERROR, 1);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// allow redirects
        curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
        curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
        curl_setopt($ch, CURLOPT_POST, 0); // set POST method
        $buffer = curl_exec($ch); // run the whole process
        curl_close($ch); 
        return $buffer;
    }
    
    // Actually do the URLS
    function startScraping()
    {
        // Get page content
        $pageContent = $this->getContents($this->startURL);
        echo 'Scraping URL: '.$this->startURL.PHP_EOL;
        
        // Get list of all emails on page
        preg_match_all('/([\w+\.]*\w+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);
        // Add the email to the email list array
        $insertCount=0;
        foreach($results[1] as $curEmail)
        {
            $insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
            if($insert){$insertCount++;}
        }
        
        echo 'Emails found: '.number_format($insertCount).PHP_EOL;
        
        // Mark the page done
        $insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')");
        
        // Get list of new page URLS is emails were found on previous page
        preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
        $currentList = $this->cleanListURLs($results[1]);
        
        $insertURLCount=0;
        // Add the list to the array
        foreach($currentList as $curURL)
        {
            $insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
            if($insert){$insertURLCount++;}
        }
        
        echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;

        $getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
        $remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1");
        
        // Get the new page ready
        $this->startURL = $getURL['urlname'];
        $this->setStartPath();
        
        // If no more pages, return
        if($this->startURL == NULL){ return;}
        // Clean vars
        unset($results,$pageContent);
        // If more pages, loop again
        $this->startScraping();
    }
    
    // Function to clean input URLS
    function cleanListURLs($linkList)
    {    
        foreach($linkList as $sub => $url)
        {
            // Check if only 1 character - there must exist at least / character
            if(strlen($url) <= 1){unset($linkList[$sub]);}
            // Check for any javascript
            if(eregi('javascript',$url)){unset($linkList[$sub]);}
            // Check for invalid extensions
            str_replace($this->allowedExtensions,'',$url,$count);
            if($count > 0){ unset($linkList[$sub]);}
            // If URL starts with #, ignore
            if(substr($url,0,1) == '#'){unset($linkList[$sub]);}
            
            // If everything is OK and path is relative, add starting path
            if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
                $linkList[$sub] = $this->startPath.$url;
            }
        }
        return $linkList;
    }
    
}

Working on an email extractor

Posted on September 4, 2009 by Aziz

Posted in PHP Leave a comment

Many have seen my old email extractorclass and maybe actually gave it a run. Personally, I never liked that piece of work that I did.As you might guess, I am working on a new email extractor which stores the info in a database table. It is much more general than the previous one and checks for duplicates before entry. This will help enable forking of child processes without being afraid of data collisions.

I am in the testing phase.

« Previous Page