Here is the promised Email scraper class. In 4 hours, I was able to retrieve over 1000 emails!
The package is located at PhpClass.org:
You will need to first create your database using the database.sql file:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | CREATE TABLE IF NOT EXISTS `emaillist` ( `emailadd` varchar (255) NOT NULL , PRIMARY KEY (`emailadd`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT= 'List of all gotten emails' ; CREATE TABLE IF NOT EXISTS `finishedurls` ( `urlname` varchar (255) NOT NULL , PRIMARY KEY (`urlname`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT= 'List of finished urls' ; CREATE TABLE IF NOT EXISTS `workingurls` ( `urlname` varchar (255) NOT NULL , PRIMARY KEY (`urlname`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT= 'List of current working urls' ; |
A Sample implementation:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd" > <html> <head> <meta http-equiv= "Content-Type" content= "text/html; charset=utf-8" > </head> <body> <?php $DB_USER = 'root' ; $DB_PASSWORD = '' ; $DB_HOST = 'localhost' ; $DB_NAME = 'scrape' ; $dbc = mysql_connect ( $DB_HOST , $DB_USER , $DB_PASSWORD ) or $error = mysql_error(); mysql_select_db( $DB_NAME ) or $error = mysql_error(); mysql_query( "SET NAMES `utf8`" ) or $error = mysql_error(); if ( $error ){ die ( $error );} include ( 'scraper.class.php' ); $new = new scraper; // Start Path can be empty, which will be extracted from the start URL $new ->setStartPath(); //$new->setStartPath('http://forums.digitalpoint.com'); $new ->startScraping(); ?> </body> </html> |
Main Class:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | <?php /*****/ /* Written by: Aziz S. Hussain Email: azizsaleh@gmail.com Website: www.azizsaleh.com Produced under GPL License */ /*****/ /* Email address scraper based on a URL. */ class scraper { // URL that stores first URL to start var $startURL ; // List of allowed page extensions var $allowedExtensions = array ( '.css' , '.xml' , '.rss' , '.ico' , '.js' , '.gif' , '.jpg' , '.jpeg' , '.png' , '.bmp' , '.wmv' , '.avi' , '.mp3' , '.flash' , '.swf' , '.css' ); // Which URL to scrape var $useURL ; // Start path, for links that are relative var $startPath ; // Set start path function setStartPath( $path = NULL){ if ( $path != NULL) { $this ->startPath = $path ; } else { $temp = explode ( '/' , $this ->startURL); $this ->startPath = $temp [0]. '//' . $temp [2]; } } // Add the start URL function startURL( $theURL ){ // Set start URL $this ->startURL = $theURL ; } // Function to get URL contents function getContents( $url ) { $ch = curl_init(); // initialize curl handle curl_setopt( $ch , CURLOPT_HEADER, 0); curl_setopt( $ch , CURLOPT_VERBOSE, 0); curl_setopt( $ch , CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)" ); curl_setopt( $ch , CURLOPT_AUTOREFERER, false); curl_setopt( $ch , CURLOPT_CONNECTTIMEOUT,7); curl_setopt( $ch , CURLOPT_URL, $url ); // set url to post to curl_setopt( $ch , CURLOPT_FAILONERROR, 1); curl_setopt( $ch , CURLOPT_FOLLOWLOCATION, 1); // allow redirects curl_setopt( $ch , CURLOPT_RETURNTRANSFER,1); // return into a variable curl_setopt( $ch , CURLOPT_TIMEOUT, 50); // times out after 50s curl_setopt( $ch , CURLOPT_POST, 0); // set POST method $buffer = curl_exec( $ch ); // run the whole process curl_close( $ch ); return $buffer ; } // Actually do the URLS function startScraping() { // Get page content $pageContent = $this ->getContents( $this ->startURL); echo 'Scraping URL: ' . $this ->startURL.PHP_EOL; // Get list of all emails on page preg_match_all( '/([\w+\.]*\w+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is' , $pageContent , $results ); // Add the email to the email list array $insertCount =0; foreach ( $results [1] as $curEmail ) { $insert = mysql_query( "INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')" ); if ( $insert ){ $insertCount ++;} } echo 'Emails found: ' .number_format( $insertCount ).PHP_EOL; // Mark the page done $insert = mysql_query( "INSERT INTO `finishedurls` (`urlname`) VALUES ('" . $this ->startURL. "')" ); // Get list of new page URLS is emails were found on previous page preg_match_all( '/href="([^"]+)"/Umis' , $pageContent , $results ); $currentList = $this ->cleanListURLs( $results [1]); $insertURLCount =0; // Add the list to the array foreach ( $currentList as $curURL ) { $insert = mysql_query( "INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')" ); if ( $insert ){ $insertURLCount ++;} } echo 'URLs found: ' .number_format( $insertURLCount ).PHP_EOL; $getURL = mysql_fetch_assoc(mysql_query( "SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1" )); $remove = mysql_query( "DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1" ); // Get the new page ready $this ->startURL = $getURL [ 'urlname' ]; $this ->setStartPath(); // If no more pages, return if ( $this ->startURL == NULL){ return ;} // Clean vars unset( $results , $pageContent ); // If more pages, loop again $this ->startScraping(); } // Function to clean input URLS function cleanListURLs( $linkList ) { foreach ( $linkList as $sub => $url ) { // Check if only 1 character - there must exist at least / character if ( strlen ( $url ) <= 1){unset( $linkList [ $sub ]);} // Check for any javascript if ( eregi ( 'javascript' , $url )){unset( $linkList [ $sub ]);} // Check for invalid extensions str_replace ( $this ->allowedExtensions, '' , $url , $count ); if ( $count > 0){ unset( $linkList [ $sub ]);} // If URL starts with #, ignore if ( substr ( $url ,0,1) == '#' ){unset( $linkList [ $sub ]);} // If everything is OK and path is relative, add starting path if ( substr ( $url ,0,1) == '/' || substr ( $url ,0,1) == '?' || substr ( $url ,0,1) == '=' ){ $linkList [ $sub ] = $this ->startPath. $url ; } } return $linkList ; } } |
i want to fetch the email ids from url. But i am failed to do this. need your help