Talvés pudas encontrar algo interesante en este codigo:
Código PHP:
<?php
require('../includes/config.inc.php');
ini_set(user_agent, "$spiderhost");
$spiderday = $spiderday * 86400;
set_time_limit(0);
$starttime = date("H:i m/d/y");
echo "##### The Spider has started at $starttime, Do Not Close This Console #####\n\n";
// Start the big loop
do {
// Open the database and start looking at URLs
$sql = mysql_query("SELECT * FROM search WHERE flag='0' ORDER BY date");//Selecciona las url con flag 0
while($rslt = mysql_fetch_array($sql)){ //Inicia el bucle con una url
$url_id = $rslt["url_id"]; //Selecciona id
$url = $rslt["url"]; //Selecciona url
$crc = $rslt["checksum"]; //Selecciona checksum
$date = $rslt["date"]; //Selecciona la fecha
if($url === $ourl){
echo "<br>WARNING: Repeditive URL got through: $ourl\n";
continue;
}
// Make an announcement
echo "<br><br>\n//* Nuevo proceso: $url */\n";
// Don't go there if you don't have to
if($flag == 1){
echo "<br>This url is already up to date.";
continue;
}
// Open URL for parsing
$read = @file_get_contents($url);//Carga la pagina remota en una variable
if(!$read || $read == ''){ //Si la pagina no existe o esta vacia
echo "<br>--Killing off dead URL: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue; //Borra de la base de datos
}
// Check for binaries
$ckbin = 14;
while($ckbin <= 26){
$ck = chr($ckbin);
$cbin = substr_count($read, $ck);
if($cbin > 0){
echo "<br><br>Killing off binary file URL: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue ; //Si el archivo es binario lo borra de la base de datos
}
++$ckbin;
}
// Set date and checksum info
$checksum = crc32($read);
$daycheck = date(U) - $spiderday;
$date = strtotime($date);
if($date > $daycheck && $crc == $checksum){
echo "<br>This url is already up to date\n";
continue;
}
// Verifica que la pagina no comience con "<script" de lo contrario la borra
// Get rid of pages from idiot webmasters who's first line of code starts with "<script".
$firstcheck = trim(strtolower(substr($read, 0, 8)));
if($firstcheck == "<script" || $firstcheck == "< script"){
echo "<br><br>Killing off incompatible file at: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
// Get meta tags
$meta = @get_meta_tags($url);
$robots = $meta["robots"];
$keywords = $meta["keywords"];
$keywords = str_replace("'","`",$keywords);
$description = $meta["description"];
$description = str_replace("'","`",$description);
if(strlen($description) > 255){
$description = substr($description, 0, 255);
}
// Check robots meta tags
$metarobots = "noindex";
if(checkmetarobots($metarobots)){
echo "<br>Indexing disallowed by robots meta tag: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
$metarobots = "none";
if(checkmetarobots($metarobots)){
echo "<br>Indexing disallowed by robots meta tag: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
// Get the page title
$temp = spliti("title>",$read,3);
$title = substr($temp[1],0,-2);
$title = str_replace("'","`",$title);
if(strlen($title) > 128){$title = substr($title, 0, 128);}
if($title == ""){$title = "Documento sin título";}
// Run the cleanup function to parse all the garbage and whitespace out of the code
if(!hardcleanup()){
echo "<br><br>Clean up error on $link\n";
continue;
}
// Encode stuff and put ir in the search database\n";
echo "<br><b>Updating: $title\n$url\n</b>";
$title = html_entity_decode($title);
$description = html_entity_decode($description);
$body = html_entity_decode($body);
$renew = @mysql_unbuffered_query("UPDATE search SET url='$url', title='$title', metak='$keywords', metad='$description', checksum='$checksum', date=CURDATE(), flag=1, body='$body' WHERE url_id='$url_id'");
if(!$renew || $renew == ""){
echo "<br><br>NOT UPDATED: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
else{
$renew = @mysql_unbuffered_query("UPDATE search SET flag=1 WHERE url_id='$url_id'");
if(!$renew || $renew == ""){
echo "<br>NOT UPDATED: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
}
}
// Check robots meta tags
$metarobots = "nofollow";
if(checkmetarobots($metarobots)){
echo "<br>Following disallowed by robots meta tag: $url\n";
continue;
}
$metarobots = "none";
if(checkmetarobots($metarobots)){
echo "<br>Following disallowed by robots meta tag: $url\n";
continue;
}
// "Parse the main URL\n"; //Analiza la url con sus componentes "host", "puerto", "usuario", "pass", "path", "consulta", y "fragmento".
$top = parse_url($url);
$tschm = $top["scheme"];
$thost = $top["host"];
$tpath = $top["path"];
$tqury = $top["query"];
$tfrag = $top["fragment"];
$currentdomain = $tschm . "://" . $thost;
$getbot = $currentdomain . "/robots.txt";
$robotay = @file($getbot);
// Parse all the links on the page
$rtemp = stristr($read,"<"); // Carga en la variable a partir de "<"
$temp = stristr($rtemp,"a"); // Carga en la variable a partir de "a"
while($rtemp){
//"Parse the href out of the string\n";//
$rtemp = stristr($temp,"href");
$rtemp = stristr($rtemp, '"');
$rtemp = substr($rtemp, 1);
$lpos = strpos($rtemp, '"');
$link = substr($rtemp, 0, $lpos);
$temp = stristr($rtemp,"<");
$link = trim($link);
// Kill any trailing slashes
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
// If it just won't fit.
if(strlen($link) > 255){
continue; //Si el link contiene mas de 255 caracteres sale del bucle while
}
if(checkforgarbage()){
continue; //Si no es un archivo valido sale del bucle while
}
// Parse the current link //Analiza la url con sus componentes "host", "puerto", "usuario", "pass", "path", "consulta", y "fragmento".
$bot = @parse_url($link);
if(!$bot || $bot == ""){
continue;
}
$bschm = $bot["scheme"];
$bhost = trim(urldecode($bot["host"]));
$bpath = trim(urldecode($bot["path"]));
$bqury = $bot["query"];
$bfrag = $bot["fragment"];
// Get rid of outside links
if($bhost != "" && $bhost != $thost){
continue;
}
// Kill off any fragment based URLs
if(strlen($bfrag) > 0){
continue;
}
// Kill off any dot dots ../../ and dots ././
$ddotcheck = substr_count($bpath,"../");
if($ddotcheck != ""){
$bpath = str_replace("/../", "/", $bpath);
$bpath = str_replace("../", "/", $bpath);
}
$dotcheck = substr_count($bpath, "./");
if($dotcheck != ""){
$bpath = str_replace("/./", "/", $bpath);
$bpath = str_replace("./", "/", $bpath);
}
// Comparitive analisys
if($bpath != "" && substr($bpath,0,1) != "/"){
if(strrpos($tpath,".") === false){
$bpath = $tpath . "/" . $bpath;
}
if(strrpos($tpath,".")){
$ttmp = substr($tpath,0,(strrpos($tpath,"/")+1));
$bpath = $ttmp . $bpath;
if(substr($bpath,0,1) != "/"){
$bpath = "/" . $bpath;
}
}
}
// Kill any trailing slashes
$link = trim($link);
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
// Check to see if the scheme and domain are in the url
if($bhost == ""){
$link = $thost . $bpath;
$link = str_replace(" ", "", $link);
$link = str_replace("//", "/", $link);
$link = $tschm . "://" . $link;
}
$link = urldecode($link);
// Kill off any remaining query strings
$kilqu = strpos($link, "?");
if($kilqu > 0 || $kilqu != ""){
$link = substr($link, 0, $kilqu);
$link = trim(str_replace("?", "", $link));
}
// Format the link for inclusion and to avoid stupid looping
$link = trim(strtolower($link));
// Kill any trailing slashes
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
// Don't be overly recursive
if($link == $currentdomain){
continue;
}
// If it's a usless link, kill it
if($link == ""){
continue;
}
// Execute robots exclusion standard via robots.txt
if(checkrobotstxt()){
echo "\nDisallowed by robots.txt: $link\n\n";
continue;
}
// Finish it off and prep for the next loop
if(!checkandupdatetoindexer()){
continue;
}
}
// Los nuevos links de la tabla indexer a la tabla search
// Take the new URLs and put them in the search database, or finish if there are no more
$movem = mysql_query("SELECT url FROM indexer");
while($mvrslt = mysql_fetch_array($movem)){
$murl = $mvrslt["url"];
$putem = mysql_unbuffered_query("INSERT INTO search SET url='$murl'");
$kill = mysql_unbuffered_query("DELETE FROM indexer");
}
$ourl = $url;
}
$preloop = mysql_fetch_row(mysql_unbuffered_query("SELECT COUNT(checksum) AS count FROM search WHERE checksum='0'"));
$loopcount = $preloop[0];
} while($loopcount > 0);
$done = mysql_unbuffered_query("UPDATE search SET flag='0' WHERE flag='1'");
echo "\n\n<br><br>Optimizing Database...";
$cleans = mysql_query("OPTIMIZE TABLE search");
$cleani = mysql_query("OPTIMIZE TABLE indexer");
echo " Done.\n\n";
$endtime = date("H:i m/d/y");
echo "\n\n<br><br>##### Spider started at $starttime, finished at $endtime. #####\n<br>##### You Can Now Close This Console #####\n";