Foros del Web - Ver Mensaje Individual - [APORTE] file_get_contents(), cURL, HTTP_Request

manuel__7 · #97 (**permalink**) 23/05/2012, 15:42

Cita:

Iniciado por abimaelrc

Para poder manipular los links de lo que hayamos obtenido usando cualquiera de los metodos mencionados podemos usar este codigo. En este caso usaré file_get_contents()

Código PHP:

Ver original<?php
$html = file_get_contents('http://www.example.com/');
function extract_links($html) {
    $links = array();
    preg_match_all('/<a\s+.*?href=[\"\']?([^\"\' >]*)[\"\']?[^>]*>(.*?)<\/a>/i', $html,$matches,PREG_SET_ORDER);
    foreach($matches as $match) {
        $links[] = array($match[1],$match[2]);
    }
    return $links;
}
$links = extract_links($html);
foreach ($links as $link) {
    echo $link[0] . PHP_EOL;
}
?>

Para bajar un archivo usando cURL se puede lograr de esta forma

Código PHP:

Ver original<?php
$url = 'http://www.example.com/hola.zip'; 
  
$g=basename($url); 
 
if(!is_file($g)){
    $fp=fopen ($g, "w");
 
    $ch=curl_init($url);
    curl_setopt ($ch,CURLOPT_FILE, $fp);
    curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,60);
    curl_exec ($ch);
    curl_close ($ch);
 
    fclose($fp); 
}

Para bajar un archivo usando file_get_contents se puede lograr de esta forma

Código PHP:

Ver original<?php
$url = "http://www.example.com/zip.zip";
$g=basename($url);
$content = file_get_contents($url);
file_put_contents($g,$content);

Para leer un sitio web que verifica si es un navegador o no el que trata de ver la página web y solo despliega la información si es un navegador el que visita el sitio, puedes tratar el siguiente código

file_get_contents

Código PHP:

Ver original<?php
$options = array('http' => 
    array( 'header' => 'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6' . PHP_EOL ) 
); 
$context = stream_context_create($options);
$page = file_get_contents('http://www.example.com', false, $context);
echo $page;

cURL

Código PHP:

Ver original<?php
$ch = curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6');
curl_setopt($ch, CURLOPT_URL, 'http://example.com');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$page = curl_exec($ch); 
curl_close($ch);
echo $page;

Un ejemplo lo pueden ver en este tema [url]http://www.forosdelweb.com/f18/enigma-con-paginasamarillas-500-internal-server-error-833165/[/url]

Enviar petición, con la mayoría de las cabeceras que envían los navegadores

file_get_contents

Código PHP:

Ver original<?php
$options = array('http' => 
    array(
        'header' => array(
            'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
            'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Cache-Control: max-age=0',
            'Connection: keep-alive',
            'Keep-Alive: 300',
            'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Accept-Language: en-us,en;q=0.5',
            'Pragma: ',
        )
    )
); 
$context = stream_context_create($options);
$page = file_get_contents('http://www.example.com', false, $context);
echo $page;

cURL

Código PHP:

Ver original<?php
$header[] = "Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";  
$header[] = "Cache-Control: max-age=0"; 
$header[] = "Connection: keep-alive"; 
$header[] = "Keep-Alive: 300"; 
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
$header[] = "Accept-Language: en-us,en;q=0.5"; 
$header[] = "Pragma: "; // browsers keep this blank. 
$ch = curl_init(); 
curl_setopt($ch, CURLOPT_URL,'http://www.example.com'); 
curl_setopt($ch, CURLOPT_HTTPHEADER, $header); 
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4) Gecko/20030624 Netscape/7.1 (ax)'); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
$page = curl_exec ($ch); 
curl_close ($ch);
echo $page;

Para autenticar, por ejemplo htpasswd se puede lograr de esta forma

Código PHP:

Ver original<?php
$url = "http://example.com/authenticate.php";
$curl = curl_init();
curl_setopt($curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
curl_setopt($curl, CURLOPT_USERPWD, "user:pass"); 
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_URL, $url); 
$data = curl_exec($curl); 
curl_close($curl);
echo $data;

Este fue tomado del siguiente tema [url]http://www.forosdelweb.com/f18/obtener-sitioweb-mediante-curl-metodos-http-956656/[/url]

He avanzado mucho, pero aun no consigo lograrlo

1.- Este código obtiene los enlaces de www.ejemplo.com de acuerdo a la etiqueta <a href=""></a>:

Código PHP:

  <?php

$html = file_get_contents('www.ejemplo.com');

function extract_links($html) {

    $links = array();

    preg_match_all('/<a\s+.*?href=[\"\']?([^\"\' >]*)[\"\']?[^>]*>(.*?)<\/a>/i', $html,$matches,PREG_SET_ORDER);

    foreach($matches as $match) {

        $links[] = array($match[1],$match[2]);

    }

    return $links;

}

$links = extract_links($html);

foreach ($links as $link) {

    echo $link[0] . PHP_EOL;

}

?>

,pero como hago para obtener otro tipo de etiquetas como input, img, etc?

2.- Esto me mostrara todos los <a href=""></a> existentes dicha página, pero cuando la página (www.ejemplo.com) tiene un iframe que llama a otra página que le provee enlaces aleatorios, como hago para obtener tambien esos enlaces (<a href=""></a>) desde www.ejemplo.com?

Sabiendo esto podría estoy muy cerca de lo que desearía hacer!