79
edits
Line 54: | Line 54: | ||
===Anhang: Crawling Magic=== | ===Anhang: Crawling Magic=== | ||
Für interessierte hier einmal wie ich StudiVZ crawle. | |||
Facebook funktioniert ziemlich analog, davon findet man auch genug wenn man mal danach googelt. Bei Facebook crawle ich übrigens die Mobile-Version, weil ich kein Javascript unterstützen kann. Es gibt zwar eine Art "noscript-Flag" als GET-Request, nur das funktioniert nicht mehr allzu gut. | |||
Außerdem ist die Mobile Seite wesentlich schneller abzufragen. | |||
<source lang="php" > | <source lang="php" > | ||
<? | $ch = curl_init(); | ||
echo ' | curl_setopt($ch, CURLOPT_URL, 'https://secure.studivz.net/Login'); | ||
curl_setopt($ch, CURLOPT_POSTFIELDS,'email='.urlencode($login_email).'&password='.urlencode($login_pass).'&ipRestriction=1&login=Einloggen&jsEnabled=true&formkey=156b5e49efad4132cbe29d20939778376b0a44edc7d74fac47862a82338411a65dc24b586321a542bf7f95592a0371913275f9ceb9708e99bff5c0192f70bfaeccd6cd59c304bda62e9cd2a835ff7786a5ac330bc6a74baa9a177ce9ee6f2cad&iv=8419a4452f139ba570dd856be3b1c522'); | |||
curl_setopt($ch, CURLOPT_POST, 1); | |||
curl_setopt($ch, CURLOPT_HEADER, 0); | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | |||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |||
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt'); | |||
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt'); | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |||
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)"); | |||
$html = curl_exec($ch); | |||
$err = 0; | |||
$err = curl_errno($ch); | |||
curl_close($ch); | |||
if ($err == 0){ | |||
sleep(2); | |||
//Get Friend Link | |||
$dom = new DOMDocument(); | |||
@$dom->loadHTML($html); | |||
$xpath = new DOMXPath($dom); | |||
$links = $xpath->query("/html/body//a[@title='Meine Freunde']"); | |||
if(($links->length) > 0) { | |||
$hash = $links->item(0)->getAttribute("href"); | |||
preg_match("/\/Friends\/All\/(.*)\/tid\/103/", $hash, $matches); | |||
$hash = $matches[1]; | |||
$logout = $xpath->query("//a[text()='Raus hier']"); | |||
//get Friends | |||
for ($j = 1; $j < $limit; $j++) { | |||
$ch = curl_init(); | |||
curl_setopt($ch, CURLOPT_URL, 'http://www.studivz.net/Friends/All/'.$hash.'/p/'.$j); | |||
curl_setopt($ch, CURLOPT_HEADER, 0); | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | |||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |||
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt'); | |||
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt'); | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |||
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)"); | |||
$html = curl_exec($ch); | |||
curl_close($ch); | |||
//create DOM object | |||
$dom = new DOMDocument(); | |||
@$dom->loadHTML($html); | |||
//use xpath to find stuff | |||
$xpath = new DOMXPath($dom); | |||
$imgs = $xpath->query("/html/body//img"); | |||
//$title = $dom->getElementsByTagName('title'); | |||
//$title = $title->item(0)->nodeValue; | |||
//create img tags | |||
for ($i = 0; $i < $imgs->length; $i++) { | |||
$img = $imgs->item($i); | |||
$img_str = "<img src='../phpthumb/phpThumb.php?src=".substr_replace($img->getAttribute("src"), ".jpg", -6)."&w=170&h=170&zc=1' "; | |||
if(strpos($img_str,"imagevz.net")){ | |||
$name = $img->getAttribute("alt"); | |||
$profileLink = $img->parentNode->getAttribute("href"); | |||
echo "<div class='friend'><a href='http://www.studivz.net".$profileLink."' target=”_blank”> | |||
".$img_str."alt='".$name."'></a> | |||
<div class='friend_txt'> | |||
<p>".$name."<br>StudiVZ<br>".$login_email."</p> | |||
</div> | |||
</div>"; | |||
} | |||
} | |||
} | |||
if(($logout->length) > 0){ | |||
$logout = $logout->item(0)->getAttribute("href"); | |||
$ch = curl_init(); | |||
curl_setopt($ch, CURLOPT_URL, 'http://www.studivz.net'.$logout); | |||
curl_setopt($ch, CURLOPT_HEADER, 0); | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | |||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |||
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/fb_cookies.txt'); | |||
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/fb_cookies.txt'); | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |||
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)"); | |||
curl_exec($ch); | |||
curl_close($ch); | |||
} | |||
} | |||
else{ | |||
//if no links found | |||
echo "<p>Could not find any friends for ".$login_email.", maybe wrong password or email!<p>"; | |||
} | |||
} | |||
</source> | </source> |
edits