IFD:WebApps/Felix Trojan: Difference between revisions

From Medien Wiki
Line 54: Line 54:


===Anhang: Crawling Magic===
===Anhang: Crawling Magic===
Für interessierte hier einmal wie ich StudiVZ crawle.
Facebook funktioniert ziemlich analog, davon findet man auch genug wenn man mal danach googelt. Bei Facebook crawle ich übrigens die Mobile-Version, weil ich kein Javascript unterstützen kann. Es gibt zwar eine Art "noscript-Flag" als GET-Request, nur das funktioniert nicht mehr allzu gut.
Außerdem ist die Mobile Seite wesentlich schneller abzufragen.


<source lang="php" >
<source lang="php" >
<?php
$ch = curl_init();
echo 'lol';
curl_setopt($ch, CURLOPT_URL, 'https://secure.studivz.net/Login');
curl_setopt($ch, CURLOPT_POSTFIELDS,'email='.urlencode($login_email).'&password='.urlencode($login_pass).'&ipRestriction=1&login=Einloggen&jsEnabled=true&formkey=156b5e49efad4132cbe29d20939778376b0a44edc7d74fac47862a82338411a65dc24b586321a542bf7f95592a0371913275f9ceb9708e99bff5c0192f70bfaeccd6cd59c304bda62e9cd2a835ff7786a5ac330bc6a74baa9a177ce9ee6f2cad&iv=8419a4452f139ba570dd856be3b1c522');
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)");
$html = curl_exec($ch);
$err = 0;
$err = curl_errno($ch);
curl_close($ch);
 
if ($err == 0){
sleep(2);
//Get Friend Link
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$links = $xpath->query("/html/body//a[@title='Meine Freunde']");
if(($links->length) > 0) {
$hash = $links->item(0)->getAttribute("href");
preg_match("/\/Friends\/All\/(.*)\/tid\/103/", $hash, $matches);
$hash = $matches[1];
$logout = $xpath->query("//a[text()='Raus hier']");
//get Friends
for ($j = 1; $j < $limit; $j++) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://www.studivz.net/Friends/All/'.$hash.'/p/'.$j);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/sv_cookies.txt');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)");
$html = curl_exec($ch);
curl_close($ch);
//create DOM object
$dom = new DOMDocument();
@$dom->loadHTML($html);
//use xpath to find stuff
$xpath = new DOMXPath($dom);
$imgs = $xpath->query("/html/body//img");
  //$title = $dom->getElementsByTagName('title');
//$title = $title->item(0)->nodeValue;
//create img tags
for ($i = 0; $i < $imgs->length; $i++) {
$img = $imgs->item($i);
$img_str = "<img src='../phpthumb/phpThumb.php?src=".substr_replace($img->getAttribute("src"), ".jpg", -6)."&w=170&h=170&zc=1' ";
if(strpos($img_str,"imagevz.net")){
$name = $img->getAttribute("alt");
$profileLink = $img->parentNode->getAttribute("href");
echo "<div class='friend'><a href='http://www.studivz.net".$profileLink."' target=”_blank”>
".$img_str."alt='".$name."'></a>
<div class='friend_txt'>
<p>".$name."<br>StudiVZ<br>".$login_email."</p>
</div>
</div>";
}
}
}
 
  if(($logout->length) > 0){
      $logout = $logout->item(0)->getAttribute("href");
      $ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://www.studivz.net'.$logout);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/fb_cookies.txt');
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/fb_cookies.txt');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)");
curl_exec($ch);
curl_close($ch);
  }
  }
  else{
  //if no links found
  echo "<p>Could not find any friends for ".$login_email.", maybe wrong password or email!<p>";
  }
}
</source>
</source>