JOHNYS22031990 Δημοσ. 12 Μαΐου 2014 Δημοσ. 12 Μαΐου 2014 εχω αυτο το κομματι του κωδικα $url_to_crawl = $argv[1];//auto to exo antikatastisei me to url pou thelo na kano crawl $depth = isset($argv[2])?$argv[2]:3; apla dn katalavano ti prepi na grapso edo anti gia auto(isset($argv[2])?$argv[2]:3; ) giati to trexo kai dn proxoraei sto vathos tis selidas ti einai to $argv[2]
theodoros8 Δημοσ. 12 Μαΐου 2014 Δημοσ. 12 Μαΐου 2014 Λογικα το argv[2] ειναι η ιδια url γιατι μετα απο αυτο εχει ? αρα μετα πανε οι παραμετροι της url πχ google.com/index.php?id=1
defacer Δημοσ. 12 Μαΐου 2014 Δημοσ. 12 Μαΐου 2014 Φαντάσου να υπήρχε ένα site που να μπορούσες να γράψεις "php argv" και να σου πει πού μπορείς να διαβάσεις τι είναι αυτό. Θα ήταν μεγάλη διευκόλυνση. 2
JOHNYS22031990 Δημοσ. 12 Μαΐου 2014 Μέλος Δημοσ. 12 Μαΐου 2014 ναι εχω αντικατστασει το argv[1] me ena url pou pairno apo mia forma prepei na antikatastiso to argv alla ti na grapso
theodoros8 Δημοσ. 12 Μαΐου 2014 Δημοσ. 12 Μαΐου 2014 νομιζω πως πρεπει να μας δωσεις λιγο περισσοτερο κωδικα, ετσι οπως τα λες δεν μπορω να καταλαβω!
georgemarios Δημοσ. 12 Μαΐου 2014 Δημοσ. 12 Μαΐου 2014 αυτο... $depth = isset($argv[2])?$argv[2]:3; ....ισοδυναμει με αυτο.... if (isset($argv[2])) $depth = $argv[2]; else $depth = 3; και λεγεται ternary operator (google for it) Με 2 κουβεντες, αν εχει καπου οριστει το argv[2] να κουβαλαει το επιθυμητο depth, χρησιμοποιησε το αλλιως βαλε την default τιμη 3. Προφανως, αφου δεν υπαρχει ουτε το argv[1] (το αντικατεστησες με το δικο σου url -- φανταζομαι hardcoded), ετσι θα αντικαταστησεις και το argv[2] με το βαθος που θες......
JOHNYS22031990 Δημοσ. 12 Μαΐου 2014 Μέλος Δημοσ. 12 Μαΐου 2014 oriste <?php require_once 'goutte.phar'; use Goutte\Client; class simpleCrawler { private $base_url; private $site_links; private $max_depth; public function __construct($base_url, $max_depth = 10) { if (strpos($base_url, 'http') === false) { // http protocol not included, prepend it to the base url $base_url = 'http://' . $base_url; } $this->base_url = $base_url; $this->site_links = array(); $this->max_depth = $max_depth; } /** * checks the uri if can be crawled or not * in order to prevent links like "javascript:void(0)" or "#something" from being crawled again * @param string $uri * @return boolean */ protected function checkIfCrawlable($uri) { if (empty($uri)) { return false; } $stop_links = array(//returned deadlinks '@^javascript\:void\(0\)$@', '@^#.*@', ); foreach ($stop_links as $ptrn) { if (preg_match($ptrn, $uri)) { return false; } } return true; } /** * normalize link before visiting it * currently just remove url hash from the string * @param string $uri * @return string */ protected function normalizeLink($uri) { $uri = preg_replace('@#.*$@', '', $uri); return $uri; } /** * initiate the crawling mechanism on all links * @param string $url_to_traverse */ public function traverse($url_to_traverse = null) { if (is_null($url_to_traverse)) { $url_to_traverse = $this->base_url; $this->site_links[$url_to_traverse] = array(//initialize first element in the site_links 'links_text' => array("BASE_URL"), 'absolute_url' => $url_to_traverse, 'frequency' => 1, 'visited' => false, 'external_link' => false, 'original_urls' => array($url_to_traverse), ); } $this->_traverseSingle($url_to_traverse, $this->max_depth); } /** * crawling single url after checking the depth value * @param string $url_to_traverse * @param int $depth */ protected function _traverseSingle($url_to_traverse, $depth) { //echo $url_to_traverse . chr(10); try { $client = new Client(); $crawler = $client->request('GET', $url_to_traverse); $status_code = $client->getResponse()->getStatus(); $this->site_links[$url_to_traverse]['status_code'] = $status_code; if ($status_code == 200) { // valid url and not reached depth limit yet $this->extractTitleInfo($crawler, $url_to_traverse); $current_links = array(); if (@$this->site_links[$url_to_traverse]['external_link'] == false) { // for internal uris, get all links inside $current_links = $this->extractLinksInfo($crawler, $url_to_traverse); } $this->site_links[$url_to_traverse]['visited'] = true; // mark current url as visited $this->traverseChildLinks($current_links, $depth - 1); } } catch (Guzzle\Http\Exception\CurlException $ex) { error_log("CURL exception: " . $url_to_traverse); $this->site_links[$url_to_traverse]['status_code'] = '404'; } catch (Exception $ex) { error_log("error retrieving data from link: " . $url_to_traverse); $this->site_links[$url_to_traverse]['status_code'] = '404'; } } /** * after checking the depth limit of the links array passed * check if the link if the link is not visited/traversed yet, in order to traverse * @param array $current_links * @param int $depth */ protected function traverseChildLinks($current_links, $depth) { if ($depth == 0) { return; } foreach ($current_links as $uri => $info) { if (!isset($this->site_links[$uri])) { $this->site_links[$uri] = $info; } else{ $this->site_links[$uri]['original_urls'] = isset($this->site_links[$uri]['original_urls'])?array_merge($this->site_links[$uri]['original_urls'], $info['original_urls']):$info['original_urls']; $this->site_links[$uri]['links_text'] = isset($this->site_links[$uri]['links_text'])?array_merge($this->site_links[$uri]['links_text'], $info['links_text']):$info['links_text']; if(@$this->site_links[$uri]['visited']) { //already visited link) $this->site_links[$uri]['frequency'] = @$this->site_links[$uri]['frequency'] + @$info['frequency']; } } if (!empty($uri) && !$this->site_links[$uri]['visited'] && !isset($this->site_links[$uri]['dont_visit']) ) { //traverse those that not visited yet $this->_traverseSingle($this->normalizeLink($current_links[$uri]['absolute_url']), $depth); } } } /** * extracting all <a> tags in the crawled document, * and return an array containing information about links like: uri, absolute_url, frequency in document * @param Symfony\Component\DomCrawler\Crawler $crawler * @param string $url_to_traverse * @return array */ protected function extractLinksInfo(Symfony\Component\DomCrawler\Crawler &$crawler, $url_to_traverse) { $current_links = array(); $crawler->filter('a')->each(function(Symfony\Component\DomCrawler\Crawler $node, $i) use (&$current_links) { $node_text = trim($node->text()); $node_url = $node->attr('href'); $hash = $this->normalizeLink($node_url); if (!isset($this->site_links[$hash])) { $current_links[$hash]['original_urls'][$node_url] = $node_url; $current_links[$hash]['links_text'][$node_text] = $node_text; if (!$this->checkIfCrawlable($node_url)){ }elseif (!preg_match("@^http(s)?@", $node_url)) { //not absolute link $current_links[$hash]['absolute_url'] = $this->base_url . $node_url; } else { $current_links[$hash]['absolute_url'] = $node_url; } if (!$this->checkIfCrawlable($node_url)) { $current_links[$hash]['dont_visit'] = true; $current_links[$hash]['external_link'] = false; } elseif ($this->checkIfExternal($current_links[$hash]['absolute_url'])) { // mark external url as marked $current_links[$hash]['external_link'] = true; } else { $current_links[$hash]['external_link'] = false; } $current_links[$hash]['visited'] = false; $current_links[$hash]['frequency'] = isset($current_links[$hash]['frequency']) ? $current_links[$hash]['frequency']++ : 1; // increase the counter } }); if (isset($current_links[$url_to_traverse])) { // if page is linked to itself, ex. homepage $current_links[$url_to_traverse]['visited'] = true; // avoid cyclic loop } return $current_links; } /** * extract information about document title, and h1 * @param Symfony\Component\DomCrawler\Crawler $crawler * @param string $uri */ protected function extractTitleInfo(Symfony\Component\DomCrawler\Crawler &$crawler, $url) { $this->site_links[$url]['title'] = trim($crawler->filterXPath('html/head/title')->text()); $h1_count = $crawler->filter('h1')->count(); $this->site_links[$url]['h1_count'] = $h1_count; $this->site_links[$url]['h1_contents'] = array(); if ($h1_count) { $crawler->filter('h1')->each(function(Symfony\Component\DomCrawler\Crawler $node, $i) use($url) { $this->site_links[$url]['h1_contents'][$i] = trim($node->text()); }); } } /** * getting information about links crawled * @return array */ public function getLinksInfo() { return $this->site_links; } /** * check if the link leads to external site or not * @param string $url * @return boolean */ public function checkIfExternal($url) { $base_url_trimmed = str_replace(array('http://', 'https://'), '', $this->base_url); if (preg_match("@http(s)?\://$base_url_trimmed@", $url)) { //base url is not the first portion of the url return false; } else { return true; } } } error_reporting(E_ALL); set_time_limit(300); //include_once ('../src/SimpleCrawler.php'); $url_to_traverse = $_POST['url']; $client = new Client(); $crawler = $client->request('POST', $url_to_traverse); //$url_to_crawl = $argv[1]; $depth = isset($argv[2])?$argv[2]:3; if($url_to_traverse){ echo "Begin crawling ".$url_to_traverse.' with links in depth '.$max_depth.chr(10); $start_time = time(); $simple_crawler = new simpleCrawler($url_to_traverse, $max_depth); $simple_crawler->traverse(); $links_data = $simple_crawler->getLinksInfo(); $end_time = time(); $duration = $end_time - $start_time; echo 'crawling approximate duration, '.$duration.' seconds'.chr(10); echo count($links_data)." unique links found".chr(10); $con=mysqli_connect("localhost","root","");//sindesi // Check connection if (mysqli_connect_errno()) { echo "Failed to connect to MySQL: " . mysqli_connect_error(); } // Create database $sql="CREATE DATABASE phpcrawl"; if (mysqli_query($con,$sql)) { echo "Database my_db created successfully"; } else { echo "Error creating database: " . mysqli_error($con); } foreach($links_data as $uri=>$info){ if(!isset($info['status_code'])){ $info['status_code']=000;//tmp } $h1_contents = implode("\n\r", isset($info['h1_contents'])?$info['h1_contents']:array() ); $original_urls = implode('\n\r', isset($info['original_urls'])?$info['original_urls']:array() ); $links_text = implode('\n\r', isset($info['links_text'])?$info['links_text']:array() ); $is_external = $info['external_link']?'1':'0'; $title = @$info['title']; $h1_count = isset($info['h1_count'])?$info['h1_count']:0; $sql_query = "insert into pages_crawled(url, frequency, status_code, is_external, title, h1_count, h1_content, source_link_text, original_urls) values('$uri', {$info['frequency']}, {$info['status_code']}, {$is_external}, '{$title}', {$h1_count}, '$h1_contents', '$links_text', '$original_urls')"; if (mysqli_query($con,$sql)) { echo " table1 created successfully"; } else { echo "Error creating table: " . mysqli_error($con); } } } ?> αυτο που εχω αντικαταστισει ειναι στη γραμμη 244 και απο πανω ειναι η αντικατασταση απο το 240-243 αυτο που θελω να αλλαξω ειναι το απο κατω στην γραμμη 245 δηλαδη θα βαλω αντι για argv[2].to $depth=2
Προτεινόμενες αναρτήσεις
Δημιουργήστε ένα λογαριασμό ή συνδεθείτε για να σχολιάσετε
Πρέπει να είστε μέλος για να αφήσετε σχόλιο
Δημιουργία λογαριασμού
Εγγραφείτε με νέο λογαριασμό στην κοινότητα μας. Είναι πανεύκολο!
Δημιουργία νέου λογαριασμούΣύνδεση
Έχετε ήδη λογαριασμό; Συνδεθείτε εδώ.
Συνδεθείτε τώρα