From efc189351d2f6e1feacb00050d6778201333f19d Mon Sep 17 00:00:00 2001 From: Banana Date: Thu, 10 Oct 2024 00:42:52 +0200 Subject: [PATCH] adding stats to crawler Signed-off-by: Banana --- crawler/cleanup.pl | 7 ++++--- crawler/fetch.pl | 25 +++++++++++++++++++++++++ crawler/parse-results.pl | 23 +++++++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/crawler/cleanup.pl b/crawler/cleanup.pl index df73b06..e4d500d 100644 --- a/crawler/cleanup.pl +++ b/crawler/cleanup.pl @@ -85,7 +85,6 @@ sayLog($queryStr) if $DEBUG; $query = $dbh->prepare($queryStr); foreach my $invalidId (@invalidUrls) { $query->execute($invalidId); - #$query->finish(); sayLog "Removed $invalidId from unique_domain" if $DEBUG; } sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls; @@ -104,7 +103,6 @@ while(my @row = $query->fetchrow_array) { my $baseUrl = $row[1]; push(@toBeDeletedFromFetchAgain, $baseUrl); } -#$query->finish(); sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?"; @@ -112,7 +110,6 @@ sayLog($queryStr) if $DEBUG; $query = $dbh->prepare($queryStr); foreach my $baseUrl (@toBeDeletedFromFetchAgain) { $query->execute($baseUrl); - #$query->finish(); sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG; } sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; @@ -133,5 +130,9 @@ $query = $dbh->prepare($queryStr); $query->execute(); sayYellow "Remove invalid urls done"; +$queryStr = "INSERT INTO `stats` SET `action` = 'cleanup', `value` = NOW() + ON DUPLICATE KEY UPDATE `value` = NOW()"; +$query = $dbh->prepare($queryStr); +$query->execute(); sayGreen "Cleanup complete"; diff --git a/crawler/fetch.pl b/crawler/fetch.pl index 3a57d7c..3c32b9d 100644 --- a/crawler/fetch.pl +++ b/crawler/fetch.pl @@ -80,6 +80,8 @@ $ua->max_size($config->get("MAX_BYTES_PER_PAGE")); ## now loop over them and store the results my $counter = 0; +my $allFetched = 0; +my $allFailed = 0; while ( my ($id, $url) = each %urlsToFetch ) { sayYellow "Fetching: $id $url"; @@ -89,22 +91,26 @@ while ( my ($id, $url) = each %urlsToFetch ) { # callback tells us to stop if($res->header('Client-Aborted')) { sayYellow "Aborted, too big."; + $allFailed++; next; } if(index($res->content_type, "text/html") == -1) { sayYellow "Fetching: $id ignored. Not html"; push(@urlsFailed, $id); + $allFailed++; next; } open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; print $fh $res->decoded_content(); close($fh); push(@urlsFetched, $id); + $allFetched++; sayGreen"Fetching: $id ok"; } else { sayRed "Fetching: $id failed: $res->code ".$res->status_line; push(@urlsFailed, $id); + $allFailed++; } if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { @@ -122,6 +128,25 @@ while ( my ($id, $url) = each %urlsToFetch ) { updateFetched($dbh, @urlsFetched); updateFailed($dbh, @urlsFailed); +# some stats stuff +my $queryStr = "INSERT INTO `stats` SET `action` = 'fetch', `value` = NOW() + ON DUPLICATE KEY UPDATE `value` = NOW()"; +$query = $dbh->prepare($queryStr); +$query->execute(); + +$queryStr = "INSERT INTO `stats` SET `action` = 'fetchfailed', `value` = '".$allFailed."' + ON DUPLICATE KEY UPDATE `value` = '".$allFailed."'"; +$query = $dbh->prepare($queryStr); +$query->execute(); + +$queryStr = "INSERT INTO `stats` SET `action` = 'fetchsuccess', `value` = '$allFetched' + ON DUPLICATE KEY UPDATE `value` = '$allFetched'"; +$query = $dbh->prepare($queryStr); +$query->execute(); + +$dbh->commit(); + +# end $dbh->disconnect(); sayGreen "Fetch complete"; diff --git a/crawler/parse-results.pl b/crawler/parse-results.pl index c91efd3..9583ee9 100644 --- a/crawler/parse-results.pl +++ b/crawler/parse-results.pl @@ -130,6 +130,12 @@ foreach my $resultFile (@results) { @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); insertIntoDb($dbh, \@links); +$queryStr = "INSERT INTO `stats` SET `action` = 'parse', `value` = NOW() + ON DUPLICATE KEY UPDATE `value` = NOW()"; +$query = $dbh->prepare($queryStr); +$query->execute(); +$dbh->commit(); + $dbh->disconnect(); sayGreen "Parse complete"; @@ -166,18 +172,22 @@ sub insertIntoDb { $query = $dbh->prepare($queryStr); my $md5 = Digest::MD5->new; my $counter = 0; + my $allLinks = 0; + my $allFailedLinks = 0; foreach my $link (@links) { sayLog $link if ($DEBUG); if(!is_uri($link)) { sayYellow "Ignore URL it is invalid: $link"; + $allFailedLinks++; next; } my $url = url($link); if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) { sayYellow "Ignore URL because of scheme: $link"; + $allFailedLinks++; next; } @@ -187,6 +197,7 @@ sub insertIntoDb { $md5->reset; $counter++; + $allLinks++; if($counter >= 500) { $counter = 0; @@ -203,5 +214,17 @@ sub insertIntoDb { #sayLog "Inserted: $link" if($DEBUG); } sayYellow "Final commit"; + + # stats stuff + $queryStr = "INSERT INTO `stats` SET `action` = 'parsesuccess', `value` = '$allLinks' + ON DUPLICATE KEY UPDATE `value` = '$allLinks'"; + $query = $dbh->prepare($queryStr); + $query->execute(); + + $queryStr = "INSERT INTO `stats` SET `action` = 'parsefailed', `value` = '$allFailedLinks' + ON DUPLICATE KEY UPDATE `value` = '$allFailedLinks'"; + $query = $dbh->prepare($queryStr); + $query->execute(); + $dbh->commit(); } -- 2.39.5