$query = $dbh->prepare($queryStr);
foreach my $invalidId (@invalidUrls) {
$query->execute($invalidId);
- #$query->finish();
sayLog "Removed $invalidId from unique_domain" if $DEBUG;
}
sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
my $baseUrl = $row[1];
push(@toBeDeletedFromFetchAgain, $baseUrl);
}
-#$query->finish();
sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
$query = $dbh->prepare($queryStr);
foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
$query->execute($baseUrl);
- #$query->finish();
sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
}
sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
$query->execute();
sayYellow "Remove invalid urls done";
+$queryStr = "INSERT INTO `stats` SET `action` = 'cleanup', `value` = NOW()
+ ON DUPLICATE KEY UPDATE `value` = NOW()";
+$query = $dbh->prepare($queryStr);
+$query->execute();
sayGreen "Cleanup complete";
## now loop over them and store the results
my $counter = 0;
+my $allFetched = 0;
+my $allFailed = 0;
while ( my ($id, $url) = each %urlsToFetch ) {
sayYellow "Fetching: $id $url";
# callback tells us to stop
if($res->header('Client-Aborted')) {
sayYellow "Aborted, too big.";
+ $allFailed++;
next;
}
if(index($res->content_type, "text/html") == -1) {
sayYellow "Fetching: $id ignored. Not html";
push(@urlsFailed, $id);
+ $allFailed++;
next;
}
open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
print $fh $res->decoded_content();
close($fh);
push(@urlsFetched, $id);
+ $allFetched++;
sayGreen"Fetching: $id ok";
}
else {
sayRed "Fetching: $id failed: $res->code ".$res->status_line;
push(@urlsFailed, $id);
+ $allFailed++;
}
if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
updateFetched($dbh, @urlsFetched);
updateFailed($dbh, @urlsFailed);
+# some stats stuff
+my $queryStr = "INSERT INTO `stats` SET `action` = 'fetch', `value` = NOW()
+ ON DUPLICATE KEY UPDATE `value` = NOW()";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+
+$queryStr = "INSERT INTO `stats` SET `action` = 'fetchfailed', `value` = '".$allFailed."'
+ ON DUPLICATE KEY UPDATE `value` = '".$allFailed."'";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+
+$queryStr = "INSERT INTO `stats` SET `action` = 'fetchsuccess', `value` = '$allFetched'
+ ON DUPLICATE KEY UPDATE `value` = '$allFetched'";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+
+$dbh->commit();
+
+# end
$dbh->disconnect();
sayGreen "Fetch complete";
@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
insertIntoDb($dbh, \@links);
+$queryStr = "INSERT INTO `stats` SET `action` = 'parse', `value` = NOW()
+ ON DUPLICATE KEY UPDATE `value` = NOW()";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+$dbh->commit();
+
$dbh->disconnect();
sayGreen "Parse complete";
$query = $dbh->prepare($queryStr);
my $md5 = Digest::MD5->new;
my $counter = 0;
+ my $allLinks = 0;
+ my $allFailedLinks = 0;
foreach my $link (@links) {
sayLog $link if ($DEBUG);
if(!is_uri($link)) {
sayYellow "Ignore URL it is invalid: $link";
+ $allFailedLinks++;
next;
}
my $url = url($link);
if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
sayYellow "Ignore URL because of scheme: $link";
+ $allFailedLinks++;
next;
}
$md5->reset;
$counter++;
+ $allLinks++;
if($counter >= 500) {
$counter = 0;
#sayLog "Inserted: $link" if($DEBUG);
}
sayYellow "Final commit";
+
+ # stats stuff
+ $queryStr = "INSERT INTO `stats` SET `action` = 'parsesuccess', `value` = '$allLinks'
+ ON DUPLICATE KEY UPDATE `value` = '$allLinks'";
+ $query = $dbh->prepare($queryStr);
+ $query->execute();
+
+ $queryStr = "INSERT INTO `stats` SET `action` = 'parsefailed', `value` = '$allFailedLinks'
+ ON DUPLICATE KEY UPDATE `value` = '$allFailedLinks'";
+ $query = $dbh->prepare($queryStr);
+ $query->execute();
+
$dbh->commit();
}