]> 91.132.146.200 Git - aranea.git/commitdiff
avoid big downloads
authorBanana <mail@bananas-playground.net>
Sat, 7 Sep 2024 23:04:26 +0000 (01:04 +0200)
committerBanana <mail@bananas-playground.net>
Sat, 7 Sep 2024 23:04:26 +0000 (01:04 +0200)
Signed-off-by: Banana <mail@bananas-playground.net>
CHANGELOG
TODO
config.default.txt
fetch.pl

index 05a9261e631d6c321b75cb4a49334dd8a196ba93..05f786a45fd00b00b8e7cb3cbfb7666068c73157 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 + Some db improvements
 + Default config file added
 + Updated requirements file
++ Avoid big downloads with MAX_BYTES_PER_PAGE setting.
 
 0.1
 + initial release
diff --git a/TODO b/TODO
index 613e35e1dcc8a595cdcaf12639738bc3983be582..c51e172d62a48775d20ea632125a90ef7770bdd6 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,4 +1,3 @@
-Avoid download to mutch data. Content check before?
 Set correct timezone. Maybe in config?
 Some sort of matching against spam domain list?
 A web view for the results?
index c1cca48d630ff32e55f6ebefb0248aa203b6ec17..22ef6944de71e2110a05a46583696382e3ce8512 100644 (file)
@@ -14,3 +14,4 @@ FETCH_URLS_PER_RUN=5000
 FETCH_URLS_PER_PACKAGE=30
 PARSE_FILES_PER_PACKAGE=50
 CLEANUP_URLS_AMOUNT_ABOVE=40
+MAX_BYTES_PER_PAGE=5000000
index b45e51e34c617901ea71a8243307e06a39748c04..ceb45b2c6b49d26526617575b09c94c5e2254b2a 100644 (file)
--- a/fetch.pl
+++ b/fetch.pl
@@ -39,9 +39,9 @@ die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
 
 ## DB connection
 my %dbAttr = (
-       PrintError=>0,# turn off error reporting via warn()
+    PrintError=>0,# turn off error reporting via warn()
     RaiseError=>1, # turn on error reporting via die()
-       AutoCommit=>0 # manually use transactions
+    AutoCommit=>0 # manually use transactions
 );
 my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
 my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
@@ -51,16 +51,15 @@ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
 ## fetch the urls to fetch from the table
 my %urlsToFetch;
 my $query = $dbh->prepare("SELECT `id`, `url`
-                                                       FROM `url_to_fetch`
-                                                       WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
-                                                               OR `last_fetched` IS NULL
-                                                               AND `fetch_failed` = 0
-                                                       LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
+                            FROM `url_to_fetch`
+                            WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
+                                OR `last_fetched` IS NULL
+                                AND `fetch_failed` = 0
+                            LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
 $query->execute();
 while(my @row = $query->fetchrow_array) {
-       $urlsToFetch{$row[0]} = $row[1];
+    $urlsToFetch{$row[0]} = $row[1];
 }
-#$query->finish();
 
 # successful fetches
 my @urlsFetched;
@@ -68,50 +67,56 @@ my @urlsFailed;
 
 # config the user agent for the request
 my $request_headers = [
-  'User-Agent' => $config->get("UA_AGENT"),
-  'Accept' => $config->get("UA_ACCEPT"),
-  'Accept-Language' => $config->get("UA_LANG"),
-  'Accept-Encoding' => HTTP::Message::decodable,
-  'Cache-Control' => $config->get("UA_CACHE")
+    'User-Agent' => $config->get("UA_AGENT"),
+    'Accept' => $config->get("UA_ACCEPT"),
+    'Accept-Language' => $config->get("UA_LANG"),
+    'Accept-Encoding' => HTTP::Message::decodable,
+    'Cache-Control' => $config->get("UA_CACHE")
 ];
-my $ua = LWP::UserAgent->new;
+my $ua = LWP::UserAgent->new();
 $ua->timeout($config->get("UA_TIMEOUT"));
 
 ## now loop over them and store the results
 my $counter = 0;
+my $fetchedData;
 while ( my ($id, $url) = each %urlsToFetch ) {
-       sayYellow "Fetching: $id $url";
-
-       my $req = HTTP::Request->new(GET => $url, $request_headers);
-       my $res = $ua->request($req);
-       if ($res->is_success) {
-               if(index($res->content_type, "text/html") == -1) {
-                       sayYellow "Fetching: $id ignored. Not html";
-                       push(@urlsFailed, $id);
-                       next;
-               }
-               open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
-               print $fh $res->decoded_content();
-               close($fh);
-               push(@urlsFetched, $id);
-               sayGreen"Fetching: $id ok";
-       }
-       else {
-               sayRed "Fetching: $id failed: $res->code ".$res->status_line;
-               push(@urlsFailed, $id);
-       }
-
-       if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
-               updateFetched($dbh, @urlsFetched);
-               updateFailed($dbh, @urlsFailed);
-               sleep(rand(7));
-
-               $counter = 0;
-               @urlsFetched = ();
-               @urlsFailed = ();
-       }
-
-       $counter++;
+    sayYellow "Fetching: $id $url";
+
+    my $req = HTTP::Request->new(GET => $url, $request_headers);
+    my $res = $ua->request($req, \&getCallback);
+    if ($res->is_success) {
+        # callback tells us to stop
+        if($res->header('X-Died')) {
+            next;
+        }
+        if(index($res->content_type, "text/html") == -1) {
+            sayYellow "Fetching: $id ignored. Not html";
+            push(@urlsFailed, $id);
+            next;
+        }
+        open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
+        print $fh $res->decoded_content();
+        close($fh);
+        push(@urlsFetched, $id);
+        sayGreen"Fetching: $id ok";
+    }
+    else {
+        sayRed "Fetching: $id failed: $res->code ".$res->status_line;
+        push(@urlsFailed, $id);
+    }
+
+    if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
+        updateFetched($dbh, @urlsFetched);
+        updateFailed($dbh, @urlsFailed);
+        sleep(rand(7));
+
+        $counter = 0;
+        @urlsFetched = ();
+        @urlsFailed = ();
+    }
+
+    $counter++;
+    $fetchedData = 0;
 }
 updateFetched($dbh, @urlsFetched);
 updateFailed($dbh, @urlsFailed);
@@ -123,29 +128,42 @@ sayGreen "Fetch complete";
 
 ## update last_fetched in the table
 sub updateFetched {
-       my ($dbh, @urls) = @_;
-       sayYellow "Update fetch timestamps: ".scalar @urls;
-       $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
-       foreach my $idToUpdate (@urls) {
-               sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
-               $query->bind_param(1,$idToUpdate);
-               $query->execute();
-       }
-       $dbh->commit();
-       sayGreen "Update fetch timestamps done";
+    my ($dbh, @urls) = @_;
+    sayYellow "Update fetch timestamps: ".scalar @urls;
+    $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
+    foreach my $idToUpdate (@urls) {
+        sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
+        $query->bind_param(1,$idToUpdate);
+        $query->execute();
+    }
+    $dbh->commit();
+    sayGreen "Update fetch timestamps done";
 }
 
 ## update fetch_failed in the table
 sub updateFailed {
-       my ($dbh, @urls) = @_;
-
-       sayYellow "Update fetch failed: ".scalar @urls;
-       $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
-       foreach my $idToUpdate (@urls) {
-               sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
-               $query->bind_param(1,$idToUpdate);
-               $query->execute();
-       }
-       $dbh->commit();
-       sayGreen "Update fetch failed done";
+    my ($dbh, @urls) = @_;
+
+    sayYellow "Update fetch failed: ".scalar @urls;
+    $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
+    foreach my $idToUpdate (@urls) {
+        sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
+        $query->bind_param(1,$idToUpdate);
+        $query->execute();
+    }
+    $dbh->commit();
+    sayGreen "Update fetch failed done";
+}
+
+## callback for request to check the already downloaded size.
+## Avoid big downloads
+## $fetchedData is set and reset out this sub
+## the die sets x-died header
+sub getCallback {
+    my ( $chunk, $res, $proto ) = @_;
+    $fetchedData .= $chunk;
+    if(length($fetchedData) > $config->get("MAX_BYTES_PER_PAGE")) {
+        sayLog "Download size maximum reached." if($DEBUG);
+        die();
+    }
 }