From 36b6e40cf81ea44fb3c2e5687086474e0fef3079 Mon Sep 17 00:00:00 2001 From: Banana Date: Sun, 8 Sep 2024 01:04:26 +0200 Subject: [PATCH] avoid big downloads Signed-off-by: Banana --- CHANGELOG | 1 + TODO | 1 - config.default.txt | 1 + fetch.pl | 154 +++++++++++++++++++++++++-------------------- 4 files changed, 88 insertions(+), 69 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 05a9261..05f786a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ + Some db improvements + Default config file added + Updated requirements file ++ Avoid big downloads with MAX_BYTES_PER_PAGE setting. 0.1 + initial release diff --git a/TODO b/TODO index 613e35e..c51e172 100644 --- a/TODO +++ b/TODO @@ -1,4 +1,3 @@ -Avoid download to mutch data. Content check before? Set correct timezone. Maybe in config? Some sort of matching against spam domain list? A web view for the results? diff --git a/config.default.txt b/config.default.txt index c1cca48..22ef694 100644 --- a/config.default.txt +++ b/config.default.txt @@ -14,3 +14,4 @@ FETCH_URLS_PER_RUN=5000 FETCH_URLS_PER_PACKAGE=30 PARSE_FILES_PER_PACKAGE=50 CLEANUP_URLS_AMOUNT_ABOVE=40 +MAX_BYTES_PER_PAGE=5000000 diff --git a/fetch.pl b/fetch.pl index b45e51e..ceb45b2 100644 --- a/fetch.pl +++ b/fetch.pl @@ -39,9 +39,9 @@ die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; ## DB connection my %dbAttr = ( - PrintError=>0,# turn off error reporting via warn() + PrintError=>0,# turn off error reporting via warn() RaiseError=>1, # turn on error reporting via die() - AutoCommit=>0 # manually use transactions + AutoCommit=>0 # manually use transactions ); my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); @@ -51,16 +51,15 @@ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); ## fetch the urls to fetch from the table my %urlsToFetch; my $query = $dbh->prepare("SELECT `id`, `url` - FROM `url_to_fetch` - WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH - OR `last_fetched` IS NULL - AND `fetch_failed` = 0 - LIMIT ".$config->get("FETCH_URLS_PER_RUN")); + FROM `url_to_fetch` + WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH + OR `last_fetched` IS NULL + AND `fetch_failed` = 0 + LIMIT ".$config->get("FETCH_URLS_PER_RUN")); $query->execute(); while(my @row = $query->fetchrow_array) { - $urlsToFetch{$row[0]} = $row[1]; + $urlsToFetch{$row[0]} = $row[1]; } -#$query->finish(); # successful fetches my @urlsFetched; @@ -68,50 +67,56 @@ my @urlsFailed; # config the user agent for the request my $request_headers = [ - 'User-Agent' => $config->get("UA_AGENT"), - 'Accept' => $config->get("UA_ACCEPT"), - 'Accept-Language' => $config->get("UA_LANG"), - 'Accept-Encoding' => HTTP::Message::decodable, - 'Cache-Control' => $config->get("UA_CACHE") + 'User-Agent' => $config->get("UA_AGENT"), + 'Accept' => $config->get("UA_ACCEPT"), + 'Accept-Language' => $config->get("UA_LANG"), + 'Accept-Encoding' => HTTP::Message::decodable, + 'Cache-Control' => $config->get("UA_CACHE") ]; -my $ua = LWP::UserAgent->new; +my $ua = LWP::UserAgent->new(); $ua->timeout($config->get("UA_TIMEOUT")); ## now loop over them and store the results my $counter = 0; +my $fetchedData; while ( my ($id, $url) = each %urlsToFetch ) { - sayYellow "Fetching: $id $url"; - - my $req = HTTP::Request->new(GET => $url, $request_headers); - my $res = $ua->request($req); - if ($res->is_success) { - if(index($res->content_type, "text/html") == -1) { - sayYellow "Fetching: $id ignored. Not html"; - push(@urlsFailed, $id); - next; - } - open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; - print $fh $res->decoded_content(); - close($fh); - push(@urlsFetched, $id); - sayGreen"Fetching: $id ok"; - } - else { - sayRed "Fetching: $id failed: $res->code ".$res->status_line; - push(@urlsFailed, $id); - } - - if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { - updateFetched($dbh, @urlsFetched); - updateFailed($dbh, @urlsFailed); - sleep(rand(7)); - - $counter = 0; - @urlsFetched = (); - @urlsFailed = (); - } - - $counter++; + sayYellow "Fetching: $id $url"; + + my $req = HTTP::Request->new(GET => $url, $request_headers); + my $res = $ua->request($req, \&getCallback); + if ($res->is_success) { + # callback tells us to stop + if($res->header('X-Died')) { + next; + } + if(index($res->content_type, "text/html") == -1) { + sayYellow "Fetching: $id ignored. Not html"; + push(@urlsFailed, $id); + next; + } + open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; + print $fh $res->decoded_content(); + close($fh); + push(@urlsFetched, $id); + sayGreen"Fetching: $id ok"; + } + else { + sayRed "Fetching: $id failed: $res->code ".$res->status_line; + push(@urlsFailed, $id); + } + + if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { + updateFetched($dbh, @urlsFetched); + updateFailed($dbh, @urlsFailed); + sleep(rand(7)); + + $counter = 0; + @urlsFetched = (); + @urlsFailed = (); + } + + $counter++; + $fetchedData = 0; } updateFetched($dbh, @urlsFetched); updateFailed($dbh, @urlsFailed); @@ -123,29 +128,42 @@ sayGreen "Fetch complete"; ## update last_fetched in the table sub updateFetched { - my ($dbh, @urls) = @_; - sayYellow "Update fetch timestamps: ".scalar @urls; - $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?"); - foreach my $idToUpdate (@urls) { - sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG); - $query->bind_param(1,$idToUpdate); - $query->execute(); - } - $dbh->commit(); - sayGreen "Update fetch timestamps done"; + my ($dbh, @urls) = @_; + sayYellow "Update fetch timestamps: ".scalar @urls; + $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?"); + foreach my $idToUpdate (@urls) { + sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG); + $query->bind_param(1,$idToUpdate); + $query->execute(); + } + $dbh->commit(); + sayGreen "Update fetch timestamps done"; } ## update fetch_failed in the table sub updateFailed { - my ($dbh, @urls) = @_; - - sayYellow "Update fetch failed: ".scalar @urls; - $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?"); - foreach my $idToUpdate (@urls) { - sayLog "Update fetch failed for: $idToUpdate" if($DEBUG); - $query->bind_param(1,$idToUpdate); - $query->execute(); - } - $dbh->commit(); - sayGreen "Update fetch failed done"; + my ($dbh, @urls) = @_; + + sayYellow "Update fetch failed: ".scalar @urls; + $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?"); + foreach my $idToUpdate (@urls) { + sayLog "Update fetch failed for: $idToUpdate" if($DEBUG); + $query->bind_param(1,$idToUpdate); + $query->execute(); + } + $dbh->commit(); + sayGreen "Update fetch failed done"; +} + +## callback for request to check the already downloaded size. +## Avoid big downloads +## $fetchedData is set and reset out this sub +## the die sets x-died header +sub getCallback { + my ( $chunk, $res, $proto ) = @_; + $fetchedData .= $chunk; + if(length($fetchedData) > $config->get("MAX_BYTES_PER_PAGE")) { + sayLog "Download size maximum reached." if($DEBUG); + die(); + } } -- 2.39.5