123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- #!/usr/bin/perl -w
- # 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
- use 5.20.0;
- use strict;
- use warnings;
- use utf8;
- use Data::Dumper;
- use Term::ANSIColor qw(:constants);
- use lib './lib';
- use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
- use open qw( :std :encoding(UTF-8) );
- use DBI;
- use ConfigReader::Simple;
- use LWP::UserAgent;
- use HTTP::Request;
- my $DEBUG = 0;
- my $config = ConfigReader::Simple->new("config.txt");
- die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
- ## DB connection
- my %dbAttr = (
- PrintError=>0,# turn off error reporting via warn()
- RaiseError=>1, # turn on error reporting via die()
- AutoCommit=>0 # manually use transactions
- );
- my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
- my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
- die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
- ## fetch the urls to fetch from the table
- my %urlsToFetch;
- my $query = $dbh->prepare("SELECT `id`, `url`
- FROM `url_to_fetch`
- WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
- OR `last_fetched` IS NULL
- AND `fetch_failed` = 0
- LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
- $query->execute();
- while(my @row = $query->fetchrow_array) {
- $urlsToFetch{$row[0]} = $row[1];
- }
- # successful fetches
- my @urlsFetched;
- my @urlsFailed;
- # config the user agent for the request
- my $request_headers = [
- 'User-Agent' => $config->get("UA_AGENT"),
- 'Accept' => $config->get("UA_ACCEPT"),
- 'Accept-Language' => $config->get("UA_LANG"),
- 'Accept-Encoding' => HTTP::Message::decodable,
- 'Cache-Control' => $config->get("UA_CACHE")
- ];
- my $ua = LWP::UserAgent->new();
- $ua->timeout($config->get("UA_TIMEOUT"));
- ## now loop over them and store the results
- my $counter = 0;
- my $fetchedData;
- while ( my ($id, $url) = each %urlsToFetch ) {
- sayYellow "Fetching: $id $url";
- my $req = HTTP::Request->new(GET => $url, $request_headers);
- my $res = $ua->request($req, \&getCallback);
- if ($res->is_success) {
- # callback tells us to stop
- if($res->header('X-Died')) {
- next;
- }
- if(index($res->content_type, "text/html") == -1) {
- sayYellow "Fetching: $id ignored. Not html";
- push(@urlsFailed, $id);
- next;
- }
- open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
- print $fh $res->decoded_content();
- close($fh);
- push(@urlsFetched, $id);
- sayGreen"Fetching: $id ok";
- }
- else {
- sayRed "Fetching: $id failed: $res->code ".$res->status_line;
- push(@urlsFailed, $id);
- }
- if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
- updateFetched($dbh, @urlsFetched);
- updateFailed($dbh, @urlsFailed);
- sleep(rand(7));
- $counter = 0;
- @urlsFetched = ();
- @urlsFailed = ();
- }
- $counter++;
- $fetchedData = 0;
- }
- updateFetched($dbh, @urlsFetched);
- updateFailed($dbh, @urlsFailed);
- $dbh->disconnect();
- sayGreen "Fetch complete";
- ## update last_fetched in the table
- sub updateFetched {
- my ($dbh, @urls) = @_;
- sayYellow "Update fetch timestamps: ".scalar @urls;
- $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
- foreach my $idToUpdate (@urls) {
- sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
- $query->bind_param(1,$idToUpdate);
- $query->execute();
- }
- $dbh->commit();
- sayGreen "Update fetch timestamps done";
- }
- ## update fetch_failed in the table
- sub updateFailed {
- my ($dbh, @urls) = @_;
- sayYellow "Update fetch failed: ".scalar @urls;
- $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
- foreach my $idToUpdate (@urls) {
- sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
- $query->bind_param(1,$idToUpdate);
- $query->execute();
- }
- $dbh->commit();
- sayGreen "Update fetch failed done";
- }
- ## callback for request to check the already downloaded size.
- ## Avoid big downloads
- ## $fetchedData is set and reset out this sub
- ## the die sets x-died header
- sub getCallback {
- my ( $chunk, $res, $proto ) = @_;
- $fetchedData .= $chunk;
- if(length($fetchedData) > $config->get("MAX_BYTES_PER_PAGE")) {
- sayLog "Download size maximum reached." if($DEBUG);
- die();
- }
- }
|