fetch.pl 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/perl -w
  2. # 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
  15. use 5.20.0;
  16. use strict;
  17. use warnings;
  18. use utf8;
  19. use Data::Dumper;
  20. use Term::ANSIColor qw(:constants);
  21. use lib './lib';
  22. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  23. use open qw( :std :encoding(UTF-8) );
  24. use DBI;
  25. use ConfigReader::Simple;
  26. use LWP::UserAgent;
  27. use HTTP::Request;
  28. my $DEBUG = 0;
  29. my $config = ConfigReader::Simple->new("config.txt");
  30. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  31. ## DB connection
  32. my %dbAttr = (
  33. PrintError=>0,# turn off error reporting via warn()
  34. RaiseError=>1, # turn on error reporting via die()
  35. AutoCommit=>0 # manually use transactions
  36. );
  37. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  38. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  39. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  40. ## fetch the urls to fetch from the table
  41. my %urlsToFetch;
  42. my $query = $dbh->prepare("SELECT `id`, `url`
  43. FROM `url_to_fetch`
  44. WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
  45. OR `last_fetched` IS NULL
  46. AND `fetch_failed` = 0
  47. LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
  48. $query->execute();
  49. while(my @row = $query->fetchrow_array) {
  50. $urlsToFetch{$row[0]} = $row[1];
  51. }
  52. # successful fetches
  53. my @urlsFetched;
  54. my @urlsFailed;
  55. # config the user agent for the request
  56. my $request_headers = [
  57. 'User-Agent' => $config->get("UA_AGENT"),
  58. 'Accept' => $config->get("UA_ACCEPT"),
  59. 'Accept-Language' => $config->get("UA_LANG"),
  60. 'Accept-Encoding' => HTTP::Message::decodable,
  61. 'Cache-Control' => $config->get("UA_CACHE")
  62. ];
  63. my $ua = LWP::UserAgent->new();
  64. $ua->timeout($config->get("UA_TIMEOUT"));
  65. $ua->max_size($config->get("MAX_BYTES_PER_PAGE"));
  66. ## now loop over them and store the results
  67. my $counter = 0;
  68. while ( my ($id, $url) = each %urlsToFetch ) {
  69. sayYellow "Fetching: $id $url";
  70. my $req = HTTP::Request->new(GET => $url, $request_headers);
  71. my $res = $ua->request($req);
  72. if ($res->is_success) {
  73. # callback tells us to stop
  74. if($res->header('Client-Aborted')) {
  75. sayYellow "Aborted, too big.";
  76. next;
  77. }
  78. if(index($res->content_type, "text/html") == -1) {
  79. sayYellow "Fetching: $id ignored. Not html";
  80. push(@urlsFailed, $id);
  81. next;
  82. }
  83. open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
  84. print $fh $res->decoded_content();
  85. close($fh);
  86. push(@urlsFetched, $id);
  87. sayGreen"Fetching: $id ok";
  88. }
  89. else {
  90. sayRed "Fetching: $id failed: $res->code ".$res->status_line;
  91. push(@urlsFailed, $id);
  92. }
  93. if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
  94. updateFetched($dbh, @urlsFetched);
  95. updateFailed($dbh, @urlsFailed);
  96. sleep(rand(7));
  97. $counter = 0;
  98. @urlsFetched = ();
  99. @urlsFailed = ();
  100. }
  101. $counter++;
  102. }
  103. updateFetched($dbh, @urlsFetched);
  104. updateFailed($dbh, @urlsFailed);
  105. $dbh->disconnect();
  106. sayGreen "Fetch complete";
  107. ## update last_fetched in the table
  108. sub updateFetched {
  109. my ($dbh, @urls) = @_;
  110. sayYellow "Update fetch timestamps: ".scalar @urls;
  111. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
  112. foreach my $idToUpdate (@urls) {
  113. sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
  114. $query->bind_param(1,$idToUpdate);
  115. $query->execute();
  116. }
  117. $dbh->commit();
  118. sayGreen "Update fetch timestamps done";
  119. }
  120. ## update fetch_failed in the table
  121. sub updateFailed {
  122. my ($dbh, @urls) = @_;
  123. sayYellow "Update fetch failed: ".scalar @urls;
  124. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
  125. foreach my $idToUpdate (@urls) {
  126. sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
  127. $query->bind_param(1,$idToUpdate);
  128. $query->execute();
  129. }
  130. $dbh->commit();
  131. sayGreen "Update fetch failed done";
  132. }