fetch.pl 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #!/usr/bin/perl -w
  2. # 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
  15. use 5.20.0;
  16. use strict;
  17. use warnings;
  18. use utf8;
  19. use Data::Dumper;
  20. use Term::ANSIColor qw(:constants);
  21. use lib './lib';
  22. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  23. use open qw( :std :encoding(UTF-8) );
  24. use DBI;
  25. use ConfigReader::Simple;
  26. use LWP::UserAgent;
  27. use HTTP::Request;
  28. my $DEBUG = 0;
  29. my $config = ConfigReader::Simple->new("config.txt");
  30. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  31. ## DB connection
  32. my %dbAttr = (
  33. PrintError=>0,# turn off error reporting via warn()
  34. RaiseError=>1, # turn on error reporting via die()
  35. AutoCommit=>0 # manually use transactions
  36. );
  37. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  38. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  39. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  40. ## fetch the urls to fetch from the table
  41. my %urlsToFetch;
  42. my $query = $dbh->prepare("SELECT `id`, `url`
  43. FROM `url_to_fetch`
  44. WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
  45. OR `last_fetched` IS NULL
  46. AND `fetch_failed` = 0
  47. LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
  48. $query->execute();
  49. while(my @row = $query->fetchrow_array) {
  50. $urlsToFetch{$row[0]} = $row[1];
  51. }
  52. #$query->finish();
  53. # successful fetches
  54. my @urlsFetched;
  55. my @urlsFailed;
  56. # config the user agent for the request
  57. my $request_headers = [
  58. 'User-Agent' => $config->get("UA_AGENT"),
  59. 'Accept' => $config->get("UA_ACCEPT"),
  60. 'Accept-Language' => $config->get("UA_LANG"),
  61. 'Accept-Encoding' => HTTP::Message::decodable,
  62. 'Cache-Control' => $config->get("UA_CACHE")
  63. ];
  64. my $ua = LWP::UserAgent->new;
  65. $ua->timeout($config->get("UA_TIMEOUT"));
  66. ## now loop over them and store the results
  67. my $counter = 0;
  68. while ( my ($id, $url) = each %urlsToFetch ) {
  69. sayYellow "Fetching: $id $url";
  70. my $req = HTTP::Request->new(GET => $url, $request_headers);
  71. my $res = $ua->request($req);
  72. if ($res->is_success) {
  73. if(index($res->content_type, "text/html") == -1) {
  74. sayYellow "Fetching: $id ignored. Not html";
  75. push(@urlsFailed, $id);
  76. next;
  77. }
  78. open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
  79. print $fh $res->decoded_content();
  80. close($fh);
  81. push(@urlsFetched, $id);
  82. sayGreen"Fetching: $id ok";
  83. }
  84. else {
  85. sayRed "Fetching: $id failed: $res->code ".$res->status_line;
  86. push(@urlsFailed, $id);
  87. }
  88. if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
  89. updateFetched($dbh, @urlsFetched);
  90. updateFailed($dbh, @urlsFailed);
  91. sleep(rand(7));
  92. $counter = 0;
  93. @urlsFetched = ();
  94. @urlsFailed = ();
  95. }
  96. $counter++;
  97. }
  98. updateFetched($dbh, @urlsFetched);
  99. updateFailed($dbh, @urlsFailed);
  100. $dbh->disconnect();
  101. sayGreen "Fetch complete";
  102. ## update last_fetched in the table
  103. sub updateFetched {
  104. my ($dbh, @urls) = @_;
  105. sayYellow "Update fetch timestamps: ".scalar @urls;
  106. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
  107. foreach my $idToUpdate (@urls) {
  108. sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
  109. $query->bind_param(1,$idToUpdate);
  110. $query->execute();
  111. }
  112. $dbh->commit();
  113. sayGreen "Update fetch timestamps done";
  114. }
  115. ## update fetch_failed in the table
  116. sub updateFailed {
  117. my ($dbh, @urls) = @_;
  118. sayYellow "Update fetch failed: ".scalar @urls;
  119. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
  120. foreach my $idToUpdate (@urls) {
  121. sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
  122. $query->bind_param(1,$idToUpdate);
  123. $query->execute();
  124. }
  125. $dbh->commit();
  126. sayGreen "Update fetch failed done";
  127. }