fetch.pl 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/perl -w
  2. # 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
  15. use 5.20.0;
  16. use strict;
  17. use warnings;
  18. use utf8;
  19. use Data::Dumper;
  20. use Term::ANSIColor qw(:constants);
  21. use lib './lib';
  22. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  23. use open qw( :std :encoding(UTF-8) );
  24. use DBI;
  25. use ConfigReader::Simple;
  26. use LWP::UserAgent;
  27. use HTTP::Request;
  28. my $DEBUG = 0;
  29. my $config = ConfigReader::Simple->new("config.txt");
  30. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  31. ## DB connection
  32. my %dbAttr = (
  33. PrintError=>0,# turn off error reporting via warn()
  34. RaiseError=>1 # turn on error reporting via die()
  35. );
  36. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  37. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  38. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  39. ## fetch the urls to fetch from the table
  40. my %urlsToFetch;
  41. my $query = $dbh->prepare("SELECT `id`, `url`
  42. FROM `url_to_fetch`
  43. WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
  44. OR `last_fetched` IS NULL
  45. AND `fetch_failed` = 0
  46. LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
  47. $query->execute();
  48. while(my @row = $query->fetchrow_array) {
  49. $urlsToFetch{$row[0]} = $row[1];
  50. }
  51. $query->finish();
  52. # successful fetches
  53. my @urlsFetched;
  54. my @urlsFailed;
  55. # config the user agent for the request
  56. my $request_headers = [
  57. 'User-Agent' => $config->get("UA_AGENT"),
  58. 'Accept' => $config->get("UA_ACCEPT"),
  59. 'Accept-Language' => $config->get("UA_LANG"),
  60. 'Accept-Encoding' => HTTP::Message::decodable,
  61. 'Cache-Control' => $config->get("UA_CACHE")
  62. ];
  63. my $ua = LWP::UserAgent->new;
  64. $ua->timeout($config->get("UA_TIMEOUT"));
  65. ## now loop over them and store the results
  66. my $counter = 0;
  67. while ( my ($id, $url) = each %urlsToFetch ) {
  68. sayYellow "Fetching: $id $url";
  69. my $req = HTTP::Request->new(GET => $url, $request_headers);
  70. my $res = $ua->request($req);
  71. if ($res->is_success) {
  72. if(index($res->content_type, "text/html") == -1) {
  73. sayYellow "Fetching: $id ignored. Not html";
  74. push(@urlsFailed, $id);
  75. next;
  76. }
  77. open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
  78. print $fh $res->decoded_content();
  79. close($fh);
  80. push(@urlsFetched, $id);
  81. sayGreen"Fetching: $id ok";
  82. }
  83. else {
  84. sayRed "Fetching: $id failed: $res->code ".$res->status_line;
  85. push(@urlsFailed, $id);
  86. }
  87. if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
  88. updateFetched($dbh, @urlsFetched);
  89. updateFailed($dbh, @urlsFailed);
  90. sleep(rand(7));
  91. $counter = 0;
  92. @urlsFetched = ();
  93. @urlsFailed = ();
  94. }
  95. $counter++;
  96. }
  97. updateFetched($dbh, @urlsFetched);
  98. updateFailed($dbh, @urlsFailed);
  99. $dbh->disconnect();
  100. sayGreen "Fetch complete";
  101. ## update last_fetched in the table
  102. sub updateFetched {
  103. my ($dbh, @urls) = @_;
  104. sayYellow "Update fetch timestamps: ".scalar @urls;
  105. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
  106. foreach my $idToUpdate (@urls) {
  107. sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
  108. $query->bind_param(1,$idToUpdate);
  109. $query->execute();
  110. }
  111. $query->finish();
  112. sayGreen "Update fetch timestamps done";
  113. }
  114. ## update fetch_failed in the table
  115. sub updateFailed {
  116. my ($dbh, @urls) = @_;
  117. sayYellow "Update fetch failed: ".scalar @urls;
  118. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
  119. foreach my $idToUpdate (@urls) {
  120. sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
  121. $query->bind_param(1,$idToUpdate);
  122. $query->execute();
  123. }
  124. $query->finish();
  125. sayGreen "Update fetch failed done";
  126. }