fetch.pl 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #!/usr/bin/perl -w
  2. # This program is free software: you can redistribute it and/or modify
  3. # it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  4. #
  5. # You should have received a copy of the
  6. # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
  7. # along with this program. If not, see http://www.sun.com/cddl/cddl.html
  8. #
  9. # 2022 https://://www.bananas-playground.net/projekt/aranea
  10. use 5.20.0;
  11. use strict;
  12. use warnings;
  13. use utf8;
  14. use Data::Dumper;
  15. use Term::ANSIColor qw(:constants);
  16. use lib './lib';
  17. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  18. use open qw( :std :encoding(UTF-8) );
  19. use DBI;
  20. use ConfigReader::Simple;
  21. use LWP::UserAgent;
  22. use HTTP::Request;
  23. my $DEBUG = 0;
  24. my $config = ConfigReader::Simple->new("config.txt");
  25. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  26. ## DB connection
  27. my %dbAttr = (
  28. PrintError=>0,# turn off error reporting via warn()
  29. RaiseError=>1 # turn on error reporting via die()
  30. );
  31. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  32. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  33. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  34. ## fetch the urls to fetch from the table
  35. my %urlsToFetch;
  36. my $query = $dbh->prepare("SELECT `id`, `url`
  37. FROM `url_to_fetch`
  38. WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
  39. OR `last_fetched` IS NULL
  40. AND `fetch_failed` = 0
  41. LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
  42. $query->execute();
  43. while(my @row = $query->fetchrow_array) {
  44. $urlsToFetch{$row[0]} = $row[1];
  45. }
  46. $query->finish();
  47. # successful fetches
  48. my @urlsFetched;
  49. my @urlsFailed;
  50. # config the user agent for the request
  51. my $request_headers = [
  52. 'User-Agent' => $config->get("UA_AGENT"),
  53. 'Accept' => $config->get("UA_ACCEPT"),
  54. 'Accept-Language' => $config->get("UA_LANG"),
  55. 'Accept-Encoding' => HTTP::Message::decodable,
  56. 'Cache-Control' => $config->get("UA_CACHE")
  57. ];
  58. my $ua = LWP::UserAgent->new;
  59. ## now loop over them and store the results
  60. my $counter = 0;
  61. while ( my ($id, $url) = each %urlsToFetch ) {
  62. sayYellow "Fetching: $id $url";
  63. my $req = HTTP::Request->new(GET => $url, $request_headers);
  64. my $res = $ua->request($req);
  65. if ($res->is_success) {
  66. if(index($res->content_type, "text/html") == -1) {
  67. sayYellow "Fetching: $id ignored. Not html";
  68. push(@urlsFailed, $id);
  69. next;
  70. }
  71. open(my $fh, '>', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
  72. print $fh $res->decoded_content();
  73. close($fh);
  74. push(@urlsFetched, $id);
  75. sayGreen"Fetching: $id ok";
  76. }
  77. else {
  78. sayRed "Fetching: $id failed: $res->code ".$res->status_line;
  79. push(@urlsFailed, $id);
  80. }
  81. if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
  82. updateFetched($dbh, @urlsFetched);
  83. updateFailed($dbh, @urlsFailed);
  84. sleep(rand(7));
  85. $counter = 0;
  86. @urlsFetched = ();
  87. @urlsFailed = ();
  88. }
  89. $counter++;
  90. }
  91. updateFetched($dbh, @urlsFetched);
  92. updateFailed($dbh, @urlsFailed);
  93. $dbh->disconnect();
  94. sayGreen "Fetch complete";
  95. ## update last_fetched in the table
  96. sub updateFetched {
  97. my ($dbh, @urls) = @_;
  98. sayYellow "Update fetch timestamps: ".scalar @urls;
  99. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
  100. foreach my $idToUpdate (@urls) {
  101. sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
  102. $query->bind_param(1,$idToUpdate);
  103. $query->execute();
  104. }
  105. $query->finish();
  106. sayGreen "Update fetch timestamps done";
  107. }
  108. ## update fetch_failed in the table
  109. sub updateFailed {
  110. my ($dbh, @urls) = @_;
  111. sayYellow "Update fetch failed: ".scalar @urls;
  112. $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
  113. foreach my $idToUpdate (@urls) {
  114. sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
  115. $query->bind_param(1,$idToUpdate);
  116. $query->execute();
  117. }
  118. $query->finish();
  119. sayGreen "Update fetch failed done";
  120. }