parse-results.pl 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/perl -w
  2. # 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
  15. use 5.20.0;
  16. use strict;
  17. use warnings;
  18. use utf8;
  19. use Data::Dumper;
  20. use Term::ANSIColor qw(:constants);
  21. use lib './lib';
  22. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  23. use open qw( :std :encoding(UTF-8) );
  24. use DBI;
  25. use ConfigReader::Simple;
  26. use HTML::LinkExtor;
  27. use URI::URL;
  28. use File::Basename;
  29. use Digest::MD5 qw(md5_hex);
  30. use Data::Validate::URI qw(is_uri);
  31. my $DEBUG = 0;
  32. my $config = ConfigReader::Simple->new("config.txt");
  33. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  34. ## DB connection
  35. my %dbAttr = (
  36. PrintError=>0,# turn off error reporting via warn()
  37. RaiseError=>1, # turn on error reporting via die()
  38. AutoCommit=>0 # manually use transactions
  39. );
  40. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  41. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  42. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  43. ## get the fetched files
  44. my @results = glob("storage/*.result");
  45. die "Nothing to parse. No files found." unless(@results);
  46. ## build clean ids for query
  47. my @queryIds = @results;
  48. foreach (@queryIds) {
  49. $_ =~ s/.result//g;
  50. $_ =~ s|storage/||g;
  51. }
  52. # get the baseurls
  53. my %baseUrls;
  54. my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
  55. sayLog($queryStr) if $DEBUG;
  56. my $query = $dbh->prepare($queryStr);
  57. $query->execute(@queryIds);
  58. while(my @row = $query->fetchrow_array) {
  59. $baseUrls{$row[0]} = $row[1];
  60. }
  61. # get the string to ignore
  62. my @urlStringsToIgnore;
  63. $queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
  64. sayLog($queryStr) if $DEBUG;
  65. $query = $dbh->prepare($queryStr);
  66. $query->execute();
  67. while(my @row = $query->fetchrow) {
  68. push(@urlStringsToIgnore, $row[0])
  69. }
  70. ## prepare linkExtor
  71. my @links = ();
  72. my @workingLinks = ();
  73. sub leCallback {
  74. my($tag, %attr) = @_;
  75. return if $tag ne 'a'; # we only look closer at <a ...>
  76. push(@workingLinks, values %attr);
  77. }
  78. my $le = HTML::LinkExtor->new(\&leCallback);
  79. ## now parse each file and get the links
  80. my $counter = 0;
  81. foreach my $resultFile (@results) {
  82. sayYellow "Parsing file: $resultFile";
  83. my $fileId = basename($resultFile,".result");
  84. if (exists $baseUrls{$fileId}) {
  85. sayYellow "Baseurl: $baseUrls{$fileId}";
  86. $le->parse_file($resultFile);
  87. @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
  88. push(@links,@workingLinks);
  89. unlink($resultFile);
  90. sayGreen "Parsing done: ".scalar @workingLinks;
  91. }
  92. else {
  93. sayRed "No entry found for file $resultFile";
  94. }
  95. if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
  96. @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
  97. insertIntoDb($dbh, \@links);
  98. $counter = 0;
  99. @links = ();
  100. }
  101. @workingLinks = ();
  102. $counter++;
  103. }
  104. @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
  105. insertIntoDb($dbh, \@links);
  106. $dbh->disconnect();
  107. sayGreen "Parse complete";
  108. ## cleanup the found links
  109. sub cleanLinks {
  110. my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
  111. my @linkArray = @{ $linkArray };
  112. my @urlStringsToIgnore = @{ $urlStringsToIgnore };
  113. sayYellow "Clean found links: ".scalar @linkArray;
  114. foreach my $toSearch (@urlStringsToIgnore) {
  115. sayYellow "Clean links from: ".$toSearch;
  116. @linkArray = grep {!/$toSearch/i} @linkArray;
  117. }
  118. sayGreen "Cleaned found links: ".scalar @linkArray;
  119. return @linkArray;
  120. }
  121. ## update the DB with the new found links
  122. sub insertIntoDb {
  123. my ($dbh, $links) = @_;
  124. my @links = @{ $links };
  125. sayYellow "Insert links into DB: ".scalar @links;
  126. $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET
  127. `id` = ?,
  128. `url` = ?,
  129. `baseurl` = ?,
  130. `created` = NOW()";
  131. sayLog $queryStr if $DEBUG;
  132. $query = $dbh->prepare($queryStr);
  133. my $md5 = Digest::MD5->new;
  134. my $counter = 0;
  135. foreach my $link (@links) {
  136. sayLog $link if ($DEBUG);
  137. if(!is_uri($link)) {
  138. sayYellow "Ignore URL it is invalid: $link";
  139. next;
  140. }
  141. my $url = url($link);
  142. if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
  143. sayYellow "Ignore URL because of scheme: $link";
  144. next;
  145. }
  146. $md5->add($link);
  147. my $digest = $md5->hexdigest;
  148. $query->execute($digest, $link, $url->scheme."://".$url->host);
  149. $md5->reset;
  150. $counter++;
  151. if($counter >= 500) {
  152. $counter = 0;
  153. sayYellow "Commit counter of 500 reached. Commiting";
  154. $dbh->commit();
  155. }
  156. #sayLog $digest if ($DEBUG);
  157. #sayLog $url->scheme if ($DEBUG);
  158. #sayLog $url->host if ($DEBUG);
  159. #sayLog $query->{Statement} if ($DEBUG);
  160. #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
  161. #sayLog "Inserted: $link" if($DEBUG);
  162. }
  163. sayYellow "Final commit";
  164. $dbh->commit();
  165. }