parse-results.pl 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/perl -w
  2. # This program is free software: you can redistribute it and/or modify
  3. # it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  4. #
  5. # You should have received a copy of the
  6. # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
  7. # along with this program. If not, see http://www.sun.com/cddl/cddl.html
  8. #
  9. # 2022 https://://www.bananas-playground.net/projekt/aranea
  10. use 5.20.0;
  11. use strict;
  12. use warnings;
  13. use utf8;
  14. use Data::Dumper;
  15. use Term::ANSIColor qw(:constants);
  16. use lib './lib';
  17. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  18. use open qw( :std :encoding(UTF-8) );
  19. use DBI;
  20. use ConfigReader::Simple;
  21. use HTML::LinkExtor;
  22. use URI::URL;
  23. use File::Basename;
  24. use Digest::MD5 qw(md5_hex);
  25. use Data::Validate::URI qw(is_uri);
  26. my $DEBUG = 0;
  27. my $config = ConfigReader::Simple->new("config.txt");
  28. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  29. ## DB connection
  30. my %dbAttr = (
  31. PrintError=>0,# turn off error reporting via warn()
  32. RaiseError=>1 # turn on error reporting via die()
  33. );
  34. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  35. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  36. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  37. ## get the fetched files
  38. my @results = glob("storage/*.result");
  39. die "Nothing to parse. No files found." unless(@results);
  40. ## build clean ids for query
  41. my @queryIds = @results;
  42. foreach (@queryIds) {
  43. $_ =~ s/.result//g;
  44. $_ =~ s|storage/||g;
  45. }
  46. # get the baseurls
  47. my %baseUrls;
  48. my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
  49. sayLog($queryStr) if $DEBUG;
  50. my $query = $dbh->prepare($queryStr);
  51. $query->execute(@queryIds);
  52. while(my @row = $query->fetchrow_array) {
  53. $baseUrls{$row[0]} = $row[1];
  54. }
  55. $query->finish();
  56. # get the string to ignore
  57. my @urlStringsToIgnore;
  58. $queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
  59. sayLog($queryStr) if $DEBUG;
  60. $query = $dbh->prepare($queryStr);
  61. $query->execute();
  62. while(my @row = $query->fetchrow) {
  63. push(@urlStringsToIgnore, $row[0])
  64. }
  65. $query->finish();
  66. ## prepare linkExtor
  67. my @links = ();
  68. my @workingLinks = ();
  69. sub leCallback {
  70. my($tag, %attr) = @_;
  71. return if $tag ne 'a'; # we only look closer at <a ...>
  72. push(@workingLinks, values %attr);
  73. }
  74. my $le = HTML::LinkExtor->new(\&leCallback);
  75. ## now parse each file and get the links
  76. my $counter = 0;
  77. foreach my $resultFile (@results) {
  78. sayYellow "Parsing file: $resultFile";
  79. my $fileId = basename($resultFile,".result");
  80. if (exists $baseUrls{$fileId}) {
  81. sayYellow "Baseurl: $baseUrls{$fileId}";
  82. $le->parse_file($resultFile);
  83. @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
  84. push(@links,@workingLinks);
  85. unlink($resultFile);
  86. sayGreen "Parsing done: ".scalar @workingLinks;
  87. }
  88. else {
  89. sayRed "No entry found for file $resultFile";
  90. }
  91. if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
  92. @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
  93. insertIntoDb($dbh, \@links);
  94. $counter = 0;
  95. @links = ();
  96. }
  97. @workingLinks = ();
  98. $counter++;
  99. }
  100. @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
  101. insertIntoDb($dbh, \@links);
  102. $dbh->disconnect();
  103. say CLEAR,GREEN, "Parse complete", RESET;
  104. ## cleanup the found links
  105. sub cleanLinks {
  106. my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
  107. my @linkArray = @{ $linkArray };
  108. my @urlStringsToIgnore = @{ $urlStringsToIgnore };
  109. sayYellow "Clean found links: ".scalar @linkArray;
  110. foreach my $toSearch (@urlStringsToIgnore) {
  111. sayYellow "Clean links from: ".$toSearch;
  112. @linkArray = grep {!/$toSearch/i} @linkArray;
  113. }
  114. sayGreen "Cleaned found links: ".scalar @linkArray;
  115. return @linkArray;
  116. }
  117. ## update the DB with the new found links
  118. sub insertIntoDb {
  119. my ($dbh, $links) = @_;
  120. my @links = @{ $links };
  121. sayYellow "Insert links into DB: ".scalar @links;
  122. $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET
  123. `id` = ?,
  124. `url` = ?,
  125. `baseurl` = ?,
  126. `created` = NOW()";
  127. sayLog $queryStr if $DEBUG;
  128. $query = $dbh->prepare($queryStr);
  129. my $md5 = Digest::MD5->new;
  130. foreach my $link (@links) {
  131. sayLog $link if ($DEBUG);
  132. if(!is_uri($link)) {
  133. sayYellow "Ignore URL it is invalid: $link";
  134. next;
  135. }
  136. my $url = url($link);
  137. if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
  138. sayYellow "Ignore URL because of scheme: $link";
  139. next;
  140. }
  141. $md5->add($link);
  142. my $digest = $md5->hexdigest;
  143. $query->execute($digest, $link, $url->scheme."://".$url->host);
  144. $md5->reset;
  145. #sayLog $digest if ($DEBUG);
  146. #sayLog $url->scheme if ($DEBUG);
  147. #sayLog $url->host if ($DEBUG);
  148. #sayLog $query->{Statement} if ($DEBUG);
  149. #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
  150. #sayLog "Inserted: $link" if($DEBUG);
  151. }
  152. $query->finish();
  153. }