cleanup.pl 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/perl -w
  2. # 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
  15. use 5.20.0;
  16. use strict;
  17. use warnings;
  18. use utf8;
  19. use Data::Dumper;
  20. use Term::ANSIColor qw(:constants);
  21. use lib './lib';
  22. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  23. use DBI;
  24. use ConfigReader::Simple;
  25. use URI::URL;
  26. use Data::Validate::URI qw(is_uri);
  27. my $DEBUG = 0;
  28. my $config = ConfigReader::Simple->new("config.txt");
  29. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  30. ## DB connection
  31. my %dbAttr = (
  32. PrintError=>0,# turn off error reporting via warn()
  33. RaiseError=>1 # turn on error reporting via die()
  34. );
  35. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  36. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  37. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  38. # update the unique domains
  39. my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
  40. sayLog($queryStr) if $DEBUG;
  41. my $query = $dbh->prepare($queryStr);
  42. $query->execute();
  43. # now validate the unique ones
  44. $queryStr = "SELECT `id`, `url` FROM unique_domain";
  45. sayLog($queryStr) if $DEBUG;
  46. $query = $dbh->prepare($queryStr);
  47. $query->execute();
  48. my @invalidUrls = ();
  49. my @toBeDeletedFromFetchAgain = ();
  50. while(my @row = $query->fetchrow_array) {
  51. my $link = $row[1];
  52. my $id = $row[0];
  53. if(!is_uri($link)) {
  54. sayYellow "Ignore URL it is invalid: $link";
  55. push(@invalidUrls, $id);
  56. push(@toBeDeletedFromFetchAgain, $link);
  57. next;
  58. }
  59. my $url = url($link);
  60. if(!defined($url->scheme) || index($url->scheme,"http") == -1) {
  61. sayYellow "Ignore URL because of scheme: $link";
  62. push(@invalidUrls, $id);
  63. push(@toBeDeletedFromFetchAgain, $link);
  64. next;
  65. }
  66. }
  67. sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
  68. $queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
  69. sayLog($queryStr) if $DEBUG;
  70. $query = $dbh->prepare($queryStr);
  71. foreach my $invalidId (@invalidUrls) {
  72. $query->execute($invalidId);
  73. #$query->finish();
  74. sayLog "Removed $invalidId from unique_domain" if $DEBUG;
  75. }
  76. sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
  77. # remove urls from fetch since we have enough already
  78. $queryStr = "SELECT count(baseurl) AS amount, baseurl
  79. FROM `url_to_fetch`
  80. WHERE last_fetched <> 0
  81. GROUP BY baseurl
  82. HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
  83. sayLog($queryStr) if $DEBUG;
  84. $query = $dbh->prepare($queryStr);
  85. $query->execute();
  86. while(my @row = $query->fetchrow_array) {
  87. my $baseUrl = $row[1];
  88. push(@toBeDeletedFromFetchAgain, $baseUrl);
  89. }
  90. #$query->finish();
  91. sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
  92. $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
  93. sayLog($queryStr) if $DEBUG;
  94. $query = $dbh->prepare($queryStr);
  95. foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
  96. $query->execute($baseUrl);
  97. #$query->finish();
  98. sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
  99. }
  100. sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
  101. # remove failed fetches
  102. sayYellow "Remove fetch_failed";
  103. $queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1";
  104. $query = $dbh->prepare($queryStr);
  105. $query->execute();
  106. sayGreen "Remove fetch_failed done";
  107. sayYellow "Remove invalid urls which the is_uri check does let pass";
  108. $queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'";
  109. $query = $dbh->prepare($queryStr);
  110. $query->execute();
  111. $queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'";
  112. $query = $dbh->prepare($queryStr);
  113. $query->execute();
  114. sayYellow "Remove invalid urls done";
  115. sayGreen "Cleanup complete";