cleanup.pl 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #!/usr/bin/perl -w
  2. # This program is free software: you can redistribute it and/or modify
  3. # it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  4. #
  5. # You should have received a copy of the
  6. # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
  7. # along with this program. If not, see http://www.sun.com/cddl/cddl.html
  8. #
  9. # 2022 https://://www.bananas-playground.net/projekt/aranea
  10. use 5.20.0;
  11. use strict;
  12. use warnings;
  13. use utf8;
  14. use Data::Dumper;
  15. use Term::ANSIColor qw(:constants);
  16. use lib './lib';
  17. use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
  18. use DBI;
  19. use ConfigReader::Simple;
  20. use URI::URL;
  21. use Data::Validate::URI qw(is_uri);
  22. my $DEBUG = 0;
  23. my $config = ConfigReader::Simple->new("config.txt");
  24. die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
  25. ## DB connection
  26. my %dbAttr = (
  27. PrintError=>0,# turn off error reporting via warn()
  28. RaiseError=>1 # turn on error reporting via die()
  29. );
  30. my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
  31. my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
  32. die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
  33. # update the uniqe domains
  34. my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
  35. sayLog($queryStr) if $DEBUG;
  36. my $query = $dbh->prepare($queryStr);
  37. $query->execute();
  38. # now validate the unique ones
  39. $queryStr = "SELECT `id`, `url` FROM unique_domain";
  40. sayLog($queryStr) if $DEBUG;
  41. $query = $dbh->prepare($queryStr);
  42. $query->execute();
  43. my @invalidUrls = ();
  44. my @toBeDeletedFromFetchAgain = ();
  45. while(my @row = $query->fetchrow_array) {
  46. my $link = $row[1];
  47. my $id = $row[0];
  48. if(!is_uri($link)) {
  49. sayYellow "Ignore URL it is invalid: $link";
  50. push(@invalidUrls, $id);
  51. push(@toBeDeletedFromFetchAgain, $link);
  52. next;
  53. }
  54. my $url = url($link);
  55. if(!defined($url->scheme) || index($url->scheme,"http") == -1) {
  56. sayYellow "Ignore URL because of scheme: $link";
  57. push(@invalidUrls, $id);
  58. push(@toBeDeletedFromFetchAgain, $link);
  59. next;
  60. }
  61. }
  62. sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
  63. $queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
  64. sayLog($queryStr) if $DEBUG;
  65. $query = $dbh->prepare($queryStr);
  66. foreach my $invalidId (@invalidUrls) {
  67. $query->execute($invalidId);
  68. $query->finish();
  69. sayLog "Removed $invalidId from unique_domain" if $DEBUG;
  70. }
  71. sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
  72. # remove urls from fetch since we have enough already
  73. $queryStr = "SELECT count(baseurl) AS amount, baseurl
  74. FROM `url_to_fetch`
  75. WHERE last_fetched <> 0
  76. GROUP BY baseurl
  77. HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
  78. sayLog($queryStr) if $DEBUG;
  79. $query = $dbh->prepare($queryStr);
  80. $query->execute();
  81. while(my @row = $query->fetchrow_array) {
  82. my $baseUrl = $row[1];
  83. push(@toBeDeletedFromFetchAgain, $baseUrl);
  84. }
  85. $query->finish();
  86. sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
  87. $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
  88. sayLog($queryStr) if $DEBUG;
  89. $query = $dbh->prepare($queryStr);
  90. foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
  91. $query->execute($baseUrl);
  92. $query->finish();
  93. sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
  94. }
  95. sayGreen "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
  96. # remove failed fetches
  97. sayYellow "Remove fetch_failed";
  98. $queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1";
  99. $query = $dbh->prepare($queryStr);
  100. $query->execute();
  101. sayGreen "Remove fetch_failed done";
  102. sayYellow "Remove invalid urls which the is_uri check does let pass";
  103. $queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'";
  104. $query = $dbh->prepare($queryStr);
  105. $query->execute();
  106. $queryStr = "SELECT * FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'";
  107. $query = $dbh->prepare($queryStr);
  108. $query->execute();
  109. sayYellow "Remove invalid urls done";
  110. sayGreen "Cleanup complete";