Преглед на файлове

project cleanup and updated project website links

Banana преди 2 години
родител
ревизия
cfdca6000e
променени са 7 файла, в които са добавени 19 реда и са изтрити 7 реда
  1. 2 0
      README
  2. 1 1
      cleanup.pl
  3. 1 0
      config.txt
  4. 2 3
      fetch.pl
  5. 9 0
      lib/Aranea/Common.pm
  6. 1 1
      parse-results.pl
  7. 3 2
      setup.sql

+ 2 - 0
README

@@ -1,3 +1,5 @@
+https://://www.bananas-playground.net/projekt/aranea
+
 A small web crawler named aranea (Latin for spider).
 The aim is to gather unique domains to show what is out there.
 

+ 1 - 1
cleanup.pl

@@ -7,7 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 use 5.20.0;
 use strict;

+ 1 - 0
config.txt

@@ -9,6 +9,7 @@ UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;
 UA_LANG="en-US"
 UA_CACHE="no-cache"
 
+FETCH_URLS_PER_RUN=5000
 FETCH_URLS_PER_PACKAGE=30
 PARSE_FILES_PER_PACKAGE=50
 CLEANUP_URLS_AMOUNT_ABOVE=40

+ 2 - 3
fetch.pl

@@ -7,8 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
-# 2022 https://://www.bananas-playground.net
-
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 use 5.20.0;
 use strict;
@@ -49,7 +48,7 @@ my $query = $dbh->prepare("SELECT `id`, `url`
 							WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
 								OR `last_fetched` IS NULL
 								AND `fetch_failed` = 0
-							LIMIT 5000");
+							LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
 $query->execute();
 while(my @row = $query->fetchrow_array) {
 	$urlsToFetch{$row[0]} = $row[1];

+ 9 - 0
lib/Aranea/Common.pm

@@ -1,3 +1,12 @@
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
+#
+# You should have received a copy of the
+# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
+# along with this program.  If not, see http://www.sun.com/cddl/cddl.html
+#
+# 2022 https://://www.bananas-playground.net/projekt/aranea
+
 package Aranea::Common;
 use 5.20.0;
 use strict;

+ 1 - 1
parse-results.pl

@@ -7,7 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 use 5.20.0;
 use strict;

+ 3 - 2
setup.sql

@@ -113,7 +113,8 @@ INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
 (51, 'bitcoin:', '2022-01-16 19:48:41'),
 (52, 'webcal:', '2022-05-08 09:39:02'),
 (53, 'source:', '2022-05-08 09:43:19'),
-(54, 'phone', '2022-05-08 09:44:19');
+(54, 'phone:', '2022-05-08 09:44:19'),
+(55, 'threema:', '2022-05-08 09:45:19');
 
 --
 -- Indexes for dumped tables
@@ -156,7 +157,7 @@ ALTER TABLE `unique_domain`
 -- AUTO_INCREMENT for table `url_to_ignore`
 --
 ALTER TABLE `url_to_ignore`
-  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=55;
+  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
 COMMIT;
 
 /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;