Browse Source

project cleanup and updated project website links

Banana 2 years ago
parent
commit
cfdca6000e
7 changed files with 19 additions and 7 deletions
  1. 2 0
      README
  2. 1 1
      cleanup.pl
  3. 1 0
      config.txt
  4. 2 3
      fetch.pl
  5. 9 0
      lib/Aranea/Common.pm
  6. 1 1
      parse-results.pl
  7. 3 2
      setup.sql

+ 2 - 0
README

@@ -1,3 +1,5 @@
+https://://www.bananas-playground.net/projekt/aranea
+
 A small web crawler named aranea (Latin for spider).
 A small web crawler named aranea (Latin for spider).
 The aim is to gather unique domains to show what is out there.
 The aim is to gather unique domains to show what is out there.
 
 

+ 1 - 1
cleanup.pl

@@ -7,7 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
 #
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 
 use 5.20.0;
 use 5.20.0;
 use strict;
 use strict;

+ 1 - 0
config.txt

@@ -9,6 +9,7 @@ UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;
 UA_LANG="en-US"
 UA_LANG="en-US"
 UA_CACHE="no-cache"
 UA_CACHE="no-cache"
 
 
+FETCH_URLS_PER_RUN=5000
 FETCH_URLS_PER_PACKAGE=30
 FETCH_URLS_PER_PACKAGE=30
 PARSE_FILES_PER_PACKAGE=50
 PARSE_FILES_PER_PACKAGE=50
 CLEANUP_URLS_AMOUNT_ABOVE=40
 CLEANUP_URLS_AMOUNT_ABOVE=40

+ 2 - 3
fetch.pl

@@ -7,8 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
 #
-# 2022 https://://www.bananas-playground.net
-
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 
 use 5.20.0;
 use 5.20.0;
 use strict;
 use strict;
@@ -49,7 +48,7 @@ my $query = $dbh->prepare("SELECT `id`, `url`
 							WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
 							WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
 								OR `last_fetched` IS NULL
 								OR `last_fetched` IS NULL
 								AND `fetch_failed` = 0
 								AND `fetch_failed` = 0
-							LIMIT 5000");
+							LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
 $query->execute();
 $query->execute();
 while(my @row = $query->fetchrow_array) {
 while(my @row = $query->fetchrow_array) {
 	$urlsToFetch{$row[0]} = $row[1];
 	$urlsToFetch{$row[0]} = $row[1];

+ 9 - 0
lib/Aranea/Common.pm

@@ -1,3 +1,12 @@
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
+#
+# You should have received a copy of the
+# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
+# along with this program.  If not, see http://www.sun.com/cddl/cddl.html
+#
+# 2022 https://://www.bananas-playground.net/projekt/aranea
+
 package Aranea::Common;
 package Aranea::Common;
 use 5.20.0;
 use 5.20.0;
 use strict;
 use strict;

+ 1 - 1
parse-results.pl

@@ -7,7 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
 #
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 
 use 5.20.0;
 use 5.20.0;
 use strict;
 use strict;

+ 3 - 2
setup.sql

@@ -113,7 +113,8 @@ INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
 (51, 'bitcoin:', '2022-01-16 19:48:41'),
 (51, 'bitcoin:', '2022-01-16 19:48:41'),
 (52, 'webcal:', '2022-05-08 09:39:02'),
 (52, 'webcal:', '2022-05-08 09:39:02'),
 (53, 'source:', '2022-05-08 09:43:19'),
 (53, 'source:', '2022-05-08 09:43:19'),
-(54, 'phone', '2022-05-08 09:44:19');
+(54, 'phone:', '2022-05-08 09:44:19'),
+(55, 'threema:', '2022-05-08 09:45:19');
 
 
 --
 --
 -- Indexes for dumped tables
 -- Indexes for dumped tables
@@ -156,7 +157,7 @@ ALTER TABLE `unique_domain`
 -- AUTO_INCREMENT for table `url_to_ignore`
 -- AUTO_INCREMENT for table `url_to_ignore`
 --
 --
 ALTER TABLE `url_to_ignore`
 ALTER TABLE `url_to_ignore`
-  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=55;
+  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
 COMMIT;
 COMMIT;
 
 
 /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
 /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;