]> 91.132.146.200 Git - aranea.git/commitdiff
project cleanup and updated project website links
authorBanana <mail@bananas-playground.net>
Sun, 8 May 2022 08:07:04 +0000 (10:07 +0200)
committerBanana <mail@bananas-playground.net>
Sun, 8 May 2022 08:07:04 +0000 (10:07 +0200)
README
cleanup.pl
config.txt
fetch.pl
lib/Aranea/Common.pm
parse-results.pl
setup.sql

diff --git a/README b/README
index 2f628fe920746fe33683d8cec9e80091cd1b51a3..7a1866105941b907e52e2480dde10f948f06f637 100644 (file)
--- a/README
+++ b/README
@@ -1,3 +1,5 @@
+https://://www.bananas-playground.net/projekt/aranea
+
 A small web crawler named aranea (Latin for spider).
 The aim is to gather unique domains to show what is out there.
 
index 339987abeeeb8a35eaa9a01a55618181b78773c3..eee4432b7df0afb8a6440d6574e1029c2997cc48 100644 (file)
@@ -7,7 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 use 5.20.0;
 use strict;
index 1473c12144e4a6b4904c5a70c9b2044ddf898f9b..cd2c4317e0fcdf7a1d7d59fed995d16fd72c5369 100644 (file)
@@ -9,6 +9,7 @@ UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;
 UA_LANG="en-US"
 UA_CACHE="no-cache"
 
+FETCH_URLS_PER_RUN=5000
 FETCH_URLS_PER_PACKAGE=30
 PARSE_FILES_PER_PACKAGE=50
 CLEANUP_URLS_AMOUNT_ABOVE=40
\ No newline at end of file
index 04e7e576dda9cfa1149f77acac629366525cf866..663fd8640083246b362be1a8c85c9c5ee22faa89 100644 (file)
--- a/fetch.pl
+++ b/fetch.pl
@@ -7,8 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
-# 2022 https://://www.bananas-playground.net
-
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 use 5.20.0;
 use strict;
@@ -49,7 +48,7 @@ my $query = $dbh->prepare("SELECT `id`, `url`
                                                        WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
                                                                OR `last_fetched` IS NULL
                                                                AND `fetch_failed` = 0
-                                                       LIMIT 5000");
+                                                       LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
 $query->execute();
 while(my @row = $query->fetchrow_array) {
        $urlsToFetch{$row[0]} = $row[1];
index 4d8ec9a9cc547d4ac3a5b464e7f2502ea64033e9..3bce2c5585ccf8e4a7f3082aa1c83ace691bf35b 100644 (file)
@@ -1,3 +1,12 @@
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
+#
+# You should have received a copy of the
+# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
+# along with this program.  If not, see http://www.sun.com/cddl/cddl.html
+#
+# 2022 https://://www.bananas-playground.net/projekt/aranea
+
 package Aranea::Common;
 use 5.20.0;
 use strict;
index 832521849363b79576f1d2e16fde4b66cdf35dbb..e66188f64e79ddee20f1126ba21c90fa1f9a839f 100644 (file)
@@ -7,7 +7,7 @@
 # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 # along with this program.  If not, see http://www.sun.com/cddl/cddl.html
 #
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
 
 use 5.20.0;
 use strict;
index b2e37dec1ecd1818afdb133274b37e18dfb9ded3..7a349e6cc34a401b9d6485bf84bc9b363ae75c66 100644 (file)
--- a/setup.sql
+++ b/setup.sql
@@ -113,7 +113,8 @@ INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
 (51, 'bitcoin:', '2022-01-16 19:48:41'),
 (52, 'webcal:', '2022-05-08 09:39:02'),
 (53, 'source:', '2022-05-08 09:43:19'),
-(54, 'phone', '2022-05-08 09:44:19');
+(54, 'phone:', '2022-05-08 09:44:19'),
+(55, 'threema:', '2022-05-08 09:45:19');
 
 --
 -- Indexes for dumped tables
@@ -156,7 +157,7 @@ ALTER TABLE `unique_domain`
 -- AUTO_INCREMENT for table `url_to_ignore`
 --
 ALTER TABLE `url_to_ignore`
-  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=55;
+  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
 COMMIT;
 
 /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;