From cfdca6000e1fb108da902cebb34ee94124105553 Mon Sep 17 00:00:00 2001 From: Banana Date: Sun, 8 May 2022 10:07:04 +0200 Subject: [PATCH] project cleanup and updated project website links --- README | 2 ++ cleanup.pl | 2 +- config.txt | 1 + fetch.pl | 5 ++--- lib/Aranea/Common.pm | 9 +++++++++ parse-results.pl | 2 +- setup.sql | 5 +++-- 7 files changed, 19 insertions(+), 7 deletions(-) diff --git a/README b/README index 2f628fe..7a18661 100644 --- a/README +++ b/README @@ -1,3 +1,5 @@ +https://://www.bananas-playground.net/projekt/aranea + A small web crawler named aranea (Latin for spider). The aim is to gather unique domains to show what is out there. diff --git a/cleanup.pl b/cleanup.pl index 339987a..eee4432 100644 --- a/cleanup.pl +++ b/cleanup.pl @@ -7,7 +7,7 @@ # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 # along with this program. If not, see http://www.sun.com/cddl/cddl.html # -# 2022 https://://www.bananas-playground.net +# 2022 https://://www.bananas-playground.net/projekt/aranea use 5.20.0; use strict; diff --git a/config.txt b/config.txt index 1473c12..cd2c431 100644 --- a/config.txt +++ b/config.txt @@ -9,6 +9,7 @@ UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*; UA_LANG="en-US" UA_CACHE="no-cache" +FETCH_URLS_PER_RUN=5000 FETCH_URLS_PER_PACKAGE=30 PARSE_FILES_PER_PACKAGE=50 CLEANUP_URLS_AMOUNT_ABOVE=40 \ No newline at end of file diff --git a/fetch.pl b/fetch.pl index 04e7e57..663fd86 100644 --- a/fetch.pl +++ b/fetch.pl @@ -7,8 +7,7 @@ # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 # along with this program. If not, see http://www.sun.com/cddl/cddl.html # -# 2022 https://://www.bananas-playground.net - +# 2022 https://://www.bananas-playground.net/projekt/aranea use 5.20.0; use strict; @@ -49,7 +48,7 @@ my $query = $dbh->prepare("SELECT `id`, `url` WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK OR `last_fetched` IS NULL AND `fetch_failed` = 0 - LIMIT 5000"); + LIMIT ".$config->get("FETCH_URLS_PER_RUN")); $query->execute(); while(my @row = $query->fetchrow_array) { $urlsToFetch{$row[0]} = $row[1]; diff --git a/lib/Aranea/Common.pm b/lib/Aranea/Common.pm index 4d8ec9a..3bce2c5 100644 --- a/lib/Aranea/Common.pm +++ b/lib/Aranea/Common.pm @@ -1,3 +1,12 @@ +# This program is free software: you can redistribute it and/or modify +# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE +# +# You should have received a copy of the +# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 +# along with this program. If not, see http://www.sun.com/cddl/cddl.html +# +# 2022 https://://www.bananas-playground.net/projekt/aranea + package Aranea::Common; use 5.20.0; use strict; diff --git a/parse-results.pl b/parse-results.pl index 8325218..e66188f 100644 --- a/parse-results.pl +++ b/parse-results.pl @@ -7,7 +7,7 @@ # COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 # along with this program. If not, see http://www.sun.com/cddl/cddl.html # -# 2022 https://://www.bananas-playground.net +# 2022 https://://www.bananas-playground.net/projekt/aranea use 5.20.0; use strict; diff --git a/setup.sql b/setup.sql index b2e37de..7a349e6 100644 --- a/setup.sql +++ b/setup.sql @@ -113,7 +113,8 @@ INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES (51, 'bitcoin:', '2022-01-16 19:48:41'), (52, 'webcal:', '2022-05-08 09:39:02'), (53, 'source:', '2022-05-08 09:43:19'), -(54, 'phone', '2022-05-08 09:44:19'); +(54, 'phone:', '2022-05-08 09:44:19'), +(55, 'threema:', '2022-05-08 09:45:19'); -- -- Indexes for dumped tables @@ -156,7 +157,7 @@ ALTER TABLE `unique_domain` -- AUTO_INCREMENT for table `url_to_ignore` -- ALTER TABLE `url_to_ignore` - MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=55; + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56; COMMIT; /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; -- 2.39.5