+https://://www.bananas-playground.net/projekt/aranea
+
A small web crawler named aranea (Latin for spider).
The aim is to gather unique domains to show what is out there.
# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
# along with this program. If not, see http://www.sun.com/cddl/cddl.html
#
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
use 5.20.0;
use strict;
UA_LANG="en-US"
UA_CACHE="no-cache"
+FETCH_URLS_PER_RUN=5000
FETCH_URLS_PER_PACKAGE=30
PARSE_FILES_PER_PACKAGE=50
CLEANUP_URLS_AMOUNT_ABOVE=40
\ No newline at end of file
# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
# along with this program. If not, see http://www.sun.com/cddl/cddl.html
#
-# 2022 https://://www.bananas-playground.net
-
+# 2022 https://://www.bananas-playground.net/projekt/aranea
use 5.20.0;
use strict;
WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
OR `last_fetched` IS NULL
AND `fetch_failed` = 0
- LIMIT 5000");
+ LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
$query->execute();
while(my @row = $query->fetchrow_array) {
$urlsToFetch{$row[0]} = $row[1];
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
+#
+# You should have received a copy of the
+# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
+# along with this program. If not, see http://www.sun.com/cddl/cddl.html
+#
+# 2022 https://://www.bananas-playground.net/projekt/aranea
+
package Aranea::Common;
use 5.20.0;
use strict;
# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
# along with this program. If not, see http://www.sun.com/cddl/cddl.html
#
-# 2022 https://://www.bananas-playground.net
+# 2022 https://://www.bananas-playground.net/projekt/aranea
use 5.20.0;
use strict;
(51, 'bitcoin:', '2022-01-16 19:48:41'),
(52, 'webcal:', '2022-05-08 09:39:02'),
(53, 'source:', '2022-05-08 09:43:19'),
-(54, 'phone', '2022-05-08 09:44:19');
+(54, 'phone:', '2022-05-08 09:44:19'),
+(55, 'threema:', '2022-05-08 09:45:19');
--
-- Indexes for dumped tables
-- AUTO_INCREMENT for table `url_to_ignore`
--
ALTER TABLE `url_to_ignore`
- MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=55;
+ MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
COMMIT;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;