and part of domains which will be ignored. Adding a global SPAM list would be overkill.
A good idea is to run it with a DNS filter, which has a good blocklist.
+
+# Webinterface
+
+The folder `webroot` does contain a webinterface which displays the gathered data and status.
+It does not provide a way to execute the crawler.
+++ /dev/null
-#!/usr/bin/perl -w
-
-# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Data::Dumper;
-use Term::ANSIColor qw(:constants);
-
-use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
-
-use DBI;
-use ConfigReader::Simple;
-use URI::URL;
-use Data::Validate::URI qw(is_uri);
-
-
-my $DEBUG = 0;
-my $config = ConfigReader::Simple->new("config.txt");
-die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-
-## DB connection
-my %dbAttr = (
- PrintError=>0,# turn off error reporting via warn()
- RaiseError=>1 # turn on error reporting via die()
-);
-my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
-my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-
-
-
-# update the unique domains
-my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
-sayLog($queryStr) if $DEBUG;
-my $query = $dbh->prepare($queryStr);
-$query->execute();
-
-# now validate the unique ones
-$queryStr = "SELECT `id`, `url` FROM unique_domain";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-$query->execute();
-my @invalidUrls = ();
-my @toBeDeletedFromFetchAgain = ();
-while(my @row = $query->fetchrow_array) {
- my $link = $row[1];
- my $id = $row[0];
- if(!is_uri($link)) {
- sayYellow "Ignore URL it is invalid: $link";
- push(@invalidUrls, $id);
- push(@toBeDeletedFromFetchAgain, $link);
- next;
- }
-
- my $url = url($link);
- if(!defined($url->scheme) || index($url->scheme,"http") == -1) {
- sayYellow "Ignore URL because of scheme: $link";
- push(@invalidUrls, $id);
- push(@toBeDeletedFromFetchAgain, $link);
- next;
- }
-}
-
-sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
-$queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-foreach my $invalidId (@invalidUrls) {
- $query->execute($invalidId);
- #$query->finish();
- sayLog "Removed $invalidId from unique_domain" if $DEBUG;
-}
-sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
-
-
-# remove urls from fetch since we have enough already
-$queryStr = "SELECT count(baseurl) AS amount, baseurl
- FROM `url_to_fetch`
- WHERE last_fetched <> 0
- GROUP BY baseurl
- HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-$query->execute();
-while(my @row = $query->fetchrow_array) {
- my $baseUrl = $row[1];
- push(@toBeDeletedFromFetchAgain, $baseUrl);
-}
-#$query->finish();
-
-sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
-$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
- $query->execute($baseUrl);
- #$query->finish();
- sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
-}
-sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
-
-# remove failed fetches
-sayYellow "Remove fetch_failed";
-$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1";
-$query = $dbh->prepare($queryStr);
-$query->execute();
-sayGreen "Remove fetch_failed done";
-
-sayYellow "Remove invalid urls which the is_uri check does let pass";
-$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'";
-$query = $dbh->prepare($queryStr);
-$query->execute();
-$queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'";
-$query = $dbh->prepare($queryStr);
-$query->execute();
-sayYellow "Remove invalid urls done";
-
-
-sayGreen "Cleanup complete";
+++ /dev/null
-DB_HOST=localhost
-DB_PORT=3306
-DB_NAME=aranea
-DB_USER=user
-DB_PASS=test
-
-UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
-UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
-UA_LANG="en-US"
-UA_CACHE="no-cache"
-UA_TIMEOUT=5
-
-FETCH_URLS_PER_RUN=5000
-FETCH_URLS_PER_PACKAGE=30
-PARSE_FILES_PER_PACKAGE=50
-CLEANUP_URLS_AMOUNT_ABOVE=40
-MAX_BYTES_PER_PAGE=5000000
--- /dev/null
+#!/usr/bin/perl -w
+
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Data::Dumper;
+use Term::ANSIColor qw(:constants);
+
+use lib './lib';
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+
+use DBI;
+use ConfigReader::Simple;
+use URI::URL;
+use Data::Validate::URI qw(is_uri);
+
+
+my $DEBUG = 0;
+my $config = ConfigReader::Simple->new("config.txt");
+die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+
+## DB connection
+my %dbAttr = (
+ PrintError=>0,# turn off error reporting via warn()
+ RaiseError=>1 # turn on error reporting via die()
+);
+my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
+my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
+die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+
+
+
+# update the unique domains
+my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
+sayLog($queryStr) if $DEBUG;
+my $query = $dbh->prepare($queryStr);
+$query->execute();
+
+# now validate the unique ones
+$queryStr = "SELECT `id`, `url` FROM unique_domain";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+$query->execute();
+my @invalidUrls = ();
+my @toBeDeletedFromFetchAgain = ();
+while(my @row = $query->fetchrow_array) {
+ my $link = $row[1];
+ my $id = $row[0];
+ if(!is_uri($link)) {
+ sayYellow "Ignore URL it is invalid: $link";
+ push(@invalidUrls, $id);
+ push(@toBeDeletedFromFetchAgain, $link);
+ next;
+ }
+
+ my $url = url($link);
+ if(!defined($url->scheme) || index($url->scheme,"http") == -1) {
+ sayYellow "Ignore URL because of scheme: $link";
+ push(@invalidUrls, $id);
+ push(@toBeDeletedFromFetchAgain, $link);
+ next;
+ }
+}
+
+sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
+$queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+foreach my $invalidId (@invalidUrls) {
+ $query->execute($invalidId);
+ #$query->finish();
+ sayLog "Removed $invalidId from unique_domain" if $DEBUG;
+}
+sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
+
+
+# remove urls from fetch since we have enough already
+$queryStr = "SELECT count(baseurl) AS amount, baseurl
+ FROM `url_to_fetch`
+ WHERE last_fetched <> 0
+ GROUP BY baseurl
+ HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+$query->execute();
+while(my @row = $query->fetchrow_array) {
+ my $baseUrl = $row[1];
+ push(@toBeDeletedFromFetchAgain, $baseUrl);
+}
+#$query->finish();
+
+sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
+$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
+ $query->execute($baseUrl);
+ #$query->finish();
+ sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
+}
+sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
+
+# remove failed fetches
+sayYellow "Remove fetch_failed";
+$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+sayGreen "Remove fetch_failed done";
+
+sayYellow "Remove invalid urls which the is_uri check does let pass";
+$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+$queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+sayYellow "Remove invalid urls done";
+
+
+sayGreen "Cleanup complete";
--- /dev/null
+DB_HOST=localhost
+DB_PORT=3306
+DB_NAME=aranea
+DB_USER=user
+DB_PASS=test
+
+UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
+UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
+UA_LANG="en-US"
+UA_CACHE="no-cache"
+UA_TIMEOUT=5
+
+FETCH_URLS_PER_RUN=5000
+FETCH_URLS_PER_PACKAGE=30
+PARSE_FILES_PER_PACKAGE=50
+CLEANUP_URLS_AMOUNT_ABOVE=40
+MAX_BYTES_PER_PAGE=5000000
--- /dev/null
+# Requirements
+
+Please check the requirements file first.
+
+# Database
+
+You need a MySQL installation and an existing database.
+
+Use `setup.sql` to create the tables into your existing database: `mysql --user=user -p databasename < setup.sql`
+
+# Config
+
+Copy `config.default.txt` to `config.txt` and edit at least to match the database name and server settings.
+
+Make sure the directory `storage` can be written.
--- /dev/null
+# MySQL
+
+Tested with a MySQL server 8.+
+
+# Perl modules
+
+Extra modules along with the more already installed ones.
+
++ [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
+
+## Debian
+
+Those are the ones which needed to be installed after a fresh debian(stable) install. May vary.
+
++ libdatetime-perl
++ libdbi-perl
++ libconfigreader-simple-perl
++ libhtml-linkextractor-perl
++ libdata-validate-uri-perl
++ libdbd-mysql-perl
++ libwww-perl
--- /dev/null
+SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
+START TRANSACTION;
+SET time_zone = "+00:00";
+
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!40101 SET NAMES utf8mb4 */;
+
+--
+-- Table structure for table `unique_domain`
+--
+
+CREATE TABLE `unique_domain` (
+ `id` int(11) NOT NULL,
+ `url` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+ `created` datetime NOT NULL DEFAULT current_timestamp()
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `url_to_fetch`
+--
+
+CREATE TABLE `url_to_fetch` (
+ `id` char(32) COLLATE utf8mb4_bin NOT NULL,
+ `url` text COLLATE utf8mb4_bin NOT NULL,
+ `baseurl` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+ `created` datetime NOT NULL,
+ `last_fetched` datetime DEFAULT NULL,
+ `fetch_failed` tinyint(1) NOT NULL DEFAULT 0
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `url_to_ignore`
+--
+
+CREATE TABLE `url_to_ignore` (
+ `id` int(11) NOT NULL,
+ `searchfor` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+ `created` datetime NOT NULL DEFAULT current_timestamp()
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
+
+--
+-- Dumping data for table `url_to_ignore`
+--
+
+INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
+(1, 'mailto:', '2022-01-05 10:46:10'),
+(2, 'javascript:', '2022-01-05 10:46:10'),
+(3, 'google.', '2022-01-05 10:46:29'),
+(4, 'amazon.', '2022-01-05 10:46:29'),
+(5, 'youtube.', '2022-01-05 10:46:47'),
+(6, '.onion', '2022-01-05 17:21:45'),
+(7, 'instagram.', '2022-01-05 20:15:21'),
+(8, 'twitter.', '2022-01-05 20:16:31'),
+(9, 'facebook.', '2022-01-05 20:16:31'),
+(10, 'skype:', '2022-01-05 21:29:53'),
+(11, 'xmpp:', '2022-01-05 21:30:22'),
+(12, 'tel:', '2022-01-05 21:30:50'),
+(13, 'fax:', '2022-01-05 21:30:50'),
+(14, 'whatsapp:', '2022-01-05 21:31:24'),
+(15, 'intent:', '2022-01-05 21:31:24'),
+(16, 'ftp:', '2022-01-05 21:33:34'),
+(17, 'youtu.', '2022-01-05 21:50:26'),
+(18, 'pinterest.', '2022-01-05 21:51:31'),
+(19, 'microsoft.', '2022-01-05 21:52:30'),
+(20, 'apple.', '2022-01-05 21:52:30'),
+(21, 'xing.', '2022-01-05 22:03:07'),
+(22, 'linked.', '2022-01-05 22:03:07'),
+(26, 't.co', '2022-01-05 22:05:07'),
+(27, 'tinyurl.', '2022-01-05 22:07:03'),
+(28, 'bitly.', '2022-01-05 22:07:03'),
+(29, 'bit.ly', '2022-01-05 22:07:23'),
+(30, 'wikipedia.', '2022-01-06 09:58:46'),
+(31, 'gstatic.', '2022-01-06 09:59:47'),
+(32, 'wikimedia.', '2022-01-06 10:00:20'),
+(33, 'goo.', '2022-01-06 10:02:11'),
+(34, 'cdn.', '2022-01-06 10:02:59'),
+(35, 'flickr.', '2022-01-06 10:05:46'),
+(36, '.mp3', '2022-01-07 13:11:49'),
+(40, '.aac', '2022-01-08 13:33:22'),
+(41, '.opus', '2022-01-08 13:33:22'),
+(42, 'awin1.', '2022-01-08 13:39:14'),
+(43, 'sms:', '2022-01-09 10:32:46'),
+(45, 'hhttps:', '2022-01-09 12:20:43'),
+(46, 'httpss:', '2022-01-09 13:12:34'),
+(47, 'soundcloud.', '2022-01-16 10:37:04'),
+(48, 'fb-messenger:', '2022-01-16 14:42:18'),
+(49, 'smartadserver.', '2022-01-16 16:48:46'),
+(50, 'ispgateway.', '2022-01-16 16:56:11'),
+(51, 'bitcoin:', '2022-01-16 19:48:41'),
+(52, 'webcal:', '2022-05-08 09:39:02'),
+(53, 'source:', '2022-05-08 09:43:19'),
+(54, 'phone:', '2022-05-08 09:44:19'),
+(55, 'threema:', '2022-05-08 09:45:19');
+
+--
+-- Indexes for dumped tables
+--
+
+--
+-- Indexes for table `unique_domain`
+--
+ALTER TABLE `unique_domain`
+ ADD PRIMARY KEY (`id`),
+ ADD UNIQUE KEY `url` (`url`);
+
+--
+-- Indexes for table `url_to_fetch`
+--
+ALTER TABLE `url_to_fetch`
+ ADD PRIMARY KEY (`id`),
+ ADD KEY `baseurl` (`baseurl`),
+ ADD KEY `last_fetched` (`last_fetched`),
+ ADD KEY `fetch_failed` (`fetch_failed`);
+
+--
+-- Indexes for table `url_to_ignore`
+--
+ALTER TABLE `url_to_ignore`
+ ADD PRIMARY KEY (`id`),
+ ADD UNIQUE KEY `url` (`searchfor`);
+
+--
+-- AUTO_INCREMENT for dumped tables
+--
+
+--
+-- AUTO_INCREMENT for table `unique_domain`
+--
+ALTER TABLE `unique_domain`
+ MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `url_to_ignore`
+--
+ALTER TABLE `url_to_ignore`
+ MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
+COMMIT;
+
+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
--- /dev/null
+# Upgrade information
+
+Each release has its own upgrade file.
+Follow the instructions in each file step by step, starting from your current one.
+
+Make sure paths and sql table names are modified to your settings.
--- /dev/null
+#!/usr/bin/perl -w
+
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Data::Dumper;
+use Term::ANSIColor qw(:constants);
+
+use lib './lib';
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+
+use open qw( :std :encoding(UTF-8) );
+use DBI;
+use ConfigReader::Simple;
+use LWP::UserAgent;
+use HTTP::Request;
+
+
+my $DEBUG = 0;
+my $config = ConfigReader::Simple->new("config.txt");
+die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+
+
+## DB connection
+my %dbAttr = (
+ PrintError=>0,# turn off error reporting via warn()
+ RaiseError=>1, # turn on error reporting via die()
+ AutoCommit=>0 # manually use transactions
+);
+my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
+my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
+die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+
+
+## fetch the urls to fetch from the table
+my %urlsToFetch;
+my $query = $dbh->prepare("SELECT `id`, `url`
+ FROM `url_to_fetch`
+ WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
+ OR `last_fetched` IS NULL
+ AND `fetch_failed` = 0
+ LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
+$query->execute();
+while(my @row = $query->fetchrow_array) {
+ $urlsToFetch{$row[0]} = $row[1];
+}
+
+# successful fetches
+my @urlsFetched;
+my @urlsFailed;
+
+# config the user agent for the request
+my $request_headers = [
+ 'User-Agent' => $config->get("UA_AGENT"),
+ 'Accept' => $config->get("UA_ACCEPT"),
+ 'Accept-Language' => $config->get("UA_LANG"),
+ 'Accept-Encoding' => HTTP::Message::decodable,
+ 'Cache-Control' => $config->get("UA_CACHE")
+];
+my $ua = LWP::UserAgent->new();
+$ua->timeout($config->get("UA_TIMEOUT"));
+$ua->max_size($config->get("MAX_BYTES_PER_PAGE"));
+
+## now loop over them and store the results
+my $counter = 0;
+while ( my ($id, $url) = each %urlsToFetch ) {
+ sayYellow "Fetching: $id $url";
+
+ my $req = HTTP::Request->new(GET => $url, $request_headers);
+ my $res = $ua->request($req);
+ if ($res->is_success) {
+ # callback tells us to stop
+ if($res->header('Client-Aborted')) {
+ sayYellow "Aborted, too big.";
+ next;
+ }
+ if(index($res->content_type, "text/html") == -1) {
+ sayYellow "Fetching: $id ignored. Not html";
+ push(@urlsFailed, $id);
+ next;
+ }
+ open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
+ print $fh $res->decoded_content();
+ close($fh);
+ push(@urlsFetched, $id);
+ sayGreen"Fetching: $id ok";
+ }
+ else {
+ sayRed "Fetching: $id failed: $res->code ".$res->status_line;
+ push(@urlsFailed, $id);
+ }
+
+ if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
+ updateFetched($dbh, @urlsFetched);
+ updateFailed($dbh, @urlsFailed);
+ sleep(rand(7));
+
+ $counter = 0;
+ @urlsFetched = ();
+ @urlsFailed = ();
+ }
+
+ $counter++;
+}
+updateFetched($dbh, @urlsFetched);
+updateFailed($dbh, @urlsFailed);
+
+$dbh->disconnect();
+sayGreen "Fetch complete";
+
+
+
+## update last_fetched in the table
+sub updateFetched {
+ my ($dbh, @urls) = @_;
+ sayYellow "Update fetch timestamps: ".scalar @urls;
+ $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
+ foreach my $idToUpdate (@urls) {
+ sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
+ $query->bind_param(1,$idToUpdate);
+ $query->execute();
+ }
+ $dbh->commit();
+ sayGreen "Update fetch timestamps done";
+}
+
+## update fetch_failed in the table
+sub updateFailed {
+ my ($dbh, @urls) = @_;
+
+ sayYellow "Update fetch failed: ".scalar @urls;
+ $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
+ foreach my $idToUpdate (@urls) {
+ sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
+ $query->bind_param(1,$idToUpdate);
+ $query->execute();
+ }
+ $dbh->commit();
+ sayGreen "Update fetch failed done";
+}
--- /dev/null
+# 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+package Aranea::Common;
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Term::ANSIColor qw(:constants);
+
+use DateTime;
+use Exporter qw(import);
+
+
+our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed);
+
+sub sayLog {
+ my ($string) = @_;
+ my $dt = DateTime->now;
+ say "[".$dt->datetime."] DEBUG: ".$string;
+}
+
+sub sayYellow {
+ my ($string) = @_;
+ my $dt = DateTime->now;
+ say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET;
+}
+
+sub sayGreen {
+ my ($string) = @_;
+ my $dt = DateTime->now;
+ say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET;
+}
+
+sub sayRed {
+ my ($string) = @_;
+ my $dt = DateTime->now;
+ say BOLD, RED, "[".$dt->datetime."] ".$string, RESET;
+}
+
+1;
--- /dev/null
+#!/usr/bin/perl -w
+
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Data::Dumper;
+use Term::ANSIColor qw(:constants);
+
+use lib './lib';
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+
+use open qw( :std :encoding(UTF-8) );
+use DBI;
+use ConfigReader::Simple;
+use HTML::LinkExtor;
+use URI::URL;
+use File::Basename;
+use Digest::MD5 qw(md5_hex);
+use Data::Validate::URI qw(is_uri);
+
+my $DEBUG = 0;
+my $config = ConfigReader::Simple->new("config.txt");
+die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+
+## DB connection
+my %dbAttr = (
+ PrintError=>0,# turn off error reporting via warn()
+ RaiseError=>1, # turn on error reporting via die()
+ AutoCommit=>0 # manually use transactions
+);
+my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
+my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
+die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+
+
+## get the fetched files
+my @results = glob("storage/*.result");
+die "Nothing to parse. No files found." unless(@results);
+
+## build clean ids for query
+my @queryIds = @results;
+foreach (@queryIds) {
+ $_ =~ s/.result//g;
+ $_ =~ s|storage/||g;
+}
+
+# get the baseurls
+my %baseUrls;
+my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
+sayLog($queryStr) if $DEBUG;
+my $query = $dbh->prepare($queryStr);
+$query->execute(@queryIds);
+while(my @row = $query->fetchrow_array) {
+ $baseUrls{$row[0]} = $row[1];
+}
+
+
+# get the string to ignore
+my @urlStringsToIgnore;
+$queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+$query->execute();
+while(my @row = $query->fetchrow) {
+ push(@urlStringsToIgnore, $row[0])
+}
+
+
+## prepare linkExtor
+my @links = ();
+my @workingLinks = ();
+sub leCallback {
+ my($tag, %attr) = @_;
+ return if $tag ne 'a'; # we only look closer at <a ...>
+ push(@workingLinks, values %attr);
+}
+my $le = HTML::LinkExtor->new(\&leCallback);
+
+## now parse each file and get the links
+my $counter = 0;
+foreach my $resultFile (@results) {
+ sayYellow "Parsing file: $resultFile";
+
+ my $fileId = basename($resultFile,".result");
+
+ if (exists $baseUrls{$fileId}) {
+ sayYellow "Baseurl: $baseUrls{$fileId}";
+
+ $le->parse_file($resultFile);
+ @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
+ push(@links,@workingLinks);
+
+ unlink($resultFile);
+ sayGreen "Parsing done: ".scalar @workingLinks;
+ }
+ else {
+ sayRed "No entry found for file $resultFile";
+ }
+
+ if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
+
+ @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
+ insertIntoDb($dbh, \@links);
+
+ $counter = 0;
+ @links = ();
+ }
+
+ @workingLinks = ();
+ $counter++;
+}
+
+@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
+insertIntoDb($dbh, \@links);
+
+$dbh->disconnect();
+sayGreen "Parse complete";
+
+
+## cleanup the found links
+sub cleanLinks {
+ my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
+ my @linkArray = @{ $linkArray };
+ my @urlStringsToIgnore = @{ $urlStringsToIgnore };
+
+ sayYellow "Clean found links: ".scalar @linkArray;
+ foreach my $toSearch (@urlStringsToIgnore) {
+ sayYellow "Clean links from: ".$toSearch;
+ @linkArray = grep {!/$toSearch/i} @linkArray;
+ }
+ sayGreen "Cleaned found links: ".scalar @linkArray;
+
+ return @linkArray;
+}
+
+
+## update the DB with the new found links
+sub insertIntoDb {
+ my ($dbh, $links) = @_;
+ my @links = @{ $links };
+
+ sayYellow "Insert links into DB: ".scalar @links;
+ $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET
+ `id` = ?,
+ `url` = ?,
+ `baseurl` = ?,
+ `created` = NOW()";
+ sayLog $queryStr if $DEBUG;
+ $query = $dbh->prepare($queryStr);
+ my $md5 = Digest::MD5->new;
+ my $counter = 0;
+ foreach my $link (@links) {
+
+ sayLog $link if ($DEBUG);
+
+ if(!is_uri($link)) {
+ sayYellow "Ignore URL it is invalid: $link";
+ next;
+ }
+
+ my $url = url($link);
+ if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
+ sayYellow "Ignore URL because of scheme: $link";
+ next;
+ }
+
+ $md5->add($link);
+ my $digest = $md5->hexdigest;
+ $query->execute($digest, $link, $url->scheme."://".$url->host);
+ $md5->reset;
+
+ $counter++;
+
+ if($counter >= 500) {
+ $counter = 0;
+ sayYellow "Commit counter of 500 reached. Commiting";
+ $dbh->commit();
+ }
+
+ #sayLog $digest if ($DEBUG);
+ #sayLog $url->scheme if ($DEBUG);
+ #sayLog $url->host if ($DEBUG);
+ #sayLog $query->{Statement} if ($DEBUG);
+ #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
+
+ #sayLog "Inserted: $link" if($DEBUG);
+ }
+ sayYellow "Final commit";
+ $dbh->commit();
+}
--- /dev/null
+*
+!.gitignore
+++ /dev/null
-# Requirements
-
-Please check the requirements file first.
-
-# Database
-
-You need a MySQL installation and a user which can create a database.
-
-Use `setup.sql` to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql`
-
-# Config
-
-Copy `config.default.txt` to `config.txt` and edit at least to match the database server settings.
-
-Make sure the directory `storage` can be written.
+++ /dev/null
-# MySQL
-
-Tested with a MySQL server 8.+
-
-# Perl modules
-
-Extra modules along with the more already installed ones.
-
-+ [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
-+ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
-
-## Debian
-
-Those are the ones which needed to be installed after a fresh debian(stable) install. May vary.
-
-+ libdatetime-perl
-+ libdbi-perl
-+ libconfigreader-simple-perl
-+ libhtml-linkextractor-perl
-+ libdata-validate-uri-perl
-+ libdbd-mysql-perl
-+ libwww-perl
+++ /dev/null
-SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
-START TRANSACTION;
-SET time_zone = "+00:00";
-
-
-/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
-/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
-/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
-/*!40101 SET NAMES utf8mb4 */;
-
---
--- Table structure for table `unique_domain`
---
-
-CREATE TABLE `unique_domain` (
- `id` int(11) NOT NULL,
- `url` varchar(255) COLLATE utf8mb4_bin NOT NULL,
- `created` datetime NOT NULL DEFAULT current_timestamp()
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
-
--- --------------------------------------------------------
-
---
--- Table structure for table `url_to_fetch`
---
-
-CREATE TABLE `url_to_fetch` (
- `id` char(32) COLLATE utf8mb4_bin NOT NULL,
- `url` text COLLATE utf8mb4_bin NOT NULL,
- `baseurl` varchar(255) COLLATE utf8mb4_bin NOT NULL,
- `created` datetime NOT NULL,
- `last_fetched` datetime DEFAULT NULL,
- `fetch_failed` tinyint(1) NOT NULL DEFAULT 0
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
-
--- --------------------------------------------------------
-
---
--- Table structure for table `url_to_ignore`
---
-
-CREATE TABLE `url_to_ignore` (
- `id` int(11) NOT NULL,
- `searchfor` varchar(255) COLLATE utf8mb4_bin NOT NULL,
- `created` datetime NOT NULL DEFAULT current_timestamp()
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
-
---
--- Dumping data for table `url_to_ignore`
---
-
-INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
-(1, 'mailto:', '2022-01-05 10:46:10'),
-(2, 'javascript:', '2022-01-05 10:46:10'),
-(3, 'google.', '2022-01-05 10:46:29'),
-(4, 'amazon.', '2022-01-05 10:46:29'),
-(5, 'youtube.', '2022-01-05 10:46:47'),
-(6, '.onion', '2022-01-05 17:21:45'),
-(7, 'instagram.', '2022-01-05 20:15:21'),
-(8, 'twitter.', '2022-01-05 20:16:31'),
-(9, 'facebook.', '2022-01-05 20:16:31'),
-(10, 'skype:', '2022-01-05 21:29:53'),
-(11, 'xmpp:', '2022-01-05 21:30:22'),
-(12, 'tel:', '2022-01-05 21:30:50'),
-(13, 'fax:', '2022-01-05 21:30:50'),
-(14, 'whatsapp:', '2022-01-05 21:31:24'),
-(15, 'intent:', '2022-01-05 21:31:24'),
-(16, 'ftp:', '2022-01-05 21:33:34'),
-(17, 'youtu.', '2022-01-05 21:50:26'),
-(18, 'pinterest.', '2022-01-05 21:51:31'),
-(19, 'microsoft.', '2022-01-05 21:52:30'),
-(20, 'apple.', '2022-01-05 21:52:30'),
-(21, 'xing.', '2022-01-05 22:03:07'),
-(22, 'linked.', '2022-01-05 22:03:07'),
-(26, 't.co', '2022-01-05 22:05:07'),
-(27, 'tinyurl.', '2022-01-05 22:07:03'),
-(28, 'bitly.', '2022-01-05 22:07:03'),
-(29, 'bit.ly', '2022-01-05 22:07:23'),
-(30, 'wikipedia.', '2022-01-06 09:58:46'),
-(31, 'gstatic.', '2022-01-06 09:59:47'),
-(32, 'wikimedia.', '2022-01-06 10:00:20'),
-(33, 'goo.', '2022-01-06 10:02:11'),
-(34, 'cdn.', '2022-01-06 10:02:59'),
-(35, 'flickr.', '2022-01-06 10:05:46'),
-(36, '.mp3', '2022-01-07 13:11:49'),
-(40, '.aac', '2022-01-08 13:33:22'),
-(41, '.opus', '2022-01-08 13:33:22'),
-(42, 'awin1.', '2022-01-08 13:39:14'),
-(43, 'sms:', '2022-01-09 10:32:46'),
-(45, 'hhttps:', '2022-01-09 12:20:43'),
-(46, 'httpss:', '2022-01-09 13:12:34'),
-(47, 'soundcloud.', '2022-01-16 10:37:04'),
-(48, 'fb-messenger:', '2022-01-16 14:42:18'),
-(49, 'smartadserver.', '2022-01-16 16:48:46'),
-(50, 'ispgateway.', '2022-01-16 16:56:11'),
-(51, 'bitcoin:', '2022-01-16 19:48:41'),
-(52, 'webcal:', '2022-05-08 09:39:02'),
-(53, 'source:', '2022-05-08 09:43:19'),
-(54, 'phone:', '2022-05-08 09:44:19'),
-(55, 'threema:', '2022-05-08 09:45:19');
-
---
--- Indexes for dumped tables
---
-
---
--- Indexes for table `unique_domain`
---
-ALTER TABLE `unique_domain`
- ADD PRIMARY KEY (`id`),
- ADD UNIQUE KEY `url` (`url`);
-
---
--- Indexes for table `url_to_fetch`
---
-ALTER TABLE `url_to_fetch`
- ADD PRIMARY KEY (`id`),
- ADD KEY `baseurl` (`baseurl`),
- ADD KEY `last_fetched` (`last_fetched`),
- ADD KEY `fetch_failed` (`fetch_failed`);
-
---
--- Indexes for table `url_to_ignore`
---
-ALTER TABLE `url_to_ignore`
- ADD PRIMARY KEY (`id`),
- ADD UNIQUE KEY `url` (`searchfor`);
-
---
--- AUTO_INCREMENT for dumped tables
---
-
---
--- AUTO_INCREMENT for table `unique_domain`
---
-ALTER TABLE `unique_domain`
- MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;
-
---
--- AUTO_INCREMENT for table `url_to_ignore`
---
-ALTER TABLE `url_to_ignore`
- MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
-COMMIT;
-
-/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
-/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
-/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
+++ /dev/null
-#!/usr/bin/perl -w
-
-# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Data::Dumper;
-use Term::ANSIColor qw(:constants);
-
-use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
-
-use open qw( :std :encoding(UTF-8) );
-use DBI;
-use ConfigReader::Simple;
-use LWP::UserAgent;
-use HTTP::Request;
-
-
-my $DEBUG = 0;
-my $config = ConfigReader::Simple->new("config.txt");
-die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-
-
-## DB connection
-my %dbAttr = (
- PrintError=>0,# turn off error reporting via warn()
- RaiseError=>1, # turn on error reporting via die()
- AutoCommit=>0 # manually use transactions
-);
-my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
-my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-
-
-## fetch the urls to fetch from the table
-my %urlsToFetch;
-my $query = $dbh->prepare("SELECT `id`, `url`
- FROM `url_to_fetch`
- WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
- OR `last_fetched` IS NULL
- AND `fetch_failed` = 0
- LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
-$query->execute();
-while(my @row = $query->fetchrow_array) {
- $urlsToFetch{$row[0]} = $row[1];
-}
-
-# successful fetches
-my @urlsFetched;
-my @urlsFailed;
-
-# config the user agent for the request
-my $request_headers = [
- 'User-Agent' => $config->get("UA_AGENT"),
- 'Accept' => $config->get("UA_ACCEPT"),
- 'Accept-Language' => $config->get("UA_LANG"),
- 'Accept-Encoding' => HTTP::Message::decodable,
- 'Cache-Control' => $config->get("UA_CACHE")
-];
-my $ua = LWP::UserAgent->new();
-$ua->timeout($config->get("UA_TIMEOUT"));
-$ua->max_size($config->get("MAX_BYTES_PER_PAGE"));
-
-## now loop over them and store the results
-my $counter = 0;
-while ( my ($id, $url) = each %urlsToFetch ) {
- sayYellow "Fetching: $id $url";
-
- my $req = HTTP::Request->new(GET => $url, $request_headers);
- my $res = $ua->request($req);
- if ($res->is_success) {
- # callback tells us to stop
- if($res->header('Client-Aborted')) {
- sayYellow "Aborted, too big.";
- next;
- }
- if(index($res->content_type, "text/html") == -1) {
- sayYellow "Fetching: $id ignored. Not html";
- push(@urlsFailed, $id);
- next;
- }
- open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
- print $fh $res->decoded_content();
- close($fh);
- push(@urlsFetched, $id);
- sayGreen"Fetching: $id ok";
- }
- else {
- sayRed "Fetching: $id failed: $res->code ".$res->status_line;
- push(@urlsFailed, $id);
- }
-
- if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
- updateFetched($dbh, @urlsFetched);
- updateFailed($dbh, @urlsFailed);
- sleep(rand(7));
-
- $counter = 0;
- @urlsFetched = ();
- @urlsFailed = ();
- }
-
- $counter++;
-}
-updateFetched($dbh, @urlsFetched);
-updateFailed($dbh, @urlsFailed);
-
-$dbh->disconnect();
-sayGreen "Fetch complete";
-
-
-
-## update last_fetched in the table
-sub updateFetched {
- my ($dbh, @urls) = @_;
- sayYellow "Update fetch timestamps: ".scalar @urls;
- $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
- foreach my $idToUpdate (@urls) {
- sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
- $query->bind_param(1,$idToUpdate);
- $query->execute();
- }
- $dbh->commit();
- sayGreen "Update fetch timestamps done";
-}
-
-## update fetch_failed in the table
-sub updateFailed {
- my ($dbh, @urls) = @_;
-
- sayYellow "Update fetch failed: ".scalar @urls;
- $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
- foreach my $idToUpdate (@urls) {
- sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
- $query->bind_param(1,$idToUpdate);
- $query->execute();
- }
- $dbh->commit();
- sayGreen "Update fetch failed done";
-}
+++ /dev/null
-# 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-package Aranea::Common;
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Term::ANSIColor qw(:constants);
-
-use DateTime;
-use Exporter qw(import);
-
-
-our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed);
-
-sub sayLog {
- my ($string) = @_;
- my $dt = DateTime->now;
- say "[".$dt->datetime."] DEBUG: ".$string;
-}
-
-sub sayYellow {
- my ($string) = @_;
- my $dt = DateTime->now;
- say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET;
-}
-
-sub sayGreen {
- my ($string) = @_;
- my $dt = DateTime->now;
- say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET;
-}
-
-sub sayRed {
- my ($string) = @_;
- my $dt = DateTime->now;
- say BOLD, RED, "[".$dt->datetime."] ".$string, RESET;
-}
-
-1;
+++ /dev/null
-#!/usr/bin/perl -w
-
-# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Data::Dumper;
-use Term::ANSIColor qw(:constants);
-
-use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
-
-use open qw( :std :encoding(UTF-8) );
-use DBI;
-use ConfigReader::Simple;
-use HTML::LinkExtor;
-use URI::URL;
-use File::Basename;
-use Digest::MD5 qw(md5_hex);
-use Data::Validate::URI qw(is_uri);
-
-my $DEBUG = 0;
-my $config = ConfigReader::Simple->new("config.txt");
-die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-
-## DB connection
-my %dbAttr = (
- PrintError=>0,# turn off error reporting via warn()
- RaiseError=>1, # turn on error reporting via die()
- AutoCommit=>0 # manually use transactions
-);
-my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
-my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-
-
-## get the fetched files
-my @results = glob("storage/*.result");
-die "Nothing to parse. No files found." unless(@results);
-
-## build clean ids for query
-my @queryIds = @results;
-foreach (@queryIds) {
- $_ =~ s/.result//g;
- $_ =~ s|storage/||g;
-}
-
-# get the baseurls
-my %baseUrls;
-my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
-sayLog($queryStr) if $DEBUG;
-my $query = $dbh->prepare($queryStr);
-$query->execute(@queryIds);
-while(my @row = $query->fetchrow_array) {
- $baseUrls{$row[0]} = $row[1];
-}
-
-
-# get the string to ignore
-my @urlStringsToIgnore;
-$queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-$query->execute();
-while(my @row = $query->fetchrow) {
- push(@urlStringsToIgnore, $row[0])
-}
-
-
-## prepare linkExtor
-my @links = ();
-my @workingLinks = ();
-sub leCallback {
- my($tag, %attr) = @_;
- return if $tag ne 'a'; # we only look closer at <a ...>
- push(@workingLinks, values %attr);
-}
-my $le = HTML::LinkExtor->new(\&leCallback);
-
-## now parse each file and get the links
-my $counter = 0;
-foreach my $resultFile (@results) {
- sayYellow "Parsing file: $resultFile";
-
- my $fileId = basename($resultFile,".result");
-
- if (exists $baseUrls{$fileId}) {
- sayYellow "Baseurl: $baseUrls{$fileId}";
-
- $le->parse_file($resultFile);
- @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
- push(@links,@workingLinks);
-
- unlink($resultFile);
- sayGreen "Parsing done: ".scalar @workingLinks;
- }
- else {
- sayRed "No entry found for file $resultFile";
- }
-
- if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
-
- @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
- insertIntoDb($dbh, \@links);
-
- $counter = 0;
- @links = ();
- }
-
- @workingLinks = ();
- $counter++;
-}
-
-@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
-insertIntoDb($dbh, \@links);
-
-$dbh->disconnect();
-sayGreen "Parse complete";
-
-
-## cleanup the found links
-sub cleanLinks {
- my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
- my @linkArray = @{ $linkArray };
- my @urlStringsToIgnore = @{ $urlStringsToIgnore };
-
- sayYellow "Clean found links: ".scalar @linkArray;
- foreach my $toSearch (@urlStringsToIgnore) {
- sayYellow "Clean links from: ".$toSearch;
- @linkArray = grep {!/$toSearch/i} @linkArray;
- }
- sayGreen "Cleaned found links: ".scalar @linkArray;
-
- return @linkArray;
-}
-
-
-## update the DB with the new found links
-sub insertIntoDb {
- my ($dbh, $links) = @_;
- my @links = @{ $links };
-
- sayYellow "Insert links into DB: ".scalar @links;
- $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET
- `id` = ?,
- `url` = ?,
- `baseurl` = ?,
- `created` = NOW()";
- sayLog $queryStr if $DEBUG;
- $query = $dbh->prepare($queryStr);
- my $md5 = Digest::MD5->new;
- my $counter = 0;
- foreach my $link (@links) {
-
- sayLog $link if ($DEBUG);
-
- if(!is_uri($link)) {
- sayYellow "Ignore URL it is invalid: $link";
- next;
- }
-
- my $url = url($link);
- if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
- sayYellow "Ignore URL because of scheme: $link";
- next;
- }
-
- $md5->add($link);
- my $digest = $md5->hexdigest;
- $query->execute($digest, $link, $url->scheme."://".$url->host);
- $md5->reset;
-
- $counter++;
-
- if($counter >= 500) {
- $counter = 0;
- sayYellow "Commit counter of 500 reached. Commiting";
- $dbh->commit();
- }
-
- #sayLog $digest if ($DEBUG);
- #sayLog $url->scheme if ($DEBUG);
- #sayLog $url->host if ($DEBUG);
- #sayLog $query->{Statement} if ($DEBUG);
- #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
-
- #sayLog "Inserted: $link" if($DEBUG);
- }
- sayYellow "Final commit";
- $dbh->commit();
-}
+++ /dev/null
-*
-!.gitignore