new directory structure to seperate the crawler and the upcomming webinterface

author Banana <mail@bananas-playground.net>

Tue, 8 Oct 2024 08:04:19 +0000 (10:04 +0200)

committer Banana <mail@bananas-playground.net>

Tue, 8 Oct 2024 08:04:19 +0000 (10:04 +0200)
author Banana <mail@bananas-playground.net>
Tue, 8 Oct 2024 08:04:19 +0000 (10:04 +0200)
committer Banana <mail@bananas-playground.net>
Tue, 8 Oct 2024 08:04:19 +0000 (10:04 +0200)
diff --git a/README.md b/README.md

index 3ac7ebbe4fef20b0904ebf518b9ac63638d8495d..df524bdfd0dde937770847df1cd0004a7bf99183 100644 (file)
--- a/README.md
+++ b/README.md
@@ -27,3 +27,8 @@ The table `url_to_ignore` does have a small amount of domains
  and part of domains which will be ignored.  Adding a global SPAM list would be overkill.
  
  A good idea is to run it with a DNS filter, which has a good blocklist.
+
+# Webinterface
+
+The folder `webroot` does contain a webinterface which displays the gathered data and status.
+It does not provide a way to execute the crawler.
diff --git a/cleanup.pl b/cleanup.pl

deleted file mode 100644 (file)

index 4dce87c..0000000
--- a/cleanup.pl
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/perl -w
-
-# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Data::Dumper;
-use Term::ANSIColor qw(:constants);
-
-use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
-
-use DBI;
-use ConfigReader::Simple;
-use URI::URL;
-use Data::Validate::URI qw(is_uri);
-
-
-my $DEBUG = 0;
-my $config = ConfigReader::Simple->new("config.txt");
-die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-
-## DB connection
-my %dbAttr = (
-       PrintError=>0,# turn off error reporting via warn()
-    RaiseError=>1 # turn on error reporting via die()
-);
-my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
-my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-
-
-
-# update the unique domains
-my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
-sayLog($queryStr) if $DEBUG;
-my $query = $dbh->prepare($queryStr);
-$query->execute();
-
-# now validate the unique ones
-$queryStr = "SELECT `id`, `url` FROM unique_domain";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-$query->execute();
-my @invalidUrls = ();
-my @toBeDeletedFromFetchAgain = ();
-while(my @row = $query->fetchrow_array) {
-       my $link = $row[1];
-       my $id = $row[0];
-       if(!is_uri($link)) {
-               sayYellow "Ignore URL it is invalid: $link";
-               push(@invalidUrls, $id);
-               push(@toBeDeletedFromFetchAgain, $link);
-               next;
-       }
-
-       my $url = url($link);
-       if(!defined($url->scheme) || index($url->scheme,"http") == -1) {
-               sayYellow "Ignore URL because of scheme: $link";
-               push(@invalidUrls, $id);
-               push(@toBeDeletedFromFetchAgain, $link);
-               next;
-       }
-}
-
-sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
-$queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-foreach my $invalidId (@invalidUrls) {
-       $query->execute($invalidId);
-       #$query->finish();
-       sayLog "Removed $invalidId from unique_domain" if $DEBUG;
-}
-sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
-
-
-# remove urls from fetch since we have enough already
-$queryStr = "SELECT count(baseurl) AS amount, baseurl
-                               FROM `url_to_fetch`
-                               WHERE last_fetched <> 0
-                               GROUP BY baseurl
-                               HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-$query->execute();
-while(my @row = $query->fetchrow_array) {
-       my $baseUrl = $row[1];
-       push(@toBeDeletedFromFetchAgain, $baseUrl);
-}
-#$query->finish();
-
-sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
-$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
-       $query->execute($baseUrl);
-       #$query->finish();
-       sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
-}
-sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
-
-# remove failed fetches
-sayYellow "Remove fetch_failed";
-$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1";
-$query = $dbh->prepare($queryStr);
-$query->execute();
-sayGreen "Remove fetch_failed done";
-
-sayYellow "Remove invalid urls which the is_uri check does let pass";
-$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'";
-$query = $dbh->prepare($queryStr);
-$query->execute();
-$queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'";
-$query = $dbh->prepare($queryStr);
-$query->execute();
-sayYellow "Remove invalid urls done";
-
-
-sayGreen "Cleanup complete";
diff --git a/config.default.txt b/config.default.txt

deleted file mode 100644 (file)

index 22ef694..0000000
--- a/config.default.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-DB_HOST=localhost
-DB_PORT=3306
-DB_NAME=aranea
-DB_USER=user
-DB_PASS=test
-
-UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
-UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
-UA_LANG="en-US"
-UA_CACHE="no-cache"
-UA_TIMEOUT=5
-
-FETCH_URLS_PER_RUN=5000
-FETCH_URLS_PER_PACKAGE=30
-PARSE_FILES_PER_PACKAGE=50
-CLEANUP_URLS_AMOUNT_ABOVE=40
-MAX_BYTES_PER_PAGE=5000000
diff --git a/crawler/cleanup.pl b/crawler/cleanup.pl

new file mode 100644 (file)

index 0000000..4dce87c
--- /dev/null
+++ b/crawler/cleanup.pl
@@ -0,0 +1,136 @@
+#!/usr/bin/perl -w
+
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Data::Dumper;
+use Term::ANSIColor qw(:constants);
+
+use lib './lib';
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+
+use DBI;
+use ConfigReader::Simple;
+use URI::URL;
+use Data::Validate::URI qw(is_uri);
+
+
+my $DEBUG = 0;
+my $config = ConfigReader::Simple->new("config.txt");
+die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+
+## DB connection
+my %dbAttr = (
+       PrintError=>0,# turn off error reporting via warn()
+    RaiseError=>1 # turn on error reporting via die()
+);
+my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
+my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
+die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+
+
+
+# update the unique domains
+my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
+sayLog($queryStr) if $DEBUG;
+my $query = $dbh->prepare($queryStr);
+$query->execute();
+
+# now validate the unique ones
+$queryStr = "SELECT `id`, `url` FROM unique_domain";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+$query->execute();
+my @invalidUrls = ();
+my @toBeDeletedFromFetchAgain = ();
+while(my @row = $query->fetchrow_array) {
+       my $link = $row[1];
+       my $id = $row[0];
+       if(!is_uri($link)) {
+               sayYellow "Ignore URL it is invalid: $link";
+               push(@invalidUrls, $id);
+               push(@toBeDeletedFromFetchAgain, $link);
+               next;
+       }
+
+       my $url = url($link);
+       if(!defined($url->scheme) || index($url->scheme,"http") == -1) {
+               sayYellow "Ignore URL because of scheme: $link";
+               push(@invalidUrls, $id);
+               push(@toBeDeletedFromFetchAgain, $link);
+               next;
+       }
+}
+
+sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
+$queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+foreach my $invalidId (@invalidUrls) {
+       $query->execute($invalidId);
+       #$query->finish();
+       sayLog "Removed $invalidId from unique_domain" if $DEBUG;
+}
+sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls;
+
+
+# remove urls from fetch since we have enough already
+$queryStr = "SELECT count(baseurl) AS amount, baseurl
+                               FROM `url_to_fetch`
+                               WHERE last_fetched <> 0
+                               GROUP BY baseurl
+                               HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+$query->execute();
+while(my @row = $query->fetchrow_array) {
+       my $baseUrl = $row[1];
+       push(@toBeDeletedFromFetchAgain, $baseUrl);
+}
+#$query->finish();
+
+sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
+$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+foreach my $baseUrl (@toBeDeletedFromFetchAgain) {
+       $query->execute($baseUrl);
+       #$query->finish();
+       sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG;
+}
+sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
+
+# remove failed fetches
+sayYellow "Remove fetch_failed";
+$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+sayGreen "Remove fetch_failed done";
+
+sayYellow "Remove invalid urls which the is_uri check does let pass";
+$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+$queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'";
+$query = $dbh->prepare($queryStr);
+$query->execute();
+sayYellow "Remove invalid urls done";
+
+
+sayGreen "Cleanup complete";
diff --git a/crawler/config.default.txt b/crawler/config.default.txt

new file mode 100644 (file)

index 0000000..22ef694
--- /dev/null
+++ b/crawler/config.default.txt
@@ -0,0 +1,17 @@
+DB_HOST=localhost
+DB_PORT=3306
+DB_NAME=aranea
+DB_USER=user
+DB_PASS=test
+
+UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
+UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
+UA_LANG="en-US"
+UA_CACHE="no-cache"
+UA_TIMEOUT=5
+
+FETCH_URLS_PER_RUN=5000
+FETCH_URLS_PER_PACKAGE=30
+PARSE_FILES_PER_PACKAGE=50
+CLEANUP_URLS_AMOUNT_ABOVE=40
+MAX_BYTES_PER_PAGE=5000000
diff --git a/crawler/documentation/install.md b/crawler/documentation/install.md

new file mode 100644 (file)

index 0000000..c17acff
--- /dev/null
+++ b/crawler/documentation/install.md
@@ -0,0 +1,15 @@
+# Requirements
+
+Please check the requirements file first.
+
+# Database
+
+You need a MySQL installation and an existing database.
+
+Use `setup.sql` to create the tables into your existing database: `mysql --user=user -p databasename < setup.sql`
+
+# Config
+
+Copy `config.default.txt` to `config.txt` and edit at least to match the database name and server settings.
+
+Make sure the directory `storage` can be written.
diff --git a/crawler/documentation/requirements.md b/crawler/documentation/requirements.md

new file mode 100644 (file)

index 0000000..0375df0
--- /dev/null
+++ b/crawler/documentation/requirements.md
@@ -0,0 +1,22 @@
+# MySQL
+
+Tested with a MySQL server 8.+
+
+# Perl modules
+
+Extra modules along with the more already installed ones.
+
++ [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
+
+## Debian
+
+Those are the ones which needed to be installed after a fresh debian(stable) install. May vary.
+
++ libdatetime-perl
++ libdbi-perl
++ libconfigreader-simple-perl
++ libhtml-linkextractor-perl
++ libdata-validate-uri-perl
++ libdbd-mysql-perl
++ libwww-perl
diff --git a/crawler/documentation/setup.sql b/crawler/documentation/setup.sql

new file mode 100644 (file)

index 0000000..2fa4c9b
--- /dev/null
+++ b/crawler/documentation/setup.sql
@@ -0,0 +1,148 @@
+SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
+START TRANSACTION;
+SET time_zone = "+00:00";
+
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!40101 SET NAMES utf8mb4 */;
+
+--
+-- Table structure for table `unique_domain`
+--
+
+CREATE TABLE `unique_domain` (
+  `id` int(11) NOT NULL,
+  `url` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+  `created` datetime NOT NULL DEFAULT current_timestamp()
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `url_to_fetch`
+--
+
+CREATE TABLE `url_to_fetch` (
+  `id` char(32) COLLATE utf8mb4_bin NOT NULL,
+  `url` text COLLATE utf8mb4_bin NOT NULL,
+  `baseurl` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+  `created` datetime NOT NULL,
+  `last_fetched` datetime DEFAULT NULL,
+  `fetch_failed` tinyint(1) NOT NULL DEFAULT 0
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `url_to_ignore`
+--
+
+CREATE TABLE `url_to_ignore` (
+  `id` int(11) NOT NULL,
+  `searchfor` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+  `created` datetime NOT NULL DEFAULT current_timestamp()
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
+
+--
+-- Dumping data for table `url_to_ignore`
+--
+
+INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
+(1, 'mailto:', '2022-01-05 10:46:10'),
+(2, 'javascript:', '2022-01-05 10:46:10'),
+(3, 'google.', '2022-01-05 10:46:29'),
+(4, 'amazon.', '2022-01-05 10:46:29'),
+(5, 'youtube.', '2022-01-05 10:46:47'),
+(6, '.onion', '2022-01-05 17:21:45'),
+(7, 'instagram.', '2022-01-05 20:15:21'),
+(8, 'twitter.', '2022-01-05 20:16:31'),
+(9, 'facebook.', '2022-01-05 20:16:31'),
+(10, 'skype:', '2022-01-05 21:29:53'),
+(11, 'xmpp:', '2022-01-05 21:30:22'),
+(12, 'tel:', '2022-01-05 21:30:50'),
+(13, 'fax:', '2022-01-05 21:30:50'),
+(14, 'whatsapp:', '2022-01-05 21:31:24'),
+(15, 'intent:', '2022-01-05 21:31:24'),
+(16, 'ftp:', '2022-01-05 21:33:34'),
+(17, 'youtu.', '2022-01-05 21:50:26'),
+(18, 'pinterest.', '2022-01-05 21:51:31'),
+(19, 'microsoft.', '2022-01-05 21:52:30'),
+(20, 'apple.', '2022-01-05 21:52:30'),
+(21, 'xing.', '2022-01-05 22:03:07'),
+(22, 'linked.', '2022-01-05 22:03:07'),
+(26, 't.co', '2022-01-05 22:05:07'),
+(27, 'tinyurl.', '2022-01-05 22:07:03'),
+(28, 'bitly.', '2022-01-05 22:07:03'),
+(29, 'bit.ly', '2022-01-05 22:07:23'),
+(30, 'wikipedia.', '2022-01-06 09:58:46'),
+(31, 'gstatic.', '2022-01-06 09:59:47'),
+(32, 'wikimedia.', '2022-01-06 10:00:20'),
+(33, 'goo.', '2022-01-06 10:02:11'),
+(34, 'cdn.', '2022-01-06 10:02:59'),
+(35, 'flickr.', '2022-01-06 10:05:46'),
+(36, '.mp3', '2022-01-07 13:11:49'),
+(40, '.aac', '2022-01-08 13:33:22'),
+(41, '.opus', '2022-01-08 13:33:22'),
+(42, 'awin1.', '2022-01-08 13:39:14'),
+(43, 'sms:', '2022-01-09 10:32:46'),
+(45, 'hhttps:', '2022-01-09 12:20:43'),
+(46, 'httpss:', '2022-01-09 13:12:34'),
+(47, 'soundcloud.', '2022-01-16 10:37:04'),
+(48, 'fb-messenger:', '2022-01-16 14:42:18'),
+(49, 'smartadserver.', '2022-01-16 16:48:46'),
+(50, 'ispgateway.', '2022-01-16 16:56:11'),
+(51, 'bitcoin:', '2022-01-16 19:48:41'),
+(52, 'webcal:', '2022-05-08 09:39:02'),
+(53, 'source:', '2022-05-08 09:43:19'),
+(54, 'phone:', '2022-05-08 09:44:19'),
+(55, 'threema:', '2022-05-08 09:45:19');
+
+--
+-- Indexes for dumped tables
+--
+
+--
+-- Indexes for table `unique_domain`
+--
+ALTER TABLE `unique_domain`
+  ADD PRIMARY KEY (`id`),
+  ADD UNIQUE KEY `url` (`url`);
+
+--
+-- Indexes for table `url_to_fetch`
+--
+ALTER TABLE `url_to_fetch`
+  ADD PRIMARY KEY (`id`),
+  ADD KEY `baseurl` (`baseurl`),
+  ADD KEY `last_fetched` (`last_fetched`),
+  ADD KEY `fetch_failed` (`fetch_failed`);
+
+--
+-- Indexes for table `url_to_ignore`
+--
+ALTER TABLE `url_to_ignore`
+  ADD PRIMARY KEY (`id`),
+  ADD UNIQUE KEY `url` (`searchfor`);
+
+--
+-- AUTO_INCREMENT for dumped tables
+--
+
+--
+-- AUTO_INCREMENT for table `unique_domain`
+--
+ALTER TABLE `unique_domain`
+  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `url_to_ignore`
+--
+ALTER TABLE `url_to_ignore`
+  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
+COMMIT;
+
+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
diff --git a/crawler/documentation/upgrade.md b/crawler/documentation/upgrade.md

new file mode 100644 (file)

index 0000000..b5780e6
--- /dev/null
+++ b/crawler/documentation/upgrade.md
@@ -0,0 +1,6 @@
+# Upgrade information
+
+Each release has its own upgrade file.
+Follow the instructions in each file step by step, starting from your current one.
+
+Make sure paths and sql table names are modified to your settings.
diff --git a/crawler/fetch.pl b/crawler/fetch.pl

new file mode 100644 (file)

index 0000000..3110158
--- /dev/null
+++ b/crawler/fetch.pl
@@ -0,0 +1,156 @@
+#!/usr/bin/perl -w
+
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Data::Dumper;
+use Term::ANSIColor qw(:constants);
+
+use lib './lib';
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+
+use open qw( :std :encoding(UTF-8) );
+use DBI;
+use ConfigReader::Simple;
+use LWP::UserAgent;
+use HTTP::Request;
+
+
+my $DEBUG = 0;
+my $config = ConfigReader::Simple->new("config.txt");
+die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+
+
+## DB connection
+my %dbAttr = (
+    PrintError=>0,# turn off error reporting via warn()
+    RaiseError=>1, # turn on error reporting via die()
+    AutoCommit=>0 # manually use transactions
+);
+my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
+my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
+die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+
+
+## fetch the urls to fetch from the table
+my %urlsToFetch;
+my $query = $dbh->prepare("SELECT `id`, `url`
+                            FROM `url_to_fetch`
+                            WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
+                                OR `last_fetched` IS NULL
+                                AND `fetch_failed` = 0
+                            LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
+$query->execute();
+while(my @row = $query->fetchrow_array) {
+    $urlsToFetch{$row[0]} = $row[1];
+}
+
+# successful fetches
+my @urlsFetched;
+my @urlsFailed;
+
+# config the user agent for the request
+my $request_headers = [
+    'User-Agent' => $config->get("UA_AGENT"),
+    'Accept' => $config->get("UA_ACCEPT"),
+    'Accept-Language' => $config->get("UA_LANG"),
+    'Accept-Encoding' => HTTP::Message::decodable,
+    'Cache-Control' => $config->get("UA_CACHE")
+];
+my $ua = LWP::UserAgent->new();
+$ua->timeout($config->get("UA_TIMEOUT"));
+$ua->max_size($config->get("MAX_BYTES_PER_PAGE"));
+
+## now loop over them and store the results
+my $counter = 0;
+while ( my ($id, $url) = each %urlsToFetch ) {
+    sayYellow "Fetching: $id $url";
+
+    my $req = HTTP::Request->new(GET => $url, $request_headers);
+    my $res = $ua->request($req);
+    if ($res->is_success) {
+        # callback tells us to stop
+        if($res->header('Client-Aborted')) {
+            sayYellow "Aborted, too big.";
+            next;
+        }
+        if(index($res->content_type, "text/html") == -1) {
+            sayYellow "Fetching: $id ignored. Not html";
+            push(@urlsFailed, $id);
+            next;
+        }
+        open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
+        print $fh $res->decoded_content();
+        close($fh);
+        push(@urlsFetched, $id);
+        sayGreen"Fetching: $id ok";
+    }
+    else {
+        sayRed "Fetching: $id failed: $res->code ".$res->status_line;
+        push(@urlsFailed, $id);
+    }
+
+    if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
+        updateFetched($dbh, @urlsFetched);
+        updateFailed($dbh, @urlsFailed);
+        sleep(rand(7));
+
+        $counter = 0;
+        @urlsFetched = ();
+        @urlsFailed = ();
+    }
+
+    $counter++;
+}
+updateFetched($dbh, @urlsFetched);
+updateFailed($dbh, @urlsFailed);
+
+$dbh->disconnect();
+sayGreen "Fetch complete";
+
+
+
+## update last_fetched in the table
+sub updateFetched {
+    my ($dbh, @urls) = @_;
+    sayYellow "Update fetch timestamps: ".scalar @urls;
+    $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
+    foreach my $idToUpdate (@urls) {
+        sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
+        $query->bind_param(1,$idToUpdate);
+        $query->execute();
+    }
+    $dbh->commit();
+    sayGreen "Update fetch timestamps done";
+}
+
+## update fetch_failed in the table
+sub updateFailed {
+    my ($dbh, @urls) = @_;
+
+    sayYellow "Update fetch failed: ".scalar @urls;
+    $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
+    foreach my $idToUpdate (@urls) {
+        sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
+        $query->bind_param(1,$idToUpdate);
+        $query->execute();
+    }
+    $dbh->commit();
+    sayGreen "Update fetch failed done";
+}
diff --git a/crawler/lib/Aranea/Common.pm b/crawler/lib/Aranea/Common.pm

new file mode 100644 (file)

index 0000000..b832ffb
--- /dev/null
+++ b/crawler/lib/Aranea/Common.pm
@@ -0,0 +1,53 @@
+# 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+package Aranea::Common;
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Term::ANSIColor qw(:constants);
+
+use DateTime;
+use Exporter qw(import);
+
+
+our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed);
+
+sub sayLog {
+       my ($string) = @_;
+       my $dt = DateTime->now;
+       say "[".$dt->datetime."] DEBUG: ".$string;
+}
+
+sub sayYellow {
+       my ($string) = @_;
+       my $dt = DateTime->now;
+       say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET;
+}
+
+sub sayGreen {
+       my ($string) = @_;
+       my $dt = DateTime->now;
+       say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET;
+}
+
+sub sayRed {
+       my ($string) = @_;
+       my $dt = DateTime->now;
+       say BOLD, RED, "[".$dt->datetime."] ".$string, RESET;
+}
+
+1;
diff --git a/crawler/parse-results.pl b/crawler/parse-results.pl

new file mode 100644 (file)

index 0000000..9a785d5
--- /dev/null
+++ b/crawler/parse-results.pl
@@ -0,0 +1,206 @@
+#!/usr/bin/perl -w
+
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
+
+use 5.20.0;
+use strict;
+use warnings;
+use utf8;
+use Data::Dumper;
+use Term::ANSIColor qw(:constants);
+
+use lib './lib';
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+
+use open qw( :std :encoding(UTF-8) );
+use DBI;
+use ConfigReader::Simple;
+use HTML::LinkExtor;
+use URI::URL;
+use File::Basename;
+use Digest::MD5 qw(md5_hex);
+use Data::Validate::URI qw(is_uri);
+
+my $DEBUG = 0;
+my $config = ConfigReader::Simple->new("config.txt");
+die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+
+## DB connection
+my %dbAttr = (
+       PrintError=>0,# turn off error reporting via warn()
+    RaiseError=>1, # turn on error reporting via die()
+       AutoCommit=>0 # manually use transactions
+);
+my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
+my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
+die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+
+
+## get the fetched files
+my @results = glob("storage/*.result");
+die "Nothing to parse. No files found." unless(@results);
+
+## build clean ids for query
+my @queryIds = @results;
+foreach (@queryIds) {
+       $_ =~ s/.result//g;
+       $_ =~ s|storage/||g;
+}
+
+# get the baseurls
+my %baseUrls;
+my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
+sayLog($queryStr) if $DEBUG;
+my $query = $dbh->prepare($queryStr);
+$query->execute(@queryIds);
+while(my @row = $query->fetchrow_array) {
+       $baseUrls{$row[0]} = $row[1];
+}
+
+
+# get the string to ignore
+my @urlStringsToIgnore;
+$queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
+sayLog($queryStr) if $DEBUG;
+$query = $dbh->prepare($queryStr);
+$query->execute();
+while(my @row = $query->fetchrow) {
+       push(@urlStringsToIgnore, $row[0])
+}
+
+
+## prepare linkExtor
+my @links = ();
+my @workingLinks = ();
+sub leCallback {
+   my($tag, %attr) = @_;
+   return if $tag ne 'a';  # we only look closer at <a ...>
+   push(@workingLinks, values %attr);
+}
+my $le = HTML::LinkExtor->new(\&leCallback);
+
+## now parse each file and get the links
+my $counter = 0;
+foreach my $resultFile (@results) {
+       sayYellow "Parsing file: $resultFile";
+
+       my $fileId = basename($resultFile,".result");
+
+       if (exists $baseUrls{$fileId}) {
+               sayYellow "Baseurl: $baseUrls{$fileId}";
+
+               $le->parse_file($resultFile);
+               @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
+               push(@links,@workingLinks);
+
+               unlink($resultFile);
+               sayGreen "Parsing done: ".scalar @workingLinks;
+       }
+       else {
+               sayRed "No entry found for file $resultFile";
+       }
+
+       if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
+
+               @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
+               insertIntoDb($dbh, \@links);
+
+               $counter = 0;
+               @links = ();
+       }
+
+       @workingLinks = ();
+       $counter++;
+}
+
+@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
+insertIntoDb($dbh, \@links);
+
+$dbh->disconnect();
+sayGreen "Parse complete";
+
+
+## cleanup the found links
+sub cleanLinks {
+       my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
+       my @linkArray = @{ $linkArray };
+       my @urlStringsToIgnore = @{ $urlStringsToIgnore };
+
+       sayYellow "Clean found links: ".scalar @linkArray;
+       foreach my $toSearch (@urlStringsToIgnore) {
+               sayYellow "Clean links from: ".$toSearch;
+               @linkArray = grep {!/$toSearch/i} @linkArray;
+       }
+       sayGreen "Cleaned found links: ".scalar @linkArray;
+
+       return @linkArray;
+}
+
+
+## update the DB with the new found links
+sub insertIntoDb {
+       my ($dbh, $links) = @_;
+       my @links = @{ $links };
+
+       sayYellow "Insert links into DB: ".scalar @links;
+       $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET
+                                       `id` = ?,
+                                       `url` = ?,
+                                       `baseurl` = ?,
+                                       `created` = NOW()";
+       sayLog $queryStr if $DEBUG;
+       $query = $dbh->prepare($queryStr);
+       my $md5 = Digest::MD5->new;
+       my $counter = 0;
+       foreach my $link (@links) {
+
+               sayLog $link if ($DEBUG);
+
+               if(!is_uri($link)) {
+                       sayYellow "Ignore URL it is invalid: $link";
+                       next;
+               }
+
+               my $url = url($link);
+               if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
+                       sayYellow "Ignore URL because of scheme: $link";
+                       next;
+               }
+
+               $md5->add($link);
+               my $digest = $md5->hexdigest;
+               $query->execute($digest, $link, $url->scheme."://".$url->host);
+               $md5->reset;
+
+               $counter++;
+
+               if($counter >= 500) {
+                       $counter = 0;
+                       sayYellow "Commit counter of 500 reached. Commiting";
+                       $dbh->commit();
+               }
+
+               #sayLog $digest if ($DEBUG);
+               #sayLog $url->scheme if ($DEBUG);
+               #sayLog $url->host if ($DEBUG);
+               #sayLog $query->{Statement} if ($DEBUG);
+               #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
+
+               #sayLog "Inserted: $link" if($DEBUG);
+       }
+       sayYellow "Final commit";
+       $dbh->commit();
+}
diff --git a/crawler/storage/.gitignore b/crawler/storage/.gitignore

new file mode 100644 (file)

index 0000000..d6b7ef3
--- /dev/null
+++ b/crawler/storage/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/documentation/install.md b/documentation/install.md

deleted file mode 100644 (file)

index b0a5198..0000000
--- a/documentation/install.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Requirements
-
-Please check the requirements file first.
-
-# Database
-
-You need a MySQL installation and a user which can create a database.
-
-Use `setup.sql` to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql`
-
-# Config
-
-Copy `config.default.txt` to `config.txt` and edit at least to match the database server settings.
-
-Make sure the directory `storage` can be written.
diff --git a/documentation/requirements.md b/documentation/requirements.md

deleted file mode 100644 (file)

index 0375df0..0000000
--- a/documentation/requirements.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# MySQL
-
-Tested with a MySQL server 8.+
-
-# Perl modules
-
-Extra modules along with the more already installed ones.
-
-+ [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
-+ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
-
-## Debian
-
-Those are the ones which needed to be installed after a fresh debian(stable) install. May vary.
-
-+ libdatetime-perl
-+ libdbi-perl
-+ libconfigreader-simple-perl
-+ libhtml-linkextractor-perl
-+ libdata-validate-uri-perl
-+ libdbd-mysql-perl
-+ libwww-perl
diff --git a/documentation/setup.sql b/documentation/setup.sql

deleted file mode 100644 (file)

index 2fa4c9b..0000000
--- a/documentation/setup.sql
+++ /dev/null
@@ -1,148 +0,0 @@
-SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
-START TRANSACTION;
-SET time_zone = "+00:00";
-
-
-/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
-/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
-/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
-/*!40101 SET NAMES utf8mb4 */;
-
---
--- Table structure for table `unique_domain`
---
-
-CREATE TABLE `unique_domain` (
-  `id` int(11) NOT NULL,
-  `url` varchar(255) COLLATE utf8mb4_bin NOT NULL,
-  `created` datetime NOT NULL DEFAULT current_timestamp()
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
-
--- --------------------------------------------------------
-
---
--- Table structure for table `url_to_fetch`
---
-
-CREATE TABLE `url_to_fetch` (
-  `id` char(32) COLLATE utf8mb4_bin NOT NULL,
-  `url` text COLLATE utf8mb4_bin NOT NULL,
-  `baseurl` varchar(255) COLLATE utf8mb4_bin NOT NULL,
-  `created` datetime NOT NULL,
-  `last_fetched` datetime DEFAULT NULL,
-  `fetch_failed` tinyint(1) NOT NULL DEFAULT 0
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
-
--- --------------------------------------------------------
-
---
--- Table structure for table `url_to_ignore`
---
-
-CREATE TABLE `url_to_ignore` (
-  `id` int(11) NOT NULL,
-  `searchfor` varchar(255) COLLATE utf8mb4_bin NOT NULL,
-  `created` datetime NOT NULL DEFAULT current_timestamp()
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
-
---
--- Dumping data for table `url_to_ignore`
---
-
-INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES
-(1, 'mailto:', '2022-01-05 10:46:10'),
-(2, 'javascript:', '2022-01-05 10:46:10'),
-(3, 'google.', '2022-01-05 10:46:29'),
-(4, 'amazon.', '2022-01-05 10:46:29'),
-(5, 'youtube.', '2022-01-05 10:46:47'),
-(6, '.onion', '2022-01-05 17:21:45'),
-(7, 'instagram.', '2022-01-05 20:15:21'),
-(8, 'twitter.', '2022-01-05 20:16:31'),
-(9, 'facebook.', '2022-01-05 20:16:31'),
-(10, 'skype:', '2022-01-05 21:29:53'),
-(11, 'xmpp:', '2022-01-05 21:30:22'),
-(12, 'tel:', '2022-01-05 21:30:50'),
-(13, 'fax:', '2022-01-05 21:30:50'),
-(14, 'whatsapp:', '2022-01-05 21:31:24'),
-(15, 'intent:', '2022-01-05 21:31:24'),
-(16, 'ftp:', '2022-01-05 21:33:34'),
-(17, 'youtu.', '2022-01-05 21:50:26'),
-(18, 'pinterest.', '2022-01-05 21:51:31'),
-(19, 'microsoft.', '2022-01-05 21:52:30'),
-(20, 'apple.', '2022-01-05 21:52:30'),
-(21, 'xing.', '2022-01-05 22:03:07'),
-(22, 'linked.', '2022-01-05 22:03:07'),
-(26, 't.co', '2022-01-05 22:05:07'),
-(27, 'tinyurl.', '2022-01-05 22:07:03'),
-(28, 'bitly.', '2022-01-05 22:07:03'),
-(29, 'bit.ly', '2022-01-05 22:07:23'),
-(30, 'wikipedia.', '2022-01-06 09:58:46'),
-(31, 'gstatic.', '2022-01-06 09:59:47'),
-(32, 'wikimedia.', '2022-01-06 10:00:20'),
-(33, 'goo.', '2022-01-06 10:02:11'),
-(34, 'cdn.', '2022-01-06 10:02:59'),
-(35, 'flickr.', '2022-01-06 10:05:46'),
-(36, '.mp3', '2022-01-07 13:11:49'),
-(40, '.aac', '2022-01-08 13:33:22'),
-(41, '.opus', '2022-01-08 13:33:22'),
-(42, 'awin1.', '2022-01-08 13:39:14'),
-(43, 'sms:', '2022-01-09 10:32:46'),
-(45, 'hhttps:', '2022-01-09 12:20:43'),
-(46, 'httpss:', '2022-01-09 13:12:34'),
-(47, 'soundcloud.', '2022-01-16 10:37:04'),
-(48, 'fb-messenger:', '2022-01-16 14:42:18'),
-(49, 'smartadserver.', '2022-01-16 16:48:46'),
-(50, 'ispgateway.', '2022-01-16 16:56:11'),
-(51, 'bitcoin:', '2022-01-16 19:48:41'),
-(52, 'webcal:', '2022-05-08 09:39:02'),
-(53, 'source:', '2022-05-08 09:43:19'),
-(54, 'phone:', '2022-05-08 09:44:19'),
-(55, 'threema:', '2022-05-08 09:45:19');
-
---
--- Indexes for dumped tables
---
-
---
--- Indexes for table `unique_domain`
---
-ALTER TABLE `unique_domain`
-  ADD PRIMARY KEY (`id`),
-  ADD UNIQUE KEY `url` (`url`);
-
---
--- Indexes for table `url_to_fetch`
---
-ALTER TABLE `url_to_fetch`
-  ADD PRIMARY KEY (`id`),
-  ADD KEY `baseurl` (`baseurl`),
-  ADD KEY `last_fetched` (`last_fetched`),
-  ADD KEY `fetch_failed` (`fetch_failed`);
-
---
--- Indexes for table `url_to_ignore`
---
-ALTER TABLE `url_to_ignore`
-  ADD PRIMARY KEY (`id`),
-  ADD UNIQUE KEY `url` (`searchfor`);
-
---
--- AUTO_INCREMENT for dumped tables
---
-
---
--- AUTO_INCREMENT for table `unique_domain`
---
-ALTER TABLE `unique_domain`
-  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;
-
---
--- AUTO_INCREMENT for table `url_to_ignore`
---
-ALTER TABLE `url_to_ignore`
-  MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56;
-COMMIT;
-
-/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
-/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
-/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
diff --git a/fetch.pl b/fetch.pl

deleted file mode 100644 (file)

index 3110158..0000000
--- a/fetch.pl
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/perl -w
-
-# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Data::Dumper;
-use Term::ANSIColor qw(:constants);
-
-use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
-
-use open qw( :std :encoding(UTF-8) );
-use DBI;
-use ConfigReader::Simple;
-use LWP::UserAgent;
-use HTTP::Request;
-
-
-my $DEBUG = 0;
-my $config = ConfigReader::Simple->new("config.txt");
-die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-
-
-## DB connection
-my %dbAttr = (
-    PrintError=>0,# turn off error reporting via warn()
-    RaiseError=>1, # turn on error reporting via die()
-    AutoCommit=>0 # manually use transactions
-);
-my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
-my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-
-
-## fetch the urls to fetch from the table
-my %urlsToFetch;
-my $query = $dbh->prepare("SELECT `id`, `url`
-                            FROM `url_to_fetch`
-                            WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
-                                OR `last_fetched` IS NULL
-                                AND `fetch_failed` = 0
-                            LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
-$query->execute();
-while(my @row = $query->fetchrow_array) {
-    $urlsToFetch{$row[0]} = $row[1];
-}
-
-# successful fetches
-my @urlsFetched;
-my @urlsFailed;
-
-# config the user agent for the request
-my $request_headers = [
-    'User-Agent' => $config->get("UA_AGENT"),
-    'Accept' => $config->get("UA_ACCEPT"),
-    'Accept-Language' => $config->get("UA_LANG"),
-    'Accept-Encoding' => HTTP::Message::decodable,
-    'Cache-Control' => $config->get("UA_CACHE")
-];
-my $ua = LWP::UserAgent->new();
-$ua->timeout($config->get("UA_TIMEOUT"));
-$ua->max_size($config->get("MAX_BYTES_PER_PAGE"));
-
-## now loop over them and store the results
-my $counter = 0;
-while ( my ($id, $url) = each %urlsToFetch ) {
-    sayYellow "Fetching: $id $url";
-
-    my $req = HTTP::Request->new(GET => $url, $request_headers);
-    my $res = $ua->request($req);
-    if ($res->is_success) {
-        # callback tells us to stop
-        if($res->header('Client-Aborted')) {
-            sayYellow "Aborted, too big.";
-            next;
-        }
-        if(index($res->content_type, "text/html") == -1) {
-            sayYellow "Fetching: $id ignored. Not html";
-            push(@urlsFailed, $id);
-            next;
-        }
-        open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
-        print $fh $res->decoded_content();
-        close($fh);
-        push(@urlsFetched, $id);
-        sayGreen"Fetching: $id ok";
-    }
-    else {
-        sayRed "Fetching: $id failed: $res->code ".$res->status_line;
-        push(@urlsFailed, $id);
-    }
-
-    if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
-        updateFetched($dbh, @urlsFetched);
-        updateFailed($dbh, @urlsFailed);
-        sleep(rand(7));
-
-        $counter = 0;
-        @urlsFetched = ();
-        @urlsFailed = ();
-    }
-
-    $counter++;
-}
-updateFetched($dbh, @urlsFetched);
-updateFailed($dbh, @urlsFailed);
-
-$dbh->disconnect();
-sayGreen "Fetch complete";
-
-
-
-## update last_fetched in the table
-sub updateFetched {
-    my ($dbh, @urls) = @_;
-    sayYellow "Update fetch timestamps: ".scalar @urls;
-    $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?");
-    foreach my $idToUpdate (@urls) {
-        sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG);
-        $query->bind_param(1,$idToUpdate);
-        $query->execute();
-    }
-    $dbh->commit();
-    sayGreen "Update fetch timestamps done";
-}
-
-## update fetch_failed in the table
-sub updateFailed {
-    my ($dbh, @urls) = @_;
-
-    sayYellow "Update fetch failed: ".scalar @urls;
-    $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?");
-    foreach my $idToUpdate (@urls) {
-        sayLog "Update fetch failed for: $idToUpdate" if($DEBUG);
-        $query->bind_param(1,$idToUpdate);
-        $query->execute();
-    }
-    $dbh->commit();
-    sayGreen "Update fetch failed done";
-}
diff --git a/lib/Aranea/Common.pm b/lib/Aranea/Common.pm

deleted file mode 100644 (file)

index b832ffb..0000000
--- a/lib/Aranea/Common.pm
+++ /dev/null
@@ -1,53 +0,0 @@
-# 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-package Aranea::Common;
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Term::ANSIColor qw(:constants);
-
-use DateTime;
-use Exporter qw(import);
-
-
-our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed);
-
-sub sayLog {
-       my ($string) = @_;
-       my $dt = DateTime->now;
-       say "[".$dt->datetime."] DEBUG: ".$string;
-}
-
-sub sayYellow {
-       my ($string) = @_;
-       my $dt = DateTime->now;
-       say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET;
-}
-
-sub sayGreen {
-       my ($string) = @_;
-       my $dt = DateTime->now;
-       say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET;
-}
-
-sub sayRed {
-       my ($string) = @_;
-       my $dt = DateTime->now;
-       say BOLD, RED, "[".$dt->datetime."] ".$string, RESET;
-}
-
-1;
diff --git a/parse-results.pl b/parse-results.pl

deleted file mode 100644 (file)

index 9a785d5..0000000
--- a/parse-results.pl
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/perl -w
-
-# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
-
-use 5.20.0;
-use strict;
-use warnings;
-use utf8;
-use Data::Dumper;
-use Term::ANSIColor qw(:constants);
-
-use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
-
-use open qw( :std :encoding(UTF-8) );
-use DBI;
-use ConfigReader::Simple;
-use HTML::LinkExtor;
-use URI::URL;
-use File::Basename;
-use Digest::MD5 qw(md5_hex);
-use Data::Validate::URI qw(is_uri);
-
-my $DEBUG = 0;
-my $config = ConfigReader::Simple->new("config.txt");
-die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-
-## DB connection
-my %dbAttr = (
-       PrintError=>0,# turn off error reporting via warn()
-    RaiseError=>1, # turn on error reporting via die()
-       AutoCommit=>0 # manually use transactions
-);
-my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
-my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-
-
-## get the fetched files
-my @results = glob("storage/*.result");
-die "Nothing to parse. No files found." unless(@results);
-
-## build clean ids for query
-my @queryIds = @results;
-foreach (@queryIds) {
-       $_ =~ s/.result//g;
-       $_ =~ s|storage/||g;
-}
-
-# get the baseurls
-my %baseUrls;
-my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
-sayLog($queryStr) if $DEBUG;
-my $query = $dbh->prepare($queryStr);
-$query->execute(@queryIds);
-while(my @row = $query->fetchrow_array) {
-       $baseUrls{$row[0]} = $row[1];
-}
-
-
-# get the string to ignore
-my @urlStringsToIgnore;
-$queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
-sayLog($queryStr) if $DEBUG;
-$query = $dbh->prepare($queryStr);
-$query->execute();
-while(my @row = $query->fetchrow) {
-       push(@urlStringsToIgnore, $row[0])
-}
-
-
-## prepare linkExtor
-my @links = ();
-my @workingLinks = ();
-sub leCallback {
-   my($tag, %attr) = @_;
-   return if $tag ne 'a';  # we only look closer at <a ...>
-   push(@workingLinks, values %attr);
-}
-my $le = HTML::LinkExtor->new(\&leCallback);
-
-## now parse each file and get the links
-my $counter = 0;
-foreach my $resultFile (@results) {
-       sayYellow "Parsing file: $resultFile";
-
-       my $fileId = basename($resultFile,".result");
-
-       if (exists $baseUrls{$fileId}) {
-               sayYellow "Baseurl: $baseUrls{$fileId}";
-
-               $le->parse_file($resultFile);
-               @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
-               push(@links,@workingLinks);
-
-               unlink($resultFile);
-               sayGreen "Parsing done: ".scalar @workingLinks;
-       }
-       else {
-               sayRed "No entry found for file $resultFile";
-       }
-
-       if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
-
-               @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
-               insertIntoDb($dbh, \@links);
-
-               $counter = 0;
-               @links = ();
-       }
-
-       @workingLinks = ();
-       $counter++;
-}
-
-@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
-insertIntoDb($dbh, \@links);
-
-$dbh->disconnect();
-sayGreen "Parse complete";
-
-
-## cleanup the found links
-sub cleanLinks {
-       my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
-       my @linkArray = @{ $linkArray };
-       my @urlStringsToIgnore = @{ $urlStringsToIgnore };
-
-       sayYellow "Clean found links: ".scalar @linkArray;
-       foreach my $toSearch (@urlStringsToIgnore) {
-               sayYellow "Clean links from: ".$toSearch;
-               @linkArray = grep {!/$toSearch/i} @linkArray;
-       }
-       sayGreen "Cleaned found links: ".scalar @linkArray;
-
-       return @linkArray;
-}
-
-
-## update the DB with the new found links
-sub insertIntoDb {
-       my ($dbh, $links) = @_;
-       my @links = @{ $links };
-
-       sayYellow "Insert links into DB: ".scalar @links;
-       $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET
-                                       `id` = ?,
-                                       `url` = ?,
-                                       `baseurl` = ?,
-                                       `created` = NOW()";
-       sayLog $queryStr if $DEBUG;
-       $query = $dbh->prepare($queryStr);
-       my $md5 = Digest::MD5->new;
-       my $counter = 0;
-       foreach my $link (@links) {
-
-               sayLog $link if ($DEBUG);
-
-               if(!is_uri($link)) {
-                       sayYellow "Ignore URL it is invalid: $link";
-                       next;
-               }
-
-               my $url = url($link);
-               if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) {
-                       sayYellow "Ignore URL because of scheme: $link";
-                       next;
-               }
-
-               $md5->add($link);
-               my $digest = $md5->hexdigest;
-               $query->execute($digest, $link, $url->scheme."://".$url->host);
-               $md5->reset;
-
-               $counter++;
-
-               if($counter >= 500) {
-                       $counter = 0;
-                       sayYellow "Commit counter of 500 reached. Commiting";
-                       $dbh->commit();
-               }
-
-               #sayLog $digest if ($DEBUG);
-               #sayLog $url->scheme if ($DEBUG);
-               #sayLog $url->host if ($DEBUG);
-               #sayLog $query->{Statement} if ($DEBUG);
-               #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
-
-               #sayLog "Inserted: $link" if($DEBUG);
-       }
-       sayYellow "Final commit";
-       $dbh->commit();
-}
diff --git a/storage/.gitignore b/storage/.gitignore

deleted file mode 100644 (file)

index d6b7ef3..0000000
--- a/storage/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
author	Banana <mail@bananas-playground.net>
	Tue, 8 Oct 2024 08:04:19 +0000 (10:04 +0200)
committer	Banana <mail@bananas-playground.net>
	Tue, 8 Oct 2024 08:04:19 +0000 (10:04 +0200)
README.md		patch \| blob \| history
cleanup.pl	[deleted file]	patch \| blob \| history
config.default.txt	[deleted file]	patch \| blob \| history
crawler/cleanup.pl	[new file with mode: 0644]	patch \| blob
crawler/config.default.txt	[new file with mode: 0644]	patch \| blob
crawler/documentation/install.md	[new file with mode: 0644]	patch \| blob
crawler/documentation/requirements.md	[new file with mode: 0644]	patch \| blob
crawler/documentation/setup.sql	[new file with mode: 0644]	patch \| blob
crawler/documentation/upgrade.md	[new file with mode: 0644]	patch \| blob
crawler/fetch.pl	[new file with mode: 0644]	patch \| blob
crawler/lib/Aranea/Common.pm	[new file with mode: 0644]	patch \| blob
crawler/parse-results.pl	[new file with mode: 0644]	patch \| blob
crawler/storage/.gitignore	[new file with mode: 0644]	patch \| blob
documentation/install.md	[deleted file]	patch \| blob \| history
documentation/requirements.md	[deleted file]	patch \| blob \| history
documentation/setup.sql	[deleted file]	patch \| blob \| history
fetch.pl	[deleted file]	patch \| blob \| history
lib/Aranea/Common.pm	[deleted file]	patch \| blob \| history
parse-results.pl	[deleted file]	patch \| blob \| history
storage/.gitignore	[deleted file]	patch \| blob \| history