From: Banana <mail@bananas-playground.net>
Date: Mon, 11 Nov 2024 14:39:16 +0000 (+0100)
Subject: adding arane-runner and made some changes
X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=129acaa9868eb8ba1f59102bb54df01c10750e6f;p=aranea.git

adding arane-runner and made some changes

Signed-off-by: Banana <mail@bananas-playground.net>
---

diff --git a/.gitignore b/.gitignore
index c86abc3..4e327dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 .idea
 _Deparsed_XSubs.pm
 config.txt
+*.pid
+*.log
+*.run
diff --git a/CHANGELOG b/CHANGELOG
index 66ae802..745ff89 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,7 @@
 0.3
 + Crawler config change. Please update config first. Compare it with config.default.txt
++ New log folder. Create it and make sure it is writable.
+* Add: aranea-runner script to be used in a cron schedule.
 + Add: Web interface
 + Folder structure to separate crawler and web interface.
 + Setup sql file changed. Creation of the database needs to be done beforehand.
diff --git a/README.md b/README.md
index df524bd..ee37a60 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,13 @@ After a run cleanup will gather all the unique Domains into
 a table. Removes URLs from the fetch table which are already
 enough. `perl cleanup.pl`
 
+# Usage
+
+Either run `fetch.pl`, `parse-results.pl` and `cleanup.pl` in the given order manually
+or use `aranea-runner` with a cron. The cron schedule depends on the amount of URLs to be fetched and parsed.
+Higher numbers needs longer run times. So plan the schedule around that by running the perl files
+manually first.
+
 # Ignores
 
 The table `url_to_ignore` does have a small amount of domains
diff --git a/crawler/aranea-runner b/crawler/aranea-runner
new file mode 100755
index 0000000..7928acb
--- /dev/null
+++ b/crawler/aranea-runner
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see http://www.gnu.org/licenses/gpl-3.0.
+#
+# To be executed as a cron. Checks which part of the crawler is running and which needs to be run next.
+set -uo pipefail
+IFS=$'\n\t'
+
+err() {
+  echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2
+}
+
+declare -A COMMANDS
+COMMANDS["fetch"]="parse-results.pl"
+COMMANDS["parse"]="cleanup.pl"
+COMMANDS["cleanup"]="fetch.pl"
+
+CWD=$(pwd);
+PIDFILE="$CWD/log/aranea.pid";
+LASTRUNFILE="$CWD/last.run";
+TORUN="cleanup";
+
+if [ ! -e "$PIDFILE" ]; then
+	if [ -e "$LASTRUNFILE" ]; then
+		read -r LASTRUN < "$LASTRUNFILE";
+		TORUN="${LASTRUN//[[:blank:]]\n/}";
+	fi;
+
+	if [[ -v COMMANDS[$TORUN] ]]; then
+		/usr/bin/perl ${COMMANDS[$TORUN]};
+	else
+		err "Invalid contents of last run file: '${TORUN}'";
+		exit 1;
+	fi;
+fi;
diff --git a/crawler/cleanup.pl b/crawler/cleanup.pl
index e03d3ca..41286c5 100644
--- a/crawler/cleanup.pl
+++ b/crawler/cleanup.pl
@@ -29,12 +29,22 @@ use DBI;
 use ConfigReader::Simple;
 use URI::URL;
 use Data::Validate::URI qw(is_uri);
-
+use Proc::Pidfile;
+use Cwd;
 
 my $DEBUG = 0;
 my $config = ConfigReader::Simple->new("config.txt");
 die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
 
+# create the PID file and exit silently if it is already running.
+my $currentdir = getcwd;
+my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1);
+
+if(!$DEBUG) {
+	open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!";
+	select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html
+}
+
 # DB connection
 my %dbAttr = (
 	PrintError=>0,# turn off error reporting via warn()
@@ -132,4 +142,12 @@ sayYellow "Remove invalid urls done";
 
 addToStats($dbh, "cleanup");
 
+# write itself to the last run file
+open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!";
+print $fh "cleanup";
+close($fh);
+
+# end
+$dbh->disconnect();
 sayGreen "Cleanup complete";
+select STDOUT;
diff --git a/crawler/documentation/install.md b/crawler/documentation/install.md
index c17acff..291c528 100644
--- a/crawler/documentation/install.md
+++ b/crawler/documentation/install.md
@@ -12,4 +12,4 @@ Use `setup.sql` to create the tables into your existing database: `mysql --user=
 
 Copy `config.default.txt` to `config.txt` and edit at least to match the database name and server settings.
 
-Make sure the directory `storage` can be written.
+Make sure the directory `storage` and `log` can be written.
diff --git a/crawler/documentation/requirements.md b/crawler/documentation/requirements.md
index 57f67f4..dd00b1c 100644
--- a/crawler/documentation/requirements.md
+++ b/crawler/documentation/requirements.md
@@ -8,6 +8,7 @@ Extra modules along with the more already installed ones.
 
 + [ConfigReader::Simple](https://metacpan.org/pod/ConfigReader::Simple)
 + [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
++ [Proc::Pidfile](https://metacpan.org/pod/Proc::Pidfile)
 
 ## Debian
 
@@ -20,3 +21,4 @@ Those are the ones which needed to be installed after a fresh debian(stable) ins
 + libdata-validate-uri-perl
 + libdbd-mysql-perl
 + libwww-perl
++ libproc-pid-file-perl
diff --git a/crawler/fetch.pl b/crawler/fetch.pl
index 66cc1ab..91b84c8 100644
--- a/crawler/fetch.pl
+++ b/crawler/fetch.pl
@@ -30,12 +30,22 @@ use DBI;
 use ConfigReader::Simple;
 use LWP::UserAgent;
 use HTTP::Request;
-
+use Proc::Pidfile;
+use Cwd;
 
 my $DEBUG = 0;
 my $config = ConfigReader::Simple->new("config.txt");
 die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
 
+# create the PID file and exit silently if it is already running.
+my $currentdir = getcwd;
+my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1);
+
+# write everything into log file
+if(!$DEBUG) {
+    open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!";
+    select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html
+}
 
 # DB connection
 my %dbAttr = (
@@ -137,12 +147,15 @@ addToStats($dbh, 'fetchfailed', $allFailed, $allFailed);
 addToStats($dbh, 'fetchsuccess', $allFetched, $allFetched);
 $dbh->commit();
 
+# write itself to the last run file
+open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!";
+print $fh "fetch";
+close($fh);
 
 # end
 $dbh->disconnect();
 sayGreen "Fetch complete";
-
-
+select STDOUT;
 
 ## update last_fetched in the table
 sub updateFetched {
diff --git a/crawler/parse-results.pl b/crawler/parse-results.pl
index 780a7c6..8c97a14 100644
--- a/crawler/parse-results.pl
+++ b/crawler/parse-results.pl
@@ -33,11 +33,22 @@ use URI::URL;
 use File::Basename;
 use Digest::MD5 qw(md5_hex);
 use Data::Validate::URI qw(is_uri);
+use Proc::Pidfile;
+use Cwd;
 
 my $DEBUG = 0;
 my $config = ConfigReader::Simple->new("config.txt");
 die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
 
+# create the PID file and exit silently if it is already running.
+my $currentdir = getcwd;
+my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1);
+
+if(!$DEBUG) {
+	open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!";
+	select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html
+}
+
 # DB connection
 my %dbAttr = (
 	PrintError=>0,# Turn off error reporting via warn()
@@ -125,8 +136,15 @@ foreach my $resultFile (@results) {
 addToStats($dbh, 'parse');
 $dbh->commit();
 
+# write itself to the last run file
+open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!";
+print $fh "parse";
+close($fh);
+
+# end
 $dbh->disconnect();
 sayGreen "Parse complete";
+select STDOUT;
 
 
 ## cleanup the found links
@@ -137,7 +155,7 @@ sub cleanLinks {
 
 	sayYellow "Clean found links: ".scalar @linkArray;
 	foreach my $toSearch (@urlsToIgnore) {
-		sayYellow "Clean links from: ".$toSearch;
+		sayYellow "Clean links from: ".$toSearch if $DEBUG;
 		@linkArray = grep {!/$toSearch/i} @linkArray;
 	}
 	sayGreen "Cleaned found links: ".scalar @linkArray;