.idea
_Deparsed_XSubs.pm
config.txt
+*.pid
+*.log
+*.run
0.3
+ Crawler config change. Please update config first. Compare it with config.default.txt
++ New log folder. Create it and make sure it is writable.
+* Add: aranea-runner script to be used in a cron schedule.
+ Add: Web interface
+ Folder structure to separate crawler and web interface.
+ Setup sql file changed. Creation of the database needs to be done beforehand.
a table. Removes URLs from the fetch table which are already
enough. `perl cleanup.pl`
+# Usage
+
+Either run `fetch.pl`, `parse-results.pl` and `cleanup.pl` in the given order manually
+or use `aranea-runner` with a cron. The cron schedule depends on the amount of URLs to be fetched and parsed.
+Higher numbers needs longer run times. So plan the schedule around that by running the perl files
+manually first.
+
# Ignores
The table `url_to_ignore` does have a small amount of domains
--- /dev/null
+#!/bin/bash
+#
+# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.
+#
+# To be executed as a cron. Checks which part of the crawler is running and which needs to be run next.
+set -uo pipefail
+IFS=$'\n\t'
+
+err() {
+ echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2
+}
+
+declare -A COMMANDS
+COMMANDS["fetch"]="parse-results.pl"
+COMMANDS["parse"]="cleanup.pl"
+COMMANDS["cleanup"]="fetch.pl"
+
+CWD=$(pwd);
+PIDFILE="$CWD/log/aranea.pid";
+LASTRUNFILE="$CWD/last.run";
+TORUN="cleanup";
+
+if [ ! -e "$PIDFILE" ]; then
+ if [ -e "$LASTRUNFILE" ]; then
+ read -r LASTRUN < "$LASTRUNFILE";
+ TORUN="${LASTRUN//[[:blank:]]\n/}";
+ fi;
+
+ if [[ -v COMMANDS[$TORUN] ]]; then
+ /usr/bin/perl ${COMMANDS[$TORUN]};
+ else
+ err "Invalid contents of last run file: '${TORUN}'";
+ exit 1;
+ fi;
+fi;
use ConfigReader::Simple;
use URI::URL;
use Data::Validate::URI qw(is_uri);
-
+use Proc::Pidfile;
+use Cwd;
my $DEBUG = 0;
my $config = ConfigReader::Simple->new("config.txt");
die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+# create the PID file and exit silently if it is already running.
+my $currentdir = getcwd;
+my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1);
+
+if(!$DEBUG) {
+ open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!";
+ select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html
+}
+
# DB connection
my %dbAttr = (
PrintError=>0,# turn off error reporting via warn()
addToStats($dbh, "cleanup");
+# write itself to the last run file
+open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!";
+print $fh "cleanup";
+close($fh);
+
+# end
+$dbh->disconnect();
sayGreen "Cleanup complete";
+select STDOUT;
Copy `config.default.txt` to `config.txt` and edit at least to match the database name and server settings.
-Make sure the directory `storage` can be written.
+Make sure the directory `storage` and `log` can be written.
+ [ConfigReader::Simple](https://metacpan.org/pod/ConfigReader::Simple)
+ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
++ [Proc::Pidfile](https://metacpan.org/pod/Proc::Pidfile)
## Debian
+ libdata-validate-uri-perl
+ libdbd-mysql-perl
+ libwww-perl
++ libproc-pid-file-perl
use ConfigReader::Simple;
use LWP::UserAgent;
use HTTP::Request;
-
+use Proc::Pidfile;
+use Cwd;
my $DEBUG = 0;
my $config = ConfigReader::Simple->new("config.txt");
die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+# create the PID file and exit silently if it is already running.
+my $currentdir = getcwd;
+my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1);
+
+# write everything into log file
+if(!$DEBUG) {
+ open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!";
+ select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html
+}
# DB connection
my %dbAttr = (
addToStats($dbh, 'fetchsuccess', $allFetched, $allFetched);
$dbh->commit();
+# write itself to the last run file
+open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!";
+print $fh "fetch";
+close($fh);
# end
$dbh->disconnect();
sayGreen "Fetch complete";
-
-
+select STDOUT;
## update last_fetched in the table
sub updateFetched {
use File::Basename;
use Digest::MD5 qw(md5_hex);
use Data::Validate::URI qw(is_uri);
+use Proc::Pidfile;
+use Cwd;
my $DEBUG = 0;
my $config = ConfigReader::Simple->new("config.txt");
die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
+# create the PID file and exit silently if it is already running.
+my $currentdir = getcwd;
+my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1);
+
+if(!$DEBUG) {
+ open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!";
+ select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html
+}
+
# DB connection
my %dbAttr = (
PrintError=>0,# Turn off error reporting via warn()
addToStats($dbh, 'parse');
$dbh->commit();
+# write itself to the last run file
+open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!";
+print $fh "parse";
+close($fh);
+
+# end
$dbh->disconnect();
sayGreen "Parse complete";
+select STDOUT;
## cleanup the found links
sayYellow "Clean found links: ".scalar @linkArray;
foreach my $toSearch (@urlsToIgnore) {
- sayYellow "Clean links from: ".$toSearch;
+ sayYellow "Clean links from: ".$toSearch if $DEBUG;
@linkArray = grep {!/$toSearch/i} @linkArray;
}
sayGreen "Cleaned found links: ".scalar @linkArray;