From: Banana Date: Mon, 11 Nov 2024 14:39:16 +0000 (+0100) Subject: adding arane-runner and made some changes X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=129acaa9868eb8ba1f59102bb54df01c10750e6f;p=aranea.git adding arane-runner and made some changes Signed-off-by: Banana --- diff --git a/.gitignore b/.gitignore index c86abc3..4e327dc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .idea _Deparsed_XSubs.pm config.txt +*.pid +*.log +*.run diff --git a/CHANGELOG b/CHANGELOG index 66ae802..745ff89 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,7 @@ 0.3 + Crawler config change. Please update config first. Compare it with config.default.txt ++ New log folder. Create it and make sure it is writable. +* Add: aranea-runner script to be used in a cron schedule. + Add: Web interface + Folder structure to separate crawler and web interface. + Setup sql file changed. Creation of the database needs to be done beforehand. diff --git a/README.md b/README.md index df524bd..ee37a60 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,13 @@ After a run cleanup will gather all the unique Domains into a table. Removes URLs from the fetch table which are already enough. `perl cleanup.pl` +# Usage + +Either run `fetch.pl`, `parse-results.pl` and `cleanup.pl` in the given order manually +or use `aranea-runner` with a cron. The cron schedule depends on the amount of URLs to be fetched and parsed. +Higher numbers needs longer run times. So plan the schedule around that by running the perl files +manually first. + # Ignores The table `url_to_ignore` does have a small amount of domains diff --git a/crawler/aranea-runner b/crawler/aranea-runner new file mode 100755 index 0000000..7928acb --- /dev/null +++ b/crawler/aranea-runner @@ -0,0 +1,48 @@ +#!/bin/bash +# +# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. +# +# To be executed as a cron. Checks which part of the crawler is running and which needs to be run next. +set -uo pipefail +IFS=$'\n\t' + +err() { + echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2 +} + +declare -A COMMANDS +COMMANDS["fetch"]="parse-results.pl" +COMMANDS["parse"]="cleanup.pl" +COMMANDS["cleanup"]="fetch.pl" + +CWD=$(pwd); +PIDFILE="$CWD/log/aranea.pid"; +LASTRUNFILE="$CWD/last.run"; +TORUN="cleanup"; + +if [ ! -e "$PIDFILE" ]; then + if [ -e "$LASTRUNFILE" ]; then + read -r LASTRUN < "$LASTRUNFILE"; + TORUN="${LASTRUN//[[:blank:]]\n/}"; + fi; + + if [[ -v COMMANDS[$TORUN] ]]; then + /usr/bin/perl ${COMMANDS[$TORUN]}; + else + err "Invalid contents of last run file: '${TORUN}'"; + exit 1; + fi; +fi; diff --git a/crawler/cleanup.pl b/crawler/cleanup.pl index e03d3ca..41286c5 100644 --- a/crawler/cleanup.pl +++ b/crawler/cleanup.pl @@ -29,12 +29,22 @@ use DBI; use ConfigReader::Simple; use URI::URL; use Data::Validate::URI qw(is_uri); - +use Proc::Pidfile; +use Cwd; my $DEBUG = 0; my $config = ConfigReader::Simple->new("config.txt"); die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; +# create the PID file and exit silently if it is already running. +my $currentdir = getcwd; +my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1); + +if(!$DEBUG) { + open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!"; + select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html +} + # DB connection my %dbAttr = ( PrintError=>0,# turn off error reporting via warn() @@ -132,4 +142,12 @@ sayYellow "Remove invalid urls done"; addToStats($dbh, "cleanup"); +# write itself to the last run file +open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!"; +print $fh "cleanup"; +close($fh); + +# end +$dbh->disconnect(); sayGreen "Cleanup complete"; +select STDOUT; diff --git a/crawler/documentation/install.md b/crawler/documentation/install.md index c17acff..291c528 100644 --- a/crawler/documentation/install.md +++ b/crawler/documentation/install.md @@ -12,4 +12,4 @@ Use `setup.sql` to create the tables into your existing database: `mysql --user= Copy `config.default.txt` to `config.txt` and edit at least to match the database name and server settings. -Make sure the directory `storage` can be written. +Make sure the directory `storage` and `log` can be written. diff --git a/crawler/documentation/requirements.md b/crawler/documentation/requirements.md index 57f67f4..dd00b1c 100644 --- a/crawler/documentation/requirements.md +++ b/crawler/documentation/requirements.md @@ -8,6 +8,7 @@ Extra modules along with the more already installed ones. + [ConfigReader::Simple](https://metacpan.org/pod/ConfigReader::Simple) + [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI) ++ [Proc::Pidfile](https://metacpan.org/pod/Proc::Pidfile) ## Debian @@ -20,3 +21,4 @@ Those are the ones which needed to be installed after a fresh debian(stable) ins + libdata-validate-uri-perl + libdbd-mysql-perl + libwww-perl ++ libproc-pid-file-perl diff --git a/crawler/fetch.pl b/crawler/fetch.pl index 66cc1ab..91b84c8 100644 --- a/crawler/fetch.pl +++ b/crawler/fetch.pl @@ -30,12 +30,22 @@ use DBI; use ConfigReader::Simple; use LWP::UserAgent; use HTTP::Request; - +use Proc::Pidfile; +use Cwd; my $DEBUG = 0; my $config = ConfigReader::Simple->new("config.txt"); die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; +# create the PID file and exit silently if it is already running. +my $currentdir = getcwd; +my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1); + +# write everything into log file +if(!$DEBUG) { + open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!"; + select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html +} # DB connection my %dbAttr = ( @@ -137,12 +147,15 @@ addToStats($dbh, 'fetchfailed', $allFailed, $allFailed); addToStats($dbh, 'fetchsuccess', $allFetched, $allFetched); $dbh->commit(); +# write itself to the last run file +open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!"; +print $fh "fetch"; +close($fh); # end $dbh->disconnect(); sayGreen "Fetch complete"; - - +select STDOUT; ## update last_fetched in the table sub updateFetched { diff --git a/crawler/parse-results.pl b/crawler/parse-results.pl index 780a7c6..8c97a14 100644 --- a/crawler/parse-results.pl +++ b/crawler/parse-results.pl @@ -33,11 +33,22 @@ use URI::URL; use File::Basename; use Digest::MD5 qw(md5_hex); use Data::Validate::URI qw(is_uri); +use Proc::Pidfile; +use Cwd; my $DEBUG = 0; my $config = ConfigReader::Simple->new("config.txt"); die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; +# create the PID file and exit silently if it is already running. +my $currentdir = getcwd; +my $pid = Proc::Pidfile->new(pidfile => $currentdir."/log/aranea.pid", silent => 1); + +if(!$DEBUG) { + open (my $LOG, '>>', 'log/aranea.log') or die "Could not open file 'log/aranea.log' $!"; + select $LOG; $| = 1; # https://perl.plover.com/FAQs/Buffering.html +} + # DB connection my %dbAttr = ( PrintError=>0,# Turn off error reporting via warn() @@ -125,8 +136,15 @@ foreach my $resultFile (@results) { addToStats($dbh, 'parse'); $dbh->commit(); +# write itself to the last run file +open(my $fh, '>:encoding(UTF-8)', "last.run") or die "Could not open file 'last.run' $!"; +print $fh "parse"; +close($fh); + +# end $dbh->disconnect(); sayGreen "Parse complete"; +select STDOUT; ## cleanup the found links @@ -137,7 +155,7 @@ sub cleanLinks { sayYellow "Clean found links: ".scalar @linkArray; foreach my $toSearch (@urlsToIgnore) { - sayYellow "Clean links from: ".$toSearch; + sayYellow "Clean links from: ".$toSearch if $DEBUG; @linkArray = grep {!/$toSearch/i} @linkArray; } sayGreen "Cleaned found links: ".scalar @linkArray;