From ddb211b53a5acc70c1fc95e5a5556c329da42cf6 Mon Sep 17 00:00:00 2001 From: Banana Date: Sun, 4 Aug 2024 14:11:26 +0200 Subject: [PATCH] docu and some other updates Signed-off-by: Banana --- README.md | 11 +++++++++-- cleanup.pl | 1 + documentation/install.md | 4 ++++ documentation/requirements.md | 1 + fetch.pl | 4 ++-- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 63f7657..04117d3 100644 --- a/README.md +++ b/README.md @@ -11,15 +11,22 @@ It starts with a given set of URL(s) and parses them for more URLs. Stores them and fetches them too. -> fetch.pl -# Parse +## Parse Each URL result (Stored result from the call) will be parsed for other URLs to follow. -> parse-results.pl -# Cleanup +## Cleanup After a run cleanup will gather all the unique Domains into a table. Removes URLs from the fetch table which are already enough. -> cleanup.pl + +# Ignores + +The table `url_to_ignore` does have a small amount of domains and part of domains which will be ignored. +Adding a global SPAM list would be overkill. + +A good idea is to run it with a DNS filter, which has a good blocklist. diff --git a/cleanup.pl b/cleanup.pl index 8562f60..d0f515a 100644 --- a/cleanup.pl +++ b/cleanup.pl @@ -104,6 +104,7 @@ while(my @row = $query->fetchrow_array) { push(@toBeDeletedFromFetchAgain, $baseUrl); } $query->finish(); + sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?"; sayLog($queryStr) if $DEBUG; diff --git a/documentation/install.md b/documentation/install.md index b30abbe..53365d2 100644 --- a/documentation/install.md +++ b/documentation/install.md @@ -9,3 +9,7 @@ You need a MySQL installation and a user which can create a database. Use setup.sql to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql` # Config + +Edit `config.txt` at least to match the database server settings. + +Make sure the directory `storage` can be written. diff --git a/documentation/requirements.md b/documentation/requirements.md index 0d14f66..e512394 100644 --- a/documentation/requirements.md +++ b/documentation/requirements.md @@ -5,3 +5,4 @@ Tested with a MySQL server 8.+ # Perl modules + [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple) ++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI) diff --git a/fetch.pl b/fetch.pl index c59c4db..d9fa7d2 100644 --- a/fetch.pl +++ b/fetch.pl @@ -51,7 +51,7 @@ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); my %urlsToFetch; my $query = $dbh->prepare("SELECT `id`, `url` FROM `url_to_fetch` - WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK + WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH OR `last_fetched` IS NULL AND `fetch_failed` = 0 LIMIT ".$config->get("FETCH_URLS_PER_RUN")); @@ -89,7 +89,7 @@ while ( my ($id, $url) = each %urlsToFetch ) { push(@urlsFailed, $id); next; } - open(my $fh, '>', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; + open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; print $fh $res->decoded_content(); close($fh); push(@urlsFetched, $id); -- 2.39.5