From ddb211b53a5acc70c1fc95e5a5556c329da42cf6 Mon Sep 17 00:00:00 2001
From: Banana <mail@bananas-playground.net>
Date: Sun, 4 Aug 2024 14:11:26 +0200
Subject: [PATCH] docu and some other updates

Signed-off-by: Banana <mail@bananas-playground.net>
---
 README.md                     | 11 +++++++++--
 cleanup.pl                    |  1 +
 documentation/install.md      |  4 ++++
 documentation/requirements.md |  1 +
 fetch.pl                      |  4 ++--
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 63f7657..04117d3 100644
--- a/README.md
+++ b/README.md
@@ -11,15 +11,22 @@ It starts with a given set of URL(s) and parses them for more
 URLs. Stores them and fetches them too.
 -> fetch.pl
 
-# Parse
+## Parse
 
 Each URL result (Stored result from the call) will be parsed
 for other URLs to follow.
 -> parse-results.pl
 
-# Cleanup
+## Cleanup
 
 After a run cleanup will gather all the unique Domains into
 a table. Removes URLs from the fetch table which are already
 enough.
 -> cleanup.pl
+
+# Ignores
+
+The table `url_to_ignore` does have a small amount of domains and part of domains which will be ignored.
+Adding a global SPAM list would be overkill.
+
+A good idea is to run it with a DNS filter, which has a good blocklist.
diff --git a/cleanup.pl b/cleanup.pl
index 8562f60..d0f515a 100644
--- a/cleanup.pl
+++ b/cleanup.pl
@@ -104,6 +104,7 @@ while(my @row = $query->fetchrow_array) {
 	push(@toBeDeletedFromFetchAgain, $baseUrl);
 }
 $query->finish();
+
 sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
 $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
 sayLog($queryStr) if $DEBUG;
diff --git a/documentation/install.md b/documentation/install.md
index b30abbe..53365d2 100644
--- a/documentation/install.md
+++ b/documentation/install.md
@@ -9,3 +9,7 @@ You need a MySQL installation and a user which can create a database.
 Use setup.sql to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql`
 
 # Config
+
+Edit `config.txt`  at least to match the database server settings.
+
+Make sure the directory `storage` can be written.
diff --git a/documentation/requirements.md b/documentation/requirements.md
index 0d14f66..e512394 100644
--- a/documentation/requirements.md
+++ b/documentation/requirements.md
@@ -5,3 +5,4 @@ Tested with a MySQL server 8.+
 # Perl modules
 
 + [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
diff --git a/fetch.pl b/fetch.pl
index c59c4db..d9fa7d2 100644
--- a/fetch.pl
+++ b/fetch.pl
@@ -51,7 +51,7 @@ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
 my %urlsToFetch;
 my $query = $dbh->prepare("SELECT `id`, `url`
 							FROM `url_to_fetch`
-							WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
+							WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
 								OR `last_fetched` IS NULL
 								AND `fetch_failed` = 0
 							LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
@@ -89,7 +89,7 @@ while ( my ($id, $url) = each %urlsToFetch ) {
 			push(@urlsFailed, $id);
 			next;
 		}
-		open(my $fh, '>', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
+		open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
 		print $fh $res->decoded_content();
 		close($fh);
 		push(@urlsFetched, $id);
-- 
2.39.5