]> 91.132.146.200 Git - aranea.git/commitdiff
docu and some other updates
authorBanana <mail@bananas-playground.net>
Sun, 4 Aug 2024 12:11:26 +0000 (14:11 +0200)
committerBanana <mail@bananas-playground.net>
Sun, 4 Aug 2024 12:11:26 +0000 (14:11 +0200)
Signed-off-by: Banana <mail@bananas-playground.net>
README.md
cleanup.pl
documentation/install.md
documentation/requirements.md
fetch.pl

index 63f76577748e7c6870fcca3ea11e3aab839b0c0f..04117d39bd6e3f1c5813be310698407a053c8b86 100644 (file)
--- a/README.md
+++ b/README.md
@@ -11,15 +11,22 @@ It starts with a given set of URL(s) and parses them for more
 URLs. Stores them and fetches them too.
 -> fetch.pl
 
-# Parse
+## Parse
 
 Each URL result (Stored result from the call) will be parsed
 for other URLs to follow.
 -> parse-results.pl
 
-# Cleanup
+## Cleanup
 
 After a run cleanup will gather all the unique Domains into
 a table. Removes URLs from the fetch table which are already
 enough.
 -> cleanup.pl
+
+# Ignores
+
+The table `url_to_ignore` does have a small amount of domains and part of domains which will be ignored.
+Adding a global SPAM list would be overkill.
+
+A good idea is to run it with a DNS filter, which has a good blocklist.
index 8562f6065f086c4357ff52e58960c0d14a3290c8..d0f515a65f88d39e3c13c446b6ea5adf6ee61b7d 100644 (file)
@@ -104,6 +104,7 @@ while(my @row = $query->fetchrow_array) {
        push(@toBeDeletedFromFetchAgain, $baseUrl);
 }
 $query->finish();
+
 sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
 $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
 sayLog($queryStr) if $DEBUG;
index b30abbe9fac44fe266ce793524f20c6ae08c606d..53365d2beb0f2482a7806ec6f14345f762658fcc 100644 (file)
@@ -9,3 +9,7 @@ You need a MySQL installation and a user which can create a database.
 Use setup.sql to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql`
 
 # Config
+
+Edit `config.txt`  at least to match the database server settings.
+
+Make sure the directory `storage` can be written.
index 0d14f66a93b4ae6c8a77010c3f260005522a3914..e51239454e75ef27a8c685e128361c9221d028b5 100644 (file)
@@ -5,3 +5,4 @@ Tested with a MySQL server 8.+
 # Perl modules
 
 + [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)
index c59c4dbda584cf0cc10ce1bcfebe44693effd28e..d9fa7d21a3b37acb9b6170cc34c612a3ccf3c571 100644 (file)
--- a/fetch.pl
+++ b/fetch.pl
@@ -51,7 +51,7 @@ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
 my %urlsToFetch;
 my $query = $dbh->prepare("SELECT `id`, `url`
                                                        FROM `url_to_fetch`
-                                                       WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
+                                                       WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
                                                                OR `last_fetched` IS NULL
                                                                AND `fetch_failed` = 0
                                                        LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
@@ -89,7 +89,7 @@ while ( my ($id, $url) = each %urlsToFetch ) {
                        push(@urlsFailed, $id);
                        next;
                }
-               open(my $fh, '>', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
+               open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
                print $fh $res->decoded_content();
                close($fh);
                push(@urlsFetched, $id);