Browse Source

docu and some other updates

Signed-off-by: Banana <mail@bananas-playground.net>
Banana 1 month ago
parent
commit
ddb211b53a
5 changed files with 17 additions and 4 deletions
  1. 9 2
      README.md
  2. 1 0
      cleanup.pl
  3. 4 0
      documentation/install.md
  4. 1 0
      documentation/requirements.md
  5. 2 2
      fetch.pl

+ 9 - 2
README.md

@@ -11,15 +11,22 @@ It starts with a given set of URL(s) and parses them for more
 URLs. Stores them and fetches them too.
 URLs. Stores them and fetches them too.
 -> fetch.pl
 -> fetch.pl
 
 
-# Parse
+## Parse
 
 
 Each URL result (Stored result from the call) will be parsed
 Each URL result (Stored result from the call) will be parsed
 for other URLs to follow.
 for other URLs to follow.
 -> parse-results.pl
 -> parse-results.pl
 
 
-# Cleanup
+## Cleanup
 
 
 After a run cleanup will gather all the unique Domains into
 After a run cleanup will gather all the unique Domains into
 a table. Removes URLs from the fetch table which are already
 a table. Removes URLs from the fetch table which are already
 enough.
 enough.
 -> cleanup.pl
 -> cleanup.pl
+
+# Ignores
+
+The table `url_to_ignore` does have a small amount of domains and part of domains which will be ignored.
+Adding a global SPAM list would be overkill.
+
+A good idea is to run it with a DNS filter, which has a good blocklist.

+ 1 - 0
cleanup.pl

@@ -104,6 +104,7 @@ while(my @row = $query->fetchrow_array) {
 	push(@toBeDeletedFromFetchAgain, $baseUrl);
 	push(@toBeDeletedFromFetchAgain, $baseUrl);
 }
 }
 $query->finish();
 $query->finish();
+
 sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
 sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain;
 $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
 $queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?";
 sayLog($queryStr) if $DEBUG;
 sayLog($queryStr) if $DEBUG;

+ 4 - 0
documentation/install.md

@@ -9,3 +9,7 @@ You need a MySQL installation and a user which can create a database.
 Use setup.sql to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql`
 Use setup.sql to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql`
 
 
 # Config
 # Config
+
+Edit `config.txt`  at least to match the database server settings.
+
+Make sure the directory `storage` can be written.

+ 1 - 0
documentation/requirements.md

@@ -5,3 +5,4 @@ Tested with a MySQL server 8.+
 # Perl modules
 # Perl modules
 
 
 + [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
 + [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple)
++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI)

+ 2 - 2
fetch.pl

@@ -51,7 +51,7 @@ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
 my %urlsToFetch;
 my %urlsToFetch;
 my $query = $dbh->prepare("SELECT `id`, `url`
 my $query = $dbh->prepare("SELECT `id`, `url`
 							FROM `url_to_fetch`
 							FROM `url_to_fetch`
-							WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK
+							WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH
 								OR `last_fetched` IS NULL
 								OR `last_fetched` IS NULL
 								AND `fetch_failed` = 0
 								AND `fetch_failed` = 0
 							LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
 							LIMIT ".$config->get("FETCH_URLS_PER_RUN"));
@@ -89,7 +89,7 @@ while ( my ($id, $url) = each %urlsToFetch ) {
 			push(@urlsFailed, $id);
 			push(@urlsFailed, $id);
 			next;
 			next;
 		}
 		}
-		open(my $fh, '>', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
+		open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
 		print $fh $res->decoded_content();
 		print $fh $res->decoded_content();
 		close($fh);
 		close($fh);
 		push(@urlsFetched, $id);
 		push(@urlsFetched, $id);