From: Banana Date: Tue, 8 Oct 2024 08:04:19 +0000 (+0200) Subject: new directory structure to seperate the crawler and the upcomming webinterface X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=e9978a772218368f852d2e74741f0a1ae01c871a;p=aranea.git new directory structure to seperate the crawler and the upcomming webinterface Signed-off-by: Banana --- diff --git a/README.md b/README.md index 3ac7ebb..df524bd 100644 --- a/README.md +++ b/README.md @@ -27,3 +27,8 @@ The table `url_to_ignore` does have a small amount of domains and part of domains which will be ignored. Adding a global SPAM list would be overkill. A good idea is to run it with a DNS filter, which has a good blocklist. + +# Webinterface + +The folder `webroot` does contain a webinterface which displays the gathered data and status. +It does not provide a way to execute the crawler. diff --git a/cleanup.pl b/cleanup.pl deleted file mode 100644 index 4dce87c..0000000 --- a/cleanup.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/perl -w - -# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. - -use 5.20.0; -use strict; -use warnings; -use utf8; -use Data::Dumper; -use Term::ANSIColor qw(:constants); - -use lib './lib'; -use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); - -use DBI; -use ConfigReader::Simple; -use URI::URL; -use Data::Validate::URI qw(is_uri); - - -my $DEBUG = 0; -my $config = ConfigReader::Simple->new("config.txt"); -die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; - -## DB connection -my %dbAttr = ( - PrintError=>0,# turn off error reporting via warn() - RaiseError=>1 # turn on error reporting via die() -); -my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); -my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); -die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); - - - -# update the unique domains -my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0"; -sayLog($queryStr) if $DEBUG; -my $query = $dbh->prepare($queryStr); -$query->execute(); - -# now validate the unique ones -$queryStr = "SELECT `id`, `url` FROM unique_domain"; -sayLog($queryStr) if $DEBUG; -$query = $dbh->prepare($queryStr); -$query->execute(); -my @invalidUrls = (); -my @toBeDeletedFromFetchAgain = (); -while(my @row = $query->fetchrow_array) { - my $link = $row[1]; - my $id = $row[0]; - if(!is_uri($link)) { - sayYellow "Ignore URL it is invalid: $link"; - push(@invalidUrls, $id); - push(@toBeDeletedFromFetchAgain, $link); - next; - } - - my $url = url($link); - if(!defined($url->scheme) || index($url->scheme,"http") == -1) { - sayYellow "Ignore URL because of scheme: $link"; - push(@invalidUrls, $id); - push(@toBeDeletedFromFetchAgain, $link); - next; - } -} - -sayYellow "Invalid unique_domain: ".scalar @invalidUrls; -$queryStr = "DELETE FROM unique_domain WHERE `id` = ?"; -sayLog($queryStr) if $DEBUG; -$query = $dbh->prepare($queryStr); -foreach my $invalidId (@invalidUrls) { - $query->execute($invalidId); - #$query->finish(); - sayLog "Removed $invalidId from unique_domain" if $DEBUG; -} -sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls; - - -# remove urls from fetch since we have enough already -$queryStr = "SELECT count(baseurl) AS amount, baseurl - FROM `url_to_fetch` - WHERE last_fetched <> 0 - GROUP BY baseurl - HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE"); -sayLog($queryStr) if $DEBUG; -$query = $dbh->prepare($queryStr); -$query->execute(); -while(my @row = $query->fetchrow_array) { - my $baseUrl = $row[1]; - push(@toBeDeletedFromFetchAgain, $baseUrl); -} -#$query->finish(); - -sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; -$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?"; -sayLog($queryStr) if $DEBUG; -$query = $dbh->prepare($queryStr); -foreach my $baseUrl (@toBeDeletedFromFetchAgain) { - $query->execute($baseUrl); - #$query->finish(); - sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG; -} -sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; - -# remove failed fetches -sayYellow "Remove fetch_failed"; -$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1"; -$query = $dbh->prepare($queryStr); -$query->execute(); -sayGreen "Remove fetch_failed done"; - -sayYellow "Remove invalid urls which the is_uri check does let pass"; -$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'"; -$query = $dbh->prepare($queryStr); -$query->execute(); -$queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'"; -$query = $dbh->prepare($queryStr); -$query->execute(); -sayYellow "Remove invalid urls done"; - - -sayGreen "Cleanup complete"; diff --git a/config.default.txt b/config.default.txt deleted file mode 100644 index 22ef694..0000000 --- a/config.default.txt +++ /dev/null @@ -1,17 +0,0 @@ -DB_HOST=localhost -DB_PORT=3306 -DB_NAME=aranea -DB_USER=user -DB_PASS=test - -UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0" -UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" -UA_LANG="en-US" -UA_CACHE="no-cache" -UA_TIMEOUT=5 - -FETCH_URLS_PER_RUN=5000 -FETCH_URLS_PER_PACKAGE=30 -PARSE_FILES_PER_PACKAGE=50 -CLEANUP_URLS_AMOUNT_ABOVE=40 -MAX_BYTES_PER_PAGE=5000000 diff --git a/crawler/cleanup.pl b/crawler/cleanup.pl new file mode 100644 index 0000000..4dce87c --- /dev/null +++ b/crawler/cleanup.pl @@ -0,0 +1,136 @@ +#!/usr/bin/perl -w + +# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. + +use 5.20.0; +use strict; +use warnings; +use utf8; +use Data::Dumper; +use Term::ANSIColor qw(:constants); + +use lib './lib'; +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); + +use DBI; +use ConfigReader::Simple; +use URI::URL; +use Data::Validate::URI qw(is_uri); + + +my $DEBUG = 0; +my $config = ConfigReader::Simple->new("config.txt"); +die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; + +## DB connection +my %dbAttr = ( + PrintError=>0,# turn off error reporting via warn() + RaiseError=>1 # turn on error reporting via die() +); +my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); +my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); +die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); + + + +# update the unique domains +my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0"; +sayLog($queryStr) if $DEBUG; +my $query = $dbh->prepare($queryStr); +$query->execute(); + +# now validate the unique ones +$queryStr = "SELECT `id`, `url` FROM unique_domain"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +$query->execute(); +my @invalidUrls = (); +my @toBeDeletedFromFetchAgain = (); +while(my @row = $query->fetchrow_array) { + my $link = $row[1]; + my $id = $row[0]; + if(!is_uri($link)) { + sayYellow "Ignore URL it is invalid: $link"; + push(@invalidUrls, $id); + push(@toBeDeletedFromFetchAgain, $link); + next; + } + + my $url = url($link); + if(!defined($url->scheme) || index($url->scheme,"http") == -1) { + sayYellow "Ignore URL because of scheme: $link"; + push(@invalidUrls, $id); + push(@toBeDeletedFromFetchAgain, $link); + next; + } +} + +sayYellow "Invalid unique_domain: ".scalar @invalidUrls; +$queryStr = "DELETE FROM unique_domain WHERE `id` = ?"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +foreach my $invalidId (@invalidUrls) { + $query->execute($invalidId); + #$query->finish(); + sayLog "Removed $invalidId from unique_domain" if $DEBUG; +} +sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls; + + +# remove urls from fetch since we have enough already +$queryStr = "SELECT count(baseurl) AS amount, baseurl + FROM `url_to_fetch` + WHERE last_fetched <> 0 + GROUP BY baseurl + HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE"); +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +$query->execute(); +while(my @row = $query->fetchrow_array) { + my $baseUrl = $row[1]; + push(@toBeDeletedFromFetchAgain, $baseUrl); +} +#$query->finish(); + +sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; +$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +foreach my $baseUrl (@toBeDeletedFromFetchAgain) { + $query->execute($baseUrl); + #$query->finish(); + sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG; +} +sayGreen "Removed baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; + +# remove failed fetches +sayYellow "Remove fetch_failed"; +$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1"; +$query = $dbh->prepare($queryStr); +$query->execute(); +sayGreen "Remove fetch_failed done"; + +sayYellow "Remove invalid urls which the is_uri check does let pass"; +$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'"; +$query = $dbh->prepare($queryStr); +$query->execute(); +$queryStr = "DELETE FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'"; +$query = $dbh->prepare($queryStr); +$query->execute(); +sayYellow "Remove invalid urls done"; + + +sayGreen "Cleanup complete"; diff --git a/crawler/config.default.txt b/crawler/config.default.txt new file mode 100644 index 0000000..22ef694 --- /dev/null +++ b/crawler/config.default.txt @@ -0,0 +1,17 @@ +DB_HOST=localhost +DB_PORT=3306 +DB_NAME=aranea +DB_USER=user +DB_PASS=test + +UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0" +UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" +UA_LANG="en-US" +UA_CACHE="no-cache" +UA_TIMEOUT=5 + +FETCH_URLS_PER_RUN=5000 +FETCH_URLS_PER_PACKAGE=30 +PARSE_FILES_PER_PACKAGE=50 +CLEANUP_URLS_AMOUNT_ABOVE=40 +MAX_BYTES_PER_PAGE=5000000 diff --git a/crawler/documentation/install.md b/crawler/documentation/install.md new file mode 100644 index 0000000..c17acff --- /dev/null +++ b/crawler/documentation/install.md @@ -0,0 +1,15 @@ +# Requirements + +Please check the requirements file first. + +# Database + +You need a MySQL installation and an existing database. + +Use `setup.sql` to create the tables into your existing database: `mysql --user=user -p databasename < setup.sql` + +# Config + +Copy `config.default.txt` to `config.txt` and edit at least to match the database name and server settings. + +Make sure the directory `storage` can be written. diff --git a/crawler/documentation/requirements.md b/crawler/documentation/requirements.md new file mode 100644 index 0000000..0375df0 --- /dev/null +++ b/crawler/documentation/requirements.md @@ -0,0 +1,22 @@ +# MySQL + +Tested with a MySQL server 8.+ + +# Perl modules + +Extra modules along with the more already installed ones. + ++ [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple) ++ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI) + +## Debian + +Those are the ones which needed to be installed after a fresh debian(stable) install. May vary. + ++ libdatetime-perl ++ libdbi-perl ++ libconfigreader-simple-perl ++ libhtml-linkextractor-perl ++ libdata-validate-uri-perl ++ libdbd-mysql-perl ++ libwww-perl diff --git a/crawler/documentation/setup.sql b/crawler/documentation/setup.sql new file mode 100644 index 0000000..2fa4c9b --- /dev/null +++ b/crawler/documentation/setup.sql @@ -0,0 +1,148 @@ +SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; +START TRANSACTION; +SET time_zone = "+00:00"; + + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; + +-- +-- Table structure for table `unique_domain` +-- + +CREATE TABLE `unique_domain` ( + `id` int(11) NOT NULL, + `url` varchar(255) COLLATE utf8mb4_bin NOT NULL, + `created` datetime NOT NULL DEFAULT current_timestamp() +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + +-- -------------------------------------------------------- + +-- +-- Table structure for table `url_to_fetch` +-- + +CREATE TABLE `url_to_fetch` ( + `id` char(32) COLLATE utf8mb4_bin NOT NULL, + `url` text COLLATE utf8mb4_bin NOT NULL, + `baseurl` varchar(255) COLLATE utf8mb4_bin NOT NULL, + `created` datetime NOT NULL, + `last_fetched` datetime DEFAULT NULL, + `fetch_failed` tinyint(1) NOT NULL DEFAULT 0 +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + +-- -------------------------------------------------------- + +-- +-- Table structure for table `url_to_ignore` +-- + +CREATE TABLE `url_to_ignore` ( + `id` int(11) NOT NULL, + `searchfor` varchar(255) COLLATE utf8mb4_bin NOT NULL, + `created` datetime NOT NULL DEFAULT current_timestamp() +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + +-- +-- Dumping data for table `url_to_ignore` +-- + +INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES +(1, 'mailto:', '2022-01-05 10:46:10'), +(2, 'javascript:', '2022-01-05 10:46:10'), +(3, 'google.', '2022-01-05 10:46:29'), +(4, 'amazon.', '2022-01-05 10:46:29'), +(5, 'youtube.', '2022-01-05 10:46:47'), +(6, '.onion', '2022-01-05 17:21:45'), +(7, 'instagram.', '2022-01-05 20:15:21'), +(8, 'twitter.', '2022-01-05 20:16:31'), +(9, 'facebook.', '2022-01-05 20:16:31'), +(10, 'skype:', '2022-01-05 21:29:53'), +(11, 'xmpp:', '2022-01-05 21:30:22'), +(12, 'tel:', '2022-01-05 21:30:50'), +(13, 'fax:', '2022-01-05 21:30:50'), +(14, 'whatsapp:', '2022-01-05 21:31:24'), +(15, 'intent:', '2022-01-05 21:31:24'), +(16, 'ftp:', '2022-01-05 21:33:34'), +(17, 'youtu.', '2022-01-05 21:50:26'), +(18, 'pinterest.', '2022-01-05 21:51:31'), +(19, 'microsoft.', '2022-01-05 21:52:30'), +(20, 'apple.', '2022-01-05 21:52:30'), +(21, 'xing.', '2022-01-05 22:03:07'), +(22, 'linked.', '2022-01-05 22:03:07'), +(26, 't.co', '2022-01-05 22:05:07'), +(27, 'tinyurl.', '2022-01-05 22:07:03'), +(28, 'bitly.', '2022-01-05 22:07:03'), +(29, 'bit.ly', '2022-01-05 22:07:23'), +(30, 'wikipedia.', '2022-01-06 09:58:46'), +(31, 'gstatic.', '2022-01-06 09:59:47'), +(32, 'wikimedia.', '2022-01-06 10:00:20'), +(33, 'goo.', '2022-01-06 10:02:11'), +(34, 'cdn.', '2022-01-06 10:02:59'), +(35, 'flickr.', '2022-01-06 10:05:46'), +(36, '.mp3', '2022-01-07 13:11:49'), +(40, '.aac', '2022-01-08 13:33:22'), +(41, '.opus', '2022-01-08 13:33:22'), +(42, 'awin1.', '2022-01-08 13:39:14'), +(43, 'sms:', '2022-01-09 10:32:46'), +(45, 'hhttps:', '2022-01-09 12:20:43'), +(46, 'httpss:', '2022-01-09 13:12:34'), +(47, 'soundcloud.', '2022-01-16 10:37:04'), +(48, 'fb-messenger:', '2022-01-16 14:42:18'), +(49, 'smartadserver.', '2022-01-16 16:48:46'), +(50, 'ispgateway.', '2022-01-16 16:56:11'), +(51, 'bitcoin:', '2022-01-16 19:48:41'), +(52, 'webcal:', '2022-05-08 09:39:02'), +(53, 'source:', '2022-05-08 09:43:19'), +(54, 'phone:', '2022-05-08 09:44:19'), +(55, 'threema:', '2022-05-08 09:45:19'); + +-- +-- Indexes for dumped tables +-- + +-- +-- Indexes for table `unique_domain` +-- +ALTER TABLE `unique_domain` + ADD PRIMARY KEY (`id`), + ADD UNIQUE KEY `url` (`url`); + +-- +-- Indexes for table `url_to_fetch` +-- +ALTER TABLE `url_to_fetch` + ADD PRIMARY KEY (`id`), + ADD KEY `baseurl` (`baseurl`), + ADD KEY `last_fetched` (`last_fetched`), + ADD KEY `fetch_failed` (`fetch_failed`); + +-- +-- Indexes for table `url_to_ignore` +-- +ALTER TABLE `url_to_ignore` + ADD PRIMARY KEY (`id`), + ADD UNIQUE KEY `url` (`searchfor`); + +-- +-- AUTO_INCREMENT for dumped tables +-- + +-- +-- AUTO_INCREMENT for table `unique_domain` +-- +ALTER TABLE `unique_domain` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- AUTO_INCREMENT for table `url_to_ignore` +-- +ALTER TABLE `url_to_ignore` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56; +COMMIT; + +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; diff --git a/crawler/documentation/upgrade.md b/crawler/documentation/upgrade.md new file mode 100644 index 0000000..b5780e6 --- /dev/null +++ b/crawler/documentation/upgrade.md @@ -0,0 +1,6 @@ +# Upgrade information + +Each release has its own upgrade file. +Follow the instructions in each file step by step, starting from your current one. + +Make sure paths and sql table names are modified to your settings. diff --git a/crawler/fetch.pl b/crawler/fetch.pl new file mode 100644 index 0000000..3110158 --- /dev/null +++ b/crawler/fetch.pl @@ -0,0 +1,156 @@ +#!/usr/bin/perl -w + +# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. + +use 5.20.0; +use strict; +use warnings; +use utf8; +use Data::Dumper; +use Term::ANSIColor qw(:constants); + +use lib './lib'; +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); + +use open qw( :std :encoding(UTF-8) ); +use DBI; +use ConfigReader::Simple; +use LWP::UserAgent; +use HTTP::Request; + + +my $DEBUG = 0; +my $config = ConfigReader::Simple->new("config.txt"); +die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; + + +## DB connection +my %dbAttr = ( + PrintError=>0,# turn off error reporting via warn() + RaiseError=>1, # turn on error reporting via die() + AutoCommit=>0 # manually use transactions +); +my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); +my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); +die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); + + +## fetch the urls to fetch from the table +my %urlsToFetch; +my $query = $dbh->prepare("SELECT `id`, `url` + FROM `url_to_fetch` + WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH + OR `last_fetched` IS NULL + AND `fetch_failed` = 0 + LIMIT ".$config->get("FETCH_URLS_PER_RUN")); +$query->execute(); +while(my @row = $query->fetchrow_array) { + $urlsToFetch{$row[0]} = $row[1]; +} + +# successful fetches +my @urlsFetched; +my @urlsFailed; + +# config the user agent for the request +my $request_headers = [ + 'User-Agent' => $config->get("UA_AGENT"), + 'Accept' => $config->get("UA_ACCEPT"), + 'Accept-Language' => $config->get("UA_LANG"), + 'Accept-Encoding' => HTTP::Message::decodable, + 'Cache-Control' => $config->get("UA_CACHE") +]; +my $ua = LWP::UserAgent->new(); +$ua->timeout($config->get("UA_TIMEOUT")); +$ua->max_size($config->get("MAX_BYTES_PER_PAGE")); + +## now loop over them and store the results +my $counter = 0; +while ( my ($id, $url) = each %urlsToFetch ) { + sayYellow "Fetching: $id $url"; + + my $req = HTTP::Request->new(GET => $url, $request_headers); + my $res = $ua->request($req); + if ($res->is_success) { + # callback tells us to stop + if($res->header('Client-Aborted')) { + sayYellow "Aborted, too big."; + next; + } + if(index($res->content_type, "text/html") == -1) { + sayYellow "Fetching: $id ignored. Not html"; + push(@urlsFailed, $id); + next; + } + open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; + print $fh $res->decoded_content(); + close($fh); + push(@urlsFetched, $id); + sayGreen"Fetching: $id ok"; + } + else { + sayRed "Fetching: $id failed: $res->code ".$res->status_line; + push(@urlsFailed, $id); + } + + if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { + updateFetched($dbh, @urlsFetched); + updateFailed($dbh, @urlsFailed); + sleep(rand(7)); + + $counter = 0; + @urlsFetched = (); + @urlsFailed = (); + } + + $counter++; +} +updateFetched($dbh, @urlsFetched); +updateFailed($dbh, @urlsFailed); + +$dbh->disconnect(); +sayGreen "Fetch complete"; + + + +## update last_fetched in the table +sub updateFetched { + my ($dbh, @urls) = @_; + sayYellow "Update fetch timestamps: ".scalar @urls; + $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?"); + foreach my $idToUpdate (@urls) { + sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG); + $query->bind_param(1,$idToUpdate); + $query->execute(); + } + $dbh->commit(); + sayGreen "Update fetch timestamps done"; +} + +## update fetch_failed in the table +sub updateFailed { + my ($dbh, @urls) = @_; + + sayYellow "Update fetch failed: ".scalar @urls; + $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?"); + foreach my $idToUpdate (@urls) { + sayLog "Update fetch failed for: $idToUpdate" if($DEBUG); + $query->bind_param(1,$idToUpdate); + $query->execute(); + } + $dbh->commit(); + sayGreen "Update fetch failed done"; +} diff --git a/crawler/lib/Aranea/Common.pm b/crawler/lib/Aranea/Common.pm new file mode 100644 index 0000000..b832ffb --- /dev/null +++ b/crawler/lib/Aranea/Common.pm @@ -0,0 +1,53 @@ +# 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. + +package Aranea::Common; +use 5.20.0; +use strict; +use warnings; +use utf8; +use Term::ANSIColor qw(:constants); + +use DateTime; +use Exporter qw(import); + + +our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed); + +sub sayLog { + my ($string) = @_; + my $dt = DateTime->now; + say "[".$dt->datetime."] DEBUG: ".$string; +} + +sub sayYellow { + my ($string) = @_; + my $dt = DateTime->now; + say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET; +} + +sub sayGreen { + my ($string) = @_; + my $dt = DateTime->now; + say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET; +} + +sub sayRed { + my ($string) = @_; + my $dt = DateTime->now; + say BOLD, RED, "[".$dt->datetime."] ".$string, RESET; +} + +1; diff --git a/crawler/parse-results.pl b/crawler/parse-results.pl new file mode 100644 index 0000000..9a785d5 --- /dev/null +++ b/crawler/parse-results.pl @@ -0,0 +1,206 @@ +#!/usr/bin/perl -w + +# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. + +use 5.20.0; +use strict; +use warnings; +use utf8; +use Data::Dumper; +use Term::ANSIColor qw(:constants); + +use lib './lib'; +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); + +use open qw( :std :encoding(UTF-8) ); +use DBI; +use ConfigReader::Simple; +use HTML::LinkExtor; +use URI::URL; +use File::Basename; +use Digest::MD5 qw(md5_hex); +use Data::Validate::URI qw(is_uri); + +my $DEBUG = 0; +my $config = ConfigReader::Simple->new("config.txt"); +die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; + +## DB connection +my %dbAttr = ( + PrintError=>0,# turn off error reporting via warn() + RaiseError=>1, # turn on error reporting via die() + AutoCommit=>0 # manually use transactions +); +my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); +my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); +die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); + + +## get the fetched files +my @results = glob("storage/*.result"); +die "Nothing to parse. No files found." unless(@results); + +## build clean ids for query +my @queryIds = @results; +foreach (@queryIds) { + $_ =~ s/.result//g; + $_ =~ s|storage/||g; +} + +# get the baseurls +my %baseUrls; +my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")"; +sayLog($queryStr) if $DEBUG; +my $query = $dbh->prepare($queryStr); +$query->execute(@queryIds); +while(my @row = $query->fetchrow_array) { + $baseUrls{$row[0]} = $row[1]; +} + + +# get the string to ignore +my @urlStringsToIgnore; +$queryStr = "SELECT `searchfor` FROM `url_to_ignore`"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +$query->execute(); +while(my @row = $query->fetchrow) { + push(@urlStringsToIgnore, $row[0]) +} + + +## prepare linkExtor +my @links = (); +my @workingLinks = (); +sub leCallback { + my($tag, %attr) = @_; + return if $tag ne 'a'; # we only look closer at + push(@workingLinks, values %attr); +} +my $le = HTML::LinkExtor->new(\&leCallback); + +## now parse each file and get the links +my $counter = 0; +foreach my $resultFile (@results) { + sayYellow "Parsing file: $resultFile"; + + my $fileId = basename($resultFile,".result"); + + if (exists $baseUrls{$fileId}) { + sayYellow "Baseurl: $baseUrls{$fileId}"; + + $le->parse_file($resultFile); + @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks; + push(@links,@workingLinks); + + unlink($resultFile); + sayGreen "Parsing done: ".scalar @workingLinks; + } + else { + sayRed "No entry found for file $resultFile"; + } + + if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) { + + @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); + insertIntoDb($dbh, \@links); + + $counter = 0; + @links = (); + } + + @workingLinks = (); + $counter++; +} + +@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); +insertIntoDb($dbh, \@links); + +$dbh->disconnect(); +sayGreen "Parse complete"; + + +## cleanup the found links +sub cleanLinks { + my ($dbh, $linkArray, $urlStringsToIgnore) = @_; + my @linkArray = @{ $linkArray }; + my @urlStringsToIgnore = @{ $urlStringsToIgnore }; + + sayYellow "Clean found links: ".scalar @linkArray; + foreach my $toSearch (@urlStringsToIgnore) { + sayYellow "Clean links from: ".$toSearch; + @linkArray = grep {!/$toSearch/i} @linkArray; + } + sayGreen "Cleaned found links: ".scalar @linkArray; + + return @linkArray; +} + + +## update the DB with the new found links +sub insertIntoDb { + my ($dbh, $links) = @_; + my @links = @{ $links }; + + sayYellow "Insert links into DB: ".scalar @links; + $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET + `id` = ?, + `url` = ?, + `baseurl` = ?, + `created` = NOW()"; + sayLog $queryStr if $DEBUG; + $query = $dbh->prepare($queryStr); + my $md5 = Digest::MD5->new; + my $counter = 0; + foreach my $link (@links) { + + sayLog $link if ($DEBUG); + + if(!is_uri($link)) { + sayYellow "Ignore URL it is invalid: $link"; + next; + } + + my $url = url($link); + if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) { + sayYellow "Ignore URL because of scheme: $link"; + next; + } + + $md5->add($link); + my $digest = $md5->hexdigest; + $query->execute($digest, $link, $url->scheme."://".$url->host); + $md5->reset; + + $counter++; + + if($counter >= 500) { + $counter = 0; + sayYellow "Commit counter of 500 reached. Commiting"; + $dbh->commit(); + } + + #sayLog $digest if ($DEBUG); + #sayLog $url->scheme if ($DEBUG); + #sayLog $url->host if ($DEBUG); + #sayLog $query->{Statement} if ($DEBUG); + #sayLog Dumper($query->{ParamValues}) if ($DEBUG); + + #sayLog "Inserted: $link" if($DEBUG); + } + sayYellow "Final commit"; + $dbh->commit(); +} diff --git a/crawler/storage/.gitignore b/crawler/storage/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/crawler/storage/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/documentation/install.md b/documentation/install.md deleted file mode 100644 index b0a5198..0000000 --- a/documentation/install.md +++ /dev/null @@ -1,15 +0,0 @@ -# Requirements - -Please check the requirements file first. - -# Database - -You need a MySQL installation and a user which can create a database. - -Use `setup.sql` to create the `aranea` database and its tables. `mysql --user=user -p < setup.sql` - -# Config - -Copy `config.default.txt` to `config.txt` and edit at least to match the database server settings. - -Make sure the directory `storage` can be written. diff --git a/documentation/requirements.md b/documentation/requirements.md deleted file mode 100644 index 0375df0..0000000 --- a/documentation/requirements.md +++ /dev/null @@ -1,22 +0,0 @@ -# MySQL - -Tested with a MySQL server 8.+ - -# Perl modules - -Extra modules along with the more already installed ones. - -+ [ConfigRead::Simple](https://metacpan.org/pod/ConfigReader::Simple) -+ [Data::Validate::URI](https://metacpan.org/pod/Data::Validate::URI) - -## Debian - -Those are the ones which needed to be installed after a fresh debian(stable) install. May vary. - -+ libdatetime-perl -+ libdbi-perl -+ libconfigreader-simple-perl -+ libhtml-linkextractor-perl -+ libdata-validate-uri-perl -+ libdbd-mysql-perl -+ libwww-perl diff --git a/documentation/setup.sql b/documentation/setup.sql deleted file mode 100644 index 2fa4c9b..0000000 --- a/documentation/setup.sql +++ /dev/null @@ -1,148 +0,0 @@ -SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; -START TRANSACTION; -SET time_zone = "+00:00"; - - -/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; -/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; -/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; -/*!40101 SET NAMES utf8mb4 */; - --- --- Table structure for table `unique_domain` --- - -CREATE TABLE `unique_domain` ( - `id` int(11) NOT NULL, - `url` varchar(255) COLLATE utf8mb4_bin NOT NULL, - `created` datetime NOT NULL DEFAULT current_timestamp() -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; - --- -------------------------------------------------------- - --- --- Table structure for table `url_to_fetch` --- - -CREATE TABLE `url_to_fetch` ( - `id` char(32) COLLATE utf8mb4_bin NOT NULL, - `url` text COLLATE utf8mb4_bin NOT NULL, - `baseurl` varchar(255) COLLATE utf8mb4_bin NOT NULL, - `created` datetime NOT NULL, - `last_fetched` datetime DEFAULT NULL, - `fetch_failed` tinyint(1) NOT NULL DEFAULT 0 -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; - --- -------------------------------------------------------- - --- --- Table structure for table `url_to_ignore` --- - -CREATE TABLE `url_to_ignore` ( - `id` int(11) NOT NULL, - `searchfor` varchar(255) COLLATE utf8mb4_bin NOT NULL, - `created` datetime NOT NULL DEFAULT current_timestamp() -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; - --- --- Dumping data for table `url_to_ignore` --- - -INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES -(1, 'mailto:', '2022-01-05 10:46:10'), -(2, 'javascript:', '2022-01-05 10:46:10'), -(3, 'google.', '2022-01-05 10:46:29'), -(4, 'amazon.', '2022-01-05 10:46:29'), -(5, 'youtube.', '2022-01-05 10:46:47'), -(6, '.onion', '2022-01-05 17:21:45'), -(7, 'instagram.', '2022-01-05 20:15:21'), -(8, 'twitter.', '2022-01-05 20:16:31'), -(9, 'facebook.', '2022-01-05 20:16:31'), -(10, 'skype:', '2022-01-05 21:29:53'), -(11, 'xmpp:', '2022-01-05 21:30:22'), -(12, 'tel:', '2022-01-05 21:30:50'), -(13, 'fax:', '2022-01-05 21:30:50'), -(14, 'whatsapp:', '2022-01-05 21:31:24'), -(15, 'intent:', '2022-01-05 21:31:24'), -(16, 'ftp:', '2022-01-05 21:33:34'), -(17, 'youtu.', '2022-01-05 21:50:26'), -(18, 'pinterest.', '2022-01-05 21:51:31'), -(19, 'microsoft.', '2022-01-05 21:52:30'), -(20, 'apple.', '2022-01-05 21:52:30'), -(21, 'xing.', '2022-01-05 22:03:07'), -(22, 'linked.', '2022-01-05 22:03:07'), -(26, 't.co', '2022-01-05 22:05:07'), -(27, 'tinyurl.', '2022-01-05 22:07:03'), -(28, 'bitly.', '2022-01-05 22:07:03'), -(29, 'bit.ly', '2022-01-05 22:07:23'), -(30, 'wikipedia.', '2022-01-06 09:58:46'), -(31, 'gstatic.', '2022-01-06 09:59:47'), -(32, 'wikimedia.', '2022-01-06 10:00:20'), -(33, 'goo.', '2022-01-06 10:02:11'), -(34, 'cdn.', '2022-01-06 10:02:59'), -(35, 'flickr.', '2022-01-06 10:05:46'), -(36, '.mp3', '2022-01-07 13:11:49'), -(40, '.aac', '2022-01-08 13:33:22'), -(41, '.opus', '2022-01-08 13:33:22'), -(42, 'awin1.', '2022-01-08 13:39:14'), -(43, 'sms:', '2022-01-09 10:32:46'), -(45, 'hhttps:', '2022-01-09 12:20:43'), -(46, 'httpss:', '2022-01-09 13:12:34'), -(47, 'soundcloud.', '2022-01-16 10:37:04'), -(48, 'fb-messenger:', '2022-01-16 14:42:18'), -(49, 'smartadserver.', '2022-01-16 16:48:46'), -(50, 'ispgateway.', '2022-01-16 16:56:11'), -(51, 'bitcoin:', '2022-01-16 19:48:41'), -(52, 'webcal:', '2022-05-08 09:39:02'), -(53, 'source:', '2022-05-08 09:43:19'), -(54, 'phone:', '2022-05-08 09:44:19'), -(55, 'threema:', '2022-05-08 09:45:19'); - --- --- Indexes for dumped tables --- - --- --- Indexes for table `unique_domain` --- -ALTER TABLE `unique_domain` - ADD PRIMARY KEY (`id`), - ADD UNIQUE KEY `url` (`url`); - --- --- Indexes for table `url_to_fetch` --- -ALTER TABLE `url_to_fetch` - ADD PRIMARY KEY (`id`), - ADD KEY `baseurl` (`baseurl`), - ADD KEY `last_fetched` (`last_fetched`), - ADD KEY `fetch_failed` (`fetch_failed`); - --- --- Indexes for table `url_to_ignore` --- -ALTER TABLE `url_to_ignore` - ADD PRIMARY KEY (`id`), - ADD UNIQUE KEY `url` (`searchfor`); - --- --- AUTO_INCREMENT for dumped tables --- - --- --- AUTO_INCREMENT for table `unique_domain` --- -ALTER TABLE `unique_domain` - MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; - --- --- AUTO_INCREMENT for table `url_to_ignore` --- -ALTER TABLE `url_to_ignore` - MODIFY `id` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=56; -COMMIT; - -/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; -/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; -/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; diff --git a/fetch.pl b/fetch.pl deleted file mode 100644 index 3110158..0000000 --- a/fetch.pl +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/perl -w - -# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. - -use 5.20.0; -use strict; -use warnings; -use utf8; -use Data::Dumper; -use Term::ANSIColor qw(:constants); - -use lib './lib'; -use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); - -use open qw( :std :encoding(UTF-8) ); -use DBI; -use ConfigReader::Simple; -use LWP::UserAgent; -use HTTP::Request; - - -my $DEBUG = 0; -my $config = ConfigReader::Simple->new("config.txt"); -die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; - - -## DB connection -my %dbAttr = ( - PrintError=>0,# turn off error reporting via warn() - RaiseError=>1, # turn on error reporting via die() - AutoCommit=>0 # manually use transactions -); -my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); -my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); -die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); - - -## fetch the urls to fetch from the table -my %urlsToFetch; -my $query = $dbh->prepare("SELECT `id`, `url` - FROM `url_to_fetch` - WHERE `last_fetched` < NOW() - INTERVAL 1 MONTH - OR `last_fetched` IS NULL - AND `fetch_failed` = 0 - LIMIT ".$config->get("FETCH_URLS_PER_RUN")); -$query->execute(); -while(my @row = $query->fetchrow_array) { - $urlsToFetch{$row[0]} = $row[1]; -} - -# successful fetches -my @urlsFetched; -my @urlsFailed; - -# config the user agent for the request -my $request_headers = [ - 'User-Agent' => $config->get("UA_AGENT"), - 'Accept' => $config->get("UA_ACCEPT"), - 'Accept-Language' => $config->get("UA_LANG"), - 'Accept-Encoding' => HTTP::Message::decodable, - 'Cache-Control' => $config->get("UA_CACHE") -]; -my $ua = LWP::UserAgent->new(); -$ua->timeout($config->get("UA_TIMEOUT")); -$ua->max_size($config->get("MAX_BYTES_PER_PAGE")); - -## now loop over them and store the results -my $counter = 0; -while ( my ($id, $url) = each %urlsToFetch ) { - sayYellow "Fetching: $id $url"; - - my $req = HTTP::Request->new(GET => $url, $request_headers); - my $res = $ua->request($req); - if ($res->is_success) { - # callback tells us to stop - if($res->header('Client-Aborted')) { - sayYellow "Aborted, too big."; - next; - } - if(index($res->content_type, "text/html") == -1) { - sayYellow "Fetching: $id ignored. Not html"; - push(@urlsFailed, $id); - next; - } - open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; - print $fh $res->decoded_content(); - close($fh); - push(@urlsFetched, $id); - sayGreen"Fetching: $id ok"; - } - else { - sayRed "Fetching: $id failed: $res->code ".$res->status_line; - push(@urlsFailed, $id); - } - - if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { - updateFetched($dbh, @urlsFetched); - updateFailed($dbh, @urlsFailed); - sleep(rand(7)); - - $counter = 0; - @urlsFetched = (); - @urlsFailed = (); - } - - $counter++; -} -updateFetched($dbh, @urlsFetched); -updateFailed($dbh, @urlsFailed); - -$dbh->disconnect(); -sayGreen "Fetch complete"; - - - -## update last_fetched in the table -sub updateFetched { - my ($dbh, @urls) = @_; - sayYellow "Update fetch timestamps: ".scalar @urls; - $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?"); - foreach my $idToUpdate (@urls) { - sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG); - $query->bind_param(1,$idToUpdate); - $query->execute(); - } - $dbh->commit(); - sayGreen "Update fetch timestamps done"; -} - -## update fetch_failed in the table -sub updateFailed { - my ($dbh, @urls) = @_; - - sayYellow "Update fetch failed: ".scalar @urls; - $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?"); - foreach my $idToUpdate (@urls) { - sayLog "Update fetch failed for: $idToUpdate" if($DEBUG); - $query->bind_param(1,$idToUpdate); - $query->execute(); - } - $dbh->commit(); - sayGreen "Update fetch failed done"; -} diff --git a/lib/Aranea/Common.pm b/lib/Aranea/Common.pm deleted file mode 100644 index b832ffb..0000000 --- a/lib/Aranea/Common.pm +++ /dev/null @@ -1,53 +0,0 @@ -# 2022 - 2024 https://://www.bananas-playground.net/projekt/aranea - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. - -package Aranea::Common; -use 5.20.0; -use strict; -use warnings; -use utf8; -use Term::ANSIColor qw(:constants); - -use DateTime; -use Exporter qw(import); - - -our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed); - -sub sayLog { - my ($string) = @_; - my $dt = DateTime->now; - say "[".$dt->datetime."] DEBUG: ".$string; -} - -sub sayYellow { - my ($string) = @_; - my $dt = DateTime->now; - say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET; -} - -sub sayGreen { - my ($string) = @_; - my $dt = DateTime->now; - say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET; -} - -sub sayRed { - my ($string) = @_; - my $dt = DateTime->now; - say BOLD, RED, "[".$dt->datetime."] ".$string, RESET; -} - -1; diff --git a/parse-results.pl b/parse-results.pl deleted file mode 100644 index 9a785d5..0000000 --- a/parse-results.pl +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/perl -w - -# 2022 - 2024 https://www.bananas-playground.net/projekt/aranea - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0. - -use 5.20.0; -use strict; -use warnings; -use utf8; -use Data::Dumper; -use Term::ANSIColor qw(:constants); - -use lib './lib'; -use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); - -use open qw( :std :encoding(UTF-8) ); -use DBI; -use ConfigReader::Simple; -use HTML::LinkExtor; -use URI::URL; -use File::Basename; -use Digest::MD5 qw(md5_hex); -use Data::Validate::URI qw(is_uri); - -my $DEBUG = 0; -my $config = ConfigReader::Simple->new("config.txt"); -die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; - -## DB connection -my %dbAttr = ( - PrintError=>0,# turn off error reporting via warn() - RaiseError=>1, # turn on error reporting via die() - AutoCommit=>0 # manually use transactions -); -my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); -my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); -die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); - - -## get the fetched files -my @results = glob("storage/*.result"); -die "Nothing to parse. No files found." unless(@results); - -## build clean ids for query -my @queryIds = @results; -foreach (@queryIds) { - $_ =~ s/.result//g; - $_ =~ s|storage/||g; -} - -# get the baseurls -my %baseUrls; -my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")"; -sayLog($queryStr) if $DEBUG; -my $query = $dbh->prepare($queryStr); -$query->execute(@queryIds); -while(my @row = $query->fetchrow_array) { - $baseUrls{$row[0]} = $row[1]; -} - - -# get the string to ignore -my @urlStringsToIgnore; -$queryStr = "SELECT `searchfor` FROM `url_to_ignore`"; -sayLog($queryStr) if $DEBUG; -$query = $dbh->prepare($queryStr); -$query->execute(); -while(my @row = $query->fetchrow) { - push(@urlStringsToIgnore, $row[0]) -} - - -## prepare linkExtor -my @links = (); -my @workingLinks = (); -sub leCallback { - my($tag, %attr) = @_; - return if $tag ne 'a'; # we only look closer at - push(@workingLinks, values %attr); -} -my $le = HTML::LinkExtor->new(\&leCallback); - -## now parse each file and get the links -my $counter = 0; -foreach my $resultFile (@results) { - sayYellow "Parsing file: $resultFile"; - - my $fileId = basename($resultFile,".result"); - - if (exists $baseUrls{$fileId}) { - sayYellow "Baseurl: $baseUrls{$fileId}"; - - $le->parse_file($resultFile); - @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks; - push(@links,@workingLinks); - - unlink($resultFile); - sayGreen "Parsing done: ".scalar @workingLinks; - } - else { - sayRed "No entry found for file $resultFile"; - } - - if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) { - - @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); - insertIntoDb($dbh, \@links); - - $counter = 0; - @links = (); - } - - @workingLinks = (); - $counter++; -} - -@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); -insertIntoDb($dbh, \@links); - -$dbh->disconnect(); -sayGreen "Parse complete"; - - -## cleanup the found links -sub cleanLinks { - my ($dbh, $linkArray, $urlStringsToIgnore) = @_; - my @linkArray = @{ $linkArray }; - my @urlStringsToIgnore = @{ $urlStringsToIgnore }; - - sayYellow "Clean found links: ".scalar @linkArray; - foreach my $toSearch (@urlStringsToIgnore) { - sayYellow "Clean links from: ".$toSearch; - @linkArray = grep {!/$toSearch/i} @linkArray; - } - sayGreen "Cleaned found links: ".scalar @linkArray; - - return @linkArray; -} - - -## update the DB with the new found links -sub insertIntoDb { - my ($dbh, $links) = @_; - my @links = @{ $links }; - - sayYellow "Insert links into DB: ".scalar @links; - $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET - `id` = ?, - `url` = ?, - `baseurl` = ?, - `created` = NOW()"; - sayLog $queryStr if $DEBUG; - $query = $dbh->prepare($queryStr); - my $md5 = Digest::MD5->new; - my $counter = 0; - foreach my $link (@links) { - - sayLog $link if ($DEBUG); - - if(!is_uri($link)) { - sayYellow "Ignore URL it is invalid: $link"; - next; - } - - my $url = url($link); - if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) { - sayYellow "Ignore URL because of scheme: $link"; - next; - } - - $md5->add($link); - my $digest = $md5->hexdigest; - $query->execute($digest, $link, $url->scheme."://".$url->host); - $md5->reset; - - $counter++; - - if($counter >= 500) { - $counter = 0; - sayYellow "Commit counter of 500 reached. Commiting"; - $dbh->commit(); - } - - #sayLog $digest if ($DEBUG); - #sayLog $url->scheme if ($DEBUG); - #sayLog $url->host if ($DEBUG); - #sayLog $query->{Statement} if ($DEBUG); - #sayLog Dumper($query->{ParamValues}) if ($DEBUG); - - #sayLog "Inserted: $link" if($DEBUG); - } - sayYellow "Final commit"; - $dbh->commit(); -} diff --git a/storage/.gitignore b/storage/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/storage/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore