From: Banana Date: Sun, 13 Oct 2024 18:05:53 +0000 (+0200) Subject: config update and origin table X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=7708535e4a9dab27f9b98089bb256a5d44ed5a6a;p=aranea.git config update and origin table Signed-off-by: Banana --- diff --git a/CHANGELOG b/CHANGELOG index da9ae53..66ae802 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,5 @@ 0.3 ++ Crawler config change. Please update config first. Compare it with config.default.txt + Add: Web interface + Folder structure to separate crawler and web interface. + Setup sql file changed. Creation of the database needs to be done beforehand. @@ -10,7 +11,7 @@ + Some db improvements + Default config file added + Updated requirements file -+ Avoid big downloads with MAX_BYTES_PER_PAGE setting. ++ Avoid big downloads with FETCH_MAX_BYTES_PER_PAGE setting. 0.1 + initial release diff --git a/crawler/cleanup.pl b/crawler/cleanup.pl index e4d500d..e03d3ca 100644 --- a/crawler/cleanup.pl +++ b/crawler/cleanup.pl @@ -23,7 +23,7 @@ use Data::Dumper; use Term::ANSIColor qw(:constants); use lib './lib'; -use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed addToStats); use DBI; use ConfigReader::Simple; @@ -35,7 +35,7 @@ my $DEBUG = 0; my $config = ConfigReader::Simple->new("config.txt"); die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; -## DB connection +# DB connection my %dbAttr = ( PrintError=>0,# turn off error reporting via warn() RaiseError=>1, # turn on error reporting via die() @@ -46,15 +46,15 @@ my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); - -# update the unique domains -my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0"; +# Update the unique domains +my $queryStr = "INSERT IGNORE INTO `unique_domain` (url) select DISTINCT(baseurl) as url FROM `url_to_fetch` + WHERE `fetch_failed` = 0 AND `last_fetched` IS NOT NULL"; sayLog($queryStr) if $DEBUG; my $query = $dbh->prepare($queryStr); $query->execute(); -# now validate the unique ones -$queryStr = "SELECT `id`, `url` FROM unique_domain"; +# Now validate the unique ones +$queryStr = "SELECT `id`, `url` FROM `unique_domain`"; sayLog($queryStr) if $DEBUG; $query = $dbh->prepare($queryStr); $query->execute(); @@ -80,7 +80,7 @@ while(my @row = $query->fetchrow_array) { } sayYellow "Invalid unique_domain: ".scalar @invalidUrls; -$queryStr = "DELETE FROM unique_domain WHERE `id` = ?"; +$queryStr = "DELETE FROM `unique_domain` WHERE `id` = ?"; sayLog($queryStr) if $DEBUG; $query = $dbh->prepare($queryStr); foreach my $invalidId (@invalidUrls) { @@ -93,7 +93,7 @@ sayGreen "Invalid unique_domain removed: ".scalar @invalidUrls; # remove urls from fetch since we have enough already $queryStr = "SELECT count(baseurl) AS amount, baseurl FROM `url_to_fetch` - WHERE last_fetched <> 0 + WHERE `last_fetched` <> 0 GROUP BY baseurl HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE"); sayLog($queryStr) if $DEBUG; @@ -130,9 +130,6 @@ $query = $dbh->prepare($queryStr); $query->execute(); sayYellow "Remove invalid urls done"; -$queryStr = "INSERT INTO `stats` SET `action` = 'cleanup', `value` = NOW() - ON DUPLICATE KEY UPDATE `value` = NOW()"; -$query = $dbh->prepare($queryStr); -$query->execute(); +addToStats($dbh, "cleanup"); sayGreen "Cleanup complete"; diff --git a/crawler/config.default.txt b/crawler/config.default.txt index 4f5cf27..06916a2 100644 --- a/crawler/config.default.txt +++ b/crawler/config.default.txt @@ -1,17 +1,24 @@ +# Database settings DB_HOST=localhost DB_PORT=3306 DB_NAME=aranea DB_USER=user DB_PASS=test +# Settings for the http call UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0" UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" UA_LANG="en-US" UA_CACHE="no-cache" UA_TIMEOUT=5 +# Setting for fetch.pl FETCH_URLS_PER_RUN=5000 -FETCH_URLS_PER_PACKAGE=30 -PARSE_FILES_PER_PACKAGE=50 +FETCH_URLS_PER_PACKAGE=100 +FETCH_MAX_BYTES_PER_PAGE=5000000 + +# Settings for parse.pl +PARSE_URLS_PER_PACKAGE=500 + +# Settings for cleanup CLEANUP_URLS_AMOUNT_ABOVE=40 -MAX_BYTES_PER_PAGE=5000000 diff --git a/crawler/fetch.pl b/crawler/fetch.pl index 583d38b..66cc1ab 100644 --- a/crawler/fetch.pl +++ b/crawler/fetch.pl @@ -37,7 +37,7 @@ my $config = ConfigReader::Simple->new("config.txt"); die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; -## DB connection +# DB connection my %dbAttr = ( PrintError=>0,# turn off error reporting via warn() RaiseError=>1, # turn on error reporting via die() @@ -46,10 +46,10 @@ my %dbAttr = ( ); my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); -die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); +die "Failed to connect to MySQL database:DBI->errstr()" unless($dbh); -## fetch the urls to fetch from the table +# Fetch the urls to fetch from the table my %urlsToFetch; my $query = $dbh->prepare("SELECT `id`, `url` FROM `url_to_fetch` @@ -62,11 +62,11 @@ while(my @row = $query->fetchrow_array) { $urlsToFetch{$row[0]} = $row[1]; } -# successful fetches +# Successful and failed fetches my @urlsFetched; my @urlsFailed; -# config the user agent for the request +# Config the user agent for the request my $request_headers = [ 'User-Agent' => $config->get("UA_AGENT"), 'Accept' => $config->get("UA_ACCEPT"), @@ -76,9 +76,9 @@ my $request_headers = [ ]; my $ua = LWP::UserAgent->new(); $ua->timeout($config->get("UA_TIMEOUT")); -$ua->max_size($config->get("MAX_BYTES_PER_PAGE")); +$ua->max_size($config->get("FETCH_MAX_BYTES_PER_PAGE")); -## now loop over them and store the results +## Now loop over them and store the results my $counter = 0; my $allFetched = 0; my $allFailed = 0; @@ -101,7 +101,6 @@ while ( my ($id, $url) = each %urlsToFetch ) { next; } open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; - print $fh $url."\n"; # to know where it comes from print $fh $res->decoded_content(); close($fh); push(@urlsFetched, $id); @@ -117,6 +116,8 @@ while ( my ($id, $url) = each %urlsToFetch ) { if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { updateFetched($dbh, @urlsFetched); updateFailed($dbh, @urlsFailed); + $dbh->commit(); + sleep(rand(7)); $counter = 0; @@ -128,6 +129,7 @@ while ( my ($id, $url) = each %urlsToFetch ) { } updateFetched($dbh, @urlsFetched); updateFailed($dbh, @urlsFailed); +$dbh->commit(); # some stats stuff addToStats($dbh, 'fetch'); @@ -153,7 +155,6 @@ sub updateFetched { $query->bind_param(1,$idToUpdate); $query->execute(); } - $dbh->commit(); sayGreen "Update fetch timestamps done"; } @@ -168,6 +169,5 @@ sub updateFailed { $query->bind_param(1,$idToUpdate); $query->execute(); } - $dbh->commit(); sayGreen "Update fetch failed done"; } diff --git a/crawler/parse-results.pl b/crawler/parse-results.pl index 9583ee9..780a7c6 100644 --- a/crawler/parse-results.pl +++ b/crawler/parse-results.pl @@ -23,7 +23,7 @@ use Data::Dumper; use Term::ANSIColor qw(:constants); use lib './lib'; -use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed addToStats); use open qw( :std :encoding(UTF-8) ); use DBI; @@ -38,11 +38,11 @@ my $DEBUG = 0; my $config = ConfigReader::Simple->new("config.txt"); die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; -## DB connection +# DB connection my %dbAttr = ( - PrintError=>0,# turn off error reporting via warn() - RaiseError=>1, # turn on error reporting via die() - AutoCommit=>0, # manually use transactions + PrintError=>0,# Turn off error reporting via warn() + RaiseError=>1, # Turn on error reporting via die() + AutoCommit=>0, # Manually use transactions mysql_enable_utf8mb4 => 1 ); my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); @@ -50,18 +50,18 @@ my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \ die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); -## get the fetched files +# Get the fetched files my @results = glob("storage/*.result"); die "Nothing to parse. No files found." unless(@results); -## build clean ids for query +# Build clean ids for query my @queryIds = @results; foreach (@queryIds) { $_ =~ s/.result//g; $_ =~ s|storage/||g; } -# get the baseurls +# Get the baseurls to create absolute links to insert while parsing the file my %baseUrls; my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")"; sayLog($queryStr) if $DEBUG; @@ -71,8 +71,7 @@ while(my @row = $query->fetchrow_array) { $baseUrls{$row[0]} = $row[1]; } - -# get the string to ignore +# Get the string to ignore my @urlStringsToIgnore; $queryStr = "SELECT `searchfor` FROM `url_to_ignore`"; sayLog($queryStr) if $DEBUG; @@ -82,58 +81,48 @@ while(my @row = $query->fetchrow) { push(@urlStringsToIgnore, $row[0]) } - -## prepare linkExtor +# Prepare linkExtor and its callback. +# The callback extracts only a tags. my @links = (); -my @workingLinks = (); sub leCallback { my($tag, %attr) = @_; return if $tag ne 'a'; # we only look closer at - push(@workingLinks, values %attr); + # do some cleanup first to avoid empty or urls which point to itself + return if $attr{"href"} eq ""; + return if rindex($attr{"href"}, "#", 0) != -1; # does not begin with # + return if $attr{"href"} eq "/"; + push(@links, $attr{'href'}); } my $le = HTML::LinkExtor->new(\&leCallback); -## now parse each file and get the links -my $counter = 0; +# Now parse each file and get the links from it. foreach my $resultFile (@results) { sayYellow "Parsing file: $resultFile"; - + @links = (); my $fileId = basename($resultFile,".result"); if (exists $baseUrls{$fileId}) { sayYellow "Baseurl: $baseUrls{$fileId}"; + my $origin = $baseUrls{$fileId}; + $le->parse_file($resultFile); - @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks; - push(@links,@workingLinks); + + # Create absolute links with the help of the baseurl if the url is not already absolute + @links = map { $_ = url($_, $origin)->abs->as_string; } @links; + + @links = cleanLinks(\@links, \@urlStringsToIgnore); + insertIntoDb($dbh, \@links, $origin); unlink($resultFile); - sayGreen "Parsing done: ".scalar @workingLinks; + sayGreen "Parsing done: ".scalar @links; } else { sayRed "No entry found for file $resultFile"; } - - if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) { - - @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); - insertIntoDb($dbh, \@links); - - $counter = 0; - @links = (); - } - - @workingLinks = (); - $counter++; } -@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); -insertIntoDb($dbh, \@links); - -$queryStr = "INSERT INTO `stats` SET `action` = 'parse', `value` = NOW() - ON DUPLICATE KEY UPDATE `value` = NOW()"; -$query = $dbh->prepare($queryStr); -$query->execute(); +addToStats($dbh, 'parse'); $dbh->commit(); $dbh->disconnect(); @@ -142,12 +131,12 @@ sayGreen "Parse complete"; ## cleanup the found links sub cleanLinks { - my ($dbh, $linkArray, $urlStringsToIgnore) = @_; + my ($linkArray, $urlStringsToIgnore) = @_; my @linkArray = @{ $linkArray }; - my @urlStringsToIgnore = @{ $urlStringsToIgnore }; + my @urlsToIgnore = @{ $urlStringsToIgnore }; sayYellow "Clean found links: ".scalar @linkArray; - foreach my $toSearch (@urlStringsToIgnore) { + foreach my $toSearch (@urlsToIgnore) { sayYellow "Clean links from: ".$toSearch; @linkArray = grep {!/$toSearch/i} @linkArray; } @@ -159,7 +148,7 @@ sub cleanLinks { ## update the DB with the new found links sub insertIntoDb { - my ($dbh, $links) = @_; + my ($dbh, $links, $origin) = @_; my @links = @{ $links }; sayYellow "Insert links into DB: ".scalar @links; @@ -170,6 +159,16 @@ sub insertIntoDb { `created` = NOW()"; sayLog $queryStr if $DEBUG; $query = $dbh->prepare($queryStr); + + my $queryOriginStr = "INSERT INTO `url_origin` SET + `origin` = ?, + `target` = ?, + `created` = NOW(), + `amount` = 1 + ON DUPLICATE KEY UPDATE `amount` = `amount`+1"; + sayLog $queryOriginStr if $DEBUG; + my $queryOrigin = $dbh->prepare($queryOriginStr); + my $md5 = Digest::MD5->new; my $counter = 0; my $allLinks = 0; @@ -193,38 +192,27 @@ sub insertIntoDb { $md5->add($link); my $digest = $md5->hexdigest; - $query->execute($digest, $link, $url->scheme."://".$url->host); + my $baseurl = $url->scheme."://".$url->host; + $query->execute($digest, $link, $baseurl); $md5->reset; + # update relation + $queryOrigin->execute($origin, $baseurl) if($origin ne $baseurl); + $counter++; $allLinks++; - if($counter >= 500) { + if($counter >= $config->get("PARSE_URLS_PER_PACKAGE")) { $counter = 0; - sayYellow "Commit counter of 500 reached. Commiting"; + sayYellow "Commit counter of PARSE_URLS_PER_PACKAGE reached. Commiting"; $dbh->commit(); } - - #sayLog $digest if ($DEBUG); - #sayLog $url->scheme if ($DEBUG); - #sayLog $url->host if ($DEBUG); - #sayLog $query->{Statement} if ($DEBUG); - #sayLog Dumper($query->{ParamValues}) if ($DEBUG); - - #sayLog "Inserted: $link" if($DEBUG); } - sayYellow "Final commit"; # stats stuff - $queryStr = "INSERT INTO `stats` SET `action` = 'parsesuccess', `value` = '$allLinks' - ON DUPLICATE KEY UPDATE `value` = '$allLinks'"; - $query = $dbh->prepare($queryStr); - $query->execute(); - - $queryStr = "INSERT INTO `stats` SET `action` = 'parsefailed', `value` = '$allFailedLinks' - ON DUPLICATE KEY UPDATE `value` = '$allFailedLinks'"; - $query = $dbh->prepare($queryStr); - $query->execute(); + addToStats($dbh, 'parsesuccess', $allLinks, $allLinks); + addToStats($dbh, 'parsefailed', $allFailedLinks, $allFailedLinks); + sayYellow "Final commit"; $dbh->commit(); }