use Term::ANSIColor qw(:constants);
use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed addToStats);
use DBI;
use ConfigReader::Simple;
my $config = ConfigReader::Simple->new("config.txt");
die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-## DB connection
+# DB connection
my %dbAttr = (
PrintError=>0,# turn off error reporting via warn()
RaiseError=>1, # turn on error reporting via die()
die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-# update the unique domains
-my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0";
+# Update the unique domains
+my $queryStr = "INSERT IGNORE INTO `unique_domain` (url) select DISTINCT(baseurl) as url FROM `url_to_fetch`
+ WHERE `fetch_failed` = 0 AND `last_fetched` IS NOT NULL";
sayLog($queryStr) if $DEBUG;
my $query = $dbh->prepare($queryStr);
-# now validate the unique ones
-$queryStr = "SELECT `id`, `url` FROM unique_domain";
+# Now validate the unique ones
+$queryStr = "SELECT `id`, `url` FROM `unique_domain`";
sayLog($queryStr) if $DEBUG;
$query = $dbh->prepare($queryStr);
sayYellow "Invalid unique_domain: ".scalar @invalidUrls;
-$queryStr = "DELETE FROM unique_domain WHERE `id` = ?";
+$queryStr = "DELETE FROM `unique_domain` WHERE `id` = ?";
sayLog($queryStr) if $DEBUG;
$query = $dbh->prepare($queryStr);
foreach my $invalidId (@invalidUrls) {
# remove urls from fetch since we have enough already
$queryStr = "SELECT count(baseurl) AS amount, baseurl
FROM `url_to_fetch`
- WHERE last_fetched <> 0
+ WHERE `last_fetched` <> 0
GROUP BY baseurl
HAVING amount > ".$config->get("CLEANUP_URLS_AMOUNT_ABOVE");
sayLog($queryStr) if $DEBUG;
sayYellow "Remove invalid urls done";
-$queryStr = "INSERT INTO `stats` SET `action` = 'cleanup', `value` = NOW()
-$query = $dbh->prepare($queryStr);
+addToStats($dbh, "cleanup");
sayGreen "Cleanup complete";
die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-## DB connection
+# DB connection
my %dbAttr = (
PrintError=>0,# turn off error reporting via warn()
RaiseError=>1, # turn on error reporting via die()
my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr);
-die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
+die "Failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-## fetch the urls to fetch from the table
+# Fetch the urls to fetch from the table
my %urlsToFetch;
my $query = $dbh->prepare("SELECT `id`, `url`
FROM `url_to_fetch`
$urlsToFetch{$row[0]} = $row[1];
-# successful fetches
+# Successful and failed fetches
my @urlsFetched;
my @urlsFailed;
-# config the user agent for the request
+# Config the user agent for the request
my $request_headers = [
'User-Agent' => $config->get("UA_AGENT"),
'Accept' => $config->get("UA_ACCEPT"),
my $ua = LWP::UserAgent->new();
-## now loop over them and store the results
+## Now loop over them and store the results
my $counter = 0;
my $allFetched = 0;
my $allFailed = 0;
open(my $fh, '>:encoding(UTF-8)', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!";
- print $fh $url."\n"; # to know where it comes from
print $fh $res->decoded_content();
push(@urlsFetched, $id);
if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) {
updateFetched($dbh, @urlsFetched);
updateFailed($dbh, @urlsFailed);
+ $dbh->commit();
$counter = 0;
updateFetched($dbh, @urlsFetched);
updateFailed($dbh, @urlsFailed);
# some stats stuff
addToStats($dbh, 'fetch');
- $dbh->commit();
sayGreen "Update fetch timestamps done";
- $dbh->commit();
sayGreen "Update fetch failed done";
use Term::ANSIColor qw(:constants);
use lib './lib';
-use Aranea::Common qw(sayLog sayYellow sayGreen sayRed);
+use Aranea::Common qw(sayLog sayYellow sayGreen sayRed addToStats);
use open qw( :std :encoding(UTF-8) );
use DBI;
my $config = ConfigReader::Simple->new("config.txt");
die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config;
-## DB connection
+# DB connection
my %dbAttr = (
- PrintError=>0,# turn off error reporting via warn()
- RaiseError=>1, # turn on error reporting via die()
- AutoCommit=>0, # manually use transactions
+ PrintError=>0,# Turn off error reporting via warn()
+ RaiseError=>1, # Turn on error reporting via die()
+ AutoCommit=>0, # Manually use transactions
mysql_enable_utf8mb4 => 1
my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT");
die "failed to connect to MySQL database:DBI->errstr()" unless($dbh);
-## get the fetched files
+# Get the fetched files
my @results = glob("storage/*.result");
die "Nothing to parse. No files found." unless(@results);
-## build clean ids for query
+# Build clean ids for query
my @queryIds = @results;
foreach (@queryIds) {
$_ =~ s/.result//g;
$_ =~ s|storage/||g;
-# get the baseurls
+# Get the baseurls to create absolute links to insert while parsing the file
my %baseUrls;
my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")";
sayLog($queryStr) if $DEBUG;
$baseUrls{$row[0]} = $row[1];
-# get the string to ignore
+# Get the string to ignore
my @urlStringsToIgnore;
$queryStr = "SELECT `searchfor` FROM `url_to_ignore`";
sayLog($queryStr) if $DEBUG;
push(@urlStringsToIgnore, $row[0])
-## prepare linkExtor
+# Prepare linkExtor and its callback.
+# The callback extracts only a tags.
my @links = ();
-my @workingLinks = ();
sub leCallback {
my($tag, %attr) = @_;
return if $tag ne 'a'; # we only look closer at <a ...>
- push(@workingLinks, values %attr);
+ # do some cleanup first to avoid empty or urls which point to itself
+ return if $attr{"href"} eq "";
+ return if rindex($attr{"href"}, "#", 0) != -1; # does not begin with #
+ return if $attr{"href"} eq "/";
+ push(@links, $attr{'href'});
my $le = HTML::LinkExtor->new(\&leCallback);
-## now parse each file and get the links
-my $counter = 0;
+# Now parse each file and get the links from it.
foreach my $resultFile (@results) {
sayYellow "Parsing file: $resultFile";
+ @links = ();
my $fileId = basename($resultFile,".result");
if (exists $baseUrls{$fileId}) {
sayYellow "Baseurl: $baseUrls{$fileId}";
+ my $origin = $baseUrls{$fileId};
- @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks;
- push(@links,@workingLinks);
+ # Create absolute links with the help of the baseurl if the url is not already absolute
+ @links = map { $_ = url($_, $origin)->abs->as_string; } @links;
+ @links = cleanLinks(\@links, \@urlStringsToIgnore);
+ insertIntoDb($dbh, \@links, $origin);
- sayGreen "Parsing done: ".scalar @workingLinks;
+ sayGreen "Parsing done: ".scalar @links;
else {
sayRed "No entry found for file $resultFile";
- if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) {
- @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
- insertIntoDb($dbh, \@links);
- $counter = 0;
- @links = ();
- }
- @workingLinks = ();
- $counter++;
-@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore);
-insertIntoDb($dbh, \@links);
-$queryStr = "INSERT INTO `stats` SET `action` = 'parse', `value` = NOW()
-$query = $dbh->prepare($queryStr);
+addToStats($dbh, 'parse');
## cleanup the found links
sub cleanLinks {
- my ($dbh, $linkArray, $urlStringsToIgnore) = @_;
+ my ($linkArray, $urlStringsToIgnore) = @_;
my @linkArray = @{ $linkArray };
- my @urlStringsToIgnore = @{ $urlStringsToIgnore };
+ my @urlsToIgnore = @{ $urlStringsToIgnore };
sayYellow "Clean found links: ".scalar @linkArray;
- foreach my $toSearch (@urlStringsToIgnore) {
+ foreach my $toSearch (@urlsToIgnore) {
sayYellow "Clean links from: ".$toSearch;
@linkArray = grep {!/$toSearch/i} @linkArray;
## update the DB with the new found links
sub insertIntoDb {
- my ($dbh, $links) = @_;
+ my ($dbh, $links, $origin) = @_;
my @links = @{ $links };
sayYellow "Insert links into DB: ".scalar @links;
`created` = NOW()";
sayLog $queryStr if $DEBUG;
$query = $dbh->prepare($queryStr);
+ my $queryOriginStr = "INSERT INTO `url_origin` SET
+ `origin` = ?,
+ `target` = ?,
+ `created` = NOW(),
+ `amount` = 1
+ ON DUPLICATE KEY UPDATE `amount` = `amount`+1";
+ sayLog $queryOriginStr if $DEBUG;
+ my $queryOrigin = $dbh->prepare($queryOriginStr);
my $md5 = Digest::MD5->new;
my $counter = 0;
my $allLinks = 0;
my $digest = $md5->hexdigest;
- $query->execute($digest, $link, $url->scheme."://".$url->host);
+ my $baseurl = $url->scheme."://".$url->host;
+ $query->execute($digest, $link, $baseurl);
+ # update relation
+ $queryOrigin->execute($origin, $baseurl) if($origin ne $baseurl);
- if($counter >= 500) {
+ if($counter >= $config->get("PARSE_URLS_PER_PACKAGE")) {
$counter = 0;
- sayYellow "Commit counter of 500 reached. Commiting";
+ sayYellow "Commit counter of PARSE_URLS_PER_PACKAGE reached. Commiting";
- #sayLog $digest if ($DEBUG);
- #sayLog $url->scheme if ($DEBUG);
- #sayLog $url->host if ($DEBUG);
- #sayLog $query->{Statement} if ($DEBUG);
- #sayLog Dumper($query->{ParamValues}) if ($DEBUG);
- #sayLog "Inserted: $link" if($DEBUG);
- sayYellow "Final commit";
# stats stuff
- $queryStr = "INSERT INTO `stats` SET `action` = 'parsesuccess', `value` = '$allLinks'
- ON DUPLICATE KEY UPDATE `value` = '$allLinks'";
- $query = $dbh->prepare($queryStr);
- $query->execute();
- $queryStr = "INSERT INTO `stats` SET `action` = 'parsefailed', `value` = '$allFailedLinks'
- ON DUPLICATE KEY UPDATE `value` = '$allFailedLinks'";
- $query = $dbh->prepare($queryStr);
- $query->execute();
+ addToStats($dbh, 'parsesuccess', $allLinks, $allLinks);
+ addToStats($dbh, 'parsefailed', $allFailedLinks, $allFailedLinks);
+ sayYellow "Final commit";