From 5e69d05c08dd6ac03db1af1833f22c0201660cc4 Mon Sep 17 00:00:00 2001 From: Banana Date: Tue, 11 Jan 2022 18:34:23 +0100 Subject: [PATCH 1/1] initial files --- README | 0 cleanup.pl | 117 ++++++++++++++++++++++++++++ config.txt | 10 +++ fetch.pl | 132 +++++++++++++++++++++++++++++++ lib/Aranea/Common.pm | 38 +++++++++ parse-results.pl | 181 +++++++++++++++++++++++++++++++++++++++++++ setup.sql | 151 ++++++++++++++++++++++++++++++++++++ 7 files changed, 629 insertions(+) create mode 100644 README create mode 100644 cleanup.pl create mode 100644 config.txt create mode 100644 fetch.pl create mode 100644 lib/Aranea/Common.pm create mode 100644 parse-results.pl create mode 100644 setup.sql diff --git a/README b/README new file mode 100644 index 0000000..e69de29 diff --git a/cleanup.pl b/cleanup.pl new file mode 100644 index 0000000..9bbbce3 --- /dev/null +++ b/cleanup.pl @@ -0,0 +1,117 @@ +#!/usr/bin/perl -w +use 5.20.0; +use strict; +use warnings; +use utf8; +use Term::ANSIColor qw(:constants); +use Data::Dumper; + +use lib './lib'; +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); + +use DBI; +use ConfigReader::Simple; +use URI::URL; +use Data::Validate::URI qw(is_uri); + + +my $DEBUG = 0; +my $config = ConfigReader::Simple->new("config.txt"); +die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; + +## DB connection +my %dbAttr = ( + PrintError=>0,# turn off error reporting via warn() + RaiseError=>1 # turn on error reporting via die() +); +my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); +my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); +die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); + + + +# update the uniqe domains +my $queryStr = "INSERT IGNORE INTO unique_domain (url) select DISTINCT(baseurl) as url FROM url_to_fetch WHERE fetch_failed = 0"; +sayLog($queryStr) if $DEBUG; +my $query = $dbh->prepare($queryStr); +$query->execute(); + +# now validate the unique ones +$queryStr = "SELECT `id`, `url` FROM unique_domain"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +$query->execute(); +my @invalidUrls = (); +while(my @row = $query->fetchrow_array) { + my $link = $row[1]; + my $id = $row[0]; + if(!is_uri($link)) { + sayYellow "Ignore URL it is invalid: $link"; + push(@invalidUrls, $id); + next; + } + + my $url = url($link); + if(!defined($url->scheme) || index($url->scheme,"http") == -1) { + sayYellow "Ignore URL because of scheme: $link"; + push(@invalidUrls, $id); + next; + } +} + +sayYellow "Invalid URLs: ".scalar @invalidUrls; +$queryStr = "DELETE FROM unique_domain WHERE `id` = ?"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +foreach my $invalidId (@invalidUrls) { + $query->execute($invalidId); + $query->finish(); + sayLog "Removed $invalidId from unique_domain" if $DEBUG; +} +sayGreen "Invalid URLs removed: ".scalar @invalidUrls; + + +# remove urls from fetch since we have enough already +my @toBeDeletedFromFetchAgain = (); +$queryStr = "SELECT count(baseurl) AS amount, baseurl + FROM `url_to_fetch` + WHERE last_fetched <> 0 + GROUP BY baseurl + HAVING amount > 40"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +$query->execute(); +while(my @row = $query->fetchrow_array) { + my $baseUrl = $row[1]; + push(@toBeDeletedFromFetchAgain, $baseUrl); +} +$query->finish(); +sayYellow "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; +$queryStr = "DELETE FROM url_to_fetch WHERE `baseurl` = ?"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +foreach my $baseUrl (@toBeDeletedFromFetchAgain) { + $query->execute($baseUrl); + $query->finish(); + sayLog "Removed $baseUrl from url_to_fetch" if $DEBUG; +} +sayGreen "Remove baseurls from url_to_fetch: ".scalar @toBeDeletedFromFetchAgain; + +# remove failed fetches +sayYellow "Remove fetch_failed"; +$queryStr = "DELETE FROM url_to_fetch WHERE fetch_failed = 1"; +$query = $dbh->prepare($queryStr); +$query->execute(); +sayGreen "Remove fetch_failed done"; + +sayYellow "Remove invalid urls which the is_uri check does let pass"; +$queryStr = "DELETE FROM unique_domain WHERE `url` NOT LIKE '%.%'"; +$query = $dbh->prepare($queryStr); +$query->execute(); +$queryStr = "SELECT * FROM `url_to_fetch` WHERE `baseurl` LIKE '% %'"; +$query = $dbh->prepare($queryStr); +$query->execute(); +sayYellow "Remove invalid urls done"; + + +sayGreen "Cleanup complete"; \ No newline at end of file diff --git a/config.txt b/config.txt new file mode 100644 index 0000000..df88968 --- /dev/null +++ b/config.txt @@ -0,0 +1,10 @@ +DB_HOST=localhost +DB_PORT=3306 +DB_NAME=aranea +DB_USER=user +DB_PASS=test + +UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0" +UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" +UA_LANG="en-US" +UA_CACHE="no-cache" \ No newline at end of file diff --git a/fetch.pl b/fetch.pl new file mode 100644 index 0000000..3c70501 --- /dev/null +++ b/fetch.pl @@ -0,0 +1,132 @@ +#!/usr/bin/perl -w +use 5.20.0; +use strict; +use warnings; +use utf8; +use Data::Dumper; +use Term::ANSIColor qw(:constants); + +use lib './lib'; +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); + +use open qw( :std :encoding(UTF-8) ); +use DBI; +use ConfigReader::Simple; +use LWP::UserAgent; +use HTTP::Request; + + +my $DEBUG = 0; +my $config = ConfigReader::Simple->new("config.txt"); +die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; + + +## DB connection +my %dbAttr = ( + PrintError=>0,# turn off error reporting via warn() + RaiseError=>1 # turn on error reporting via die() +); +my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); +my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); +die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); + + +## fetch the urls to fetch from the table +my %urlsToFetch; +my $query = $dbh->prepare("SELECT `id`, `url` + FROM `url_to_fetch` + WHERE `last_fetched` < NOW() - INTERVAL 1 WEEK + AND `fetch_failed` = 0"); +$query->execute(); +while(my @row = $query->fetchrow_array) { + $urlsToFetch{$row[0]} = $row[1]; +} +$query->finish(); + +# successful fetches +my @urlsFetched; +my @urlsFailed; + +# config the user agent for the request +my $request_headers = [ + 'User-Agent' => $config->get("UA_AGENT"), + 'Accept' => $config->get("UA_ACCEPT"), + 'Accept-Language' => $config->get("UA_LANG"), + 'Accept-Encoding' => HTTP::Message::decodable, + 'Cache-Control' => $config->get("UA_CACHE") +]; +my $ua = LWP::UserAgent->new; + +## now loop over them and store the results +my $counter = 0; +while ( my ($id, $url) = each %urlsToFetch ) { + sayYellow "Fetching: $id $url"; + + my $req = HTTP::Request->new(GET => $url, $request_headers); + my $res = $ua->request($req); + if ($res->is_success) { + if(index($res->content_type, "text/html") == -1) { + sayYellow "Fetching: $id ignored. Not html"; + push(@urlsFailed, $id); + next; + } + open(my $fh, '>', "storage/$id.result") or die "Could not open file 'storage/$id.result' $!"; + print $fh $res->decoded_content(); + close($fh); + push(@urlsFetched, $id); + sayGreen"Fetching: $id ok"; + } + else { + sayRed "Fetching: $id failed: $res->code ".$res->status_line; + push(@urlsFailed, $id); + } + + if($counter >= 10) { + updateFetched($dbh, @urlsFetched); + updateFailed($dbh, @urlsFailed); + sleep(rand(7)); + + $counter = 0; + @urlsFetched = (); + @urlsFailed = (); + } + + $counter++; +} +updateFetched($dbh, @urlsFetched); +updateFailed($dbh, @urlsFailed); + +$dbh->disconnect(); +sayGreen "Fetch complete"; + + + +## update last_fetched in the table +sub updateFetched { + my ($dbh, @urls) = @_; + sayYellow "Update fetch timestamps: ".scalar @urls; + $query = $dbh->prepare("UPDATE `url_to_fetch` SET `last_fetched` = NOW() WHERE `id` = ?"); + foreach my $idToUpdate (@urls) { + sayLog "Update fetch timestamp for: $idToUpdate" if($DEBUG); + $query->bind_param(1,$idToUpdate); + $query->execute(); + } + $query->finish(); + sayGreen "Update fetch timestamps done"; +} + +## update fetch_failed in the table +sub updateFailed { + my ($dbh, @urls) = @_; + + sayYellow "Update fetch failed: ".scalar @urls; + $query = $dbh->prepare("UPDATE `url_to_fetch` SET `fetch_failed` = 1 WHERE `id` = ?"); + foreach my $idToUpdate (@urls) { + sayLog "Update fetch failed for: $idToUpdate" if($DEBUG); + $query->bind_param(1,$idToUpdate); + $query->execute(); + } + $query->finish(); + sayGreen "Update fetch failed done"; +} + diff --git a/lib/Aranea/Common.pm b/lib/Aranea/Common.pm new file mode 100644 index 0000000..4d8ec9a --- /dev/null +++ b/lib/Aranea/Common.pm @@ -0,0 +1,38 @@ +package Aranea::Common; +use 5.20.0; +use strict; +use warnings; +use utf8; +use Term::ANSIColor qw(:constants); + +use DateTime; +use Exporter qw(import); + + +our @EXPORT_OK = qw(sayLog sayYellow sayGreen sayRed); + +sub sayLog { + my ($string) = @_; + my $dt = DateTime->now; + say "[".$dt->datetime."] DEBUG: ".$string; +} + +sub sayYellow { + my ($string) = @_; + my $dt = DateTime->now; + say CLEAR,YELLOW, "[".$dt->datetime."] ".$string, RESET; +} + +sub sayGreen { + my ($string) = @_; + my $dt = DateTime->now; + say CLEAR,GREEN, "[".$dt->datetime."] ".$string, RESET; +} + +sub sayRed { + my ($string) = @_; + my $dt = DateTime->now; + say BOLD, RED, "[".$dt->datetime."] ".$string, RESET; +} + +1; \ No newline at end of file diff --git a/parse-results.pl b/parse-results.pl new file mode 100644 index 0000000..9d081fd --- /dev/null +++ b/parse-results.pl @@ -0,0 +1,181 @@ +#!/usr/bin/perl -w +use 5.20.0; +use strict; +use warnings; +use utf8; +use Term::ANSIColor qw(:constants); +use Data::Dumper; + +use lib './lib'; +use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); + +use open qw( :std :encoding(UTF-8) ); +use DBI; +use ConfigReader::Simple; +use HTML::LinkExtor; +use URI::URL; +use File::Basename; +use Digest::MD5 qw(md5_hex); +use Data::Validate::URI qw(is_uri); + +my $DEBUG = 0; +my $config = ConfigReader::Simple->new("config.txt"); +die "Could not read config! $ConfigReader::Simple::ERROR\n" unless ref $config; + +## DB connection +my %dbAttr = ( + PrintError=>0,# turn off error reporting via warn() + RaiseError=>1 # turn on error reporting via die() +); +my $dbDsn = "DBI:mysql:database=".$config->get("DB_NAME").";host=".$config->get("DB_HOST").";port=".$config->get("DB_PORT"); +my $dbh = DBI->connect($dbDsn,$config->get("DB_USER"),$config->get("DB_PASS"), \%dbAttr); +die "failed to connect to MySQL database:DBI->errstr()" unless($dbh); + + +## get the fetched files +my @results = glob("storage/*.result"); +die "Nothing to parse. No files found." unless(@results); + +## build clean ids for query +my @queryIds = @results; +foreach (@queryIds) { + $_ =~ s/.result//g; + $_ =~ s|storage/||g; +} + +# get the baseurls +my %baseUrls; +my $queryStr = "SELECT `id`, `baseurl` FROM `url_to_fetch` WHERE `id` IN (".join(', ', ('?') x @queryIds).")"; +sayLog($queryStr) if $DEBUG; +my $query = $dbh->prepare($queryStr); +$query->execute(@queryIds); +while(my @row = $query->fetchrow_array) { + $baseUrls{$row[0]} = $row[1]; +} +$query->finish(); + + +# get the string to ignore +my @urlStringsToIgnore; +$queryStr = "SELECT `searchfor` FROM `url_to_ignore`"; +sayLog($queryStr) if $DEBUG; +$query = $dbh->prepare($queryStr); +$query->execute(); +while(my @row = $query->fetchrow) { + push(@urlStringsToIgnore, $row[0]) +} +$query->finish(); + + +## prepare linkExtor +my @links = (); +my @workingLinks = (); +sub leCallback { + my($tag, %attr) = @_; + return if $tag ne 'a'; # we only look closer at + push(@workingLinks, values %attr); +} +my $le = HTML::LinkExtor->new(\&leCallback); + +## now parse each file and get the links +my $counter = 0; +foreach my $resultFile (@results) { + sayYellow "Parsing file: $resultFile"; + + my $fileId = basename($resultFile,".result"); + + if (exists $baseUrls{$fileId}) { + sayYellow "Baseurl: $baseUrls{$fileId}"; + + $le->parse_file($resultFile); + @workingLinks = map { $_ = url($_, $baseUrls{$fileId})->abs->as_string; } @workingLinks; + push(@links,@workingLinks); + + unlink($resultFile); + sayGreen "Parsing done: ".scalar @workingLinks; + } + else { + sayRed "No entry found for file $resultFile"; + } + + if($counter >= 50) { + + @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); + insertIntoDb($dbh, \@links); + + $counter = 0; + @links = (); + } + + @workingLinks = (); + $counter++; +} + +@links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); +insertIntoDb($dbh, \@links); + + +$dbh->disconnect(); +say CLEAR,GREEN, "Parse complete", RESET; + + +## cleanup the found links +sub cleanLinks { + my ($dbh, $linkArray, $urlStringsToIgnore) = @_; + my @linkArray = @{ $linkArray }; + my @urlStringsToIgnore = @{ $urlStringsToIgnore }; + + sayYellow "Clean found links: ".scalar @linkArray; + foreach my $toSearch (@urlStringsToIgnore) { + sayYellow "Clean links from: ".$toSearch; + @linkArray = grep {!/$toSearch/i} @linkArray; + } + sayGreen "Cleaned found links: ".scalar @linkArray; + + return @linkArray; +} + + +## update the DB with the new found links +sub insertIntoDb { + my ($dbh, $links) = @_; + my @links = @{ $links }; + + sayYellow "Insert links into DB: ".scalar @links; + $queryStr = "INSERT IGNORE INTO `url_to_fetch` SET + `id` = ?, + `url` = ?, + `baseurl` = ?, + `created` = NOW()"; + sayLog $queryStr if $DEBUG; + $query = $dbh->prepare($queryStr); + my $md5 = Digest::MD5->new; + foreach my $link (@links) { + + if(!is_uri($link)) { + sayYellow "Ignore URL it is invalid: $link"; + next; + } + + my $url = url($link); + if(!defined($url->scheme) || index($url->scheme,"http") == -1) { + sayYellow "Ignore URL because of scheme: $link"; + next; + } + + $md5->add($link); + my $digest = $md5->hexdigest; + $query->execute($digest, $link, $url->scheme."://".$url->host); + $md5->reset; + + sayLog $link if ($DEBUG); + sayLog $digest if ($DEBUG); + sayLog $url->scheme if ($DEBUG); + sayLog $url->host if ($DEBUG); + sayLog $query->{Statement} if ($DEBUG); + sayLog Dumper($query->{ParamValues}) if ($DEBUG); + + sayLog "Inserted: $link" if($DEBUG); + } + $query->finish(); +} diff --git a/setup.sql b/setup.sql new file mode 100644 index 0000000..93dd6d8 --- /dev/null +++ b/setup.sql @@ -0,0 +1,151 @@ +-- phpMyAdmin SQL Dump +-- version 5.1.1 +-- https://www.phpmyadmin.net/ +-- +-- Host: 127.0.0.1 +-- Generation Time: Jan 11, 2022 at 05:33 PM +-- Server version: 8.0.25 +-- PHP Version: 7.4.21 + +SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; +START TRANSACTION; +SET time_zone = "+00:00"; + + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; + +-- +-- Database: `aranea` +-- +CREATE DATABASE IF NOT EXISTS `aranea` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +USE `aranea`; + +-- -------------------------------------------------------- + +-- +-- Table structure for table `unique_domain` +-- + +DROP TABLE IF EXISTS `unique_domain`; +CREATE TABLE `unique_domain` ( + `id` int NOT NULL, + `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, + `created` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + +-- -------------------------------------------------------- + +-- +-- Table structure for table `url_to_fetch` +-- + +DROP TABLE IF EXISTS `url_to_fetch`; +CREATE TABLE `url_to_fetch` ( + `id` char(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, + `baseurl` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, + `created` datetime NOT NULL, + `last_fetched` datetime DEFAULT NULL, + `fetch_failed` tinyint(1) NOT NULL DEFAULT '0' +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + +-- -------------------------------------------------------- + +-- +-- Table structure for table `url_to_ignore` +-- + +DROP TABLE IF EXISTS `url_to_ignore`; +CREATE TABLE `url_to_ignore` ( + `id` int NOT NULL, + `searchfor` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, + `created` datetime NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + +-- +-- Dumping data for table `url_to_ignore` +-- + +INSERT INTO `url_to_ignore` (`id`, `searchfor`, `created`) VALUES +(1, 'mailto:', '2022-01-05 10:46:10'), +(2, 'javascript:', '2022-01-05 10:46:10'), +(3, 'google.', '2022-01-05 10:46:29'), +(4, 'amazon.', '2022-01-05 10:46:29'), +(5, 'youtube.', '2022-01-05 10:46:47'), +(6, '.onion', '2022-01-05 17:21:45'), +(7, 'instagram.', '2022-01-05 20:15:21'), +(8, 'twitter.', '2022-01-05 20:16:31'), +(9, 'facebook.', '2022-01-05 20:16:31'), +(10, 'skype:', '2022-01-05 21:29:53'), +(11, 'xmpp:', '2022-01-05 21:30:22'), +(12, 'tel:', '2022-01-05 21:30:50'), +(13, 'fax:', '2022-01-05 21:30:50'), +(14, 'whatsapp:', '2022-01-05 21:31:24'), +(15, 'intent:', '2022-01-05 21:31:24'), +(16, 'ftp:', '2022-01-05 21:33:34'), +(17, 'youtu.', '2022-01-05 21:50:26'), +(18, 'pinterest.', '2022-01-05 21:51:31'), +(19, 'microsoft.', '2022-01-05 21:52:30'), +(20, 'apple.', '2022-01-05 21:52:30'), +(21, 'xing.', '2022-01-05 22:03:07'), +(22, 'linked.', '2022-01-05 22:03:07'), +(26, 't.co', '2022-01-05 22:05:07'), +(27, 'tinyurl.', '2022-01-05 22:07:03'), +(28, 'bitly.', '2022-01-05 22:07:03'), +(29, 'bit.ly', '2022-01-05 22:07:23'), +(30, 'wikipedia.', '2022-01-06 09:58:46'), +(31, 'gstatic.', '2022-01-06 09:59:47'), +(32, 'wikimedia.', '2022-01-06 10:00:20'), +(33, 'goo.', '2022-01-06 10:02:11'), +(34, 'cdn.', '2022-01-06 10:02:59'), +(35, 'flickr.', '2022-01-06 10:05:46'), +(36, '.mp3', '2022-01-07 13:11:49'); + +-- +-- Indexes for dumped tables +-- + +-- +-- Indexes for table `unique_domain` +-- +ALTER TABLE `unique_domain` + ADD PRIMARY KEY (`id`), + ADD UNIQUE KEY `url` (`url`); + +-- +-- Indexes for table `url_to_fetch` +-- +ALTER TABLE `url_to_fetch` + ADD PRIMARY KEY (`id`), + ADD KEY `baseurl` (`baseurl`); + +-- +-- Indexes for table `url_to_ignore` +-- +ALTER TABLE `url_to_ignore` + ADD PRIMARY KEY (`id`), + ADD UNIQUE KEY `url` (`searchfor`); + +-- +-- AUTO_INCREMENT for dumped tables +-- + +-- +-- AUTO_INCREMENT for table `unique_domain` +-- +ALTER TABLE `unique_domain` + MODIFY `id` int NOT NULL AUTO_INCREMENT; + +-- +-- AUTO_INCREMENT for table `url_to_ignore` +-- +ALTER TABLE `url_to_ignore` + MODIFY `id` int NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=38; +COMMIT; + +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; -- 2.39.5