From: Banana Date: Sun, 16 Jan 2022 09:23:23 +0000 (+0100) Subject: cleanup of the code and some paperwork X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=17aef3b5ab59e592ee756006862f6a6cc9ed7846;p=aranea.git cleanup of the code and some paperwork --- diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..f3df3d7 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,2 @@ +aranea - 0.1 ++ initial release \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..67be0ac --- /dev/null +++ b/LICENSE @@ -0,0 +1,380 @@ +COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 + +1. Definitions. + +1.1. "Contributor" means each individual or entity that +creates or contributes to the creation of Modifications. + +1.2. "Contributor Version" means the combination of the +Original Software, prior Modifications used by a +Contributor (if any), and the Modifications made by that +particular Contributor. + +1.3. "Covered Software" means (a) the Original Software, or +(b) Modifications, or (c) the combination of files +containing Original Software with files containing +Modifications, in each case including portions thereof. + +1.4. "Executable" means the Covered Software in any form +other than Source Code. + +1.5. "Initial Developer" means the individual or entity +that first makes Original Software available under this +License. + +1.6. "Larger Work" means a work which combines Covered +Software or portions thereof with code not governed by the +terms of this License. + +1.7. "License" means this document. + +1.8. "Licensable" means having the right to grant, to the +maximum extent possible, whether at the time of the initial +grant or subsequently acquired, any and all of the rights +conveyed herein. + +1.9. "Modifications" means the Source Code and Executable +form of any of the following: + +A. Any file that results from an addition to, +deletion from or modification of the contents of a +file containing Original Software or previous +Modifications; + +B. Any new file that contains any part of the +Original Software or previous Modification; or + +C. Any new file that is contributed or otherwise made +available under the terms of this License. + +1.10. "Original Software" means the Source Code and +Executable form of computer software code that is +originally released under this License. + +1.11. "Patent Claims" means any patent claim(s), now owned +or hereafter acquired, including without limitation, +method, process, and apparatus claims, in any patent +Licensable by grantor. + +1.12. "Source Code" means (a) the common form of computer +software code in which modifications are made and (b) +associated documentation included in or with such code. + +1.13. "You" (or "Your") means an individual or a legal +entity exercising rights under, and complying with all of +the terms of, this License. For legal entities, "You" +includes any entity which controls, is controlled by, or is +under common control with You. For purposes of this +definition, "control" means (a) the power, direct or +indirect, to cause the direction or management of such +entity, whether by contract or otherwise, or (b) ownership +of more than fifty percent (50%) of the outstanding shares +or beneficial ownership of such entity. + +2. License Grants. + +2.1. The Initial Developer Grant. + +Conditioned upon Your compliance with Section 3.1 below and +subject to third party intellectual property claims, the +Initial Developer hereby grants You a world-wide, +royalty-free, non-exclusive license: + +(a) under intellectual property rights (other than +patent or trademark) Licensable by Initial Developer, +to use, reproduce, modify, display, perform, +sublicense and distribute the Original Software (or +portions thereof), with or without Modifications, +and/or as part of a Larger Work; and + +(b) under Patent Claims infringed by the making, +using or selling of Original Software, to make, have +made, use, practice, sell, and offer for sale, and/or +otherwise dispose of the Original Software (or +portions thereof). + +(c) The licenses granted in Sections 2.1(a) and (b) +are effective on the date Initial Developer first +distributes or otherwise makes the Original Software +available to a third party under the terms of this +License. + +(d) Notwithstanding Section 2.1(b) above, no patent +license is granted: (1) for code that You delete from +the Original Software, or (2) for infringements +caused by: (i) the modification of the Original +Software, or (ii) the combination of the Original +Software with other software or devices. + +2.2. Contributor Grant. + +Conditioned upon Your compliance with Section 3.1 below and +subject to third party intellectual property claims, each +Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than +patent or trademark) Licensable by Contributor to +use, reproduce, modify, display, perform, sublicense +and distribute the Modifications created by such +Contributor (or portions thereof), either on an +unmodified basis, with other Modifications, as +Covered Software and/or as part of a Larger Work; and + +(b) under Patent Claims infringed by the making, +using, or selling of Modifications made by that +Contributor either alone and/or in combination with +its Contributor Version (or portions of such +combination), to make, use, sell, offer for sale, +have made, and/or otherwise dispose of: (1) +Modifications made by that Contributor (or portions +thereof); and (2) the combination of Modifications +made by that Contributor with its Contributor Version +(or portions of such combination). + +(c) The licenses granted in Sections 2.2(a) and +2.2(b) are effective on the date Contributor first +distributes or otherwise makes the Modifications +available to a third party. + +(d) Notwithstanding Section 2.2(b) above, no patent +license is granted: (1) for any code that Contributor +has deleted from the Contributor Version; (2) for +infringements caused by: (i) third party +modifications of Contributor Version, or (ii) the +combination of Modifications made by that Contributor +with other software (except as part of the +Contributor Version) or other devices; or (3) under +Patent Claims infringed by Covered Software in the +absence of Modifications made by that Contributor. + +3. Distribution Obligations. + +3.1. Availability of Source Code. + +Any Covered Software that You distribute or otherwise make +available in Executable form must also be made available in +Source Code form and that Source Code form must be +distributed only under the terms of this License. You must +include a copy of this License with every copy of the +Source Code form of the Covered Software You distribute or +otherwise make available. You must inform recipients of any +such Covered Software in Executable form as to how they can +obtain such Covered Software in Source Code form in a +reasonable manner on or through a medium customarily used +for software exchange. + +3.2. Modifications. + +The Modifications that You create or to which You +contribute are governed by the terms of this License. You +represent that You believe Your Modifications are Your +original creation(s) and/or You have sufficient rights to +grant the rights conveyed by this License. + +3.3. Required Notices. + +You must include a notice in each of Your Modifications +that identifies You as the Contributor of the Modification. +You may not remove or alter any copyright, patent or +trademark notices contained within the Covered Software, or +any notices of licensing or any descriptive text giving +attribution to any Contributor or the Initial Developer. + +3.4. Application of Additional Terms. + +You may not offer or impose any terms on any Covered +Software in Source Code form that alters or restricts the +applicable version of this License or the recipients' +rights hereunder. You may choose to offer, and to charge a +fee for, warranty, support, indemnity or liability +obligations to one or more recipients of Covered Software. +However, you may do so only on Your own behalf, and not on +behalf of the Initial Developer or any Contributor. You +must make it absolutely clear that any such warranty, +support, indemnity or liability obligation is offered by +You alone, and You hereby agree to indemnify the Initial +Developer and every Contributor for any liability incurred +by the Initial Developer or such Contributor as a result of +warranty, support, indemnity or liability terms You offer. + +3.5. Distribution of Executable Versions. + +You may distribute the Executable form of the Covered +Software under the terms of this License or under the terms +of a license of Your choice, which may contain terms +different from this License, provided that You are in +compliance with the terms of this License and that the +license for the Executable form does not attempt to limit +or alter the recipient's rights in the Source Code form +from the rights set forth in this License. If You +distribute the Covered Software in Executable form under a +different license, You must make it absolutely clear that +any terms which differ from this License are offered by You +alone, not by the Initial Developer or Contributor. You +hereby agree to indemnify the Initial Developer and every +Contributor for any liability incurred by the Initial +Developer or such Contributor as a result of any such terms +You offer. + +3.6. Larger Works. + +You may create a Larger Work by combining Covered Software +with other code not governed by the terms of this License +and distribute the Larger Work as a single product. In such +a case, You must make sure the requirements of this License +are fulfilled for the Covered Software. + +4. Versions of the License. + +4.1. New Versions. + +Sun Microsystems, Inc. is the initial license steward and +may publish revised and/or new versions of this License +from time to time. Each version will be given a +distinguishing version number. Except as provided in +Section 4.3, no one other than the license steward has the +right to modify this License. + +4.2. Effect of New Versions. + +You may always continue to use, distribute or otherwise +make the Covered Software available under the terms of the +version of the License under which You originally received +the Covered Software. If the Initial Developer includes a +notice in the Original Software prohibiting it from being +distributed or otherwise made available under any +subsequent version of the License, You must distribute and +make the Covered Software available under the terms of the +version of the License under which You originally received +the Covered Software. Otherwise, You may also choose to +use, distribute or otherwise make the Covered Software +available under the terms of any subsequent version of the +License published by the license steward. + +4.3. Modified Versions. + +When You are an Initial Developer and You want to create a +new license for Your Original Software, You may create and +use a modified version of this License if You: (a) rename +the license and remove any references to the name of the +license steward (except to note that the license differs +from this License); and (b) otherwise make it clear that +the license contains terms which differ from this License. + +5. DISCLAIMER OF WARRANTY. + +COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" +BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED +SOFTWARE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR +PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND +PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY +COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE +INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF +ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF +WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF +ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS +DISCLAIMER. + +6. TERMINATION. + +6.1. This License and the rights granted hereunder will +terminate automatically if You fail to comply with terms +herein and fail to cure such breach within 30 days of +becoming aware of the breach. Provisions which, by their +nature, must remain in effect beyond the termination of +this License shall survive. + +6.2. If You assert a patent infringement claim (excluding +declaratory judgment actions) against Initial Developer or +a Contributor (the Initial Developer or Contributor against +whom You assert such claim is referred to as "Participant") +alleging that the Participant Software (meaning the +Contributor Version where the Participant is a Contributor +or the Original Software where the Participant is the +Initial Developer) directly or indirectly infringes any +patent, then any and all rights granted directly or +indirectly to You by such Participant, the Initial +Developer (if the Initial Developer is not the Participant) +and all Contributors under Sections 2.1 and/or 2.2 of this +License shall, upon 60 days notice from Participant +terminate prospectively and automatically at the expiration +of such 60 day notice period, unless if within such 60 day +period You withdraw Your claim with respect to the +Participant Software against such Participant either +unilaterally or pursuant to a written agreement with +Participant. + +6.3. In the event of termination under Sections 6.1 or 6.2 +above, all end user licenses that have been validly granted +by You or any distributor hereunder prior to termination +(excluding licenses granted to You by any distributor) +shall survive termination. + +7. LIMITATION OF LIABILITY. + +UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT +(INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE +INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF +COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE +LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR +CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT +LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK +STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER +COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN +INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF +LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL +INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT +APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO +NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR +CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT +APPLY TO YOU. + +8. U.S. GOVERNMENT END USERS. + +The Covered Software is a "commercial item," as that term is +defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial +computer software" (as that term is defined at 48 C.F.R. ¤ +252.227-7014(a)(1)) and "commercial computer software +documentation" as such terms are used in 48 C.F.R. 12.212 (Sept. +1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 +through 227.7202-4 (June 1995), all U.S. Government End Users +acquire Covered Software with only those rights set forth herein. +This U.S. Government Rights clause is in lieu of, and supersedes, +any other FAR, DFAR, or other clause or provision that addresses +Government rights in computer software under this License. + +9. MISCELLANEOUS. + +This License represents the complete agreement concerning subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the +extent necessary to make it enforceable. This License shall be +governed by the law of the jurisdiction specified in a notice +contained within the Original Software (except to the extent +applicable law, if any, provides otherwise), excluding such +jurisdiction's conflict-of-law provisions. Any litigation +relating to this License shall be subject to the jurisdiction of +the courts located in the jurisdiction and venue specified in a +notice contained within the Original Software, with the losing +party responsible for costs, including, without limitation, court +costs and reasonable attorneys' fees and expenses. The +application of the United Nations Convention on Contracts for the +International Sale of Goods is expressly excluded. Any law or +regulation which provides that the language of a contract shall +be construed against the drafter shall not apply to this License. +You agree that You alone are responsible for compliance with the +United States export administration regulations (and the export +control laws and regulation of any other countries) when You use, +distribute or otherwise make available any Covered Software. + +10. RESPONSIBILITY FOR CLAIMS. + +As between Initial Developer and the Contributors, each party is +responsible for claims and damages arising, directly or +indirectly, out of its utilization of rights under this License +and You agree to work with Initial Developer and Contributors to +distribute such responsibility on an equitable basis. Nothing +herein is intended or shall be deemed to constitute any admission +of liability. \ No newline at end of file diff --git a/README b/README index e69de29..2f628fe 100644 --- a/README +++ b/README @@ -0,0 +1,15 @@ +A small web crawler named aranea (Latin for spider). +The aim is to gather unique domains to show what is out there. + +It starts with a given set of URL(s) and parses them for more +URLs. Stores them and fethches them too. +-> fetch.pl + +Each URL result (Stored result from the call) will be parsed +for other URLs to follow. +-> parse.pl + +After a run cleanup will gather all the uniqe Domains into +a table. Removes URLs from the fetch table which are already +enough. +-> cleanup.pl diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..d782af1 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +aranea - 0.1 \ No newline at end of file diff --git a/cleanup.pl b/cleanup.pl index 9bbbce3..e28880e 100644 --- a/cleanup.pl +++ b/cleanup.pl @@ -1,10 +1,20 @@ #!/usr/bin/perl -w + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE +# +# You should have received a copy of the +# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 +# along with this program. If not, see http://www.sun.com/cddl/cddl.html +# +# 2022 https://://www.bananas-playground.net + use 5.20.0; use strict; use warnings; use utf8; -use Term::ANSIColor qw(:constants); use Data::Dumper; +use Term::ANSIColor qw(:constants); use lib './lib'; use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); diff --git a/config.txt b/config.txt index df88968..5e5cccd 100644 --- a/config.txt +++ b/config.txt @@ -7,4 +7,7 @@ DB_PASS=test UA_AGENT="Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0" UA_ACCEPT="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" UA_LANG="en-US" -UA_CACHE="no-cache" \ No newline at end of file +UA_CACHE="no-cache" + +FETCH_URLS_PER_PACKAGE=30 +PARSE_FILES_PER_PACKAGE=50 \ No newline at end of file diff --git a/fetch.pl b/fetch.pl index 3c70501..a94c4ad 100644 --- a/fetch.pl +++ b/fetch.pl @@ -1,4 +1,15 @@ #!/usr/bin/perl -w + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE +# +# You should have received a copy of the +# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 +# along with this program. If not, see http://www.sun.com/cddl/cddl.html +# +# 2022 https://://www.bananas-playground.net + + use 5.20.0; use strict; use warnings; @@ -81,7 +92,7 @@ while ( my ($id, $url) = each %urlsToFetch ) { push(@urlsFailed, $id); } - if($counter >= 10) { + if($counter >= $config->get("FETCH_URLS_PER_PACKAGE")) { updateFetched($dbh, @urlsFetched); updateFailed($dbh, @urlsFailed); sleep(rand(7)); diff --git a/parse-results.pl b/parse-results.pl index 9d081fd..8325218 100644 --- a/parse-results.pl +++ b/parse-results.pl @@ -1,10 +1,20 @@ #!/usr/bin/perl -w + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE +# +# You should have received a copy of the +# COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 +# along with this program. If not, see http://www.sun.com/cddl/cddl.html +# +# 2022 https://://www.bananas-playground.net + use 5.20.0; use strict; use warnings; use utf8; -use Term::ANSIColor qw(:constants); use Data::Dumper; +use Term::ANSIColor qw(:constants); use lib './lib'; use Aranea::Common qw(sayLog sayYellow sayGreen sayRed); @@ -98,7 +108,7 @@ foreach my $resultFile (@results) { sayRed "No entry found for file $resultFile"; } - if($counter >= 50) { + if($counter >= $config->get("PARSE_FILES_PER_PACKAGE")) { @links = cleanLinks($dbh, \@links, \@urlStringsToIgnore); insertIntoDb($dbh, \@links); @@ -152,13 +162,15 @@ sub insertIntoDb { my $md5 = Digest::MD5->new; foreach my $link (@links) { + sayLog $link if ($DEBUG); + if(!is_uri($link)) { sayYellow "Ignore URL it is invalid: $link"; next; } my $url = url($link); - if(!defined($url->scheme) || index($url->scheme,"http") == -1) { + if(!defined($url->scheme) || ($url->scheme ne "http" && $url->scheme ne "https")) { sayYellow "Ignore URL because of scheme: $link"; next; } @@ -168,14 +180,13 @@ sub insertIntoDb { $query->execute($digest, $link, $url->scheme."://".$url->host); $md5->reset; - sayLog $link if ($DEBUG); - sayLog $digest if ($DEBUG); - sayLog $url->scheme if ($DEBUG); - sayLog $url->host if ($DEBUG); - sayLog $query->{Statement} if ($DEBUG); - sayLog Dumper($query->{ParamValues}) if ($DEBUG); + #sayLog $digest if ($DEBUG); + #sayLog $url->scheme if ($DEBUG); + #sayLog $url->host if ($DEBUG); + #sayLog $query->{Statement} if ($DEBUG); + #sayLog Dumper($query->{ParamValues}) if ($DEBUG); - sayLog "Inserted: $link" if($DEBUG); + #sayLog "Inserted: $link" if($DEBUG); } $query->finish(); }