From: Banana Date: Sun, 13 Oct 2024 18:29:58 +0000 (+0200) Subject: revert to lower charset in db because mariadb does not support the same as mysql.... X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=f92b67ba2cb7ae47e4d1b74ba3d98ec0ada932bb;p=aranea.git revert to lower charset in db because mariadb does not support the same as mysql.... Signed-off-by: Banana --- diff --git a/crawler/config.default.txt b/crawler/config.default.txt index 06916a2..3ccb66a 100644 --- a/crawler/config.default.txt +++ b/crawler/config.default.txt @@ -15,7 +15,7 @@ UA_TIMEOUT=5 # Setting for fetch.pl FETCH_URLS_PER_RUN=5000 FETCH_URLS_PER_PACKAGE=100 -FETCH_MAX_BYTES_PER_PAGE=5000000 +FETCH_MAX_BYTES_PER_PAGE=10000000 # Settings for parse.pl PARSE_URLS_PER_PACKAGE=500 diff --git a/crawler/documentation/setup.sql b/crawler/documentation/setup.sql index 2340b74..cf27c01 100644 --- a/crawler/documentation/setup.sql +++ b/crawler/documentation/setup.sql @@ -12,10 +12,11 @@ SET time_zone = "+00:00"; -- Table structure for table `stats` -- +DROP TABLE IF EXISTS `stats`; CREATE TABLE `stats` ( - `action` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, - `value` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; + `action` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, + `value` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; -- -------------------------------------------------------- @@ -23,11 +24,26 @@ CREATE TABLE `stats` ( -- Table structure for table `unique_domain` -- +DROP TABLE IF EXISTS `unique_domain`; CREATE TABLE `unique_domain` ( `id` int NOT NULL, - `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, + `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, `created` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; + +-- -------------------------------------------------------- + +-- +-- Table structure for table `url_origin` +-- + +DROP TABLE IF EXISTS `url_origin`; +CREATE TABLE `url_origin` ( + `origin` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, + `target` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, + `created` datetime NOT NULL, + `amount` int NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; -- -------------------------------------------------------- @@ -35,14 +51,15 @@ CREATE TABLE `unique_domain` ( -- Table structure for table `url_to_fetch` -- +DROP TABLE IF EXISTS `url_to_fetch`; CREATE TABLE `url_to_fetch` ( - `id` char(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, - `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, - `baseurl` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, + `id` char(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, + `baseurl` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, `created` datetime NOT NULL, `last_fetched` datetime DEFAULT NULL, `fetch_failed` tinyint(1) NOT NULL DEFAULT '0' -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; -- -------------------------------------------------------- @@ -50,11 +67,12 @@ CREATE TABLE `url_to_fetch` ( -- Table structure for table `url_to_ignore` -- +DROP TABLE IF EXISTS `url_to_ignore`; CREATE TABLE `url_to_ignore` ( `id` int NOT NULL, - `searchfor` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, + `searchfor` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, `created` datetime NOT NULL -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; -- -- Dumping data for table `url_to_ignore` @@ -126,6 +144,12 @@ ALTER TABLE `unique_domain` ADD PRIMARY KEY (`id`), ADD UNIQUE KEY `url` (`url`); +-- +-- Indexes for table `url_origin` +-- +ALTER TABLE `url_origin` + ADD UNIQUE KEY `origin` (`origin`,`target`); + -- -- Indexes for table `url_to_fetch` -- diff --git a/webroot/index.php b/webroot/index.php index d556a99..e23e677 100644 --- a/webroot/index.php +++ b/webroot/index.php @@ -69,7 +69,7 @@ if(isset($_GET['p']) && !empty($_GET['p'])) { $DB = new mysqli(DB_HOST, DB_USERNAME,DB_PASSWORD, DB_NAME); if ($DB->connect_errno) exit('Can not connect to MySQL Server'); $DB->set_charset("utf8mb4"); -$DB->query("SET collation_connection = 'utf8mb4_0900_ai_ci'"); +$DB->query("SET collation_connection = 'utf8mb4_unicode_520_ci'"); $driver = new mysqli_driver(); $driver->report_mode = MYSQLI_REPORT_ERROR | MYSQLI_REPORT_STRICT;