From: Banana Date: Fri, 25 Dec 2020 10:37:03 +0000 (+0100) Subject: some improvements to the imdb dataset to mysql importer X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=9c7cf993e5ad6ed4c49c542f21885e99cda62842;p=dolphin.git some improvements to the imdb dataset to mysql importer --- diff --git a/.gitignore b/.gitignore index ddb9e52..96549dc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .project .settings/ .idea +*.log diff --git a/imdb-dataset-to-mysql/README b/imdb-dataset-to-mysql/README index 5a1db4b..a7d40da 100644 --- a/imdb-dataset-to-mysql/README +++ b/imdb-dataset-to-mysql/README @@ -8,8 +8,10 @@ and copyright/license and verify compliance. https://www.imdb.com/conditions This will import the imdb dataset tsv into your mysql database for further user. -Based on the dataset at feb. 2020 +Code based on the dataset at feb. 2020 There will be no relations or whatsoever. Just plain data into tables. +It also does not create any relation tables yet. Some tables have columns which have +strings separated by comma in them. As of march 2020 Title crew looks strange. The longest line is 16313 (wc -L title.crews.tsv) @@ -18,4 +20,11 @@ varchar. Do not know if this is an error or correct... This is not a good example to be written in PHP. But you can use it. -Don't execute it through a webserver. It is a CLI. \ No newline at end of file +Don't execute it through a webserver. It is a CLI script + +# Usage +Download and place the tsv files from https://www.imdb.com/interfaces/ into the datasets folder. +Decide which one do you need. Alter $filesToImport in import.php to match the files. +Decide if you need a full text search index. Needed if you want to use the api.php. +Adding the index after the initial import is not a good idea. It takes ages!! +Using the index will slow down the import. To use change BUILD_INDEX to true in import.php file diff --git a/imdb-dataset-to-mysql/TODO b/imdb-dataset-to-mysql/TODO new file mode 100644 index 0000000..0258252 --- /dev/null +++ b/imdb-dataset-to-mysql/TODO @@ -0,0 +1 @@ +Complete relation model. Resolve those command separated strings from some tables. diff --git a/imdb-dataset-to-mysql/api.php b/imdb-dataset-to-mysql/api.php new file mode 100644 index 0000000..3b500d7 --- /dev/null +++ b/imdb-dataset-to-mysql/api.php @@ -0,0 +1,96 @@ +connect_errno) exit("Can not connect to MySQL Server\n"); +$DB->set_charset("utf8mb4"); +$DB->query("SET collation_connection = 'utf8mb4_bin'"); +$driver = new mysqli_driver(); +$driver->report_mode = MYSQLI_REPORT_ERROR | MYSQLI_REPORT_STRICT; + +## defaults +$returnData = array(); +$returnStatusCode = 200; + +$_s = ''; +if(isset($_GET['s']) && !empty($_GET['s'])) { + $_s = Helper::validate($_GET['s']) ? trim($_GET['s']) : ''; + $_s = strtolower($_s); +} + +if(!empty($_s)) { + $queryStr = "SELECT `tconst`, `primaryTitle`, `originalTitle`, `startYear`, `runtimeMinutes`, `genres`, + MATCH (`primaryTitle`) + AGAINST ('".$DB->real_escape_string($_s)."' IN NATURAL LANGUAGE MODE) AS score + FROM `title_basics` + WHERE MATCH (`primaryTitle`) + AGAINST ('".$DB->real_escape_string($_s)."' IN NATURAL LANGUAGE MODE) + LIMIT 10"; + try { + $query = $DB->query($queryStr); + if ($query !== false && $query->num_rows > 0) { + while (($result = $query->fetch_assoc()) != false) { + $returnData[$result['tconst']] = $result; + } + } + + } catch (Exception $e) { + error_log("ERROR search query failed: ".$e->getMessage()); + error_log("ERROR search query: ".$queryStr); + } + +} + +header("Cache-Control: no-store, no-cache, must-revalidate, max-age=0"); +header("Cache-Control: post-check=0, pre-check=0", false); +header("Pragma: no-cache"); +header('Content-Type: application/json'); +if($returnStatusCode !== 200) { + http_response_code($returnStatusCode); +} +echo json_encode($returnData); diff --git a/imdb-dataset-to-mysql/import.php b/imdb-dataset-to-mysql/import.php index 2decf6b..8dc74de 100644 --- a/imdb-dataset-to-mysql/import.php +++ b/imdb-dataset-to-mysql/import.php @@ -16,6 +16,7 @@ /** * read and create mysql tables based on the tsv data from imdb * dataset format based of feb. 2020 + * See README for more details */ mb_http_output('UTF-8'); @@ -34,6 +35,10 @@ $filesToImport = array( 'NameBasics' => 'name.basics.tsv' ); +## create mysql fulltext index or not. +## Warning. It takes a very long time! +define('BUILD_INDEX',false); + ## database settings define('DB_HOST','localhost'); define('DB_USER','user'); diff --git a/imdb-dataset-to-mysql/lib/NameBasics.class.php b/imdb-dataset-to-mysql/lib/NameBasics.class.php index 8f70dbe..0834c1b 100644 --- a/imdb-dataset-to-mysql/lib/NameBasics.class.php +++ b/imdb-dataset-to-mysql/lib/NameBasics.class.php @@ -25,13 +25,18 @@ class NameBasics extends TSVImport { $this->_db_table_name = 'name_basics'; $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( `nconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, -`primaryName` varchar(128) COLLATE utf8mb4_bin NOT NULL, +`primaryName` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, `birthYear` year NOT NULL, `deathYear` year NOT NULL, -`primaryProfession` text COLLATE utf8mb4_bin NOT NULL, -`knownForTitles` text COLLATE utf8mb4_bin NOT NULL, +`primaryProfession` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`knownForTitles` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, UNIQUE KEY `nconst` (`nconst`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + + if($this->_createFulltext) { + $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`primaryName`)"; + $this->_db_table_after_import_query[] = "OPTIMIZE TABLE `" . $this->_db_table_name . "`"; + } } /** @@ -57,4 +62,4 @@ UNIQUE KEY `nconst` (`nconst`) return $ret; } -} \ No newline at end of file +} diff --git a/imdb-dataset-to-mysql/lib/TitleAkas.class.php b/imdb-dataset-to-mysql/lib/TitleAkas.class.php index df5a9c3..575b9ff 100644 --- a/imdb-dataset-to-mysql/lib/TitleAkas.class.php +++ b/imdb-dataset-to-mysql/lib/TitleAkas.class.php @@ -32,6 +32,11 @@ class TitleAkas extends TSVImport { `isOriginalTitle` tinyint(1) NOT NULL, UNIQUE KEY `titleId` (`titleId`,`ordering`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + + if($this->_createFulltext) { + $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`title`)"; + $this->_db_table_after_import_query[] = "OPTIMIZE TABLE `" . $this->_db_table_name . "`"; + } } public function queryValuePart($data) { @@ -55,4 +60,4 @@ UNIQUE KEY `titleId` (`titleId`,`ordering`) return $ret; } -} \ No newline at end of file +} diff --git a/imdb-dataset-to-mysql/lib/TitleBasics.class.php b/imdb-dataset-to-mysql/lib/TitleBasics.class.php index 50293e7..3af631c 100644 --- a/imdb-dataset-to-mysql/lib/TitleBasics.class.php +++ b/imdb-dataset-to-mysql/lib/TitleBasics.class.php @@ -25,17 +25,23 @@ class TitleBasics extends TSVImport { public function setup() { $this->_db_table_name = 'title_basics'; $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( -`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, -`titleType` varchar(16) COLLATE utf8mb4_bin NOT NULL, -`primaryTitle` varchar(255) COLLATE utf8mb4_bin NOT NULL, -`originalTitle` varchar(255) COLLATE utf8mb4_bin NOT NULL, +`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`titleType` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`primaryTitle` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`originalTitle` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, `isAdult` tinyint(1) NOT NULL, -`startYear` char(4) COLLATE utf8mb4_bin NOT NULL, -`endYear` char(4) COLLATE utf8mb4_bin NOT NULL, +`startYear` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`endYear` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, `runtimeMinutes` int NOT NULL, -`genres` varchar(255) COLLATE utf8mb4_bin NOT NULL, +`genres` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, UNIQUE KEY `tconst` (`tconst`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + + if($this->_createFulltext) { + $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`primaryTitle`)"; + $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`originalTitle`)"; + $this->_db_table_after_import_query[] = "OPTIMIZE TABLE `" . $this->_db_table_name . "`"; + } } /** diff --git a/imdb-dataset-to-mysql/lib/TitleEpisode.class.php b/imdb-dataset-to-mysql/lib/TitleEpisode.class.php index de8387a..2987dc4 100644 --- a/imdb-dataset-to-mysql/lib/TitleEpisode.class.php +++ b/imdb-dataset-to-mysql/lib/TitleEpisode.class.php @@ -24,8 +24,8 @@ class TitleEpisode extends TSVImport { public function setup() { $this->_db_table_name = 'title_episode'; $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( -`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, -`parentTconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`parentTconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, `seasonNumber` int NOT NULL, `episodeNumber` int NOT NULL, UNIQUE KEY `tconst` (`tconst`) @@ -53,4 +53,4 @@ UNIQUE KEY `tconst` (`tconst`) return $ret; } -} \ No newline at end of file +} diff --git a/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php b/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php index 0ff2721..9e434f2 100644 --- a/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php +++ b/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php @@ -24,12 +24,12 @@ class TitlePrincipals extends TSVImport { public function setup() { $this->_db_table_name = 'title_principals'; $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( -`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, `ordering` int NOT NULL, -`nconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, -`category` varchar(128) COLLATE utf8mb4_bin NOT NULL, -`job` varchar(128) COLLATE utf8mb4_bin NOT NULL, -`characters` varchar(128) COLLATE utf8mb4_bin NOT NULL, +`nconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`category` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`job` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`characters` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, UNIQUE KEY `tconst` (`tconst`,`ordering`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; } @@ -57,4 +57,4 @@ UNIQUE KEY `tconst` (`tconst`,`ordering`) return $ret; } -} \ No newline at end of file +} diff --git a/imdb-dataset-to-mysql/lib/TitleRatings.class.php b/imdb-dataset-to-mysql/lib/TitleRatings.class.php index 25ac38a..3c59de3 100644 --- a/imdb-dataset-to-mysql/lib/TitleRatings.class.php +++ b/imdb-dataset-to-mysql/lib/TitleRatings.class.php @@ -24,8 +24,8 @@ class TitleRatings extends TSVImport { public function setup() { $this->_db_table_name = 'title_ratings'; $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( -`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, -`averageRating` varchar(8) COLLATE utf8mb4_bin NOT NULL, +`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`averageRating` varchar(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, `numVotes` int NOT NULL, UNIQUE KEY `tconst` (`tconst`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; diff --git a/imdb-dataset-to-mysql/lib/helper.class.php b/imdb-dataset-to-mysql/lib/helper.class.php new file mode 100644 index 0000000..564c755 --- /dev/null +++ b/imdb-dataset-to-mysql/lib/helper.class.php @@ -0,0 +1,114 @@ +_db_table_after_import_query)) { + echo "Executing after import stuff\n"; + foreach ($this->_db_table_after_import_query as $k=>$v) { + echo " Running $v\n"; + $this->_DB->query($v); + echo " Done\n"; + } + echo "Done\n"; + } $ret = true; } } else { @@ -135,6 +155,7 @@ abstract class TSVImport { /** * check if needed DB Table is already there. * Otherwise create one. + * * @return bool */ protected function _checkTable() { @@ -161,6 +182,7 @@ abstract class TSVImport { * Creates the needed table für this import * If there are any changes to the table you need to * alter the insert queries too + * * @return bool */ protected function _createTable() { @@ -182,6 +204,7 @@ abstract class TSVImport { /** * Count the file lines. * Used for user info + * * @param $file * @return int */ @@ -191,4 +214,4 @@ abstract class TSVImport { return $file->key(); } -} \ No newline at end of file +}