From ac67136d7cda4e67321a22a08b2b877eee737ddb Mon Sep 17 00:00:00 2001 From: Banana Date: Sun, 22 Mar 2020 14:30:26 +0100 Subject: [PATCH] import imdb datasets into a mysql database and tables with PHP cli --- .gitignore | 1 + imdb-dataset-to-mysql/README | 20 ++ imdb-dataset-to-mysql/datasets/.gitignore | 2 + imdb-dataset-to-mysql/import.php | 64 ++++++ .../lib/NameBasics.class.php | 60 ++++++ imdb-dataset-to-mysql/lib/TitleAkas.class.php | 58 ++++++ .../lib/TitleBasics.class.php | 66 ++++++ imdb-dataset-to-mysql/lib/TitleCrew.class.php | 54 +++++ .../lib/TitleEpisode.class.php | 56 +++++ .../lib/TitlePrincipals.class.php | 60 ++++++ .../lib/TitleRatings.class.php | 54 +++++ .../lib/import.abstract.class.php | 194 ++++++++++++++++++ 12 files changed, 689 insertions(+) create mode 100644 imdb-dataset-to-mysql/README create mode 100644 imdb-dataset-to-mysql/datasets/.gitignore create mode 100644 imdb-dataset-to-mysql/import.php create mode 100644 imdb-dataset-to-mysql/lib/NameBasics.class.php create mode 100644 imdb-dataset-to-mysql/lib/TitleAkas.class.php create mode 100644 imdb-dataset-to-mysql/lib/TitleBasics.class.php create mode 100644 imdb-dataset-to-mysql/lib/TitleCrew.class.php create mode 100644 imdb-dataset-to-mysql/lib/TitleEpisode.class.php create mode 100644 imdb-dataset-to-mysql/lib/TitlePrincipals.class.php create mode 100644 imdb-dataset-to-mysql/lib/TitleRatings.class.php create mode 100644 imdb-dataset-to-mysql/lib/import.abstract.class.php diff --git a/.gitignore b/.gitignore index baf6b9d..ddb9e52 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .buildpath .project .settings/ +.idea diff --git a/imdb-dataset-to-mysql/README b/imdb-dataset-to-mysql/README new file mode 100644 index 0000000..97227d1 --- /dev/null +++ b/imdb-dataset-to-mysql/README @@ -0,0 +1,20 @@ +https://www.imdb.com/interfaces/ + +Subsets of IMDb data are available for access to customers for personal and non-commercial use. +You can hold local copies of this data, and it is subject to our terms and conditions. +Please refer to the Non-Commercial Licensing +https://help.imdb.com/article/imdb/general-information/can-i-use-imdb-data-in-my-software/G5JTRESSHJBBHTGX +and copyright/license and verify compliance. +https://www.imdb.com/conditions + +This will import the imdb dataset tsv into your mysql database for further user. +Based on the dataset at feb. 2020 + +As of march 2020 +Title crew looks strange. The longest line is 16313 (wc -L title.crews.tsv) +therefore the column directors and writers are defined as text and not +varchar. Do not know if this is an error or correct... + + +This is not a good example to be written in PHP. But you can use it. +Don't execute it through a webserver. It is a CLI. \ No newline at end of file diff --git a/imdb-dataset-to-mysql/datasets/.gitignore b/imdb-dataset-to-mysql/datasets/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/imdb-dataset-to-mysql/datasets/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/imdb-dataset-to-mysql/import.php b/imdb-dataset-to-mysql/import.php new file mode 100644 index 0000000..2decf6b --- /dev/null +++ b/imdb-dataset-to-mysql/import.php @@ -0,0 +1,64 @@ + 'title.akas.tsv', + 'TitleBasics' => 'title.basics.tsv', + 'TitleCrew' => 'title.crew.tsv', + 'TitleEpisode' => 'title.episode.tsv', + 'TitlePrincipals' => 'title.principals.tsv', + 'TitleRatings' => 'title.ratings.tsv', + 'NameBasics' => 'name.basics.tsv' +); + +## database settings +define('DB_HOST','localhost'); +define('DB_USER','user'); +define('DB_PASSWORD','test'); +define('DB_NAME','imdb'); + +## DB connection +$DB = new mysqli(DB_HOST, DB_USER,DB_PASSWORD, DB_NAME); +if ($DB->connect_errno) exit("Can not connect to MySQL Server\n"); +$DB->set_charset("utf8mb4"); +$DB->query("SET collation_connection = 'utf8mb4_bin'"); +$driver = new mysqli_driver(); +$driver->report_mode = MYSQLI_REPORT_ERROR | MYSQLI_REPORT_STRICT; + +require_once 'lib/import.abstract.class.php'; + +foreach($filesToImport as $key=>$file) { + $classFile = $key.'.class.php'; + $file = 'datasets/'.$file; + if(file_exists($file) && is_readable($file) && file_exists('lib/'.$classFile)) { + require_once 'lib/'.$classFile; + $obj = new $key($DB); + $obj->import($file); + } + else { + echo "Required file $file or import class $key not found\n"; + } +} diff --git a/imdb-dataset-to-mysql/lib/NameBasics.class.php b/imdb-dataset-to-mysql/lib/NameBasics.class.php new file mode 100644 index 0000000..8f70dbe --- /dev/null +++ b/imdb-dataset-to-mysql/lib/NameBasics.class.php @@ -0,0 +1,60 @@ +_db_table_name = 'name_basics'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`nconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`primaryName` varchar(128) COLLATE utf8mb4_bin NOT NULL, +`birthYear` year NOT NULL, +`deathYear` year NOT NULL, +`primaryProfession` text COLLATE utf8mb4_bin NOT NULL, +`knownForTitles` text COLLATE utf8mb4_bin NOT NULL, +UNIQUE KEY `nconst` (`nconst`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + /** + * @inheritDoc + */ + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[5])) { + return $ret; + } + + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."', + '".$this->_DB->real_escape_string($data[3])."', + '".$this->_DB->real_escape_string($data[4])."', + '".$this->_DB->real_escape_string($data[5])."' + )"; + } + + return $ret; + } +} \ No newline at end of file diff --git a/imdb-dataset-to-mysql/lib/TitleAkas.class.php b/imdb-dataset-to-mysql/lib/TitleAkas.class.php new file mode 100644 index 0000000..df5a9c3 --- /dev/null +++ b/imdb-dataset-to-mysql/lib/TitleAkas.class.php @@ -0,0 +1,58 @@ +_db_table_name = 'title_akas'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`titleId` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`ordering` int NOT NULL, +`title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`region` varchar(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`language` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`types` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`attributes` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`isOriginalTitle` tinyint(1) NOT NULL, +UNIQUE KEY `titleId` (`titleId`,`ordering`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[7])) { + return $ret; + } + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."', + '".$this->_DB->real_escape_string($data[3])."', + '".$this->_DB->real_escape_string($data[4])."', + '".$this->_DB->real_escape_string($data[5])."', + '".$this->_DB->real_escape_string($data[6])."', + '".$this->_DB->real_escape_string($data[7])."' + )"; + } + + return $ret; + } +} \ No newline at end of file diff --git a/imdb-dataset-to-mysql/lib/TitleBasics.class.php b/imdb-dataset-to-mysql/lib/TitleBasics.class.php new file mode 100644 index 0000000..50293e7 --- /dev/null +++ b/imdb-dataset-to-mysql/lib/TitleBasics.class.php @@ -0,0 +1,66 @@ +_db_table_name = 'title_basics'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`titleType` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`primaryTitle` varchar(255) COLLATE utf8mb4_bin NOT NULL, +`originalTitle` varchar(255) COLLATE utf8mb4_bin NOT NULL, +`isAdult` tinyint(1) NOT NULL, +`startYear` char(4) COLLATE utf8mb4_bin NOT NULL, +`endYear` char(4) COLLATE utf8mb4_bin NOT NULL, +`runtimeMinutes` int NOT NULL, +`genres` varchar(255) COLLATE utf8mb4_bin NOT NULL, +UNIQUE KEY `tconst` (`tconst`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + /** + * @inheritDoc + */ + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[8])) { + return $ret; + } + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."', + '".$this->_DB->real_escape_string($data[3])."', + '".$this->_DB->real_escape_string($data[4])."', + '".$this->_DB->real_escape_string($data[5])."', + '".$this->_DB->real_escape_string($data[6])."', + '".$this->_DB->real_escape_string($data[7])."', + '".$this->_DB->real_escape_string($data[8])."' + )"; + } + + return $ret; + } +} diff --git a/imdb-dataset-to-mysql/lib/TitleCrew.class.php b/imdb-dataset-to-mysql/lib/TitleCrew.class.php new file mode 100644 index 0000000..58e8571 --- /dev/null +++ b/imdb-dataset-to-mysql/lib/TitleCrew.class.php @@ -0,0 +1,54 @@ +_db_table_name = 'title_crew'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`directors` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +`writers` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL, +UNIQUE KEY `tconst` (`tconst`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + /** + * @inheritDoc + */ + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[2])) { + return $ret; + } + + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."' + )"; + } + + return $ret; + } +} \ No newline at end of file diff --git a/imdb-dataset-to-mysql/lib/TitleEpisode.class.php b/imdb-dataset-to-mysql/lib/TitleEpisode.class.php new file mode 100644 index 0000000..de8387a --- /dev/null +++ b/imdb-dataset-to-mysql/lib/TitleEpisode.class.php @@ -0,0 +1,56 @@ +_db_table_name = 'title_episode'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`parentTconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`seasonNumber` int NOT NULL, +`episodeNumber` int NOT NULL, +UNIQUE KEY `tconst` (`tconst`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + /** + * @inheritDoc + */ + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[3])) { + return $ret; + } + + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."', + '".$this->_DB->real_escape_string($data[3])."' + )"; + } + + return $ret; + } +} \ No newline at end of file diff --git a/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php b/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php new file mode 100644 index 0000000..0ff2721 --- /dev/null +++ b/imdb-dataset-to-mysql/lib/TitlePrincipals.class.php @@ -0,0 +1,60 @@ +_db_table_name = 'title_principals'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`ordering` int NOT NULL, +`nconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`category` varchar(128) COLLATE utf8mb4_bin NOT NULL, +`job` varchar(128) COLLATE utf8mb4_bin NOT NULL, +`characters` varchar(128) COLLATE utf8mb4_bin NOT NULL, +UNIQUE KEY `tconst` (`tconst`,`ordering`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + /** + * @inheritDoc + */ + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[5])) { + return $ret; + } + + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."', + '".$this->_DB->real_escape_string($data[3])."', + '".$this->_DB->real_escape_string($data[4])."', + '".$this->_DB->real_escape_string($data[5])."' + )"; + } + + return $ret; + } +} \ No newline at end of file diff --git a/imdb-dataset-to-mysql/lib/TitleRatings.class.php b/imdb-dataset-to-mysql/lib/TitleRatings.class.php new file mode 100644 index 0000000..25ac38a --- /dev/null +++ b/imdb-dataset-to-mysql/lib/TitleRatings.class.php @@ -0,0 +1,54 @@ +_db_table_name = 'title_ratings'; + $this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` ( +`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL, +`averageRating` varchar(8) COLLATE utf8mb4_bin NOT NULL, +`numVotes` int NOT NULL, +UNIQUE KEY `tconst` (`tconst`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin"; + } + + /** + * @inheritDoc + */ + public function queryValuePart($data) { + $ret = ''; + + if(!empty($data)) { + if(!isset($data[2])) { + return $ret; + } + + $ret .= "( + '".$this->_DB->real_escape_string($data[0])."', + '".$this->_DB->real_escape_string($data[1])."', + '".$this->_DB->real_escape_string($data[2])."' + )"; + } + + return $ret; + } +} diff --git a/imdb-dataset-to-mysql/lib/import.abstract.class.php b/imdb-dataset-to-mysql/lib/import.abstract.class.php new file mode 100644 index 0000000..32181c7 --- /dev/null +++ b/imdb-dataset-to-mysql/lib/import.abstract.class.php @@ -0,0 +1,194 @@ +_DB = $db; + $this->setup(); + } + + /** + * Set the table name into $_db_table_name + * Set the CREATE TABLE query into $_db_table_crate_str with the use + * of $_db_table_name + * + * set $_db_table_name $_db_table_crate_str + * @return void + */ + abstract public function setup(); + + /** + * Creates the values port of the insert query + * INSERT INTO ... VALUES (1,2,3),(1,2,3),(1,2,3) + * Where (1,2,3) is the result of this function + * + * @param $data array of the tsv line to build the () + * for the values insert query + * @return string + */ + abstract public function queryValuePart($data); + + /** + * @param $file The TSV file to import + * @return bool + */ + public function import($file) { + $ret = false; + + echo "Starting to import $file with ".get_class($this)." class\n"; + + $check = $this->_checkTable(); + if($check == false) { + echo "Creating needed database table: $this->_db_table_name \n"; + $this->_createTable(); + } + else { + echo "Database table already exists: $this->_db_table_name \n"; + } + + if(!empty($file)) { + if (($handle = fopen($file, "r")) !== FALSE) { + // skip first line as it should be column names + // the length of the first line should be not so long as the others. + fgetcsv($handle, 4000, "\t"); + + $linesInFile = $this->_linesInFile($file); + + $queryStrStart = "INSERT IGNORE INTO `".$this->_db_table_name."` VALUES "; + $queryStr = ''; + $total=0; + // some files have very long lines... otherwise a length value would be perfect + while (($data = fgetcsv($handle, 0, "\t")) !== FALSE) { + // invalid lines. + $_p = $this->queryValuePart($data); + if(!empty($_p)) { + $queryStr .= $this->queryValuePart($data).","; + $total++; + + if(isset($queryStr[1000000])) { + try { + $this->_DB->query($queryStrStart . trim($queryStr, ",")); + $queryStr = ''; + } catch(Exception $e) { + echo "Failure in executing the query. ".$e->getMessage()."\n"; + var_dump($queryStr); + exit(); + return false; + } + } + + echo "Inserting: $total/$linesInFile\r"; + } + } + if(!empty($queryStr)) { + $this->_DB->query($queryStrStart.trim($queryStr,",")); + echo "\n"; + } + + fclose($handle); + echo "Import complete. Inserted $total rows\n"; + $ret = true; + } + } else { + echo "Filename empty\n"; + } + + return $ret; + } + + /** + * check if needed DB Table is already there. + * Otherwise create one. + * @return bool + */ + protected function _checkTable() { + $ret = false; + + $queryStr = "SELECT count(*) AS amount + FROM information_schema.TABLES + WHERE (TABLE_SCHEMA = '".DB_NAME."') AND (TABLE_NAME = '".$this->_db_table_name."')"; + try { + $query = $this->_DB->query($queryStr); + $result = $query->fetch_assoc(); + if(!empty($result['amount'])) { + $ret = true; + } + } + catch (Exception $e) { + echo $e->getMessage(); + } + + return $ret; + } + + /** + * Creates the needed table für this import + * If there are any changes to the table you need to + * alter the insert queries too + * @return bool + */ + protected function _createTable() { + $ret = false; + + try { + $query = $this->_DB->query($this->_db_table_crate_str); + if($query) { + $ret = true; + } + } + catch (Exception $e) { + echo $e->getMessage(); + } + + return $ret; + } + + /** + * Count the file lines. + * Used for user info + * @param $file + * @return int + */ + protected function _linesInFile($file) { + $file = new \SplFileObject($file, 'r'); + $file->seek(PHP_INT_MAX); + return $file->key(); + } + +} \ No newline at end of file -- 2.39.5