.project
.settings/
.idea
+*.log
https://www.imdb.com/conditions
This will import the imdb dataset tsv into your mysql database for further user.
-Based on the dataset at feb. 2020
+Code based on the dataset at feb. 2020
There will be no relations or whatsoever. Just plain data into tables.
+It also does not create any relation tables yet. Some tables have columns which have
+strings separated by comma in them.
As of march 2020
Title crew looks strange. The longest line is 16313 (wc -L title.crews.tsv)
This is not a good example to be written in PHP. But you can use it.
-Don't execute it through a webserver. It is a CLI.
\ No newline at end of file
+Don't execute it through a webserver. It is a CLI script
+
+# Usage
+Download and place the tsv files from https://www.imdb.com/interfaces/ into the datasets folder.
+Decide which one do you need. Alter $filesToImport in import.php to match the files.
+Decide if you need a full text search index. Needed if you want to use the api.php.
+Adding the index after the initial import is not a good idea. It takes ages!!
+Using the index will slow down the import. To use change BUILD_INDEX to true in import.php file
--- /dev/null
+Complete relation model. Resolve those command separated strings from some tables.
--- /dev/null
+<?php
+/**
+ * dolphin. Collection of useful PHP skeletons.
+ * Copyright (C) 2013-2020 Johannes 'Banana' Keßler
+ *
+ * https://www.bananas-playground.net
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
+ *
+ * You should have received a copy of the
+ * COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
+ * along with this program. If not, see http://www.sun.com/cddl/cddl.html
+ */
+
+/**
+ * This is a very simple api to the dataset stored in the DB
+ * Use this as a base to extend
+ */
+
+
+mb_http_output('UTF-8');
+mb_internal_encoding('UTF-8');
+ini_set('error_reporting',-1); // E_ALL & E_STRICT
+date_default_timezone_set('Europe/Berlin');
+
+## check request
+$_urlToParse = filter_var($_SERVER['QUERY_STRING'],FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_LOW);
+if(!empty($_urlToParse)) {
+ # see http://de2.php.net/manual/en/regexp.reference.unicode.php
+ if(preg_match('/[\p{C}\p{M}\p{Sc}\p{Sk}\p{So}\p{Zl}\p{Zp}]/u',$_urlToParse) === 1) {
+ die('Malformed request. Make sure you know what you are doing.');
+ }
+}
+
+## set the error reporting
+ini_set('log_errors',true);
+ini_set('error_log','./error.log');
+
+require 'lib/helper.class.php';
+
+## database settings
+define('DB_HOST','localhost');
+define('DB_USER','user');
+define('DB_PASSWORD','test');
+define('DB_NAME','imdb');
+
+## DB connection
+$DB = new mysqli(DB_HOST, DB_USER,DB_PASSWORD, DB_NAME);
+if ($DB->connect_errno) exit("Can not connect to MySQL Server\n");
+$DB->set_charset("utf8mb4");
+$DB->query("SET collation_connection = 'utf8mb4_bin'");
+$driver = new mysqli_driver();
+$driver->report_mode = MYSQLI_REPORT_ERROR | MYSQLI_REPORT_STRICT;
+
+## defaults
+$returnData = array();
+$returnStatusCode = 200;
+
+$_s = '';
+if(isset($_GET['s']) && !empty($_GET['s'])) {
+ $_s = Helper::validate($_GET['s']) ? trim($_GET['s']) : '';
+ $_s = strtolower($_s);
+}
+
+if(!empty($_s)) {
+ $queryStr = "SELECT `tconst`, `primaryTitle`, `originalTitle`, `startYear`, `runtimeMinutes`, `genres`,
+ MATCH (`primaryTitle`)
+ AGAINST ('".$DB->real_escape_string($_s)."' IN NATURAL LANGUAGE MODE) AS score
+ FROM `title_basics`
+ WHERE MATCH (`primaryTitle`)
+ AGAINST ('".$DB->real_escape_string($_s)."' IN NATURAL LANGUAGE MODE)
+ LIMIT 10";
+ try {
+ $query = $DB->query($queryStr);
+ if ($query !== false && $query->num_rows > 0) {
+ while (($result = $query->fetch_assoc()) != false) {
+ $returnData[$result['tconst']] = $result;
+ }
+ }
+
+ } catch (Exception $e) {
+ error_log("ERROR search query failed: ".$e->getMessage());
+ error_log("ERROR search query: ".$queryStr);
+ }
+
+}
+
+header("Cache-Control: no-store, no-cache, must-revalidate, max-age=0");
+header("Cache-Control: post-check=0, pre-check=0", false);
+header("Pragma: no-cache");
+header('Content-Type: application/json');
+if($returnStatusCode !== 200) {
+ http_response_code($returnStatusCode);
+}
+echo json_encode($returnData);
/**
* read and create mysql tables based on the tsv data from imdb
* dataset format based of feb. 2020
+ * See README for more details
*/
mb_http_output('UTF-8');
'NameBasics' => 'name.basics.tsv'
);
+## create mysql fulltext index or not.
+## Warning. It takes a very long time!
+define('BUILD_INDEX',false);
+
## database settings
define('DB_HOST','localhost');
define('DB_USER','user');
$this->_db_table_name = 'name_basics';
$this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` (
`nconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
-`primaryName` varchar(128) COLLATE utf8mb4_bin NOT NULL,
+`primaryName` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`birthYear` year NOT NULL,
`deathYear` year NOT NULL,
-`primaryProfession` text COLLATE utf8mb4_bin NOT NULL,
-`knownForTitles` text COLLATE utf8mb4_bin NOT NULL,
+`primaryProfession` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`knownForTitles` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
UNIQUE KEY `nconst` (`nconst`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin";
+
+ if($this->_createFulltext) {
+ $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`primaryName`)";
+ $this->_db_table_after_import_query[] = "OPTIMIZE TABLE `" . $this->_db_table_name . "`";
+ }
}
/**
return $ret;
}
-}
\ No newline at end of file
+}
`isOriginalTitle` tinyint(1) NOT NULL,
UNIQUE KEY `titleId` (`titleId`,`ordering`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin";
+
+ if($this->_createFulltext) {
+ $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`title`)";
+ $this->_db_table_after_import_query[] = "OPTIMIZE TABLE `" . $this->_db_table_name . "`";
+ }
}
public function queryValuePart($data) {
return $ret;
}
-}
\ No newline at end of file
+}
public function setup() {
$this->_db_table_name = 'title_basics';
$this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` (
-`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
-`titleType` varchar(16) COLLATE utf8mb4_bin NOT NULL,
-`primaryTitle` varchar(255) COLLATE utf8mb4_bin NOT NULL,
-`originalTitle` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`titleType` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`primaryTitle` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`originalTitle` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`isAdult` tinyint(1) NOT NULL,
-`startYear` char(4) COLLATE utf8mb4_bin NOT NULL,
-`endYear` char(4) COLLATE utf8mb4_bin NOT NULL,
+`startYear` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`endYear` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`runtimeMinutes` int NOT NULL,
-`genres` varchar(255) COLLATE utf8mb4_bin NOT NULL,
+`genres` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
UNIQUE KEY `tconst` (`tconst`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin";
+
+ if($this->_createFulltext) {
+ $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`primaryTitle`)";
+ $this->_db_table_after_import_query[] = "ALTER TABLE `" . $this->_db_table_name . "` ADD FULLTEXT (`originalTitle`)";
+ $this->_db_table_after_import_query[] = "OPTIMIZE TABLE `" . $this->_db_table_name . "`";
+ }
}
/**
public function setup() {
$this->_db_table_name = 'title_episode';
$this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` (
-`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
-`parentTconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
+`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`parentTconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`seasonNumber` int NOT NULL,
`episodeNumber` int NOT NULL,
UNIQUE KEY `tconst` (`tconst`)
return $ret;
}
-}
\ No newline at end of file
+}
public function setup() {
$this->_db_table_name = 'title_principals';
$this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` (
-`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
+`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`ordering` int NOT NULL,
-`nconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
-`category` varchar(128) COLLATE utf8mb4_bin NOT NULL,
-`job` varchar(128) COLLATE utf8mb4_bin NOT NULL,
-`characters` varchar(128) COLLATE utf8mb4_bin NOT NULL,
+`nconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`category` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`job` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`characters` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
UNIQUE KEY `tconst` (`tconst`,`ordering`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin";
}
return $ret;
}
-}
\ No newline at end of file
+}
public function setup() {
$this->_db_table_name = 'title_ratings';
$this->_db_table_crate_str = "CREATE TABLE `".$this->_db_table_name."` (
-`tconst` varchar(16) COLLATE utf8mb4_bin NOT NULL,
-`averageRating` varchar(8) COLLATE utf8mb4_bin NOT NULL,
+`tconst` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
+`averageRating` varchar(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`numVotes` int NOT NULL,
UNIQUE KEY `tconst` (`tconst`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin";
--- /dev/null
+<?php
+/**
+ * dolphin. Collection of useful PHP skeletons.
+ * Copyright (C) 2013-2020 Johannes 'Banana' Keßler
+ *
+ * https://www.bananas-playground.net
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
+ *
+ * You should have received a copy of the
+ * COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
+ * along with this program. If not, see http://www.sun.com/cddl/cddl.html
+ */
+
+/**
+ * Static helper class
+ *
+ */
+class Helper {
+
+ /**
+ * validate the given string with the given type. Optional check the string
+ * length
+ *
+ * @see http://de.php.net/manual/en/regexp.reference.unicode.php
+ * http://www.sql-und-xml.de/unicode-database/#pc
+ *
+ * the pattern replaces all that is allowed. the correct result after
+ * the replace should be empty, otherwise are there chars which are not
+ * allowed
+ *
+ * @param string $input The string to check
+ * @param string $mode How the string should be checked
+ * @param mixed $limit If int given the string is checked for length
+ *
+ * @return bool
+ */
+ static function validate($input,$mode='text',$limit=false) {
+ // check if we have input
+ $input = trim($input);
+
+ if($input == "") return false;
+
+ $ret = false;
+
+ switch ($mode) {
+ case 'mail':
+ if(filter_var($input,FILTER_VALIDATE_EMAIL) === $input) {
+ return true;
+ }
+ else {
+ return false;
+ }
+ break;
+
+ case 'url':
+ if(filter_var($input,FILTER_VALIDATE_URL) === $input) {
+ return true;
+ }
+ else {
+ return false;
+ }
+ break;
+
+ case 'nospace':
+ // text without any whitespace and special chars
+ $pattern = '/[\p{L}\p{N}]/u';
+ break;
+
+ case 'nospaceP':
+ // text without any whitespace and special chars
+ // but with Punctuation
+ $pattern = '/[\p{L}\p{N}\p{Po}]/u';
+ break;
+
+ case 'digit':
+ // only numbers and digit
+ $pattern = '/[\p{Nd}]/';
+ break;
+
+ case 'pageTitle':
+ // text with whitespace and without special chars
+ // but with Punctuation
+ $pattern = '/[\p{L}\p{N}\p{Po}\p{Z}\s]/u';
+ break;
+
+ # strange. the \p{M} is needed.. don't know why..
+ case 'filename':
+ $pattern = '/[\p{L}\p{N}\p{M}\-_\.\p{Zs}]/u';
+ break;
+
+ case 'text':
+ default:
+ $pattern = '/[\p{L}\p{N}\p{P}\p{S}\p{Z}\p{M}\s]/u';
+ }
+
+ $value = preg_replace($pattern, '', $input);
+ #if($input === $value) {
+ if($value === "") {
+ $ret = true;
+ }
+
+ if(!empty($limit)) {
+ # isset starts with 0
+ if(isset($input[$limit])) {
+ # too long
+ $ret = false;
+ }
+ }
+
+ return $ret;
+ }
+}
abstract class TSVImport {
/**
- * @var $_DB database object
+ * @var $_DB object database
*/
protected $_DB;
/**
- * @var $_db_table_name Tablename
+ * @var $_db_table_name string Tablename
*/
protected $_db_table_name;
/**
- * @var $_db_table_crate_str Creation SQL for this table
+ * @var $_db_table_crate_str string Creation SQL for this table
*/
protected $_db_table_crate_str;
+ /**
+ * @var bool Create fulltext index or not
+ */
+ protected $_createFulltext = BUILD_INDEX;
+
+ /**
+ * @var array Queries to be run after the import
+ */
+ protected $_db_table_after_import_query = array();
+
/**
* TSVImport constructor.
+ *
* @param $db Mysqli database object
*/
public function __construct($db) {
fclose($handle);
echo "Import complete. Inserted $total rows\n";
+ if(!empty($this->_db_table_after_import_query)) {
+ echo "Executing after import stuff\n";
+ foreach ($this->_db_table_after_import_query as $k=>$v) {
+ echo " Running $v\n";
+ $this->_DB->query($v);
+ echo " Done\n";
+ }
+ echo "Done\n";
+ }
$ret = true;
}
} else {
/**
* check if needed DB Table is already there.
* Otherwise create one.
+ *
* @return bool
*/
protected function _checkTable() {
* Creates the needed table für this import
* If there are any changes to the table you need to
* alter the insert queries too
+ *
* @return bool
*/
protected function _createTable() {
/**
* Count the file lines.
* Used for user info
+ *
* @param $file
* @return int
*/
return $file->key();
}
-}
\ No newline at end of file
+}