diff --git a/db_converter.py b/db_converter.py index 439957b..0435542 100644 --- a/db_converter.py +++ b/db_converter.py @@ -16,7 +16,7 @@ def parse(input_filename, output_filename): - "Feed it a file, and it'll output a fixed one" + """Feed it a file, and it'll output a fixed one""" # State storage if input_filename == "-": @@ -38,16 +38,15 @@ def parse(input_filename, output_filename): # unless we're writing output to stdout, in which case NO PROGRESS FOR YOU. if output_filename == "-": output = sys.stdout - logging = open(os.devnull, "w") + logging = open(os.devnull, "w", encoding='utf-8') else: - output = open(output_filename, "w") + output = open(output_filename, "w", encoding='utf-8') logging = sys.stdout if input_filename == "-": input_fh = sys.stdin else: - input_fh = open(input_filename) - + input_fh = open(input_filename, encoding='utf-8') output.write("-- Converted by db_converter\n") output.write("START TRANSACTION;\n") @@ -57,21 +56,26 @@ def parse(input_filename, output_filename): for i, line in enumerate(input_fh): time_taken = time.time() - started - percentage_done = (i+1) / float(num_lines) + percentage_done = (i + 1) / float(num_lines) secs_left = (time_taken / percentage_done) - time_taken logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % ( i + 1, num_lines, - ((i+1)/float(num_lines))*100, + ((i + 1) / float(num_lines)) * 100, len(tables), num_inserts, secs_left // 60, secs_left % 60, )) logging.flush() - line = line.decode("utf8").strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\") + line = (line.strip() + .replace(r"\\", "WUBWUBREALSLASHWUB") + .replace(r"\'", "''") + .replace("WUBWUBREALSLASHWUB", r"\\")) # Ignore comment lines - if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line: + if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES")\ + or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") \ + or not line: continue # Outside of anything handling @@ -83,26 +87,29 @@ def parse(input_filename, output_filename): creation_lines = [] # Inserting data into a table? elif line.startswith("INSERT INTO"): - output.write(line.encode("utf8").replace("'0000-00-00 00:00:00'", "NULL") + "\n") + line = line.replace("\\0", "") + line = line.replace("'0000-00-00'", "'0001-01-01'") + line = line.replace("'0000-00-00 00:00:00'", "'0001-01-01 00:00:00'") + output.write(line + "\n") num_inserts += 1 # ??? else: - print "\n ! Unknown line in main body: %s" % line + print("\n ! Unknown line in main body: %s" % line) # Inside-create-statement handling else: # Is it a column? if line.startswith('"'): - useless, name, definition = line.strip(",").split('"',2) + useless, name, definition = line.strip(",").split('"', 2) try: - type, extra = definition.strip().split(" ", 1) + db_type, extra = definition.strip().split(" ", 1) # This must be a tricky enum if ')' in extra: - type, extra = definition.strip().split(")") + db_type, extra = definition.strip().split(")") except ValueError: - type = definition.strip() + db_type = definition.strip() extra = "" extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", "")) extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", "")) @@ -110,37 +117,39 @@ def parse(input_filename, output_filename): # See if it needs type conversion final_type = None set_sequence = None - if type == "tinyint(1)": - type = "int4" + if db_type.startswith("tinyint("): + db_type = "int2" + set_sequence = True + elif db_type.startswith("mediumint("): + db_type = "integer" set_sequence = True - final_type = "boolean" - elif type.startswith("int("): - type = "integer" + elif db_type.startswith("int("): + db_type = "integer" set_sequence = True - elif type.startswith("bigint("): - type = "bigint" + elif db_type.startswith("bigint("): + db_type = "bigint" set_sequence = True - elif type == "longtext": - type = "text" - elif type == "mediumtext": - type = "text" - elif type == "tinytext": - type = "text" - elif type.startswith("varchar("): - size = int(type.split("(")[1].rstrip(")")) - type = "varchar(%s)" % (size * 2) - elif type.startswith("smallint("): - type = "int2" + elif db_type == "longtext": + db_type = "text" + elif db_type == "mediumtext": + db_type = "text" + elif db_type == "tinytext": + db_type = "text" + elif db_type.startswith("varchar("): + size = int(db_type.split("(")[1].rstrip(")")) + db_type = "varchar(%s)" % (size * 2) + elif db_type.startswith("smallint("): + db_type = "int2" set_sequence = True - elif type == "datetime": - type = "timestamp with time zone" - elif type == "double": - type = "double precision" - elif type == "blob": - type = "bytea" - elif type.startswith("enum(") or type.startswith("set("): - - types_str = type.split("(")[1].rstrip(")").rstrip('"') + elif db_type == "datetime": + db_type = "timestamp with time zone" + elif db_type == "double": + db_type = "double precision" + elif db_type == "blob": + db_type = "bytea" + elif db_type.startswith("enum(") or db_type.startswith("set("): + + types_str = db_type.split("(")[1].rstrip(")").rstrip('"') types_arr = [type_str.strip('\'') for type_str in types_str.split(",")] # Considered using values to make a name, but its dodgy @@ -148,33 +157,43 @@ def parse(input_filename, output_filename): enum_name = "{0}_{1}".format(current_table, name) if enum_name not in enum_types: - output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str)); + output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str)) enum_types.append(enum_name) - type = enum_name + db_type = enum_name if final_type: - cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT, ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, name, final_type, name, final_type)) + cast_lines.append("ALTER TABLE \"%s\" " + "ALTER COLUMN \"%s\" " + "DROP DEFAULT, " + "ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % + (current_table, name, name, final_type, name, final_type)) # ID fields need sequences [if they are integers?] if name == "id" and set_sequence is True: - sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table)) - sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table)) - sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table)) + sequence_lines.append("CREATE SEQUENCE %s_id_seq" % current_table) + sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % + (current_table, current_table)) + sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % + (current_table, current_table)) # Record it - creation_lines.append('"%s" %s %s' % (name, type, extra)) - tables[current_table]['columns'].append((name, type, extra)) + creation_lines.append('"%s" %s %s' % (name, db_type, extra)) + tables[current_table]['columns'].append((name, db_type, extra)) # Is it a constraint or something? elif line.startswith("PRIMARY KEY"): creation_lines.append(line.rstrip(",")) elif line.startswith("CONSTRAINT"): - foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(","))) - foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(","))) + foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % + (current_table, line.split("CONSTRAINT")[1].strip().rstrip(","))) + foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % + (current_table, + line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(","))) elif line.startswith("UNIQUE KEY"): creation_lines.append("UNIQUE (%s)" % line.split("(")[1].split(")")[0]) elif line.startswith("FULLTEXT KEY"): - fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') ) - fulltext_key_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys)) + fulltext_keys = " || ' ' || ".join(line.split('(')[-1].split(')')[0].replace('"', '').split(',')) + fulltext_key_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % + (current_table, fulltext_keys)) elif line.startswith("KEY"): pass @@ -182,13 +201,16 @@ def parse(input_filename, output_filename): elif line == ");": output.write("CREATE TABLE \"%s\" (\n" % current_table) for i, line in enumerate(creation_lines): - output.write(" %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "")) + line = " %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "") + line = line.replace("'0000-00-00'", "'1000-01-01'") + line = line.replace("'0000-00-00 00:00:00'", "'1000-01-01 00:00:00'") + line = re.sub(r"COMMENT\s+'(.*)'.*$", r", /* \1 */", line) + output.write(line) output.write(');\n\n') current_table = None # ??? else: - print "\n ! Unknown line inside table creation: %s" % line - + print("\n ! Unknown line inside table creation: %s" % line) # Finish file output.write("\n-- Post-data save --\n") @@ -218,7 +240,7 @@ def parse(input_filename, output_filename): # Finish file output.write("\n") output.write("COMMIT;\n") - print "" + print("") if __name__ == "__main__":