-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubstripper.py
More file actions
91 lines (64 loc) · 2.37 KB
/
substripper.py
File metadata and controls
91 lines (64 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
Small utility for pulling out a list of start and end times without anything else from multiple subttile formats
Made for dialogueify
"""
# Built-in modules
import os
import re
import json
from datetime import timedelta
#Custom modules
from exceptions import *
#Global variables
parsers = {}
# Main function
def sub_parse(file):
global parsers
extension = file[len(file)-file[::-1].find(".")::] #Pull the extension from the file name
try:
return parsers[extension](file)
except KeyError:
raise UnsupportedSubFormat
# Parsers
def parse_srt(file):
with open(file, "r", encoding="utf-8") as srt_file:
lines = srt_file.readlines()
subtitles = []
current_subtitle = None
for line in lines:
line = line.strip()
if not line:
if current_subtitle:
subtitles.append(current_subtitle)
current_subtitle = None
elif "-->" in line:
if current_subtitle:
subtitles.append(current_subtitle)
start_time, end_time = line.split("-->")
start_time = parse_hhmmssms(start_time.strip())
end_time = parse_hhmmssms(end_time.strip())
current_subtitle = {"start_time": start_time, "end_time": end_time}
return subtitles
def parse_json(file):
with open(file, "r", encoding="utf-8") as json_file:
events = json.load(json_file)["events"]
subtitles = []
for sub in events:
start_time = timedelta(milliseconds = sub["tStartMs"])
end_time = start_time + timedelta(milliseconds = sub["dDurationMs"])
subtitles.append({"start_time" : start_time, "end_time" : end_time})
return subtitles
def parse_vtt(file):
raise UnsupportedSubFormat
# Utilities
def parse_hhmmssms(timecode):
timecode_parts = timecode.split(":")
seconds, milliseconds = map(int, timecode_parts[2].replace(",", ".").split("."))
return timedelta(hours=int(timecode_parts[0]), minutes=int(timecode_parts[1]), seconds=seconds, milliseconds=milliseconds)
if True:
parsers = {
"srt" : parse_srt,
"json3" : parse_json, #Format used by youtube's subtitles
"json" : parse_json,
"vtt" : parse_vtt
}