forked from zayn303/tv-program-web-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_tv_programs_sqlite.py
More file actions
145 lines (129 loc) · 4.37 KB
/
load_tv_programs_sqlite.py
File metadata and controls
145 lines (129 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import re
from datetime import datetime
from pathlib import Path
import sqlite3
DB_PATH = "tvguide.db"
INPUT_FILES = ["tv_programs_BBC.txt","tv_programs_Disc.txt","tv_programs_NatGeo.txt"]
RECORD_SEP = re.compile(r"^-{3,}\s*$")
KV_LINE = re.compile(r"^\s*([^:]+)\s*:\s*(.*)\s*$")
DDL = """
PRAGMA journal_mode=WAL;
PRAGMA foreign_keys=ON;
CREATE TABLE IF NOT EXISTS programs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
day_name TEXT,
air_date TEXT, -- store ISO date string 'YYYY-MM-DD'
start_time TEXT, -- store ISO time string 'HH:MM:SS'
end_time TEXT,
duration_min INTEGER,
channel TEXT,
link TEXT,
original_name TEXT,
prod_year INTEGER,
description TEXT,
score_pct INTEGER,
genre TEXT,
source_file TEXT
);
CREATE UNIQUE INDEX IF NOT EXISTS ux_program
ON programs(channel, air_date, start_time, title);
"""
UPSERT = """
INSERT INTO programs
(title, day_name, air_date, start_time, end_time, duration_min,
channel, link, original_name, prod_year, description, score_pct, genre, source_file)
VALUES
(:title, :day_name, :air_date, :start_time, :end_time, :duration_min,
:channel, :link, :original_name, :prod_year, :description, :score_pct, :genre, :source_file)
ON CONFLICT(channel, air_date, start_time, title) DO UPDATE SET
day_name=excluded.day_name,
end_time=excluded.end_time,
duration_min=excluded.duration_min,
link=excluded.link,
original_name=excluded.original_name,
prod_year=excluded.prod_year,
description=excluded.description,
score_pct=excluded.score_pct,
genre=excluded.genre,
source_file=excluded.source_file;
"""
def parse_file(path: Path):
items = []
with path.open("r", encoding="utf-8") as f:
block = []
for line in f:
if RECORD_SEP.match(line):
if block:
rec = parse_block(block, path.name)
if rec: items.append(rec)
block = []
else:
if line.strip():
block.append(line.rstrip("\n"))
if block:
rec = parse_block(block, path.name)
if rec: items.append(rec)
return items
def parse_block(lines, source_file):
d = {}
for ln in lines:
m = KV_LINE.match(ln)
if m:
key, val = m.group(1).strip(), m.group(2).strip()
d[key] = val
def to_date_str(s):
if not s: return None
try:
return datetime.strptime(s, "%d.%m.%Y").strftime("%Y-%m-%d")
except ValueError:
return None
def to_time_str(s):
if not s: return None
try:
# normalize to HH:MM:SS
return datetime.strptime(s, "%H:%M").strftime("%H:%M:%S")
except ValueError:
return None
def first_int(s):
if not s: return None
m = re.search(r"\d+", s)
return int(m.group(0)) if m else None
return {
"title": d.get("Title") or None,
"day_name": d.get("Day") or None,
"air_date": to_date_str(d.get("Date")), # TEXT
"start_time": to_time_str(d.get("Start Time")), # TEXT
"end_time": to_time_str(d.get("End Time")), # TEXT
"duration_min": first_int(d.get("Duration")),
"channel": d.get("Channel") or None,
"link": d.get("Link") or None,
"original_name": d.get("Original Name") or None,
"prod_year": first_int(d.get("Year")),
"description": d.get("Description") or None,
"score_pct": first_int(d.get("Score")),
"genre": d.get("Genre") or None,
"source_file": source_file,
}
def main():
conn = sqlite3.connect(DB_PATH)
conn.execute("PRAGMA synchronous=NORMAL;")
for stmt in filter(None, DDL.split(";")):
s = stmt.strip()
if s: conn.execute(s + ";")
all_rows = []
for fname in INPUT_FILES:
p = Path(fname)
if not p.exists():
print(f"WARNING: {fname} not found, skipping.")
continue
all_rows.extend(parse_file(p))
if not all_rows:
print("No rows parsed. Nothing to insert.")
return
with conn:
conn.executemany(UPSERT, all_rows)
print(f"Inserted/updated {len(all_rows)} rows into {DB_PATH}.")
conn.close()
if __name__ == "__main__":
main()