-
Notifications
You must be signed in to change notification settings - Fork 15
/
covid_data_dash.py
815 lines (713 loc) · 39.1 KB
/
covid_data_dash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
import datetime
import sys
import time
from lib2to3.pgen2.pgen import DFAState
import numpy as np
import pandas as pd
import tableauscraper
from bs4 import BeautifulSoup
from dateutil.parser import parse as d
from dateutil.relativedelta import relativedelta
import covid_plot_cases
import covid_plot_deaths
import covid_plot_vacs
from utils_pandas import cum2daily
from utils_pandas import export
from utils_pandas import import_csv
from utils_pandas import weekly2daily
from utils_pandas import weeks_to_end_date
from utils_scraping import any_in
from utils_scraping import logger
from utils_scraping import USE_CACHE_DATA
from utils_scraping import web_files
from utils_scraping_tableau import force_setParameter
from utils_scraping_tableau import workbook_explore
from utils_scraping_tableau import workbook_iterate
from utils_scraping_tableau import workbook_series
from utils_scraping_tableau import workbook_value
from utils_thai import get_province
from utils_thai import today
########################
# Dashboards
########################
# # all province case numbers in one go but we already have this data
# url = "https://ddc.moph.go.th/covid19-dashboard/index.php?dashboard=scoreboard"
# # 5 kinds cases stats for last 30 days but we already have I think
# url = "https://ddc.moph.go.th/covid19-dashboard/index.php?dashboard=30-days"
def todays_data():
url = "https://public.tableau.com/views/SATCOVIDDashboard/1-dash-tiles"
# new day starts with new info comes in
while True:
# dates = reversed(pd.date_range("2021-01-24", today() - relativedelta(hours=7.5)).to_pydatetime())
# get_wb, date = next(workbook_iterate(url, param_date=dates))
# date = next(iter(date))
# if (wb := get_wb()) is None:
# continue
ts = tableauscraper.TableauScraper(verify=False)
ts.loads(url)
wb = ts.getWorkbook()
last_update = wb.getWorksheet("D_UpdateTime (2)").data
if last_update.empty:
continue
last_update = pd.to_datetime(last_update['max_update_date-alias'], dayfirst=False).iloc[0]
if last_update >= today().date():
return True
# We got todays data too early
print("z", end="")
time.sleep(60)
def dash_daily():
df = import_csv("moph_dashboard", ["Date"], False, dir="inputs/json") # so we cache it
return df
# remove crap from bad pivot
df = df.drop(columns=[c for c in df.columns if "Vac Given" in c and not any_in(c, "Cum",)])
# somehow we got some dodgy rows. should be no neg cases 2021
if 'Cases' in df.columns:
df = df.drop(df[df['Cases'] == 0.0].index)
# Fix spelling mistake
if 'Postitive Rate Dash' in df.columns:
df = df.drop(columns=['Postitive Rate Dash'])
all_atk_reg = pd.DataFrame()
allow_na = {
"ATK": [[d("2021-07-31"), d("2022-07-05")], [d("2021-04-01"), d("2021-07-30"), 0.0, 0.0]],
"Cases Area Prison": d("2021-05-12"),
"Positive Rate Dash": (d("2021-07-01"), today() - relativedelta(days=14)),
"Tests": today(), # it's no longer there
'Hospitalized Field HICI': d("2021-08-08"),
'Hospitalized Field Hospitel': [[d("2021-08-04"), today(), 0.0], [d("2021-04-01"), d("2021-08-03"), 0.0, 0.0]],
'Hospitalized Field Other': d("2021-08-08"),
'Vac Given 1 Cum': (d("2021-08-01"), today() - relativedelta(days=4)),
'Vac Given 2 Cum': (d("2021-08-01"), today() - relativedelta(days=4)),
"Vac Given 3 Cum": (d("2021-08-01"), today() - relativedelta(days=4)),
'Hospitalized Field': (d('2021-04-20'), today(), 100),
'Hospitalized Respirator': (d("2021-03-25"), today(), 1), # patchy before this
'Hospitalized Severe': (d("2021-04-01"), today(), 10), # try and fix bad values
'Hospitalized Hospital': (d("2021-01-27"), today(), 1),
'Recovered': (d('2021-01-01'), today(), 1),
'Cases Walkin': (d('2021-01-01'), today(), 1),
# 'Infections Non-Hospital Cum': (d("2022-04-08"), d("2022-06-12"), 800000), # Redo older rows because cumsum cal was off
}
atk_days = set([pd.DateOffset(week * 7) + d("2022-01-01") for week in range(15, 40)])
def valid_atk(df, date):
if date not in atk_days:
return True
# Get rid of cumulative values. Just store weekly now. # TODO: change the col name
if 800000 > df.loc[date]['Infections Non-Hospital Cum'] > 50000:
return True
return False
url = "https://public.tableau.com/views/SATCOVIDDashboard/1-dash-tiles"
# new day starts with new info comes in
dates = reversed(pd.date_range("2021-01-24", today() - relativedelta(hours=7.5)).to_pydatetime())
# for get_wb, date in workbook_iterate(url, D_NewTL="DAY(txn_date)"):
for get_wb, date in workbook_iterate(url, inc_no_param=True, param_date=dates):
if date is None:
# initial one which is today
date = today()
else:
date = next(iter(date), None)
if type(date) == str:
date = d(date)
if skip_valid(df, date, allow_na) and valid_atk(df, date):
continue
if (wb := get_wb()) is None:
continue
row = pd.DataFrame()
row = row.combine_first(workbook_series(wb, "D_NewTL", {
"SUM(case_new)-value": "Cases",
"DAY(txn_date)-value": "Date"
}))
# last_update = workbook_value(wb, None, "D_UpdateTime (2)", "Date", is_date=True)
last_update = wb.getWorksheet("D_UpdateTime (2)").data
if not last_update.empty:
last_update = pd.to_datetime(
last_update['max_update_date-alias'].str.replace("2565", "2022"), dayfirst=False).iloc[0]
if last_update.normalize() < row.index.max() or date.date() > row.index.max().date():
# We got todays data too early
continue
else:
last_update = None
row = row.combine_first(workbook_value(wb, date, "D_New", "Cases"))
row = row.combine_first(workbook_value(wb, date, "D_Walkin", "Cases Walkin"))
row = row.combine_first(workbook_value(wb, date, "D_Proact", "Cases Proactive"))
row = row.combine_first(workbook_value(wb, date, "D_NonThai", "Cases Imported"))
row = row.combine_first(workbook_value(wb, date, "D_Prison", "Cases Area Prison"))
row = row.combine_first(workbook_value(wb, date, "D_Hospital", "Hospitalized Hospital"))
row = row.combine_first(workbook_value(wb, date, "D_Severe", "Hospitalized Severe", np.nan))
row = row.combine_first(workbook_value(wb, date, "D_SevereTube", "Hospitalized Respirator", np.nan))
row = row.combine_first(workbook_value(wb, date, "D_Medic", "Hospitalized"))
row = row.combine_first(workbook_value(wb, date, "D_Recov", "Recovered"))
row = row.combine_first(workbook_value(wb, date, "D_Death", "Deaths"))
row = row.combine_first(workbook_value(wb, date, "D_ATK", "ATK", np.nan))
row = row.combine_first(workbook_value(wb, date, "D_HospitalField", "Hospitalized Field"))
row = row.combine_first(workbook_value(wb, date, "D_Hospitel", "Hospitalized Field Hospitel"))
row = row.combine_first(workbook_value(wb, date, "D_HICI", "Hospitalized Field HICI"))
row = row.combine_first(workbook_value(wb, date, "D_HFieldOth", "Hospitalized Field Other"))
row = row.combine_first(workbook_series(wb, "D_Lab2", {
"AGG(% ติดเฉลี่ย)-value": "Positive Rate Dash",
"DAY(txn_date)-value": "Date",
}, np.nan))
row = row.combine_first(workbook_series(wb, "D_Lab", {
"AGG(% ติดเฉลี่ย)-alias": "Positive Rate Dash",
"ATTR(txn_date)-alias": "Date",
}, np.nan))
row = row.combine_first(workbook_series(wb, "D_DeathTL", {
"SUM(death_new)-value": "Deaths",
"DAY(txn_date)-value": "Date"
}))
row = row.combine_first(workbook_series(wb, "D_Vac_Stack", {
"DAY(txn_date)-value": "Date",
"vaccine_plan_group-alias": {
"1": "1 Cum",
"2": "2 Cum",
"3": "3 Cum",
},
"SUM(vaccine_total_acm)-value": "Vac Given",
}))
row = row.combine_first(workbook_series(wb, "D_RecovL", {
"DAY(txn_date)-value": "Date",
"SUM(recovered_new)-value": "Recovered"
}))
if row.empty:
break
# wb.getWorksheet("D_UpdateTime").data.iloc[0]
assert date >= row.index.max() # might be something broken with setParam for date
row["Source Cases"] = "https://ddc.moph.go.th/covid19-dashboard/index.php?dashboard=main"
if date < today() - relativedelta(days=30): # TODO: should use skip_valid rules to work which are delayed rather than 0?
row.loc[date] = row.loc[date].fillna(0.0) # ATK and HICI etc are null to mean 0.0
# Not date indexed as it's weekly
atk_reg = wb.getWorksheet("ATK+WEEK_line_Total (1)").data
if not atk_reg.empty:
# It's the same value for all dates so only need on first iteration
col = "Infections Non-Hospital Cum" # ATK+? no real explanation for this number
atk_reg = atk_reg.rename(columns={"Week-value": "Week", 'SUM(#SETDATE_WEEK_CNT)-value': col})[["Week", col]]
atk_reg['Date'] = (pd.to_numeric(atk_reg['Week']) * 7).apply(lambda x: pd.DateOffset(x) + d("2022-01-01"))
atk_reg = atk_reg.set_index("Date")[[col]]
# atk_reg = atk_reg.cumsum()
all_atk_reg = all_atk_reg.combine_first(atk_reg)
df = row.combine_first(df) # prefer any updated info that might come in. Only applies to backdated series though
logger.info("{} MOPH Dashboard {}", date, row.loc[row.last_valid_index():].to_string(index=False, header=False))
# We get negative values for field hospital before April
assert df[df['Recovered'] == 0.0].loc["2021-03-05":].empty
df.loc[:"2021-03-31", 'Hospitalized Field'] = np.nan
# 2022-05-07 and 03 got 0.0 by mistake
df['Hospitalized Respirator'] = df['Hospitalized Respirator'].replace(0.0, np.nan)
df["Hospitalized Severe"] = df["Hospitalized Severe"].replace(0.0, np.nan)
df = all_atk_reg.combine_first(df)
export(df, "moph_dashboard", csv_only=True, dir="inputs/json")
return df
def dash_weekly(file="moph_dash_weekly"):
df = import_csv(file, ["Date"], False, dir="inputs/json") # so we cache it
allow_na = {
'Vac Given 1 Cum': (d("2021-08-01"), d("2023-05-23")),
'Vac Given 2 Cum': (d("2021-08-01"), d("2023-05-23")),
"Vac Given 3 Cum": (d("2021-08-01"), d("2023-05-23")),
'Hospitalized Respirator': (d("2021-03-25"), today(), 1), # patchy before this
'Hospitalized Severe': (d("2021-04-01"), today(), 10), # try and fix bad values
'Cases Cum': (d("2022-09-17"), today(), 4625384),
'Deaths Age 0-4': (d("2024-01-01"), today(), 0),
}
url = "https://public.tableau.com/views/SATCOVIDDashboard_WEEK/1-dash-week"
# aggregated for week ending on sat
dates = reversed(pd.date_range("2022-09-25", today() - relativedelta(days=1, hours=7.5), freq='W-SAT').to_pydatetime())
# latest = next(dates, None)
# logger.info("{} MOPH Dashboard: checking", latest)
for get_wb, this_index in workbook_iterate(url, inc_no_param=False, param_date_weekend=list(dates), param_wave=["ตั้งแต่เริ่มระบาด"]):
# date, wave = this_index
date = this_index[0]
# logger.info("{} MOPH Dashboard: trying {}", date, this_index)
# date = date if date is not None else latest
if skip_valid(df, date, allow_na):
# print("s", end="")
row = str(dict(df.loc[date]))
logger.info("{} MOPH Dashboard: skipping {}", date, row)
continue
else:
logger.info("{} MOPH Dashboard: reading workbook for {}", date, this_index)
if (wb := get_wb()) is None:
logger.warning("{} MOPH Dashboard: workbook is None", date)
continue
# end_date = workbook_value(wb, None, "D_UpdateTime (2)", "Date", is_date=True)
# if end_date.date() != date.date():
# # we have a problem. not setting the date right
# continue
# TODO: should be part of workbook_iterate so its done once.
# row_since2023 = row = extract_basics(wb, date)
# if row_since2023.empty:
# logger.warning("{} MOPH Dashboard: wrong date: skip", date)
# continue
# wb = force_setParameter(wb, "param_wave", "ตั้งแต่เริ่มระบาด")
# We miss data not effected by wave
row_update = extract_basics(wb, date, check_date=False)
assert not row_update.empty
assert np.nan not in row_update.index and pd.NaT not in row_update.index
# row = row_update.combine_first(row_since2023)
row = row_update
df = row.combine_first(df) # prefer any updated info that might come in. Only applies to backdated series though
logger.info("{} MOPH Dashboard {}", date, row.loc[row.last_valid_index():].to_string(index=False, header=False))
print()
export(df, file, csv_only=True, dir="inputs/json")
return df
def closest(sub, repl):
def get_name(wb, name):
return {s.name.replace(sub, repl): s.name for s in wb.worksheets}.get(name)
return get_name
def dash_province_weekly(file="moph_province_weekly"):
df = import_csv(file, ["Date", "Province"], False, dir="inputs/json") # so we cache it
# Remove any dips in cumualtive values. can be caused by getting daily instead#
# lambda mydf: mydf.loc[mydf['Cases Cum'].ffill() < mydf['Cases Cum'].cummax().ffill(), 'Cases Cum'] = np.nan
# decresed = df[(df[[c for c in df.columns if " Cum" in c]].groupby("Province").diff() < 0).any(axis=1)]
contiguous = df[["Cases Cum", "Deaths Cum"]].dropna(how="any")
dec1 = contiguous[(contiguous.groupby("Province").diff() < 0).any(axis=1)] # in case its the drop thats wrong
dec2 = contiguous[(contiguous.groupby("Province").diff(-1) > 0).any(axis=1)] # In case its a spike thats wrong
contiguous = df[["Vac Given 1 Cum", "Vac Given 2 Cum", "Vac Given 3 Cum"]].dropna()
dec3 = contiguous[(contiguous.groupby("Province").diff() < 0).any(axis=1)]
dec4 = contiguous[(contiguous.groupby("Province").diff(-1) > 0).any(axis=1)]
decreased = dec1.combine_first(dec2).combine_first(dec3).combine_first(dec4)
# Just remove bad rows
logger.warning("Dropping decreasing values {}", decreased.index)
df = df.drop(index=decreased.index)
valid = {
# "Deaths Cum": (d("2022-12-11"), today(), 1),
"Cases Cum": (today() - relativedelta(days=22), today(), 150), # TODO: need better way to reject this year cum values
# 'Vac Given 1 Cum': (today() - relativedelta(days=22), today() - relativedelta(days=4)),
# 'Vac Given 1 Cum': (d("2021-08-01"), d("2023-05-23")),
}
url = "https://public.tableau.com/views/SATCOVIDDashboard_WEEK/2-dash-week-province"
dates = reversed(pd.date_range("2022-01-01", today() - relativedelta(hours=7.5), freq='W-SAT').to_pydatetime())
# dates = iter([d.strftime("%m/%d/%Y") for d in dates])
# latest = next(dates, None)
# ts = tableauscraper.TableauScraper()
# try:
# ts.loads(url)
# except AttributeError as e:
# # Somethign is messed up about the page returned to scrape
# logger.error("{} MOPH Dashboard: Can't scrape {}", e)
# return df
# provs = ts.getWorkbook().getWorksheet("D2_Province (2)").getSelectableValues("province")
_, content, _ = next(web_files("https://ddc.moph.go.th/covid19-dashboard/?dashboard=province", proxy=False, check=True))
if content is None:
return df
soup = BeautifulSoup(content, 'html.parser')
# soup = parse_file(file, html=True, paged=False)
provs = [p.get("value") for p in soup.select("#sel-province")[0].find_all("option") if p.get("value")]
for get_wb, idx_value in workbook_iterate(url, inc_no_param=False, param_date_weekend=list(dates), param_wave=["ตั้งแต่เริ่มระบาด"], filters=dict(province=provs), verify=False):
# for get_wb, idx_value in workbook_iterate(url, inc_no_param=False, param_date=list(dates), D2_Province="province", verify=False):
date, wave, province = idx_value
# if date is None:
# date = latest
# date = d(date, dayfirst=False)
if province is None:
continue
province = get_province(province)
# TODO: make invalid not inc Cum values
if skip_valid(df, (date, province), valid) and (date, province) not in decreased.index:
print("s", end="")
continue
if (wb := get_wb()) is None:
continue
"D2_Update (2)"
row = extract_basics(wb, date)
if row.empty:
logger.warning("{} MOPH Dashboard: wrong date: skip {}", date, province)
continue # Not getting latest data yet
# wb = force_setParameter(wb, "param_wave", "ตั้งแต่เริ่มระบาด")
# We miss data not effected by wave
# row_update = extract_basics(wb, date, check_date=False, base_df=row)
# assert not row_update.empty
# row = row_update.combine_first(row)
row['Province'] = province
row = row.reset_index().set_index(["Date", "Province"])
combined = row.combine_first(df)
# Test if this creates a dip
contiguous = combined[["Cases Cum", "Deaths Cum"]]
dec1 = contiguous[(contiguous.groupby("Province").diff() < 0).any(axis=1)] # in case its the drop thats wrong
bad_rows = dec1.index.intersection(row.index)
if False and len(bad_rows) > 0:
logger.info("{} MOPH dash, dropping invalid row. cum value not inc. {}", bad_rows,
row.loc[bad_rows].to_string(index=False, header=False))
df = combined.drop(bad_rows)
# TODO: Some rows don't seem to show a drop
# TODO: We should drop the row before as well as that might be the source of the bad data.
else:
df = combined
logger.info("{} MOPH Dashboard {}", row.index.max(),
row.loc[row.last_valid_index():].to_string(index=False, header=False))
print()
export(df, file, csv_only=True, dir="inputs/json")
# Vac Given 3 Cum seems to be 3+4+5+6 which is wrong
df = df.drop(columns=["Vac Given 3 Cum"])
return df
def extract_basics(wb, date, check_date=True, base_df=None):
row = pd.DataFrame()
maxdate = None
# D_CaseNew_/7 - daily avg cases
# D_DeathNew_/7 - daily avg deaths
# D_Death (2)
def to_cum(cum, periodic, name):
""" take a single cum value and daily changes and return a cum series going backwards """
cum_name = name + " Cum"
if periodic.empty or not cum[cum_name].last_valid_index():
return pd.DataFrame()
periodic = periodic[name]
cum = cum[cum_name]
periodic = periodic.replace(np.nan, 0)
assert cum.last_valid_index()
combined = periodic.combine_first(cum) # TODO: this might go back too far. should really be min of periodic?
df = cum.reindex(combined.index).bfill().subtract(periodic.reindex(
combined.index)[::-1].cumsum()[::-1].shift(-1), fill_value=0)
assert (df > 0).any()
return df.to_frame(cum_name).ffill()
vacs = workbook_series(wb, ["D_Vac2Table"], {
"vaccine_plan_group-alias": {
"1": "1 Cum",
"2": "2 Cum",
"3": "3 Cum",
},
"SUM(vaccine_total_acm)-alias": "Vac Given",
}, index_date=False, index_col="Date", index_value=date)
vacs_dates = workbook_series(wb, ["D_Vac_Stack (2)", "D2_Vac_Stack (2)"], {
"DAY(txn_date)-value": "Date",
"vaccine_plan_group-alias": {
"1": "1 Cum",
"2": "2 Cum",
"3": "3 Cum",
},
"SUM(vaccine_total_acm)-value": "Vac Given",
# "SUM(vaccine_total_acm)-alias": "Vac Given",
}, index_date=True)
cases = workbook_series(wb, ["D_NewTL (2)", "D2_NewTL (2)"], {
"SUM(case_new)-value": "Cases",
"AGG(stat_count)-value": "Cases",
"AGG(STAT_COUNT)-value": "Cases",
"ATTR(week)-alias": "Week"
}, index_col="Week", index_date=False)
cases = weeks_to_end_date(cases, year_col="Year", week_col="Week", offset=0, date=date)
if cases.empty and base_df is not None and 'Cases' in base_df.columns:
cases = base_df[['Cases']]
maxdate = cases.index.max()
deaths = workbook_series(wb, ["D_DeathTL (2)", "D2_DeathTL (2)"], {
"SUM(death_new)-value": "Deaths",
"AGG(# NUM_DEATH)-value": "Deaths",
"ATTR(week)-alias": "Week"
}, index_col="Week", index_date=False)
deaths = weeks_to_end_date(deaths, year_col="Year", week_col="Week", offset=0, date=date)
if deaths.empty and base_df is not None and 'deaths' in base_df.columns:
deaths = base_df[['Deaths']]
maxdate = deaths.index.max()
assert maxdate is not pd.NaT or maxdate is not np.nan
# There is no date in the data to tell us that its returning the correct data except for the
# the deaths and cases. lets just look if we got latest instead.
if check_date and ((not deaths.empty and date < deaths.index.max()) or (not cases.empty and date < cases.index.max())):
return row
# date = cases.index.max() # We can't get update date always so use lastest cases date
cases_cum = workbook_value(wb, date, ["D_NewACM (2)", "D2_NewACM (2)"], "Cases Cum", default=np.nan)
deaths_cum = workbook_value(wb, date, ["D_DeathACM (2)", "D2_DeathACM (2)"], "Deaths Cum", default=np.nan)
row = row.combine_first(cases_cum)
row = row.combine_first(to_cum(row, cases, "Cases")).combine_first(cases)
row = row.combine_first(deaths_cum)
if not deaths.empty:
row = row.combine_first(to_cum(row, deaths, "Deaths")).combine_first(deaths)
row = row.combine_first(workbook_value(wb, date, "D_Severe (2)", "Hospitalized Severe", None))
row = row.combine_first(workbook_value(wb, date, "D_SevereTube (2)", "Hospitalized Respirator", None))
ages = workbook_series(wb, 'cvd_agegroup', {'Measure Values-value': 'Deaths',
"Measure Names-alias": "Age Group"}, index_col="Age Group", index_date=False)
gender = workbook_series(wb, 'cvd_gender', {'Measure Values-value': 'Deaths',
"Measure Names-alias": "Gender"}, index_col="Gender", index_date=False)
if not ages.empty:
ages['Date'] = date
ages = ages.reset_index().pivot(columns=['Age Group'], values=["Deaths"], index=["Date"])
ages.columns = [f"Deaths Age {a}" for a in ["0-4", "10-19", "20-49", "5-9", "50-59", "60-69", "70+"]]
row = row.combine_first(ages)
if not gender.empty:
gender['Date'] = date
gender = gender.reset_index().pivot(columns=['Gender'], values=["Deaths"], index=["Date"])
gender.columns = [f"Deaths {g}" for g in ["Male", "Female"]]
row = row.combine_first(gender)
# TODO: should switch from weekly?
row = row.combine_first(vacs).combine_first(vacs_dates)
return row
def dash_ages():
df = import_csv("moph_dashboard_ages", ["Date"], False, dir="inputs/json") # so we cache it
return df # no longer there
# Fix mistake in column name
df.columns = [c.replace("Hospitalized Severe", "Cases Proactive") for c in df.columns]
# Get deaths by prov, date. and other stats - timeline
#
# ['< 10 ปี', '10-19 ปี', '20-29 ปี', '30-39 ปี', '40-49 ปี', '50-59 ปี', '60-69 ปี', '>= 70 ปี', 'ไม่ระบุ']
# D4_TREND
# cases = AGG(stat_count)-alias
# proactive = AGG(ผู้ติดเชื้อรายใหม่เชิงรุก)-alias,
# deaths = AGG(ผู้เสียชีวิต)-alias (and AGG(ผู้เสียชีวิต (รวมทุกกลุ่มผู้ป่วย))-value all patient groups)
# cum cases = AGG(stat_accum)-alias
# date = DAY(date)-alias, DAY(date)-value
url = "https://ddc.moph.go.th/covid19-dashboard/index.php?dashboard=select-trend-line"
url = "https://dvis3.ddc.moph.go.th/t/sat-covid/views/SATCOVIDDashboard/4-dash-trend-w"
url = "https://dvis3.ddc.moph.go.th/t/sat-covid/views/SATCOVIDDashboard/4-dash-trend?:isGuestRedirectFromVizportal=y&:embed=y" # noqa
def range2eng(range):
return range.replace(" ปี", "").replace('ไม่ระบุ', "Unknown").replace(">= 70", "70+").replace("< 10", "0-9")
for get_wb, idx_value in workbook_iterate(url, D4_CHART="age_range", verify=False):
age_group = next(iter(idx_value), None)
age_group = range2eng(age_group)
skip = not pd.isna(df[f"Cases Age {age_group}"].get(str(today().date())))
if skip or (wb := get_wb()) is None:
continue
row = workbook_series(
wb,
"D4_TREND", {
"DAY(date)-value": "Date",
"AGG(ผู้เสียชีวิต (รวมทุกกลุ่มผู้ป่วย))-value": "Deaths",
"AGG(stat_count)-alias": "Cases",
"AGG(ผู้ติดเชื้อรายใหม่เชิงรุก)-alias": "Cases Proactive",
},
index_col="Date"
)
if row.empty:
continue
row['Age'] = age_group
row = row.pivot(values=["Deaths", "Cases", "Cases Proactive"], columns="Age")
row.columns = [f"{n} Age {v}" for n, v in row.columns]
df = row.combine_first(df)
logger.info("{} MOPH Ages {} {}", row.last_valid_index(), range2eng(age_group),
row.loc[row.last_valid_index():].to_string(index=False, header=False))
df = df.loc[:, ~df.columns.duplicated()] # remove duplicate columns
export(df, "moph_dashboard_ages", csv_only=True, dir="inputs/json")
return df
def dash_trends_prov():
df = import_csv("moph_dashboard_prov_trends", ["Date", "Province"], False, dir="inputs/json") # so we cache it
url = "https://dvis3.ddc.moph.go.th/t/sat-covid/views/SATCOVIDDashboard/4-dash-trend"
for get_wb, idx_value in workbook_iterate(url, D4_CHART="province", verify=False):
province = get_province(next(iter(idx_value), None))
date = str(today().date())
try:
df.loc[(date, province)]
continue
except KeyError:
pass
if (wb := get_wb()) is None:
continue
row = workbook_series(
wb,
None,
'D4_TREND',
{
"DAY(date)-value": "Date",
"AGG(ผู้เสียชีวิต (รวมทุกกลุ่มผู้ป่วย))-value": "Deaths",
"AGG(stat_count)-alias": "Cases",
"AGG(ผู้ติดเชื้อรายใหม่เชิงรุก)-alias": "Cases Proactive",
},
)
if row.empty or province is None:
continue
row['Province'] = province
df = row.reset_index("Date").set_index(["Date", "Province"]).combine_first(df)
logger.info("{} DASH Trend prov {}", row.last_valid_index(),
row.loc[row.last_valid_index():].to_string(index=False, header=False))
df = df.loc[:, ~df.columns.duplicated()] # remove duplicate columns
export(df, "moph_dashboard_prov_trends", csv_only=True, dir="inputs/json") # Speeds up things locally
return df
def dash_by_province():
df = import_csv("moph_dashboard_prov", ["Date", "Province"], False, dir="inputs/json") # so we cache it
# Vac Given 3 Cum seems to be 3+4+5+6 which is wrong
df = df.drop(columns=["Vac Given 3 Cum"])
return df
url = "https://public.tableau.com/views/SATCOVIDDashboard/2-dash-tiles-province-w"
url = "https://public.tableau.com/views/SATCOVIDDashboard/2-dash-tiles-province-w?:size=1200,1050&:embed=y&:showVizHome=n&:bootstrapWhenNotified=y&:tabs=n&:toolbar=n&:apiID=host0"
# Fix spelling mistake
if 'Postitive Rate Dash' in df.columns:
df = df.drop(columns=['Postitive Rate Dash'])
if "Hospitalized Severe" in df.columns:
df = df.drop(columns=["Hospitalized Severe"]) # was actually proactive
last_pos_rate = max(df["Positive Rate Dash"].last_valid_index()[0], today() - relativedelta(days=31))
valid = {
# shouldn't be 0 pos rate. Maybe set to min 0.001 again later?
"Positive Rate Dash": (d("2021-05-20"), last_pos_rate, 0.0, 2), # Might have to remove it completely.
"Tests": today(), # It's no longer there
"Vac Given 1 Cum": (d("2021-08-01"), today() - relativedelta(days=2), 1),
"Vac Given 2 Cum": (d("2021-08-01"), today() - relativedelta(days=2)),
"Vac Given 3 Cum": (d("2021-08-01"), today() - relativedelta(days=2)),
# all the non-series will take too long to get historically
"Cases Walkin": (d("2021-07-01"), today() - relativedelta(days=5)),
"Cases Proactive": (d("2021-08-01"), today() - relativedelta(days=5)),
"Cases Area Prison": (d("2021-08-01"), today() - relativedelta(days=5)),
"Cases Imported": (d("2021-08-01"), today() - relativedelta(days=5)),
"Deaths": d("2021-07-12"), # Not sure why but Lamphun seems to be missing death data before here?
"Cases": d("2021-06-28"), # Only Lampang?
}
# Dates with no data and doesn't seem to change
skip = [
(d("2021-10-12"), "Mae Hong Son"),
(d('2021-10-09'), 'Phayao'),
(d('2021-10-09'), 'Nan'),
(d('2021-10-07'), 'Nan'),
(d("2021-10-04"), "Amnat Charoen"),
(d('2021-10-04'), 'Nan'),
(d('2021-10-03'), 'Nan'),
(d("2021-10-02"), "Phrae"),
(d("2021-10-02"), "Phayao"),
(d('2021-10-02'), 'Nan'),
(d("2021-10-01"), "Amnat Charoen"),
(d('2021-10-01'), 'Chai Nat'),
(d("2021-09-28"), "Phayao"),
(d('2021-09-28'), 'Nan'),
(d("2021-09-27"), "Phayao"),
(d("2021-09-26"), "Mukdahan"),
(d('2021-09-26'), 'Bueng Kan'),
(d('2021-09-26'), 'Nan'),
(d("2021-09-23"), 'Phayao'),
(d("2021-09-22"), 'Nakhon Phanom'),
(d('2021-09-21'), 'Nan'),
]
dates = reversed(pd.date_range("2021-02-01", today() - relativedelta(hours=7.5)).to_pydatetime())
#dates = [d.strftime('%m/%d/%Y') for d in dates]
# Add in None for today as selecting today doesn't give us new data anymore. TODO: fix TableauScraper to remember the last data it had
# prov_values = ['อุบลราชธานี', 'อุทัยธานี', 'อุตรดิตถ์', 'อุดรธานี', 'อำนาจเจริญ', 'อ่างทอง', 'หนองคาย', 'สุรินทร์', 'สุราษฎร์ธานี', 'สุพรรณบุรี', 'สระบุรี', 'สระแก้ว', 'สมุทรสาคร', 'สมุทรสงคราม', 'สมุทรปราการ', 'สตูล', 'สงขลา', 'สกลนคร', 'ศรีสะเกษ', 'เลย', 'ลำพูน', 'ลำปาง', 'ลพบุรี', 'ราชบุรี', 'ระยอง', 'ร้อยเอ็ด', 'ยะลา', 'ยโสธร', 'แม่ฮ่องสอน', 'มหาสารคาม', 'ภูเก็ต', 'แพร่',
# 'เพชรบูรณ์', 'เพชรบุรี', 'พิษณุโลก', 'พิจิตร', 'ปัตตานี', 'ปราจีนบุรี', 'ประจวบคีรีขันธ์', 'ปทุมธานี', 'บุรีรัมย์', 'นนทบุรี', 'นครสวรรค์', 'นครศรีธรรมราช', 'นครราชสีมา', 'นครพนม', 'นครปฐม', 'นครนายก', 'ตาก', 'ตราด', 'ตรัง', 'เชียงใหม่', 'เชียงราย', 'ชุมพร', 'ชัยภูมิ', 'ชลบุรี', 'ฉะเชิงเทรา', 'จันทบุรี', 'ขอนแก่น', 'กาฬสินธุ์', 'กาญจนบุรี', 'กรุงเทพมหานคร', 'กระบี่']
for get_wb, idx_value in workbook_iterate(url, inc_no_param=False, param_date=[None] + list(dates), D2_Province="province", verify=False):
# for get_wb, idx_value in workbook_iterate(url, inc_no_param=False, param_date=[None] + list(dates), param_province=prov_values, verify=False):
date, province = idx_value
#date = d(date, dayfirst=False)
if date is None:
date = (today() - relativedelta(hours=7.5))
if province is None:
continue
province = get_province(province)
if (date, province) in skip or skip_valid(df, (date, province), valid):
print("s", end="")
continue
if (wb := get_wb("D2_NewTL (2)")) is None:
continue
row = pd.DataFrame()
row = row.combine_first(workbook_series(wb, "D2_NewTL", {
"AGG(stat_count)-alias": "Cases",
"DAY(txn_date)-value": "Date"
}))
last_update = row.last_valid_index()
# TODO: ensure we are looking at the right provice. can't seem to get cur selection from wb.getWorksheet("D2_Province")
# Need to work if the data has been updated yet. If it has last deaths should be today.
if date.date() > last_update.date():
# the date we are trying to get isn't the last deaths we know about. No new data yet
logger.warning("{} MOPH Dashboard {}", date.date(), f"Skipping {province} as data update={last_update}")
continue
row = row.combine_first(workbook_value(wb, date, "D2_Walkin", "Cases Walkin"))
row = row.combine_first(workbook_value(wb, date, "D2_Proact", "Cases Proactive"))
row = row.combine_first(workbook_value(wb, date, "D2_Prison", "Cases Area Prison"))
row = row.combine_first(workbook_value(wb, date, "D2_NonThai", "Cases Imported"))
row = row.combine_first(workbook_value(wb, date, "D2_New", "Cases"))
row = row.combine_first(workbook_value(wb, date, "D2_Death", "Deaths"))
row = row.combine_first(workbook_series(wb, "D2_DeathTL", {
"AGG(num_death)-value": "Deaths",
"DAY(txn_date)-value": "Date"
}, end=date))
row = row.combine_first(workbook_series(wb, "D_Lab2", {
"AGG(% ติดเฉลี่ย)-value": "Positive Rate Dash",
"DAY(txn_date)-value": "Date",
}, np.nan))
row = row.combine_first(workbook_series(wb, "D_Lab", {
"AGG(% ติดเฉลี่ย)-value": "Positive Rate Dash",
"DAY(txn_date)-value": "Date",
}, np.nan))
row = row.combine_first(workbook_series(wb, "D2_Vac_Stack", {
"DAY(txn_date)-value": "Date",
"vaccine_plan_group-alias": {
"1": "1 Cum",
"2": "2 Cum",
"3": "3 Cum",
},
"SUM(vaccine_total_acm)-value": "Vac Given",
}))
row['Province'] = province
df = row.reset_index("Date").set_index(["Date", "Province"]).combine_first(df)
logger.info("{} MOPH Dashboard {}", date.date(),
row.loc[row.last_valid_index():].to_string(index=False, header=False))
if USE_CACHE_DATA:
# Save as we go to help debugging
export(df, "moph_dashboard_prov", csv_only=True, dir="inputs/json")
export(df, "moph_dashboard_prov", csv_only=True, dir="inputs/json") # Speeds up things locally
return df
def skip_valid(df, idx_value, allow_na={}):
"return true if we have a value already for this index or the value isn't in range"
if type(idx_value) == tuple:
date, prov = idx_value
idx_value = (str(date.date()) if date else None, prov)
else:
date = idx_value
idx_value = str(date.date()) if date else None
# Assume index of df is in the same order as params
if df.empty:
return False
def is_valid(column, date, idx_value, limits=None):
if limits is None:
limits = allow_na.get(column, None)
if limits is None:
return True
elif type(limits) in (list, tuple) and type(limits[0]) in (list, tuple):
# multiple rules. recurse
return all([is_valid(column, date, idx_value, limits=rule) for rule in limits])
else:
pass
maxdate = today()
mins = []
if type(limits) in [tuple, list]:
mindate, *limits = limits
if limits:
maxdate, *mins = limits
elif limits is None:
mindate = d("1975-1-1")
else:
mindate = limits
if not(date is None or mindate <= date <= maxdate):
return True
try:
val = df.loc[idx_value][column]
except KeyError:
return False
if pd.isna(val):
return False
if not mins:
return True
min_val, *max_val = mins
if type(val) == str:
return False
elif min_val > val:
return False
elif max_val and val > max_val[0]:
return False
else:
return True
# allow certain fields null if before set date
nulls = [c for c in set(list(allow_na.keys()) + list(df.columns)) if not is_valid(c, date, idx_value)]
if not nulls:
return True
else:
logger.info("{} MOPH Dashboard Retry Missing data at {} for {}. Retry", date, idx_value, nulls)
return False
def check_dash_ready():
print("Waiting for today's dashboard update ")
gottoday = todays_data()
print(gottoday)
sys.exit(0 if gottoday else 1)
if __name__ == '__main__':
# check_dash_ready()
# dash_ages_df = dash_ages()
# This doesn't add any more info since severe cases was a mistake
# dash_trends_prov_df = dash_trends_prov()
df = import_csv("combined", index=["Date"], date_cols=["Date"])
prov = import_csv("cases_by_province", index=["Date", "Province"], date_cols=["Date"])
dash_by_province_df = dash_province_weekly()
# df = dash.combine(df, lambda s1, s2: s1)
# df = briefings.combine(df, lambda s1, s2: s1)
vaccols = [f"Vac Given {d} Cum" for d in range(1, 5)]
daily_prov = cum2daily(dash_by_province_df, exclude=vaccols)
dash_by_province_daily = dash_by_province()
prov = dash_by_province_daily.combine_first(daily_prov).combine_first(prov)
# Write this one as it's imported
export(prov, "cases_by_province", csv_only=True)
dash_daily_df = dash_weekly()
dash_daily_df = dash_daily_df.loc[dash_daily_df.index.notnull()] # remove some bad data
daily = cum2daily(dash_daily_df, exclude=vaccols)
daily_deaths = weekly2daily(dash_daily_df[(c for c in dash_daily_df.columns if "Deaths " in c)])
df = daily.combine_first(df).combine_first(daily_deaths)
export(df, "combined", csv_only=True)
covid_plot_deaths.save_deaths_plots(df)
covid_plot_cases.save_cases_plots(df)
covid_plot_vacs.save_vacs_prov_plots(df)
covid_plot_vacs.save_vacs_plots(df)