-
Notifications
You must be signed in to change notification settings - Fork 15
/
covid_data_vac.py
1253 lines (1131 loc) · 64.8 KB
/
covid_data_vac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import copy
import datetime
import functools
import json
import os
import re
import time
import numpy as np
import pandas as pd
import requests
from dateutil.parser import parse as d
import covid_plot_vacs
from utils_pandas import daily2cum
from utils_pandas import export
from utils_pandas import import_csv
from utils_scraping import any_in
from utils_scraping import camelot_cache
from utils_scraping import get_next_number
from utils_scraping import get_next_numbers
from utils_scraping import logger
from utils_scraping import MAX_DAYS
from utils_scraping import NUM_OR_DASH
from utils_scraping import pairwise
from utils_scraping import parse_file
from utils_scraping import parse_numbers
from utils_scraping import replace_matcher
from utils_scraping import split
from utils_scraping import USE_CACHE_DATA
from utils_scraping import web_files
from utils_scraping import web_links
from utils_thai import area_crosstab
from utils_thai import file2date
from utils_thai import find_thai_date
from utils_thai import get_province
from utils_thai import join_provinces
from utils_thai import today
pd.set_option('future.no_silent_downcasting', True)
################################
# Vaccination reports
################################
# def get_vaccination_coldchain(request_json, join_prov=False):
# logger.info("Requesting coldchain: {}", request_json)
# if join_prov:
# df_codes = pd.read_html("https://en.wikipedia.org/wiki/ISO_3166-2:TH")[0]
# codes = [code for code, prov, ptype in df_codes.itertuples(index=False) if "special" not in ptype]
# provinces = [
# prov.split("(")[0] for code, prov, ptype in df_codes.itertuples(index=False) if "special" not in ptype
# ]
# provinces = [get_province(prov) for prov in provinces]
# else:
# provinces = codes = [None]
# url = "https://datastudio.google.com/batchedDataV2?appVersion=20210506_00020034"
# with open(request_json) as fp:
# post = json.load(fp)
# specs = post['dataRequest']
# post['dataRequest'] = []
# def set_filter(filters, field, value):
# for filter in filters:
# if filter['filterDefinition']['filterExpression']['queryTimeTransformation']['dataTransformation'][
# 'sourceFieldName'] == field:
# filter['filterDefinition']['filterExpression']['stringValues'] = value
# return filters
# def make_request(post, codes):
# for code in codes:
# for spec in specs:
# pspec = copy.deepcopy(spec)
# if code:
# set_filter(pspec['datasetSpec']['filters'], "_hospital_province_code_", [code])
# post['dataRequest'].append(pspec)
# try:
# r = requests.post(url, json=post, timeout=120)
# except requests.exceptions.ReadTimeout:
# logger.info("Timeout so using cached {}", request_json)
# with open(os.path.join("inputs", "json", request_json), ) as fp:
# data = fp.read()
# else:
# _, _, data = r.text.split("\n")
# data = json.loads(data)
# if any(resp for resp in data['dataResponse'] if 'errorStatus' in resp):
# # raise Exception(resp['errorStatus']['reasonStr'])
# # read from cache if possible
# with open(os.path.join("inputs", "json", request_json)) as fp:
# data = json.load(fp)
# else:
# with open(os.path.join("inputs", "json", request_json), "w") as fp:
# fp.write(data)
# for resp in (resp for resp in data['dataResponse'] if 'errorStatus' in resp):
# # raise Exception(resp['errorStatus']['reasonStr'])
# pass
# for resp in (resp for resp in data['dataResponse'] if 'errorStatus' not in resp):
# yield resp
# if join_prov:
# dfall = pd.DataFrame(columns=["Date", "Province", "Vaccine"]).set_index(["Date", "Province", "Vaccine"])
# else:
# dfall = pd.DataFrame(columns=["Date"]).set_index(["Date"])
# for prov_spec, data in zip([(p, s) for p in provinces for s in specs], make_request(post, codes)):
# prov, spec = prov_spec
# fields = [(f['name'], f['dataTransformation']['sourceFieldName']) for f in spec['datasetSpec']['queryFields']]
# for datasubset in data['dataSubset']:
# colmuns = datasubset['dataset']['tableDataset']['column']
# df_cols = {}
# date_col = None
# for field, column in zip(fields, colmuns):
# fieldname = dict(_vaccinated_on_='Date',
# _manuf_name_='Vaccine',
# datastudio_record_count_system_field_id_98323387='Vac Given').get(field[1], field[1])
# nullIndex = column['nullIndex']
# del column['nullIndex']
# if column:
# field_type = next(iter(column.keys()))
# conv = dict(dateColumn=d, datetimeColumn=d, longColumn=int, doubleColumn=float,
# stringColumn=str)[field_type]
# values = [conv(i) for i in column[field_type]['values']]
# if conv == d:
# date_col = fieldname
# else:
# values = []
# # datastudio_record_count_system_field_id_98323387 = supply?
# for i in nullIndex: # TODO check we are doing this right
# values.insert(i, None)
# df_cols[fieldname] = values
# df = pd.DataFrame(df_cols)
# if not date_col:
# df['Date'] = today()
# else:
# df['Date'] = df[date_col]
# if prov:
# df['Province'] = prov
# df = df.set_index(["Date", "Province", "Vaccine"])
# else:
# df = df.set_index(['Date'])
# dfall = dfall.combine_first(df)
# return dfall
# vac given table
# <p>สรุปการฉดีวัคซีนโควิด 19 ตัง้แตว่ันที่ 7 มิถุนายน 2564
# ผลการใหบ้ริการ ณ วนัที ่23 มิถุนายน 2564 เวลา 18.00 น.
def vac_problem(daily, date, file, page):
if "Anaphylaxis" not in page:
return daily
prob_a, rest = get_next_number(page, "Anaphylaxis")
prob_p, rest = get_next_number(page, "Polyneuropathy")
row = [date, prob_a, prob_p]
assert not any_in(['None'], row)
df = pd.DataFrame([row], columns=[
"Date",
"Vac Problem Anaphylaxis",
"Vac Problem Polyneuropathy",
]).set_index("Date")
return daily.combine_first(df)
def vaccination_daily(daily, file_date, file, page):
if not re.search(r"(ให้หน่วยบริกำร|ใหห้นว่ยบริกำร|วคัซนีโควดิ 19|ริการวัคซีนโควิด 19|ผู้ได้รับวัคซีนเข็มที่ 1)", page): # noqa
return daily
elif "ายหลังได้รับวัคซีน" in page:
return daily
first_line, *_, last_line = page.split("\n\n")
date = find_thai_date(first_line, all=True) or find_thai_date(last_line, all=True) # 2021-01-11 has date range
if date:
date = date[-1]
else:
# prior to 2022 we could just get the first date of the page
date = max(find_thai_date(page, all=True))
# page.split("ณ วันที่")[-1]
assert date, f"No date found in {file}: {page}"
# if date in [d("2022-01-20")]:
# pass
# else:
# assert docdate == date
if date == d("2022-09-30") and date != file_date:
date = d("2022-10-07")
# fix numbers with spaces in them
page = re.sub(r"(\d) (,\d)", r"\1\2", page)
if date == d("2021-05-06"):
page = re.sub(r",5 9 รำย", "", page) # glitch on 2021-05-6
# dose1_total, rest1 = get_next_number(page, "ได้รับวัคซีนเข็มที่ 1", until="โดส")
# dose2_total, rest2 = get_next_number(page, "ได้รับวัคซีน 2 เข็ม", until="โดส")
alloc_sv, rest = get_next_number(page, "Sinovac", until="โดส")
alloc_az, rest = get_next_number(page, "AstraZeneca", until="โดส")
# numbers, _ = get_next_numbers(page, "2 (รำย) รวม (โดส)")
# if numbers:
# given1, given2, given_total, *_ = numbers
# alloc_total, rest = get_next_number(page, "รวมกำรจัดสรรวัคซีนทั้งหมด", "รวมกำรจดัสรรวคัซนีทัง้หมด", until="โดส")
# assert alloc_total == alloc_sv + alloc_az
row = [date, alloc_sv, alloc_az]
# assert not any_in(row, None)
df = pd.DataFrame([row], columns=[
"Date",
"Vac Allocated Sinovac",
"Vac Allocated AstraZeneca",
]).set_index("Date").infer_objects(copy=False).fillna(value=np.nan)
# TODO: until make more specific to only reports for allocations
daily = daily.combine_first(df)
if not re.search(r"(ากรทางการแพท|บุคคลที่มีโรคประจ|ากรทางการแพทย|กรทำงกำรแพทย์)", page):
# logger.info("{} Vac Sum (Missing groups) {} {}", date.date(), df.to_string(header=False, index=False), file)
assert not d("2021-07-12") < date < d("2022-12-02")
return daily
def clean_num(numbers):
if len(numbers) > 8:
return [n for n in numbers if n not in (60, 17, 12, 7, 3, 5, 11)]
else:
return [n for n in numbers if n not in (60, 7)]
page = re.sub("ผัสผู้ป่วย 1,022", "", page) # 2021-05-06
# Daily totals at the bottom often make it harder to get the right numbers
# ส ำหรับรำยงำนจ ำนวนผู้ได้รับวัคซีนโควิด 19 เพิ่มขึ้นในวันที่ 17 ตุลำคม 2564 มีผู้ได้รับวัคซีนทั้งหมด
gtext, *_ = re.split("(?:หรับรำยงำนจ|าหรับรายงานจ)", page)
d1_num, rest1 = get_next_numbers(gtext,
r"1\s*(?:จํานวน|จำนวน|จ ำนวน)",
r"เข็ม(?:ท่ี|ที่) 1 จํานวน",
r"นเข็มที่ 1 จ", r"ซนีเขม็ที่ 1 จ",
until=r"(?:2 เข็ม)", return_until=True, require_until=True)
d2_num, rest2 = get_next_numbers(gtext,
r"ได้รับ\s*วัคซีน\s+2\s+เข็ม",
r"ไดรับวัคซีน\s+2\s+เข็ม",
r"วัคซีน 2 เข็ม",
until=r"(?:ดังรูป|โควิด 19|จังหวัดที่|\(Booster\s*dose\))", return_until=True, require_until=True)
d3_num, rest3 = get_next_numbers(gtext, r"\(Booster dose\)", until="ดังรูป", return_until=True)
if not len(clean_num(d1_num)) == len(clean_num(d2_num)):
if date > d("2021-04-24"):
ld1, ld2 = len(clean_num(d1_num)), len(clean_num(d2_num))
error = f"ERROR groups first dose1=({ld1}). groups for dose2=({ld2}) in {file} for {date}",
logger.error(error)
assert False, error
else:
logger.info("{} Vac Sum (Error groups) {} {}", date.date(), df.to_string(header=False, index=False), file)
return daily
# assert len(d3_num) == 0 or len(d3_num) == len(d2_num)
is_risks = re.compile(
r"(บุคคลที่มีโรคประจ|บุคคลท่ีมีโรคประจําตัว|ผู้ที่มีอายุตั้งแต่ 60|ผู้ท่ีมีอายุต้ังแต่ 60|จำนวน|ได้รับวัคซีน 2|7 กลุ)")
row = [None] * 9
for dose, numbers, rest in [(1, d1_num, rest1), (2, d2_num, rest2), (3, d3_num, rest3)]:
cols = [
"Date",
f"Vac Given {dose} Cum",
f"Vac Group Medical All {dose} Cum",
f"Vac Group Medical Staff {dose} Cum",
f"Vac Group Health Volunteer {dose} Cum",
f"Vac Group Other Frontline Staff {dose} Cum",
f"Vac Group Over 60 {dose} Cum",
f"Vac Group Risk: Disease {dose} Cum",
f"Vac Group Risk: Pregnant {dose} Cum",
f"Vac Group Risk: Location {dose} Cum",
f"Vac Group Student {dose} Cum",
f"Vac Group Kids {dose} Cum",
]
numbers = clean_num(numbers) # remove 7 chronic diseases and over 60 from numbers
if (num_len := len(numbers)) in (6, 7, 8, 9) and is_risks.search(rest):
if num_len < 7 and date < d("2022-01-18"):
total, med_all, frontline, over60, chronic, area = numbers
pregnant = volunteer = medical = student = kids = None
else:
# They changed around the order too much. have to switch to picking per category
total, *_ = numbers
medical = get_next_number(rest, r"างการแพท", r"งกำรแพท", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=True)
# 2021-01-03 dropped frontline
frontline = get_next_number(rest, r"นหน้ำ", r"านหน้า", r"านหนา", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=False)
volunteer = get_next_number(rest, r"อาสาสมัคร", r"อำสำสมัคร", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=True)
over60 = get_next_number(rest, r"60 *(?:ปี|ป)\s*?\s*(?:ขึ|ปี|ข้ึ)",
until="(?:ราย|รำย)", return_rest=False, asserted=True)
d7, chronic, *_ = get_next_numbers(rest, r"โรค", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=True)
assert d7 == 7
pregnant = get_next_number(rest, r"งครร(?:ภ์|ภ)", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=False)
area = get_next_number(rest, r"าชนทั่วไป", r"ประชาชน", r"ประชำชน", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=True)
student = get_next_numbers(rest, r"ราย อายุ", r"นักเรียน", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=False)
kids = get_next_number(rest, r"(?:อายุ|เด็กอำยุ) 5 ?- ?11 ปี", until="(?:ราย|รำย)",
return_rest=False, thainorm=True, asserted=False)
if len(student) == 3:
d12, d17, student = student
assert (d12, d17) == (12, 17)
elif len(student) == 0:
student = np.nan
else:
# if something was captured into *student then hope it was the addition of students on 2021-10-06 or else...
raise Exception("Unexpected excess vaccination values found on {} in {}: {}", date, file, student)
if row and row[0] and row[0] < medical:
# Dose 3 seems to be already adding volunteer to the medical number since 2022ish
medical = medical - volunteer
med_all = medical + volunteer
if date in [d("2021-08-11")] and dose == 2:
frontline = None # Wrong value for dose2
row = [medical, volunteer, frontline, over60, chronic, pregnant, area, student, kids]
if date in [d("2021-08-11")]:
pass
else:
assert not any_in([None, np.nan], medical or med_all, over60, chronic, area)
total_row = [medical or med_all, volunteer, frontline, over60, chronic, pregnant, area, student, kids]
total_row = sum(i for i in total_row if i and not pd.isna(i))
if date in [d("2022-06-19"), d("2022-10-21")]:
# 2022-06-19 - not sure which number is out?
# 2022-10-21 - 3rd dose is 26622061 but doc says adds up to 32326428 ??
pass
else:
assert 0.94 <= (total_row / total) <= 1.01
df = pd.DataFrame([[date, total, med_all] + row], columns=cols).set_index("Date")
elif dose == 3:
if len(numbers) == 2:
numbers = numbers + [np.nan] * 9
elif len(numbers) == 0:
numbers = [np.nan] * 11
df = pd.DataFrame([[date] + numbers], columns=cols).set_index("Date")
elif numbers:
assert date < d("2021-07-12"), f"{date} {file} can't find vac groups for dose {dose} in {gtext}"
total, *_ = numbers
df = pd.DataFrame([[date, total]], columns=[
"Date",
f"Vac Given {dose} Cum",
]).set_index("Date")
else:
assert date < d("2021-07-12"), f"{date} {file} can't find vac groups for dose {dose} in {gtext}"
continue
daily = daily.combine_first(df)
daily = daily.fillna(value=np.nan)
# logger.info("{} Vac Sum {} {}", date.date(), daily.loc[date:date].to_string(header=False, index=False), file)
return daily
givencols = [
"Date",
"Province",
"Vac Given 1 Cum",
"Vac Given 1 %",
"Vac Given 2 Cum",
"Vac Given 2 %",
]
givencols3 = givencols + [
"Vac Given 3 Cum",
"Vac Given 3 %",
]
givencols4 = givencols3 + [
"Vac Given 4 Cum",
"Vac Given 4 %",
]
givencols6 = givencols4 + [
"Vac Given 5 Cum",
"Vac Given 5 %",
"Vac Given 6 Cum",
"Vac Given 6 %",
]
vaccols8x3 = givencols3 + [
f"Vac Group {g} {d} Cum" for g in [
"Medical Staff", "Health Volunteer", "Other Frontline Staff", "Over 60", "Risk: Disease", "Risk: Pregnant",
"Risk: Location", "Student"
] for d in range(1, 4)
]
# Student vaccination figures did not exist prior to 2021-10-06
vaccols7x3 = [col for col in vaccols8x3 if "Student" not in col]
vaccols6x2 = [col for col in vaccols7x3 if " 3 " not in col and "Pregnant" not in col]
vaccols5x2 = [col for col in vaccols6x2 if "Volunteer" not in col]
vaccols_60s = ["Date", "Province"] + [f"Vac Group {g} {d} Cum" for g in ["Over 60"] for d in range(1, 4)]
vaccols_disease = ["Date", "Province"] + [f"Vac Group {g} {d} Cum" for g in ["Risk: Disease"] for d in range(1, 4)]
vaccols_medical = ["Date", "Province"] + [f"Vac Group {g} {d} Cum" for g in ["Medical Staff"] for d in range(1, 4)]
alloc2_doses = [
"Date",
"Province",
"Vac Allocated Sinovac 1",
"Vac Allocated Sinovac 2",
"Vac Allocated AstraZeneca 1",
"Vac Allocated AstraZeneca 2",
"Vac Allocated Sinovac",
"Vac Allocated AstraZeneca",
]
alloc2 = [
"Date",
"Province",
"Vac Allocated Sinovac",
"Vac Allocated AstraZeneca",
]
alloc4 = alloc2 + ["Vac Allocated Sinopharm", "Vac Allocated Pfizer", "Vac Allocated Moderna", ]
def vaccination_tables(df, _, page, file):
page = page.replace("( ร้อยละ )", "( ร้อยละ )\n") # Ensure heading on it's own line
lines = [line.strip() for line in page.split('\n') if line.strip()]
first_line = page.split("\n\n")[0]
# 28 กมุภำพันธ์ 2564 – 6 กุมภำพันธ ์2565
first_line = first_line.replace(" ์", " ") # TODO: generalise to remove stray that accents?
date = find_thai_date(first_line, all=True) # 2021-01-11 has date range
if not date:
return df
else:
date = date[-1]
rows = {}
def add(prov, numbers, cols):
assert rows.get((date, prov), None) is None or rows.get((date, prov), None).keys() != cols
rows[(date, prov)] = {c: n for c, n in zip(cols, [date, prov] + numbers)} | rows.get((date, prov), {})
shots = re.compile(r"(เข็ม(?:ที|ที่|ท่ี)\s.?(?:1|2)\s*)")
july = re.compile(r"\( *(?:ร้อยละ|รอ้ยละ) *\)", re.DOTALL)
oldhead = re.compile(r"(เข็มที่ 1 วัคซีน|เข็มท่ี 1 และ|เข็มที ่1 และ)")
def in_heading(pat):
return max(len(pat.findall(h)) for h in headings)
_, *rest = split(lines, lambda x: (july.search(x) or shots.search(x) or oldhead.search(x)) and '2564' not in x)
for headings, lines in pairwise(rest):
shot_count = in_heading(shots)
table = {12: "new_given", 10: "given", 6: "alloc", 14: "july", 16: "july"}.get(shot_count)
shot_nums = [int(t.strip()[-1]) for t in re.findall(r"(เข็(?:ม|ม)\s?(?:ที|ที่|ท่ี|ท่ี)\s.?[1-6]\s*)", page)]
if max(shot_nums) == 6:
table = "shot6"
elif not table and in_heading(oldhead):
table = "old_given"
elif not table and in_heading(july) and in_heading(re.compile(r"(?:ร้อยละ|รอ้ยละ)")) and date > d("2021-08-01"): # new % table
table = "percent"
elif not table and in_heading(july):
table = "july"
elif not table:
continue
added = None
for line in lines:
# fix some number broken in the middle
line = re.sub(r"(\d+ ,\d+)", lambda x: x.group(0).replace(" ", ""), line)
area, *rest = line.split(' ', 1)
if area in ["เข็มที่", "และ", "จ", "ควำมครอบคลุม", 'ตั้งแต่วันที่', 'หมายเหตุ', 'เขต', 'เข็ม']: # Extra heading
continue
if area == "รวม" or not rest:
continue # previously meant end of table. Now can be part of header. 2021-08-14
cols = [c.strip() for c in NUM_OR_DASH.split(rest[0]) if c.strip()]
if len(cols) < 5:
break
if area in ["ผู้ท่ีมีอายุต้ังแต่", 'เข็มที'] and len(cols) < 6:
# 2021-12-21, 2021-01-03 - header down the bottom
continue
if added is None:
added = 0
if NUM_OR_DASH.match(area):
thaiprov, *cols = cols
else:
thaiprov = area
prov = get_province(thaiprov)
numbers = parse_numbers(cols)
added += 1
if table == "alloc":
sv1, sv2, az1, az2 = numbers[3:7]
add(prov, [sv1, sv2, az1, az2, sv1 + sv2, az1 + az2], alloc2_doses)
elif table == "given":
if len(numbers) == 16:
alloc_sv, alloc_az, *numbers = numbers
add(prov, [alloc_sv, alloc_az], alloc2)
assert len(numbers) == 14
add(prov, numbers, vaccols5x2)
elif table == "old_given":
alloc, target_num, given, perc, *rest = numbers
medical, frontline, disease, elders, riskarea, *rest = rest
# TODO: #อยู่ระหว่ำง ระบุ กลุ่มเป้ำหมำย - In the process of specifying the target group
# unknown = sum(rest)
row = [given, perc, 0, 0] + [medical, 0, frontline, 0, disease, 0, elders, 0, riskarea, 0]
add(prov, row, vaccols5x2)
add(prov, [alloc, 0, 0, 0, alloc, 0], alloc2_doses)
elif table == "new_given" and len(numbers) == 12: # e.g. vaccinations/Daily report 2021-05-11.pdf
dose1, dose2, *groups = numbers
add(prov, [dose1, np.nan, dose2, np.nan] + groups, vaccols5x2)
elif table == "new_given" and len(numbers) == 21: # from 2021-07-20
# Actually cumulative totals
pop, alloc, givens, groups = numbers[0], numbers[1:4], numbers[4:8], numbers[9:21]
sv, az, total_alloc = alloc
add(prov, givens + groups + [pop], vaccols6x2 + ["Vac Population"])
add(prov, [sv, az], alloc2)
elif table == "july" and len(numbers) == 5:
pop, given1, perc1, given2, perc2, = numbers
row = [given1, perc1, given2, perc2]
add(prov, row, givencols)
elif table == "july" and len(numbers) in [31, 32, 33, 27, 21, 22, 17]: # from 2021-08-05
# Actually cumulative totals
if len(numbers) == 21:
# Givens is a single total only 2021-08-16
pop, alloc, givens, groups = numbers[0], numbers[1:5], numbers[5:6], numbers[6:]
givens = [np.nan] * 6 # We don't use the total
elif len(numbers) == 22 and date < datetime.datetime(2021, 10, 5):
# Givens has sinopharm in it too. 2021-08-15
pop, alloc, givens, groups = numbers[0], numbers[1:6], numbers[6:7], numbers[7:]
givens = [np.nan] * 6 # We don't use the total
elif len(numbers) == 17:
# No allocations or givens 2021-08-10
pop, givens, groups = numbers[0], numbers[1:2], numbers[2:]
givens = [np.nan] * 6
alloc = [np.nan] * 4
elif len(numbers) in [27, 33]: # 2021-08-06, # 2021-08-05
pop, alloc, givens, groups = numbers[0], numbers[1:5], numbers[5:11], numbers[12:]
elif len(numbers) == 31: # 2021-10-05
pop, alloc, groups = numbers[0], numbers[1:6], numbers[7:]
# TODO: put in manuf given per province?
givens = [np.nan] * 6
elif len(numbers) == 32: # 2021-11-17
pop, alloc, groups = numbers[0], numbers[1:7], numbers[8:]
# TODO: put in manuf given per province?
givens = [np.nan] * 6
else:
assert False
sp = md = np.nan
if len(alloc) == 4: # 2021-08-06
sv, az, pf, total_alloc = alloc
elif len(alloc) == 5: # 2021-08-15
sv, az, sp, pf, total_alloc = alloc
elif len(alloc) == 6: # 2021-08-15
sv, az, sp, pf, md, total_alloc = alloc
else:
assert False
if date >= d("2021-12-05"):
# seems like they stopped having columns for moderna so total always more
pass
elif not pd.isna(total_alloc):
assert sum([m for m in [sv, az, pf, sp, md] if not pd.isna(m)]) == total_alloc, f"Error{date} {file}"
if len(groups) == 15: # 2021-08-06
# medical has 3 doses, rest 2, so insert some Nones
for i in range(5, len(groups) + 6, 3):
groups.insert(i, np.nan)
if len(groups) < 24:
groups = groups + [np.nan] * 3 # students
add(prov, givens + groups + [pop], vaccols8x3 + ["Vac Population"])
add(prov, [sv, az, sp, pf, md], alloc4)
elif table == "percent" and len(numbers) in [13]: # 2021-08-10
# extra table with % per population for over 60s and totals
pop, d1, d1p, d2, d2p, d3, d3p, total, pop60, d60_1, d60_1p, d60_2, d60_2p = numbers
add(prov, [d1, d1p, d2, d2p, d3, d3p], givencols3)
elif table == "percent" and len(numbers) in [18, 22, 15, 14]:
# extra table with % per population for over 60s and totals - 2021-09-09, 2021-10-05
if len(numbers) == 14: # 2022-02-06
pop, d1, d1p, d2, d2p, d3, d3p, *numbers2 = numbers
else:
pop, d1, d1p, d2, d2p, d3, d3p, total, *numbers2 = numbers
add(prov, [d1, d1p, d2, d2p, d3, d3p, pop], givencols3 + ["Vac Population"])
# Over 60s
cols = vaccols_60s + ["Vac Population Over 60s"]
if len(numbers) == 22:
pop, d1, d1p, d2, d2p, d3, d3p, *numbers3 = numbers2
elif len(numbers) in [15, 14]: # 2022-03-27 # 2022-02-06
pop, d1, d1p, d2, d2p, d3, d3p, *numbers3 = numbers2
if "ในกลุ่มบุคลำกรทำงกำรแพทย์และสำธำรณสุข" in page:
cols = vaccols_medical + ["Vac Population Medical Staff"]
else:
pop, d1, d1p, d2, d2p, *numbers3 = numbers2
d3, d3p = [np.nan] * 2
add(prov, [d1, d2, d3, pop], cols)
# Disease
if len(numbers) == 22:
pop, d1, d1p, d2, d2p, d3, d3p, *numbers4 = numbers3
elif len(numbers) in [15, 14]:
pop, d1, d1p, d2, d2p, d3, d3p, *numbers4 = [np.nan] * 7
else:
pop, d1, d1p, d2, d2p, *numbers4 = numbers3
d3, d3p = [np.nan] * 2
add(prov, [d1, d2, d3, pop], vaccols_disease + ["Vac Population Risk: Disease"])
assert not numbers4
elif table == "percent" and len(numbers) in [7, 8]: # 2022-02 changed to simpler table
pop, d1, d1p, d2, d2p, d3, d3p, *total = numbers
add(prov, [d1, d1p, d2, d2p, d3, d3p, pop], givencols3 + ["Vac Population"])
elif table == "percent" and len(numbers) in [9, 10]: # 2022-07 gen pop and 4 doses
pop, d1, d1p, d2, d2p, d3, d3p, d4, d4p, *total = numbers
add(prov, [d1, d1p, d2, d2p, d3, d3p, d4, d4p, pop], givencols4 + ["Vac Population"])
elif table == "shot6":
pop, d1, d1p, d2, d2p, d3, d3p, d4, d4p, d5, d5p, d6, d6p, *total = numbers
add(prov, [d1, d1p, d2, d2p, d3, d3p, d4, d4p, d5, d5p, d6, d6p, pop], givencols6 + ["Vac Population"])
else:
assert False, f"No vac table format match for {len(numbers)} cols in {file} {str(date)}"
assert added is None or added > 7
rows = pd.DataFrame.from_dict(rows, orient='index')
rows = rows.set_index(["Date", "Province"]).fillna(np.nan) if not rows.empty else rows
percents = rows[list(rows.columns.intersection([f'Vac Given {i} %' for i in range(1, 5)]))].fillna(0)
assert (percents < 500).all().all(), f"{file} {date}: wrong allocations"
# if 'Vac Given 1 Cum' in rows.columns:
# rows['Vac Given Cum'] = rows[list(rows.columns.intersection([f'Vac Given {i} Cum' for i in range(1, 5)]))].sum(axis=1)
return df.combine_first(rows) if not rows.empty else df
# def vaccination_reports_files():
# # also from https://ddc.moph.go.th/vaccine-covid19/diaryReportMonth/08/9/2021
# folders = web_links("https://ddc.moph.go.th/dcd/pagecontent.php?page=643&dept=dcd",
# ext=None, match=re.compile("2564"))
# links = (link for f in folders for link in web_links(f, ext=".pdf"))
# url = "https://ddc.moph.go.th/uploads/ckeditor2/files/Daily report "
# gen_links = (f"{url}{f.year}-{f.month:02}-{f.day:02}.pdf"
# for f in reversed(list(daterange(d("2021-05-20"), today(), 1))))
# links = unique_values(chain(links, gen_links)) # Some were not listed on the site so we guess
# links = sorted(links, key=lambda f: date if (date := file2date(f)) is not None else d("2020-01-01"), reverse=True)
# for link in links:
# date = file2date(link)
# if not date or date <= d("2021-02-27"):
# continue
# date = date - datetime.timedelta(days=1) # TODO: get actual date from titles. maybe not always be 1 day delay
# if USE_CACHE_DATA and date < today() - datetime.timedelta(days=MAX_DAYS - 1):
# break
# def get_file(link=link):
# try:
# file, _, _ = next(iter(web_files(link, dir="vaccinations")))
# except StopIteration:
# return None
# return file
# yield link, date, get_file
# def vac_slides_files(check=True):
# # Also at https://ddc.moph.go.th/dcd/pagecontent.php?page=648&dept=dcd
# base = "https://ddc.moph.go.th/vaccine-covid19/diaryPresentMonth"
# folders = [f"{base}/{m}/10/{y}" for y in range(2021, 2023) for m in range(1, 13)]
# links = sorted((link for f in folders for link in web_links(f, ext=".pdf", check=check)), reverse=True)
# count = 0
# for link in links:
# if USE_CACHE_DATA and count > MAX_DAYS:
# break
# count += 1
# def dl_file(link=link):
# file, _, _ = next(iter(web_files(link, dir="inputs/vaccinations")))
# return file
# yield link, None, dl_file
def vac_slides_files(check=0):
return vaccination_reports_files2(check=check,
base1="https://ddc.moph.go.th/dcd/pagecontent.php?page=648&dept=dcd",
base2="https://ddc.moph.go.th/vaccine-covid19/diaryPresentMonth/{m:02}/10/2021",
ext=".pdf"
)
@functools.lru_cache
def vaccination_reports_files2(check=0,
base1="https://ddc.moph.go.th/dcd/pagecontent.php?page=643&dept=dcd",
base2="https://ddc.moph.go.th/vaccine-covid19/diaryReportMonth/{m:02}/9/2021",
ext=".pdf",
timeout=90,
):
# https://ddc.moph.go.th/vaccine-covid19/diaryReport
# or https://ddc.moph.go.th/dcd/pagecontent.php?page=643&dept=dcd
# more reliable from dec 2021 and updated quicker
hasyear = re.compile(r"(?<!\.) *256[4-7]")
# re.compile(r"(ประจำปีงบประมาณ|งบประมาณ)\w*25")
# if not check:
# use_proxy = False
# else:
# try:
# use_proxy = requests.head("https://ddc.moph.go.th", timeout=80).status_code >= 400
# except requests.exceptions.RequestException:
# use_proxy = True
use_proxy = False
def avoid_redirect(links):
return (url.replace("http://", "https://") for url in links)
link_check = 2 if check else 0
years = list(web_links(base1, ext="dept=dcd", match=hasyear, check=link_check, proxy=use_proxy, timeout=timeout))
months = [link for link in web_links(*avoid_redirect(years), ext="dept=dcd",
match=hasyear, check=link_check, proxy=use_proxy, timeout=timeout)]
links1 = [link for link in web_links(*avoid_redirect(months), ext=".pdf", check=link_check, proxy=use_proxy, timeout=timeout) if (
date := file2date(link)) is None and (any_in(link.lower(), *['wk', "week", "รง.", "rep"])) or date is not None and date >= d("2021-09-01")]
# รง.สรุปวัคซีนประจำสัปดาห์ที่ 23 (6 -9 มิ.ย.66).pdf
# text = รายงานสรุปวัคซีน
# this set was more reliable for awhile. Need to match tests
folders = [base2.format(m=m) for m in range(11, 2, -1)]
links2 = (link for f in folders for link in reversed(
list(web_links(f, ext=ext, check=0, proxy=use_proxy, timeout=timeout, ignore_errors=True))))
links = list(links1) + list(links2)
count = 0
res = []
checks = [True] * check + [False] * (len(links) - check)
for link, check in zip(links, checks):
if any_in(link, "1638863771691.pdf", '1639206014644.pdf', "Slide") and "Report" in base2:
# it's really slides
continue
def get_file(link=link, check=check):
try:
file, _, _ = next(iter(web_files(link, dir="inputs/vaccinations",
proxy=use_proxy, timeout=timeout, check=check)))
except StopIteration:
return None
return file
count += 1
if USE_CACHE_DATA and count > MAX_DAYS:
break
res.append((link, None, get_file))
return res
def vaccination_reports():
vac_daily = pd.DataFrame(columns=['Date']).set_index("Date")
vac_prov_reports = pd.DataFrame(columns=['Date', 'Province']).set_index(["Date", "Province"])
# add in newer https://ddc.moph.go.th/uploads/ckeditor2//files/Daily%20report%202021-06-04.pdf
# Just need the latest
for link, date, dl in vaccination_reports_files2(check=0):
if (file := dl()) is None:
continue
table = pd.DataFrame(columns=["Date", "Province"]).set_index(["Date", "Province"])
for page in parse_file(file):
if date is None:
*rest, with_date = page.split("ข้อมูล", 2)
if rest:
# could be 2 dates on teh front page
date = max(find_thai_date(with_date, all=True))
else:
date = find_thai_date(page)
table = vaccination_tables(table, date, page, file)
vac_daily = vaccination_daily(vac_daily, date, file, page)
vac_daily = vac_problem(vac_daily, date, file, page)
gcount = len([c for c in table.columns if 'Population' in c]) - 1
dcount = int(len([c for c in table.columns if 'Vac Given' in c]) / 2)
sumdoses = len([c for c in vac_daily.columns if 'Vac Given' in c])
sumgroups = int(len([c for c in vac_daily.columns if 'Vac Group' in c]) / sumdoses) if sumdoses else 0
logger.info(
f"Vac Report: {date} totals= {sumgroups}g * {sumdoses}d, Table= {len(table)}p * {gcount}g * {dcount}d : {file}")
# TODO: move this into vaccination_tables so can be tested
if date in [d("2021-12-11"), d("2022-01-06")] and table.empty:
logger.info("{} doc has slides instead of report", date)
continue
elif d("2021-05-04") <= date <= d("2021-09-23") and len(table) < 77:
logger.warning("{} Dropping table: too few provinces", date)
continue
elif d("2021-04-09") <= date <= d("2021-05-03") and table.groupby("Date").count().iloc[0]['Vac Group Risk: Location 1 Cum'] != 77:
#counts = table.groupby("Date").count()
#missing_data = counts[counts['Vac Allocated AstraZeneca'] > counts['Vac Group Risk: Location 2 Cum']]
# if not missing_data.empty:
logger.warning("{} Dropping table: alloc doesn't match prov", date)
continue
elif date in [d("2022-02-06")]:
# TODO: how to fix this?
logger.warning("{} Dropping table: missing headers on extra pages", date)
continue
elif date < d("2021-08-01") or date in [d("2022-03-27"), d("2022-07-10")]:
# TODO: 2022-03-27: work out why only 76 prov
pass
elif date in [d("2023-01-20")]:
# locally it has 77 but on the github 74. Why??
pass
else:
assert len(table) == 77
vac_prov_reports = vac_prov_reports.combine_first(table)
if date < d("2021-12-11"):
# TODO: find days where day is yesterdays and fix them, or fix the check
pass
elif date in [d("2022-01-21"), d("2022-01-04"), d("2021-12-11"), d("2022-10-23"), d("2022-11-27")]:
# "2022-01-21": is actually "2022-01-20"
# TODO: maybe just stop trusting the date at the top of the doc?
pass
elif date >= d("2022-12-02"):
# as of Report weekly 48 28-11-2-12-2022.pdf" - no longer has the groups
pass
else:
assert date in vac_daily.index, "either missing daily data or got wrong date"
# Do cross check we got the same number of allocations to vaccination
# if not vac_prov_reports.empty:
# # counts = vac_prov_reports.groupby("Date").count()
# # missing_data = counts[counts['Vac Allocated AstraZeneca'] > counts['Vac Group Risk: Location 2 Cum']]
# # # 2021-04-08 2021-04-06 2021-04-05- 03-02 just not enough given yet
# # missing_data = missing_data["2021-04-09": "2021-05-03"]
# # # 2021-05-02 2021-05-01 - use images for just one table??
# # # We will just remove this days
# # vac_prov_reports = vac_prov_reports.drop(index=missing_data.index)
# # # After 2021-05-08 they stopped using allocation table. But cum should now always have 77 provinces
# # # TODO: only have 76 prov? something going on
# # missing_data = counts[counts['Vac Given 1 Cum'] < 77]["2021-05-04":]
# # vac_prov_reports = vac_prov_reports.drop(index=missing_data.index)
# # Just in case coldchain data not working
# it's possible we got the same data twice from the two different reports
vac_daily = vac_daily[~vac_daily.index.duplicated(keep='first')]
vac_prov_reports = vac_prov_reports[~vac_prov_reports.index.duplicated(keep='first')]
return vac_daily, vac_prov_reports
# def get_vac_coldchain():
# vac_import = get_vaccination_coldchain("vac_request_imports.json", join_prov=False)
# if not vac_import.empty:
# vac_import["_vaccine_name_"] = vac_import["_vaccine_name_"].apply(replace_matcher(["Astrazeneca", "Sinovac"]))
# vac_import = vac_import.drop(columns=['_arrive_at_transporter_']).pivot(columns="_vaccine_name_",
# values="_quantity_")
# vac_import.columns = [f"Vac Imported {c}" for c in vac_import.columns]
# vac_import = vac_import.fillna(0)
# vac_import['Vac Imported'] = vac_import.sum(axis=1)
# vac_import = vac_import.combine_first(daily2cum(vac_import))
# # Delivered Vac data from coldchain
# vac_delivered = get_vaccination_coldchain("vac_request_delivery.json", join_prov=False)
# vac_delivered = join_provinces(vac_delivered, '_hospital_province_')
# # TODO: save delivered by prov somewhere. note some hospitals unknown prov
# vac_delivered = vac_delivered.reset_index()
# vac_delivered['Date'] = vac_delivered['Date'].dt.floor('d')
# vac_delivered = vac_delivered[['Date', '_quantity_']].groupby('Date').sum()
# vac_delivered = vac_delivered.rename(columns=dict(_quantity_='Vac Delivered'))
# vac_delivered['Vac Delivered Cum'] = vac_delivered['Vac Delivered'].fillna(0).cumsum()
# # per prov given from coldchain
# vacct = get_vaccination_coldchain("vac_request_givenprov.json", join_prov=True)
# vacct = vacct.reset_index().set_index("Date").loc['2021-02-28':].reset_index().set_index(['Date', 'Province'])
# vacct = vacct.reset_index().pivot(index=["Date", "Province"], columns=["Vaccine"]).fillna(0)
# vacct.columns = [" ".join(c).replace("Sinovac Life Sciences", "Sinovac") for c in vacct.columns]
# vacct['Vac Given'] = vacct.sum(axis=1, skipna=False)
# vacct = vacct.loc[:today() - datetime.timedelta(days=1)] # Today's data is incomplete
# vacct = vacct.fillna(0)
# vaccum = vacct.groupby(level="Province", as_index=False, group_keys=False).apply(daily2cum)
# vacct = vacct.combine_first(vaccum)
# # Their data can have some prov on the last day missing data
# # Need the last day we have a full set of data since some provinces can come in late in vac tracker data
# # TODO: could add unknowns
# counts1 = vacct['Vac Given'].groupby("Date").count()
# counts2 = vacct['Vac Given Cum'].groupby("Date").count()
# last_valid = max([counts2[counts1 > 76].last_valid_index(), counts2[counts2 > 76].last_valid_index()])
# vacct = vacct.loc[:last_valid]
# return vac_import, vac_delivered, vacct
def export_vaccinations(vac_reports, vac_reports_prov, vac_slides_data, do_export=not USE_CACHE_DATA):
# TODO: replace the vacct per prov data with the dashboard data
# TODO: replace the import/delivered data with?
# vac_import, vac_delivered, vacct = get_vac_coldchain()
# vac_reports_prov.drop(columns=["Vac Given 1 %", "Vac Given 1 %"], inplace=True)
# Not currently used as it is too likely to result in missing numbers
# vac_prov_sum = vac_reports_prov.groupby("Date").sum()
vac_prov = import_csv("vaccinations", ["Date", "Province"], not USE_CACHE_DATA)
vac_prov = vac_prov.combine_first(vac_reports_prov) # .combine_first(vacct)
if do_export:
export(vac_prov, "vaccinations", csv_only=True)
# vac_prov = vac_prov.combine_first(vacct)
# Add totals if they are missing
# given = vac_prov[[f"Vac Given {d}" for d in range(1, 4)]].sum(axis=1).to_frame("Vac Given")
# vac_prov = vac_prov.combine_first(given)
given_cum = vac_prov[[f"Vac Given {d} Cum" for d in range(1, 4)]].sum(axis=1).to_frame("Vac Given Cum")
vac_prov = vac_prov.combine_first(given_cum)
# Get vaccinations by district # TODO: move this to plot
vac_prov = join_provinces(vac_prov, "Province")
given_by_area_1 = area_crosstab(vac_prov, 'Vac Given 1', ' Cum')
given_by_area_2 = area_crosstab(vac_prov, 'Vac Given 2', ' Cum')
given_by_area_both = area_crosstab(vac_prov, 'Vac Given', ' Cum')
vac_timeline = import_csv("vac_timeline", ["Date"], not USE_CACHE_DATA)
vac_timeline = vac_timeline.combine_first(
vac_reports).combine_first(
vac_slides_data).combine_first(
# vac_delivered).combine_first(
# vac_import).combine_first(
given_by_area_1).combine_first(
given_by_area_2).combine_first(
given_by_area_both)
if export:
export(vac_timeline, "vac_timeline")
return vac_timeline
def vac_manuf_given_blue(page, file, page_num, url):
# if not re.search(r"(ผลการฉีดวคัซีนสะสมจ|ผลการฉีดวัคซีนสะสมจ|านวนผู้ได้รับวัคซีน|านวนการได้รับวัคซีนสะสม|านวนผูไ้ดร้บัวคัซนี|วัคซีนโควิด)", page): # noqa
# return df
fname = os.path.splitext(os.path.basename(file))[0]
df = pd.DataFrame()
if "AstraZeneca" not in page or fname.isnumeric() and int(fname) <= 1620104912165: # 2021-03-21
return df
if any_in(page, "รำยจังหวัด"):
# it's province table in wrong file
return df
table = camelot_cache(file, page_num, process_background=True)
if table is None:
# 1623224536253.pdf. very simple table
# Slide weekly 2023-01-20.pdf and Slide wk 1 2023-01-06.pdf'
return df
# should be just one col. sometimes there are extra empty ones. 2021-08-03
table = table.infer_objects(copy=False).replace('', np.nan).dropna(how="all", axis=1).replace(np.nan, '')
manuf = ["Sinovac", "AstraZeneca", "Sinopharm", "Pfizer", "Moderna"]
if len(table.columns) < 3:
return df
elif table.empty or len(table.columns) > 5:
# 'inputs/vaccinations/Slide 2022-05-02.pdf'
# 'inputs/vaccinations/1620105431806.pdf' has astra per province?
return df
assert len(table.columns) >= 3
cols = table.columns
# get biggest date on page
header_dates = [find_thai_date(r) for r in table.iloc[0] if find_thai_date(r)]
date = min(max(header_dates), find_thai_date(page)) if header_dates else find_thai_date(page)
assert date
# Throw away daily and manuf totals
if len(table.columns) == 3:
table = table.drop(columns=[cols[1]]) # .drop(index=[0, 1, 3, 5, 7])
else:
labels = [c for c in table.columns if table[c].str.contains("AstraZeneca", regex=True).any()][0]
vals = [c for c in table.columns if table[c].str.contains("สะสม", regex=True).any()]
if not vals:
# probably one of the prov tables
return df
else:
vals = vals[-1] # get rid of the daily col
table = table[[labels, vals]]
labels, vals = table.columns
table = table[table[labels].str.contains("AstraZeneca")]
if table[vals].str.contains("AstraZeneca").any():
# Summary col got mixed in. Can't really untangle
# Slide 2022-01-02.pdf
return df
def clean_labels(label):
# Sometimes numbers can get mixed in there
return [s for s in re.sub(r"[,\d]*", "", label).split("\n") if s]
doses = [dict(zip(reversed(clean_labels(table.iloc[dose][labels])),
reversed(get_next_numbers(table.iloc[dose][vals], return_rest=False)))) for dose in range(len(table))]
for d0, d1 in zip(doses[:-1], doses[1:]):
assert sum([v for m, v in d0.items() if m in manuf]) >= sum([v for m, v in d1.items() if m in manuf])
# TODO: add asserts to compare totals
def clean_manuf(m):
return m.capitalize().replace("zeneca", "Zeneca")
row = pd.DataFrame([{f"Vac Given {clean_manuf(m)} {d + 1} Cum": num for d in range(len(table))
for m, num in doses[d].items() if clean_manuf(m) in manuf} | {"Date": date}]).set_index("Date")
if date > d("2021-11-23"):
assert len(row.columns) >= 12
# logger.debug("{} Vac slides {} {}", date.date(), file, row.to_string(header=False, index=False))
return row
def vac_manuf_given_brown(page, file, page_num, url):
# if not re.search(r"(ผลการฉีดวคัซีนสะสมจ|ผลการฉีดวัคซีนสะสมจ|านวนผู้ได้รับวัคซีน|านวนการได้รับวัคซีนสะสม|านวนผูไ้ดร้บัวคัซนี|วัคซีนโควิด)", page): # noqa
# return df
fname = os.path.splitext(os.path.basename(file))[0]
df = pd.DataFrame()
if "AstraZeneca" not in page or fname.isnumeric() and int(fname) <= 1620104912165: # 2021-03-21
return df
if any_in(page, "รำยจังหวัด"):
# it's province table in wrong file
return df
table = camelot_cache(file, page_num, process_background=True)
if table is None:
# 1623224536253.pdf. very simple table
# get with Slide weekly 2023-01-20.pdf
return df
# should be just one col. sometimes there are extra empty ones. 2021-08-03
table = table.infer_objects(copy=False).replace('', np.nan).dropna(how="all", axis=1).replace(np.nan, '')
manuf = ["Sinovac", "AstraZeneca", "Sinopharm", "Pfizer", "Moderna"]
if len(table.columns) != 1:
return df
title1, daily, title2, doses, *rest = [cell for cell in table[table.columns[0]]
if cell.strip()] # + title3, totals + extras
date = find_thai_date(title1)
# Sometimes header and cell are split into different rows 'vaccinations/1629345010875.pdf'
if len(rest) == 3 and date < d("2021-10-14"):
# TODO: need better way to detect this case
doses = rest[0] # Assumes header is doses cell
# Sometimes there is an extra date thrown in inside brackets on the subheadings
# e.g. vaccinations/1624968183817.pdf
_, doses = find_thai_date(doses, remove=True)
numbers = get_next_numbers(doses, return_rest=False)
numbers = [n for n in numbers if n not in [1, 2, 3]] # these are in subtitles and seem to switch positions