1+ import abc
12import dataclasses
23import json
34import logging
1314
1415logger = logging .getLogger (__name__ )
1516
16- ZARR_SCHEMA_FORMAT_VERSION = "0.4 "
17+ ZARR_SCHEMA_FORMAT_VERSION = "0.5 "
1718DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
1819
1920_fixed_field_descriptions = {
2829}
2930
3031
32+ class Source (abc .ABC ):
33+ @property
34+ @abc .abstractmethod
35+ def path (self ):
36+ pass
37+
38+ @property
39+ @abc .abstractmethod
40+ def num_records (self ):
41+ pass
42+
43+ @property
44+ @abc .abstractmethod
45+ def num_samples (self ):
46+ pass
47+
48+ @property
49+ @abc .abstractmethod
50+ def samples (self ):
51+ pass
52+
53+ @property
54+ def contigs (self ):
55+ return None
56+
57+ @property
58+ def filters (self ):
59+ return None
60+
61+ @property
62+ def root_attrs (self ):
63+ return {}
64+
65+ @abc .abstractmethod
66+ def iter_alleles (self , start , stop , num_alleles ):
67+ pass
68+
69+ @abc .abstractmethod
70+ def iter_genotypes (self , start , stop , num_alleles ):
71+ pass
72+
73+ def iter_id (self , start , stop ):
74+ return
75+
76+ def iter_contig (self , start , stop ):
77+ return
78+
79+ @abc .abstractmethod
80+ def iter_field (self , field_name , shape , start , stop ):
81+ pass
82+
83+ @abc .abstractmethod
84+ def generate_schema (self , variants_chunk_size , samples_chunk_size , local_alleles ):
85+ pass
86+
87+
3188@dataclasses .dataclass
3289class ZarrArraySpec :
3390 name : str
@@ -182,25 +239,16 @@ class VcfZarrSchema(core.JsonDataclass):
182239 format_version : str
183240 samples_chunk_size : int
184241 variants_chunk_size : int
185- samples : list
186- contigs : list
187- filters : list
188242 fields : list
189243
190244 def __init__ (
191245 self ,
192246 format_version : str ,
193- samples : list ,
194- contigs : list ,
195- filters : list ,
196247 fields : list ,
197248 variants_chunk_size : int = None ,
198249 samples_chunk_size : int = None ,
199250 ):
200251 self .format_version = format_version
201- self .samples = samples
202- self .contigs = contigs
203- self .filters = filters
204252 self .fields = fields
205253 if variants_chunk_size is None :
206254 variants_chunk_size = 1000
@@ -238,9 +286,6 @@ def fromdict(d):
238286 f"{ d ['format_version' ]} != { ZARR_SCHEMA_FORMAT_VERSION } "
239287 )
240288 ret = VcfZarrSchema (** d )
241- ret .samples = [Sample (** sd ) for sd in d ["samples" ]]
242- ret .contigs = [Contig (** sd ) for sd in d ["contigs" ]]
243- ret .filters = [Filter (** sd ) for sd in d ["filters" ]]
244289 ret .fields = [ZarrArraySpec (** sd ) for sd in d ["fields" ]]
245290 return ret
246291
@@ -474,8 +519,10 @@ def init(
474519
475520 # Doing this synchronously - this is fine surely
476521 self .encode_samples (root )
477- self .encode_filter_id (root )
478- self .encode_contig_id (root )
522+ if self .source .filters is not None :
523+ self .encode_filter_id (root )
524+ if self .source .contigs is not None :
525+ self .encode_contigs (root )
479526
480527 self .wip_path .mkdir ()
481528 self .arrays_path .mkdir ()
@@ -502,33 +549,33 @@ def init(
502549 )
503550
504551 def encode_samples (self , root ):
505- if [s .id for s in self .schema .samples ] != self .source .samples :
506- raise ValueError ("Subsetting or reordering samples not supported currently" )
552+ samples = self .source .samples
507553 array = root .array (
508554 "sample_id" ,
509- data = [sample .id for sample in self . schema . samples ],
510- shape = len (self . schema . samples ),
555+ data = [sample .id for sample in samples ],
556+ shape = len (samples ),
511557 dtype = "str" ,
512558 compressor = DEFAULT_ZARR_COMPRESSOR ,
513559 chunks = (self .schema .samples_chunk_size ,),
514560 )
515561 array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
516562 logger .debug ("Samples done" )
517563
518- def encode_contig_id (self , root ):
564+ def encode_contigs (self , root ):
565+ contigs = self .source .contigs
519566 array = root .array (
520567 "contig_id" ,
521- data = [contig .id for contig in self . schema . contigs ],
522- shape = len (self . schema . contigs ),
568+ data = [contig .id for contig in contigs ],
569+ shape = len (contigs ),
523570 dtype = "str" ,
524571 compressor = DEFAULT_ZARR_COMPRESSOR ,
525572 )
526573 array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
527- if all (contig .length is not None for contig in self . schema . contigs ):
574+ if all (contig .length is not None for contig in contigs ):
528575 array = root .array (
529576 "contig_length" ,
530- data = [contig .length for contig in self . schema . contigs ],
531- shape = len (self . schema . contigs ),
577+ data = [contig .length for contig in contigs ],
578+ shape = len (contigs ),
532579 dtype = np .int64 ,
533580 compressor = DEFAULT_ZARR_COMPRESSOR ,
534581 )
@@ -537,10 +584,11 @@ def encode_contig_id(self, root):
537584 def encode_filter_id (self , root ):
538585 # TODO need a way to store description also
539586 # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
587+ filters = self .source .filters
540588 array = root .array (
541589 "filter_id" ,
542- data = [filt .id for filt in self . schema . filters ],
543- shape = len (self . schema . filters ),
590+ data = [filt .id for filt in filters ],
591+ shape = len (filters ),
544592 dtype = "str" ,
545593 compressor = DEFAULT_ZARR_COMPRESSOR ,
546594 )
0 commit comments