Skip to content

Commit

Permalink
Add support for matching VCF lines by ID
Browse files Browse the repository at this point in the history
See #1739
  • Loading branch information
pd3 committed Oct 22, 2024
1 parent a79ca97 commit 3d994d3
Show file tree
Hide file tree
Showing 12 changed files with 105 additions and 8 deletions.
12 changes: 12 additions & 0 deletions test/annotate35.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=src,Number=1,Type=String,Description="">
##INFO=<ID=dst,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID6 N <INS> . PASS src=ID6
chr1 902 ID5 N <INS> . PASS src=ID5
chr1 902 ID4 N <INS> . PASS src=ID4
chr1 902 ID3 N <INS> . PASS src=ID3
chr1 902 ID2 N <INS> . PASS src=ID2
chr1 902 ID1 N <INS> . PASS src=ID1
12 changes: 12 additions & 0 deletions test/annotate35.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=src,Number=1,Type=String,Description="">
##INFO=<ID=dst,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID6 N <INS> . PASS src=ID6;dst=ID6
chr1 902 ID5 N <INS> . PASS src=ID5;dst=ID5
chr1 902 ID4 N <INS> . PASS src=ID4;dst=ID4
chr1 902 ID3 N <INS> . PASS src=ID3;dst=ID3
chr1 902 ID2 N <INS> . PASS src=ID2;dst=ID2
chr1 902 ID1 N <INS> . PASS src=ID1;dst=ID1
11 changes: 11 additions & 0 deletions test/annotate35.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
##fileformat=VCFv4.2
##INFO=<ID=src,Number=1,Type=String,Description="">
##INFO=<ID=dst,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID6 N <INS> . PASS src=ID6
chr1 902 ID5 N <INS> . PASS src=ID5
chr1 902 ID4 N <INS> . PASS src=ID4
chr1 902 ID3 N <INS> . PASS src=ID3
chr1 902 ID2 N <INS> . PASS src=ID2
chr1 902 ID1 N <INS> . PASS src=ID1
6 changes: 6 additions & 0 deletions test/annots35.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
chr1 902 ID4 N <INS> ID4
chr1 902 ID2 N <INS> ID2
chr1 902 ID6 N <INS> ID6
chr1 902 ID1 N <INS> ID1
chr1 902 ID3 N <INS> ID3
chr1 902 ID5 N <INS> ID5
11 changes: 11 additions & 0 deletions test/annots35.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
##fileformat=VCFv4.2
##INFO=<ID=src,Number=1,Type=String,Description="">
##INFO=<ID=dst,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID4 N <INS> . PASS src=ID4
chr1 902 ID2 N <INS> . PASS src=ID2
chr1 902 ID6 N <INS> . PASS src=ID6
chr1 902 ID3 N <INS> . PASS src=ID3
chr1 902 ID1 N <INS> . PASS src=ID1
chr1 902 ID5 N <INS> . PASS src=ID5
7 changes: 7 additions & 0 deletions test/isec.match-id.1.1.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##INFO=<ID=src,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID1 A C . . .
chr1 902 ID2 A C . . .
chr1 902 ID3 A C . . .
7 changes: 7 additions & 0 deletions test/isec.match-id.1.2.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##INFO=<ID=src,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID3 A C . . .
chr1 902 ID2 A C . . .
chr1 902 ID1 A C . . .
8 changes: 8 additions & 0 deletions test/isec.match-id.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=src,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID3 A C . . .
chr1 902 ID2 A C . . .
chr1 902 ID1 A C . . .
8 changes: 8 additions & 0 deletions test/isec.match-id.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=src,Number=1,Type=String,Description="">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 902 ID1 A C . . .
chr1 902 ID2 A C . . .
chr1 902 ID3 A C . . .
8 changes: 7 additions & 1 deletion test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.chk',args=>'-s -');
run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.2.chk',args=>q[-s - -i 'type="snp"']);
run_test(\&test_vcf_stats,$opts,in=>['stats.vaf'],out=>'stats.vaf.1.chk',args=>q[-s -]);
run_test(\&test_vcf_isec,$opts,in=>['isec.match-id.1.1','isec.match-id.1.2'],out=>'isec.match-id.1.out',args=>'-n =2 -w 2 --no-version');
run_test(\&test_vcf_isec,$opts,in=>['isec.match-id.1.1','isec.match-id.1.2'],out=>'isec.match-id.2.out',args=>'-n =2 -w 2 -c id --no-version');
run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2');
run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"');
run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both');
Expand Down Expand Up @@ -524,6 +526,8 @@
run_test(\&test_vcf_sort,$opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n');
run_test(\&test_vcf_sort,$opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n');
run_test(\&test_vcf_regions,$opts,in=>'regions');
run_test(\&test_vcf_annotate,$opts,in=>'annotate35',vcf=>'annots35',out=>'annotate35.1.out',args=>q[-c CHROM,POS,~ID,REF,ALT,INFO/src]);
run_test(\&test_vcf_annotate,$opts,in=>'annotate35',tab=>'annots35',out=>'annotate35.2.out',args=>q[-c CHROM,POS,~ID,REF,ALT,dst:=src]);
run_test(\&test_vcf_annotate,$opts,in=>'annotate.escape.1',tab=>'annotate.escape.1',out=>'annotate.escape.1.1.out',args=>q[-c CHROM,POS,ISTR,FMT/FSTR]);
run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.1.out',args=>q[-c CHROM,POS,-,-,SCORE,~X,-,- -i'STR={X}']);
run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.2.out',args=>q[-c CHROM,POS,REF,ALT,SCORE,-,~X,- -i'INT={X}']);
Expand Down Expand Up @@ -1477,7 +1481,9 @@ sub test_vcf_isec
my $files = join(' ',@files);
$args{args} =~ s/{PATH}/$$opts{path}/g;
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec $args{args} $files");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} $files");
# Either improve or disable completely: the output type does not make sense in all modes
# test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} $files");
}
sub test_vcf_isec2
{
Expand Down
16 changes: 12 additions & 4 deletions vcfannotate.c
Original file line number Diff line number Diff line change
Expand Up @@ -2313,15 +2313,14 @@ static void init_columns(args_t *args)
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
col->replace = replace;
if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF);
if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_ANY;
}
else args->alt_idx = icol;
}
else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
{
if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
if ( str.s[0]=='~' ) replace = MATCH_VALUE;
if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
Expand All @@ -2330,7 +2329,11 @@ static void init_columns(args_t *args)
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
if ( replace & MATCH_VALUE ) args->match_id = icol;
if ( replace & MATCH_VALUE )
{
args->match_id = icol;
if ( args->tgts_is_vcf ) args->pair_logic = (args->pair_logic==-1) ? BCF_SR_PAIR_ID : args->pair_logic|BCF_SR_PAIR_ID;
}
}
else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
{
Expand Down Expand Up @@ -3122,6 +3125,11 @@ static void init_data(args_t *args)
&args->index_fn, args->write_index) < 0 )
error("Error: failed to initialise index for %s\n",args->output_fname);
}
if ( args->tgts_is_vcf )
{
if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_SOME;
bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic);
}
}

static void destroy_data(args_t *args)
Expand Down Expand Up @@ -3784,6 +3792,7 @@ int main_vcfannotate(int argc, char *argv[])
else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME;
else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT;
else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT;
else if ( !strcmp(optarg,"id") ) args->pair_logic |= BCF_SR_PAIR_ID;
else error("The --pair-logic string \"%s\" not recognised.\n", optarg);
break;
case 3 :
Expand Down Expand Up @@ -3829,7 +3838,6 @@ int main_vcfannotate(int argc, char *argv[])
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME);
if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n");
}
}
Expand Down
7 changes: 4 additions & 3 deletions vcfisec.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
Copyright (C) 2012-2023 Genome Research Ltd.
Copyright (C) 2012-2024 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -460,7 +460,7 @@ static void destroy_data(args_t *args)
{
if ( !args->fnames[i] ) continue;
if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]);
int is_tbi = !args->write_index
int is_tbi = !args->write_index
|| (args->write_index&127) == HTS_FMT_TBI;
if ( args->output_type==FT_VCF_GZ && is_tbi )
{
Expand All @@ -476,8 +476,8 @@ static void destroy_data(args_t *args)
free(args->fh_out);
free(args->fnames);
if ( args->fh_sites ) fclose(args->fh_sites);
if ( args->write ) free(args->write);
}
free(args->write);
}

static void usage(void)
Expand Down Expand Up @@ -597,6 +597,7 @@ int main_vcfisec(int argc, char *argv[])
else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME;
else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE;
else if ( !strcmp(optarg,"id") ) args->files->collapse |= BCF_SR_PAIR_ID;
else error("The --collapse string \"%s\" not recognised.\n", optarg);
break;
case 'f': args->files->apply_filters = optarg; break;
Expand Down

0 comments on commit 3d994d3

Please sign in to comment.