diff --git a/csq.c b/csq.c index b38eba10..c3fbb337 100644 --- a/csq.c +++ b/csq.c @@ -666,6 +666,7 @@ void splice_init(splice_t *splice, bcf1_t *rec) } static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) { + // beg .. the beggining of the splice region // len>0 .. beg is the first base, del filled from right // len<0 .. beg is the last base, del filled from left @@ -681,8 +682,24 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) } else { - rbeg = abeg = beg; - rlen = alen = len; + if ( beg < splice->tr->beg ) + { + // This can happen with very short exons and introns. Not a real biology, but the program + // should not crash on it. This is not a real fix, the code would need a revamp to handle + // well cases like this, see test/csq/ENSCAFT00000047742 + // >chr9:104-110 + // ATGTCAGGGCC + // ATGTC-GGGCC + // 456 + // eee.eee + rbeg = abeg = splice->tr->beg; + rlen = alen = 0; + } + else + { + rbeg = abeg = beg; + rlen = alen = len; + } // check for incomplete del as above?? } diff --git a/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.fa b/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.fa new file mode 100644 index 00000000..04e2099b --- /dev/null +++ b/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.fa @@ -0,0 +1,114 @@ +>chr9 chr9:19518647-19527624 +GCGGGACGGGGCGGTCCCCGCTCGGAGCCCCCGCCCAGCTGACCCGGCGGCTCTCCCCTCGCAGGCTGCTGCCCCGGCGT +GCAGGGCCCGGCCGCCGCCATGTCAGGGCCCGTTCGAGCTCTCGGTGCAGGACCTCAACGACCTGCTGTCGGACGGCAGC +GGCTGTTACAGCCTGCCGAGCCAGCCTTGCAAGGAGGTCACCCCCAGGATCTACGTGGGCAACGCGTGAGTCGCCGTTGG +GGCGCCCCGCCCACCCGAAAGCCGGGGTCGGGGCGTTGCGGGcgctggggggggggtgcggggcgTGGCCGCCCTCCGGG +AGCCCCGCGGGGCCGGGCAGGGGCTGGAGTCGCCGCCCGCCCCCGCGCAGTGGGGCGGAGGGGCTAgaccccgccccggc +cccgcgcgTCCTCCCCCCGCGGGGGGCCCGGGTCCCCGGGCGTCCGGCCCGAAGCCGCCAGCCGCTAGGGGCTTGGCCTT +GGGCGGGGTCGGGCTACCGGGATCACTTAACAGGTGGCCCTCTGTCACCCGGCCGGTCTCCAGCCCGCGCGTCATGTGAC +ATCTGCCTGGTTCTGCAGTGAGGTCACCGCGGAATGTCTGCCTTCGCTGCCATGGCAACTGGCTGACGTCACAGATCGGG +CGTGGAACTTTCCCGGCTGGGCAGGCCAGATCAAGGAATCGAAATACTCCCAATGAGGGATCCGGAGAGCTGGGCTGGGG +TTTCCTtgctccctcccctgccttccaGATAGGGTTTGGTTTACCAACGCCCTAAGTTCCATGGGCCAGGGTCTGCGTTA +GAAACCCAGAACTAACTCCCTTCTCCCCTTTGACCCAGGAGGAAATGGTGGCCCAGGGTGGTTGAGGGACTGTGAGGACA +GAGACCATGCCTTGTTCATGTCTGTATCCCCTGTCCTGTACTAGGACCTGCTGCTTACCCTGCTAAGGGCTGCCACAAAA +ATGAGGGTCATCGCGGGGCAAGAAGGGGTCCCTAAGGCAGGTCCTGCCTTCGTGGGGACTGTCTCATCcattccatccat +tcattcaccatattgagcacctactgtgtgtccgGCATTTACAGTGCACATTCATAGTGATAGTAGTGGTAACGGTAAAT +TTGTTGAACGCTGAATTAGAGAAGGGAGGTGAATGAAAATTCACCTTcagagttatttgtttttttaatcaaccaCTGTT +TAACAACCACGTGCAGTGTCATGAGCAGGAACAGCTGAAACAGTCCCAGGGCCTTAGCCGAAGTGCTCCGTAATTGCAAA +GTTCAGAGGATAGTGCTCTGTCCAGCCACCCCAGCAGCAAGTGGCCTTGGTGCCTCGGAGAACCCGCTGTCTGGGGCAGT +TTGGCACATTCAGCTGCCCCTCCTTCCCGAGTGGAAACCGAGGCGGTGGATGTCTGGTGGTAGCCAGGGTGGAAAGTGCC +GGTGGCCACTGTAGAGAAACACATCCCCAGCCCAATTAGTGAATCTGGCTTTGACTTGaccaggggtgtgggggggtaCT +CCTGCACATAGGGAGCCCCTGAACCAAATCAGGGCTCACAGGGCCTGGAGAGGAGTCTGTATGAGCCTCATTTTCTAGAT +CTCAGGATCTGCAGTTGATGTGGACATGCCAGGCAGAGTGAGGGGAAGACAGAGCAGGGCTGGTGTCAGGTCTTCTCCTT +GCTCTGCCCCTAAGCATCCCATGTGAGCCCCAGCACTGTCCTTCCACTGTCCTGGAAGCTTAAGCCCAGGCCTGGCTTCC +TTGCAGGCTGCTGTGACAGGCAAACAAGATCATGTCGGAGAAGGCACTTTGTAAAGTATAAACTGTTGTCTGAATTGGGA +TGTTGTTGGGAGATGAAATAAAGGAGCCCTTTGACTATTCAGATCAAATGTTTTATAACCCCCAGAGTTCTGTTGGCGCC +ACTTGGTCCCCAAGTGCTTACTCACCCAGGGGAGCTTCAGAAATGTGGCTAGTGGTCTGTGGTGGAAAGGGCCCCTCTGG +GGGACTGGAGGGGCCGCCACACTTTATAGTCCAAGGAAGGAGGCTGCCTAAGATGTgttgatttcattctttctcactgA +TTCACCCCATAGTCATTCAGCACCTATAGCCGGGTGCAATAGCAAGTGCTGGGCAGACAGACCTGGTTACCCCCTCAAAA +GTTCACAGCCTACCCATCCGAACAAATAATCACAATCAGGCATTAGAAGAGCACTGACTACGGTGTCAAGCTCTGTCTAT +AGgctagatcattttttttttttttaaatttttatttatttatgatagtcacagagagagagagagagaggcagagacac +aggcagagggagaaacaggctccatgcaccgggagcccgacgtgggattcgatcctgcgtctccaggatcgcgccctggg +ccaaaggcaggcgccaaaccgctgcgccacccagggatccctaggctaGATCATTTAATCCTTCTGCCCAGGGGGTAGGT +ATTGTTGTCAACCTCATTtgacagatagggaaactgagtcaccaaAAGGTTAAGTGGCTTGCTTAAGGTTACCTGGCTGG +CAAGTCTCAGGGTTCGACTCagacccaggcagtctgactctagAGCCCGTTTTGAGCCAGTAGGCTGTACCACCACCACA +GTATAGGGAACATGATGCCAGATGGAATGGAGGGGGCGATCAGGGCATATTGATGGAAAATGAGAGGGTGTGTGACCATG +GAGAAGCTCGAGAAGGGCATTCCagggagagggaacagcatgggGCTGAAACCACCCCAGCTATTTGGGGAAGCTGCTGG +TAATTAGATATGGCTGGAGTAAGGGAGGTGGGTCACAAGATGAGGGGAAGTTGGCAGGGGCCAAGCGTGGAGGCTTCCTC +GCTTTGCCAGGAGACTTAGaatcttctccatcttcctccttcTGGGGCTGCCAGGTGGTCAGGTAATCATCCCCCTGCTC +CTGTCTCCTGTCCAGGTCTGTGGCTCAAGACATCCCCAAGCTGCAGAAACTAGGCATCACCCATGTTCTGAATGCTGCTG +AGGGCAGGTCCTTCATGCACGTCAACACCAATGCCAACTTCTACAAGGACTCCGGCattacctacctgggcatcaagGCC +AATGACACGCAGGAGTTCAACCTCAGCGCCTACTTTGAAAGGGCTGCAGACTTCATCGACCAGGCCCTGGCTCAAAAGAA +TGGTAAGGCACATGTGGCCCAGGAAACAGTGCAAGGCAGTTctgactggatttttttctagaaaacagGCCCACAACTGG +CTTCCTCTTGGAAAACCTATCAAGTGGCCACATGTTACACAAGATTTATAAATGTTTCTCACCATGGCAGCCCCGGGGTC +CTTGGGTTTGAGTGGACTTTGCCTGAGAGTTCCTTTCTAGCCACTCCCCGCCATGCCCTCAGTTCAAATAGCTTTAGTAA +TAATGATTTTATCTTCCATGATAAGAAGTCCTGAGGTGAGGGAGCTCTGGGGCTGGTTGATTCAGCAGCTTGGTGGCGAC +ATCAGGAAGAGGTGCTTCCCTTTTCTCTTGCACGCCACCCTCAGGGTTCTCCCCTCATGGTCCCAAGATGGCGGTGGTAG +ACCTAGTCACCACATCCTAGCAGAACAAAGCATAGCAAAAGACCAGGCAGCCATCTCTTCCTtgtgtttcttcccttttt +ttaatgagcaagtaaaagggtgcctgagtggctcagtcggttgagggtctgccatcagctcaggtcatgatctcaggttc +ctgagattgagccctatatggagctctctgctcagtggggagtctgcatctccctcttctctgttcctctgcctgGCTCG +TGctcggtttctctctctctcaaataaataaataaaaataaagagcaagtAGAAGGGTGCTCAGAAACCCTTCATTGGGG +TTCCCCTCACATCTCAGTGGCTGAATTGGCTTACATGAACGCCTGGCCAGGGAAACAGGGCAGCCATGACTGGCTTAGGC +GGGTGCTTAGCAAGCTGTGGCCCACAGACCACATCTTCCAGCCACTTTCTAAATGCTTCCTGCAGCACCCAAATCTAAGA +CTGAGCAGAACTGCTGAGTTGCtggcaggcagggaggcaggaccTCCCTGGGGGAAACCCTAGAAGGCAGAGGGCAAACG +TCAGTTGAGCAGGCTGGGCCACTTGCCCTTTCCCTTTTAGCTTGTTTCTCTCAGCGCCTGCCTTTACCTTCAAGCTGCCT +CCCCTGgcctcttcctccctcaccaCGGCCTTCTCAGGGGAAGTGAGCCCCATGAAGGTATGATCAGCCTTGATACCCTG +ATTCtagcctgtctctgcttcttgtGCCCAACAGCCTCCCCCTGGAATGATTTCCAGTGAACAGTGTGTACTAGATGCTG +GTTATCGCATATCTGGGGCTCAGGAGGGAAGTTTTATTGCCCGGGGAATCTGCAACTTTACTGTAAGGCCCTTTGTGGTC +CTTCATGATCTTGTGTGTGTCACCCCCGCCATTCCGGCTATATTGGGTTCCTGTGTTGATGGGAACGAGGGGCCCAGTTA +AGGGGATGGATTTTGCTCTTCTTGTATGTGGGTTGGTCATCCGCTTAAGGAACTTAGAGCTTCCTGACTTTGTTTCCTCT +CTCCATACAACTGGGAACATACCTTGGCTTTTTGTGACCTCTCGCCTCAGCAGGAGGCAGCATGGGCTGAGTTTTCAGTC +TGGAaagttctctctgcttctccctatcttGTCCCCTGGCAAATTCCACCTTGTGCTTTAACCCTGGGCATAGTGTTCTC +TCTTCCATGAAGCCTGCCCTCAGTTGGGTCCACTGTGTCCCCTTCTGTCTCCCCAGCACCTGATGCAGCATTGGGCACAT +AGCAGACATGCTGCAAATAATTACTGAGTGAGTGACAAAAATGAATTCCATGGGCTCTAATGCCAAGTGTACATGCTTCT +AGGATTGACTGGCCAAGCGAGTCAACAGAAACACACTGGCCTCCTGCAGTGTGCAGGGCTCAGGAGAGCCAGATGCCCAT +CACTTTGGAGAAGGGGAAACAGCCAGGAGTGCTCAGAGGAGAAAGCAGTCACTTCTAGCCAGGAAGGTCAAGGGACACTT +TTATGGAGGACATGGCATCTGAGTGGGCCCTGGAAGGATGGGAGTCAAACCTTCCTCTTCCCCATTGTCTCTGTGAGCCC +TGGACTGCTAAATATGTTCCAGGTGGTGAGGCAGGGAAAGCCCCACTTTGCATACTTTCAGTATACTGGCTAGAGAGGGA +ACTGGCAGGAAAGTGGAGGGCTCCCCCTTCCTGCTTGGAAGGCACGGGGAAAGAGAGTCATGTTTTGGGGGCAGGGTGAT +GGGAACACAAGAGACTTTGGGCTAGCAGATTTCCTCCAGGATAATGGCAGTATTTAGGCATTGGTGGATAATCATCCACT +TGGCTagtgagcaaatatttattgagcaccaagtGCATACTAGAGGTTGTAGGGTATCCTAAAATCAATACCACACTGTC +CTTGCCCATTACATAAGCCAAGCATTAGCCTAAGGGCTTTATGTGACGTTTCCTTCAATCCCCTAACAACACTGTGCAAT +GGATACTATTAACACAGATGATGAATCGAGGCATAGGTGGGTTAGGTGGGTCTTAGGGTTCATAACCAGTAAATTGgcca +ggaaggaaaagagaaccaATAGAATAAAcagatattttaaggaattgtcTTATACAATTGTAGGATCTGGCAAATCTGAA +ATCTGTGGGGTAGAGTGGTAGGCTGGAAACTCGGGCAGGGCTTCTTGATTACAGCCTTGAGGcagaatttctctttctcc +agtgaACCTCACATTTTGCTCTTAAGATCTTtgactgattggatgaggcctaaACACATTATCAAGGGTGATCTCCTTTA +CTAAAAGTCAACTGATAGTCTGTGTTAATTACATTCACAAAATACCTTATAGACTAGTGTTTGACCAAGCAGCTGAGCAT +CATAGACAAGCCAGGCTGACAAATGGAACCATCAGAACCAGTAAGTTAGAATTTAAACCCACATTCATAGgacttcaaag +attttttttttttttttttttttttttaagatttattttctggggcacctgcatggcttagtcagttgagcattccactc +ttggttttggctcaggtcataatcttaggttTGGGAAATCGAACCCTGCCtcagtctctgcactcagcagggagtctgct +ttccctctccctctgcctctgctaccacttccccccaccccacctcctaaCCACCACCCTTCTCTGGcttctgcaaaata +tatatatatatatatatatatatatatatatatatatatatatatatttatttgagagagagagagtgtgtgcacatgtg +cacatgagtgaggtggggggagggggaaaggggaggCTTGgtacagggctccatctcaggactctcaGACCATGACAtga +accaaaattaagagtcagaggcttaaccgactgagccacccagatgccccaaagatTTTGGTTCTAAATCCCTAtgctga +aaaaggaaatatttgaactGGGGTTTTAAGGGATGACAGGAATTTGCCAGGTGGCCGAGAGAGACTTTGTTCCCTATAGA +GGAAATGACTTGTGGAAAGGCACCAGGGGTAGGGAAAAGCATGGGAACGTATCAAGGCCGTGGGTACTTGGCCTTGTAAG +CCACTCCTTGGAGACAAGAAGCCCTGCAAATGGTGATAGgttttatttgaatatagaaaGCCCCTTCCTGCTGCTAGGTA +GAGGGTGAATTGGTGATGATGAGGGTAGATCAAGAGCTCAGTTTGGAGGTGTGAAGAGGGTGGAGGTGAGGGTGATGAGG +ACCTGAACTAACACTCTGTGGCCTGAGTGGGTGGTGGCAGAGAGAAGCCGAGGCTTGGGGCTCTTGGGGACCCCTGGTGG +AAATGAAATGTGCTGGGACTTGTTGATGCTCTCTTTCACCAAATGCTGGGGATGCAGCAGGGAGCCAAACAGACAATTCT +GCCTTGTGGAGCTTTTCCATCTCAGGGGAAGCTGACAGCAAACAAGTGAAGATTTCTGGCCTGGGGCcaggcggggggga +ggggcgggtggGCGAAAGCAGGGAGTCAGGGATGAGAGAGGAGAAGGGCTGGGAGTGGGTTTGAGGCTTTAGAGAGGCTG +AGTGGGAGAGCAAACACCTGGGGAGAGAGGTTTGCAAGCTGAGAAGCCAGCTGGCTGgtgaagggatggagggagagagg +agacaatgtaggggcggggaggggggggtgcaGATTTTATAGGGCCTCGCGGGCCATTTGGGGAGCCGTGAGCAGGTGtt +tatatttacatgttttgatatttttatccTCCTGTTTGTATTTGACATGCAcgttccttttactttttaaaccactttgt +tgagttataattgacattCAACACATTTAAACATCACACATTTAAAGCCTGCAGTTTGACATTTTGACTTCTGTGTATAC +CTGTGAAACCAGGACCACAAACGAGAGTGTGAGCACACCCGTCTCGCCCCAGGCGCCTCTGTGGCCTCTCCCTTCCGCTC +CTTCTGCCTCCCATTTCTAAGCAACTGCCGCTCTGTTTTCTGTCACCAGACACAAGTGTACATTTTGTGGACTTTTGTGT +AAATGGAATTACACAGTGGGTGTGCTTTCTTGGCCTGGCCTCTGCCGGTGTCATTGCTGTGAGTTCCATCTCGGTCACTA +CACGTGTCCATAGCTCATTCCTGTTTAATGCTGACGAACCTTCCAGTGTATGGATGTGTCCCAGTTTGTGTGTTCACCTA +TTGGTGAacttttggattatttctatgttttctttttttgttttttgtttttttactttatttagttatttgataGAGGC +AGTGgggagcatgagccaggggaggagcagagggagaagcagactccccactgaacagggagcccgacgccaggcttgat +cccaggaccccaggatcatgacctgagctgaagacagatgcttaatggaccGAGCCACTGTATGTTTTGCTTttgattga +aatataaaattcacgTGGAGAAATGCTGGTATCACAAGTGTGCAGATCAGTGTGTTGCcctcagatcaagaaatagatCT +CCCCCCCAGGACTCTAGAAACCCCTAGTGTGTTCCCTCCCTATCTCACTGTTACTACCACCCAGACTTCTGTTAGCGCAG +AGCAGTTCTGCTTGCCCTCCTATTCCATGAACTCACAAATGtcctatgtgtgtgtctcatccGTGTTCTAAGTGTAAGTC +GTACACTGTTTATCGGTCCTGCCGTTGAAGGGCCTCTGGGAGCTGCCAGCTGGGGCCACTAGGAACAGTGCTGCCCTATG +AATATTCTGGTGCGTGTCTTTGGTGGACATGCATGCACctccctgctgggccctgggggttgCATGTGTCAACCTGAGTA +CGTACTGCCCTGTGGTTCGCCAGAGCCATGTGCTCACGTACACTGCCCCCTGCACAGCCACAGATAGAAGtcctgctgct +ccacatcctcgcTGATGCTTTGTGTTTTCCGTCTTCTCCATCCGGGCGGGTACACAATGGCCTCTTGTACTTTTCATTTG +CGCTTCCCCCGGAAGAGTTCTGACTCGAGTCGGGCTGTGTTGGGATGGCCCCAGCTCACGCTCCCTCGTCCCTGTCTCTT +TCAGGCCGGGTCCTTGTCCACTGCCGGGAAGGTTACAGCCGCTCCCCAACCCTAGTTATCGCGTACCTCATGATGCGGCA +GAAGATGGATGTCAAGTCTGCCCTGAGCATCGTGAGGCAGAACCGTGAGATCGGCCCCAACGATGGTTTCCTGGCCCAGC +TATGCCAGCTCAATGACAAACTAGTCAAGGAGGGGAAATTGAAACTCTAGGGCACTCCCGCTGCCTCTTCTCTAGCGGCA +GACAGGGGAGGCCCTGGT diff --git a/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.fa.fai b/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.fa.fai new file mode 100644 index 00000000..ef6d4231 --- /dev/null +++ b/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.fa.fai @@ -0,0 +1 @@ +chr9 8978 29 80 81 diff --git a/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.gff b/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.gff new file mode 100644 index 00000000..9c4d8a55 --- /dev/null +++ b/test/csq/ENSCAFT00000047742/ENSCAFT00000047742.gff @@ -0,0 +1,11 @@ +chr9 ensembl gene 100 8978 . + . ID=gene:ENSCAFG00000028570;Name=DUSP3;biotype=protein_coding;description=dual specificity phosphatase 3 [Source:VGNC Symbol%3BAcc:VGNC:53236];gene_id=ENSCAFG00000028570;logic_name=ensembl;version=2 +chr9 ensembl mRNA 100 8978 . + . ID=transcript:ENSCAFT00000047742;Parent=gene:ENSCAFG00000028570;Name=DUSP3-201;biotype=protein_coding;transcript_id=ENSCAFT00000047742;version=2 +chr9 ensembl exon 100 104 . + . Parent=transcript:ENSCAFT00000047742;Name=ENSCAFE00000400429;constitutive=1;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSCAFE00000400429;rank=1;version=1 +chr9 ensembl CDS 100 104 . + 0 ID=CDS:ENSCAFP00000041069;Parent=transcript:ENSCAFT00000047742;protein_id=ENSCAFP00000041069 +chr9 ensembl exon 106 225 . + . Parent=transcript:ENSCAFT00000047742;Name=ENSCAFE00000157884;constitutive=1;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSCAFE00000157884;rank=2;version=4 +chr9 ensembl CDS 106 225 . + 1 ID=CDS:ENSCAFP00000041069;Parent=transcript:ENSCAFT00000047742;protein_id=ENSCAFP00000041069 +chr9 ensembl exon 2976 3202 . + . Parent=transcript:ENSCAFT00000047742;Name=ENSCAFE00000157890;constitutive=1;ensembl_end_phase=1;ensembl_phase=2;exon_id=ENSCAFE00000157890;rank=3;version=1 +chr9 ensembl CDS 2976 3202 . + 1 ID=CDS:ENSCAFP00000041069;Parent=transcript:ENSCAFT00000047742;protein_id=ENSCAFP00000041069 +chr9 ensembl CDS 8725 8930 . + 2 ID=CDS:ENSCAFP00000041069;Parent=transcript:ENSCAFT00000047742;protein_id=ENSCAFP00000041069 +chr9 ensembl exon 8725 8978 . + . Parent=transcript:ENSCAFT00000047742;Name=ENSCAFE00000307165;constitutive=1;ensembl_end_phase=-1;ensembl_phase=1;exon_id=ENSCAFE00000307165;rank=4;version=2 +chr9 ensembl three_prime_UTR 8931 8978 . + . Parent=transcript:ENSCAFT00000047742 diff --git a/test/csq/ENSCAFT00000047742/test.txt b/test/csq/ENSCAFT00000047742/test.txt new file mode 100644 index 00000000..603cdbef --- /dev/null +++ b/test/csq/ENSCAFT00000047742/test.txt @@ -0,0 +1,3 @@ +104 CA C synonymous&splice_acceptor&splice_donor|DUSP3|ENSCAFT00000047742|protein_coding +104 CA C synonymous&splice_acceptor&splice_donor|DUSP3|ENSCAFT00000047742|protein_coding + diff --git a/test/csq/ENSCAFT00000047742/test.vcf b/test/csq/ENSCAFT00000047742/test.vcf new file mode 100644 index 00000000..ff2493d4 --- /dev/null +++ b/test/csq/ENSCAFT00000047742/test.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr9 104 . CA C . PASS EXP=synonymous&splice_acceptor&splice_donor|DUSP3|ENSCAFT00000047742|protein_coding