diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 6ac1beb..9dac7b2 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -434,7 +434,7 @@ def to_unicode(obj, encoding='utf-8'): def canonical_transcripts(db, fasta_filename): import pyfaidx - fasta = pyfaidx.Fasta(fasta_filename, as_raw=True) + fasta = pyfaidx.Fasta(fasta_filename, as_raw=False) for gene in db.features_of_type('gene'): # exons_list will contain (CDS_length, total_length, transcript, [exons]) tuples. @@ -449,20 +449,20 @@ def canonical_transcripts(db, fasta_filename): cds_len += exon_length total_len += exon_length - exon_list.append((cds_len, total_len, transcript, exons)) + exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']] )) # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: - best = sorted(exon_list)[0] + best = sorted(exon_list, key=lambda x: x[0], reverse=True)[0] # Otherwise, just choose the longest else: - best = sorted(exon_list, lambda x: x[1])[0] + best = sorted(exon_list, key=lambda x: x[1])[0] print(best) canonical_exons = best[-1] transcript = best[-2] - seqs = [i.sequence(fasta) for i in canonical_exons] + seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')] yield transcript, ''.join(seqs)