From 62bc4ed46ef1684e6fbc05bc169910b771834ec8 Mon Sep 17 00:00:00 2001 From: evgeny Date: Mon, 2 Mar 2020 12:26:12 +0200 Subject: [PATCH 1/4] gffutils.feature expects Sequence objects from pyfaidx, not raw strings --- gffutils/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 6ac1bebf..adf9f419 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -434,7 +434,7 @@ def to_unicode(obj, encoding='utf-8'): def canonical_transcripts(db, fasta_filename): import pyfaidx - fasta = pyfaidx.Fasta(fasta_filename, as_raw=True) + fasta = pyfaidx.Fasta(fasta_filename, as_raw=False) for gene in db.features_of_type('gene'): # exons_list will contain (CDS_length, total_length, transcript, [exons]) tuples. From ba2995f6e500c146795da722fae5e36ba65c9998 Mon Sep 17 00:00:00 2001 From: evgeny Date: Mon, 2 Mar 2020 12:40:51 +0200 Subject: [PATCH 2/4] Only take CDS length into account when it exists (otherwise uncomparable elements such as Features might get into play) --- gffutils/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffutils/helpers.py b/gffutils/helpers.py index adf9f419..e79cfd7e 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -453,7 +453,7 @@ def canonical_transcripts(db, fasta_filename): # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: - best = sorted(exon_list)[0] + best = sorted(exon_list, key=lambda x: x[0], reverse=True)[0] # Otherwise, just choose the longest else: best = sorted(exon_list, lambda x: x[1])[0] From 87fa72a466657d9572d4b45aeee3d2326c20dce7 Mon Sep 17 00:00:00 2001 From: evgeny Date: Mon, 2 Mar 2020 12:49:04 +0200 Subject: [PATCH 3/4] key argument name must be explicitly specified in Python3.6+ --- gffutils/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffutils/helpers.py b/gffutils/helpers.py index e79cfd7e..8e6f8fbd 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -456,7 +456,7 @@ def canonical_transcripts(db, fasta_filename): best = sorted(exon_list, key=lambda x: x[0], reverse=True)[0] # Otherwise, just choose the longest else: - best = sorted(exon_list, lambda x: x[1])[0] + best = sorted(exon_list, key=lambda x: x[1])[0] print(best) From 5c78d2db238afa2a02092237ccc34e83d00a2640 Mon Sep 17 00:00:00 2001 From: evgeny Date: Mon, 2 Mar 2020 13:38:24 +0200 Subject: [PATCH 4/4] Use only CDS exons if CDS exists and don't assume sorting. Tested against Ensembl GFFs --- gffutils/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 8e6f8fbd..9dac7b2a 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -449,7 +449,7 @@ def canonical_transcripts(db, fasta_filename): cds_len += exon_length total_len += exon_length - exon_list.append((cds_len, total_len, transcript, exons)) + exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']] )) # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: @@ -462,7 +462,7 @@ def canonical_transcripts(db, fasta_filename): canonical_exons = best[-1] transcript = best[-2] - seqs = [i.sequence(fasta) for i in canonical_exons] + seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')] yield transcript, ''.join(seqs)