Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Funcotator - removing the required transcript file #8863

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.broadinstitute.hellbender.tools.funcotator.compositeoutput.CompositeOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.genelistoutput.GeneListOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.mafOutput.MafOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
Expand Down Expand Up @@ -108,15 +109,19 @@ public FuncotatorEngine(final BaseFuncotatorArgumentCollection funcotatorArgs,
this.funcotatorArgs = funcotatorArgs;
inputMetadata = metadata;

// Determine whether we have to convert given variants from B37 to HG19:
mustConvertInputContigsToHg19 = determineReferenceAndDatasourceCompatibility();

dataSourceFactories = funcotationFactories;
// Note: The dataSourceFactories must be sorted to ensure that as we iterate through them
// to create funcotations, the inherent dependencies between different funcotation types are preserved.
// For example, most FuncotationFactories require that a GencodeFuncotation is present before they can
// create their annotations. This sorting enables such dependencies.
dataSourceFactories.sort(DataSourceUtils::datasourceComparator);

// Determine whether we have to convert given variants from B37 to HG19:
mustConvertInputContigsToHg19 = determineReferenceAndDatasourceCompatibility();
// Now set the GencodeFuncotationFactory to convert exon sequences to B37 for transcripts, if necessary:
// TODO: (NOTE:) This is a workaround for the B37 / HG19 problem. We really need to make B37 datasources.
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
dataSourceFactories.stream().filter(f -> f.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)).forEach(f -> ((GencodeFuncotationFactory)f).setDoExonContigConversionToB37ForTranscripts(mustConvertInputContigsToHg19));

// Read in the custom variant classification order file here so that it can be shared across all engines:
if (funcotatorArgs.customVariantClassificationOrderFile != null) {
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package org.broadinstitute.hellbender.utils;

import htsjdk.samtools.util.Locatable;
import javassist.Loader;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.funcotator.FuncotatorUtils;

import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
* Class with utility methods for working with transcripts.
*/
public class TranscriptUtils {

//==================================================================================================================
// Public Static Members:

/**
* Extracts the transcript sequence from the reference context given the exons.
* The given exons are assumed to be all on the same contig.
* The exons will be sorted in the order they appear in the genome before extracting the sequences.
* @param refContext the reference context. Must not be {@code null}.
* @param exons the exons of the transcript. Must not be {@code null}.
* @return the transcript sequence as coded in the given reference context.
*/
public static final String extractTrascriptFromReference(final ReferenceContext refContext,
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
final List<Locatable> exons) {
return extractTrascriptFromReference(refContext, exons, false);
}

/**
* Extracts the transcript sequence from the reference context given the exons.
* The given exons are assumed to be all on the same contig.
* The exons will be sorted in the order they appear in the genome before extracting the sequences.
* @param refContext the reference context. Must not be {@code null}.
* @param exons the exons of the transcript. Must not be {@code null}.
* @param convertHg19ContigToB37Contig whether to convert the contig names from hg19 to b37.
* @return the transcript sequence as coded in the given reference context.
*/
public static final String extractTrascriptFromReference(final ReferenceContext refContext,
final List<Locatable> exons,
final boolean convertHg19ContigToB37Contig ) {

// TODO: THIS CONVERSION IS BAD. WE MUST REFACTOR THIS EVENTUALLY TO REMOVE THIS AND MAKE SOME B37 DATA SOURCES.
Utils.nonNull(refContext);
Utils.nonNull(exons);

final StringBuilder transcript = new StringBuilder();

// We should iterate through the list of exons in sorted order so we can simply append them together.
for (final Locatable exon : exons.stream().sorted(Comparator.comparingInt(Locatable::getStart).thenComparing(Locatable::getEnd)).toList() ) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again this ordering/overlapping assumption

final SimpleInterval exonInterval;

if ( convertHg19ContigToB37Contig ) {
exonInterval = new SimpleInterval(FuncotatorUtils.convertHG19ContigToB37Contig(exon.getContig()), exon.getStart(), exon.getEnd());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might be a little cleaner to read with a ternary (alternatively make some helper method somewhere called "getConvertedContigName()" (that goes for GencodeFuncotationFactory as well which also threads this a bunch)

}
else {
exonInterval = new SimpleInterval(exon.getContig(), exon.getStart(), exon.getEnd());
}
transcript.append(new String(refContext.getBases(exonInterval)));
}
return transcript.toString();
}

//==================================================================================================================
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
// Private Static Members:

//==================================================================================================================
// Private Members:

//==================================================================================================================
// Constructors:

//==================================================================================================================
// Override Methods:

//==================================================================================================================
// Static Methods:

//==================================================================================================================
// Instance Methods:

//==================================================================================================================
// Helper Data Types:

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
*
* Created by jonn on 7/21/17.
*/
public abstract class GencodeGtfFeature implements Feature, Comparable<GencodeGtfFeature> {
public abstract class GencodeGtfFeature implements Feature, Locatable, Comparable<GencodeGtfFeature> {

private static final Logger logger = LogManager.getLogger(GencodeGtfFeature.class);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,8 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
MafOutputRendererConstants.FieldName_Reference_Allele, MafOutputRendererConstants.FieldName_Tumor_Seq_Allele1, MafOutputRendererConstants.FieldName_Tumor_Seq_Allele2, MafOutputRendererConstants.FieldName_Genome_Change, MafOutputRendererConstants.FieldName_Annotation_Transcript, MafOutputRendererConstants.FieldName_Transcript_Strand, MafOutputRendererConstants.FieldName_Transcript_Exon, MafOutputRendererConstants.FieldName_Transcript_Position, MafOutputRendererConstants.FieldName_cDNA_Change, MafOutputRendererConstants.FieldName_Codon_Change, MafOutputRendererConstants.FieldName_Protein_Change, MafOutputRendererConstants.FieldName_gc_content, MafOutputRendererConstants.FieldName_ref_context, MafOutputRendererConstants.FieldName_Other_Transcripts);

private static String hg38Chr3Ref;
private static String b37Chr3Ref;
private static String b37Chr2Ref;
private static String hg19Chr3Ref;
private static String b37Chr2Ref;
private static String hg19Chr19Ref;

private static String eColiRef;
Expand All @@ -126,9 +125,8 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
}

hg38Chr3Ref = FuncotatorReferenceTestUtils.retrieveHg38Chr3Ref();
b37Chr3Ref = FuncotatorReferenceTestUtils.retrieveB37Chr3Ref();
b37Chr2Ref = FuncotatorReferenceTestUtils.retrieveB37Chr2Ref();
hg19Chr3Ref = FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref();
b37Chr2Ref = FuncotatorReferenceTestUtils.retrieveB37Chr2Ref();
hg19Chr19Ref = FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref();
eColiRef = FuncotatorReferenceTestUtils.retrieveEcoliReference();
}
Expand Down Expand Up @@ -749,7 +747,7 @@ public void testCanAnnotateMixedContigHg19Clinvar() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -788,7 +786,7 @@ public void testManualAnnotationsCorrectness() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -867,7 +865,7 @@ public void testXsvLocatableAnnotationsHaveOnlyOneEntryForMultiHitLocations() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
XSV_CLINVAR_MULTIHIT_TEST_VCF,
outputFile,
b37Chr2Ref,
b37Reference,
DS_XSV_CLINVAR_TESTS,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -897,7 +895,7 @@ public void testXsvLocatableAnnotationsHaveCorrectColsForOnlyOnePositionSpecifie
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
XSV_CLINVAR_COL_TEST_VCF,
outputFile,
b37Chr2Ref,
b37Reference,
DS_XSV_CLINVAR_TESTS,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1067,8 +1065,8 @@ public void testExclusionFromDatasourceVcfToVcf() {
@DataProvider(name = "provideForMafVcfConcordance")
final Object[][] provideForMafVcfConcordance() {
return new Object[][]{
{PIK3CA_VCF_HG19_SNPS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 15},
{PIK3CA_VCF_HG19_INDELS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 57},
{PIK3CA_VCF_HG19_SNPS, b37Reference, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 15},
{PIK3CA_VCF_HG19_INDELS, b37Reference, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 57},
{MUC16_VCF_HG19, hg19Chr19Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER, false, 2057},
{
PIK3CA_VCF_HG38,
Expand All @@ -1080,7 +1078,7 @@ final Object[][] provideForMafVcfConcordance() {
false,
104,
},
{PIK3CA_VCF_HG19_INDELS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, VCF_FIELDS_GENCODE_19_DS, MAF_FIELDS_GENCODE_DS, DS_PIK3CA_DIR, true, 57},
{PIK3CA_VCF_HG19_INDELS, b37Reference, FuncotatorTestConstants.REFERENCE_VERSION_HG19, VCF_FIELDS_GENCODE_19_DS, MAF_FIELDS_GENCODE_DS, DS_PIK3CA_DIR, true, 57},
};
}

Expand Down Expand Up @@ -1325,7 +1323,7 @@ public void testVcfDatasourceAccountsForAltAlleles() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19_ALTS,
vcfOutputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
vcfOutputFormatType,
Expand Down Expand Up @@ -1463,7 +1461,7 @@ private AnnotatedIntervalCollection runPik3caHg19VcfToMaf(final Set<String> excl
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorArgumentDefinitions.OutputFormatType.MAF,
Expand Down Expand Up @@ -1502,7 +1500,7 @@ public void testVCFToVCFPreservesFields() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1555,7 +1553,7 @@ public void testMafCustomCountFields(final String tnVcf) {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
tnVcf,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1611,7 +1609,7 @@ public void testMafCustomCountFieldsTumorOnly(final String tnVcf) {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
tnVcf,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1665,7 +1663,7 @@ private void runSomaticVcf(final String vcfFile) {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
vcfFile,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand All @@ -1685,7 +1683,7 @@ public void testSequenceDictionaryCheck() {
// The input VCF and Reference file are incompatible because
// the reference file dictionary has only chromosome 2 and the
// input VCF has a dictionary that contains all contigs for HG19.
// Therefore the reference dictionary is NOT a superset of the input VCF dictionary.
// Therefore, the reference dictionary is NOT a superset of the input VCF dictionary.

final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
XSV_CLINVAR_COL_TEST_VCF,
Expand Down Expand Up @@ -1747,7 +1745,7 @@ public void testNoVariantsProduceMaf() {

arguments.addVCF(new File(EMPTY_VCF));
arguments.addOutput(outputFile);
arguments.addReference(new File(b37Chr3Ref));
arguments.addReference(new File(b37Reference));
arguments.add(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR);
arguments.add(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19);
arguments.add(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString());
Expand Down Expand Up @@ -1777,7 +1775,7 @@ public void testEnsureDbSnpInMaf() {

arguments.addVCF(new File(MAF_DBSNP_TEST));
arguments.addOutput(outputFile);
arguments.addReference(new File(b37Chr3Ref));
arguments.addReference(new File(b37Reference));
arguments.add(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, PIK3CA_DBSNP_DS);
arguments.add(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19);
arguments.add(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1353,8 +1353,7 @@ void testGetVariantClassificationForCodingRegions(final int chromosomeNumber,
transcript,
exonPositionList,
muc16TranscriptIdMap,
muc16TranscriptDataSource,
true);
false);

final GencodeFuncotation.VariantClassification varClass = GencodeFuncotationFactory.createVariantClassification(
variantContext,
Expand Down Expand Up @@ -1442,8 +1441,7 @@ void testSpliceSiteWindowSettings(final int chromosomeNumber,
transcript,
exonPositionList,
muc16TranscriptIdMap,
muc16TranscriptDataSource,
true);
false);

final GencodeFuncotation.VariantClassification varClass = GencodeFuncotationFactory.createVariantClassification(
variantContext,
Expand Down Expand Up @@ -2739,10 +2737,23 @@ public void testFivePrimeFlankSorting() {
new FlankSettings(5000, 0),
"TEST")) {

// We have to set the contig conversion because of the datasources / B37 mismatch:
funcotationFactory.setDoExonContigConversionToB37ForTranscripts(true);

final List<Funcotation> gencodeFuncotationList = funcotationFactory.createFuncotations(vcHg19, referenceContext, featureContext);

System.out.println(gencodeFuncotationList.size());
}
}

// @Test
// public void testGetCodingSequenceFromTranscriptFasta() {
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
//
//
// getCodingSequenceFromReferenceContext();
//
// final String transcriptFastaFile = FuncotatorTestConstants.PIK3CA_ALL_TRANSCRIPTS_GENCODE_TRANSCRIPT_FASTA_FILE;
// final String transcriptId = "ENST00000263967.1";
// final String expectedCodingSequence = "ATGAGTGGGAGCTGCT";
// }
}
Loading
Loading