From ba82829a4b37c48d95cefffc3ae878b0986d261a Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Wed, 10 Sep 2025 10:58:37 -0400
Subject: [PATCH 1/9] Build RNU4ATAC gene patches and export to ES
---
.../src/data_pipeline/data_types/gene.py | 60 +++++++++++++++++++
.../pipelines/export_to_elasticsearch.py | 10 ++++
.../data_pipeline/pipelines/gene_patches.py | 19 ++++++
3 files changed, 89 insertions(+)
create mode 100644 data-pipeline/src/data_pipeline/pipelines/gene_patches.py
diff --git a/data-pipeline/src/data_pipeline/data_types/gene.py b/data-pipeline/src/data_pipeline/data_types/gene.py
index cfa469b19..97de00c33 100644
--- a/data-pipeline/src/data_pipeline/data_types/gene.py
+++ b/data-pipeline/src/data_pipeline/data_types/gene.py
@@ -117,6 +117,66 @@ def reject_par_y_genes(genes_path=None):
return genes
+def patch_rnu4atac(genes_path=None):
+ gene_symbol = "RNU4ATAC"
+
+ genes = hl.read_table(genes_path)
+ genes = genes.filter(genes.symbol == gene_symbol)
+
+ correct_start = 121530880
+ correct_stop = 121531009
+ correct_start_locus = hl.locus(contig="chr2", pos=correct_start, reference_genome="GRCh38")
+ correct_stop_locus = hl.locus(contig="chr2", pos=correct_stop, reference_genome="GRCh38")
+ correct_xstart = x_position(correct_start_locus)
+ correct_xstop = x_position(correct_stop_locus)
+
+ correct_interval = hl.interval(correct_start_locus, correct_stop_locus, includes_start=True, includes_end=True)
+
+ correct_exon = hl.struct(
+ feature_type="exon", start=correct_start, stop=correct_stop, xstart=correct_xstart, xstop=correct_xstop
+ )
+
+ incorrect_transcript = genes.take(1)[0].transcripts[0]
+ correct_transcript = hl.struct(
+ interval=correct_interval,
+ transcript_version="2",
+ gene_version="2",
+ start=correct_start,
+ stop=correct_stop,
+ xstart=correct_xstart,
+ xstop=correct_xstop,
+ exons=hl.array([correct_exon]),
+ transcript_id=incorrect_transcript.transcript_id,
+ gene_id=incorrect_transcript.gene_id,
+ chrom=incorrect_transcript.chrom,
+ strand=incorrect_transcript.strand,
+ reference_genome=incorrect_transcript.reference_genome,
+ gtex_tissue_expression=incorrect_transcript.gtex_tissue_expression,
+ refseq_id="NR_023343",
+ refseq_version="3",
+ )
+
+ correct_mane_select_transcript = hl.struct(
+ matched_gene_version="2",
+ ensembl_id="ENST00000580972",
+ ensembl_version="2",
+ refseq_id="NR_023343",
+ refseq_version="3",
+ )
+
+ genes = genes.annotate(
+ gene_version=2,
+ start=correct_start,
+ stop=correct_stop,
+ xstart=correct_xstart,
+ xstop=correct_xstop,
+ exons=[correct_exon],
+ transcripts=[correct_transcript],
+ mane_select_transcript=correct_mane_select_transcript,
+ )
+ return genes
+
+
###############################################
# Transcripts #
###############################################
diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
index bf074c45b..680a96f41 100644
--- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
+++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
@@ -41,6 +41,7 @@
from data_pipeline.pipelines.gnomad_v4_cnvs import pipeline as gnomad_v4_cnvs_pipeline
from data_pipeline.pipelines.gnomad_v4_lof_curation_results import pipeline as gnomad_v4_lof_curation_results_pipeline
+from data_pipeline.pipelines.gene_patches import pipeline as gnomad_v4_gene_patches
logger = logging.getLogger("gnomad_data_pipeline")
@@ -88,6 +89,15 @@ def add_liftover_document_id(ds):
"block_size": 200,
},
},
+ "gene_patches": {
+ "get_table": lambda: hl.read_table(gnomad_v4_gene_patches.get_output("gene_patches").get_output_path()),
+ "args": {
+ "index": "genes_grch38_patches",
+ "index_fields": ["gene_id", "symbol_upper_case", "search_terms", "xstart", "xstop"],
+ "id_field": "gene_id",
+ "block_size": 200,
+ },
+ },
##############################################################################################################
# Transcripts
##############################################################################################################
diff --git a/data-pipeline/src/data_pipeline/pipelines/gene_patches.py b/data-pipeline/src/data_pipeline/pipelines/gene_patches.py
new file mode 100644
index 000000000..8c24ff183
--- /dev/null
+++ b/data-pipeline/src/data_pipeline/pipelines/gene_patches.py
@@ -0,0 +1,19 @@
+import hail as hl
+
+from data_pipeline.pipeline import Pipeline, run_pipeline
+
+from data_pipeline.data_types.gene import patch_rnu4atac
+
+pipeline = Pipeline()
+
+pipeline.add_task(
+ "patch_rnu4atac_grch38",
+ patch_rnu4atac,
+ "/genes/genes_grch38_patched.ht",
+ {"genes_path": "gs://gnomad-v4-data-pipeline/output/genes/genes_grch38_annotated_6.ht"},
+)
+
+pipeline.set_outputs({"gene_patches": "patch_rnu4atac_grch38"})
+
+if __name__ == "__main__":
+ run_pipeline(pipeline)
From 2edf0107875a1592fc486530e64206ce97362975 Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Wed, 10 Sep 2025 10:41:00 -0400
Subject: [PATCH 2/9] Use RNU4ATAC gene patches in API
---
.../data_pipeline/pipelines/gene_patches.py | 2 -
graphql-api/src/elasticsearch.ts | 24 ++-
graphql-api/src/queries/gene-queries.ts | 200 +++++++++++++-----
.../queries/helpers/elasticsearch-helpers.ts | 23 +-
4 files changed, 175 insertions(+), 74 deletions(-)
diff --git a/data-pipeline/src/data_pipeline/pipelines/gene_patches.py b/data-pipeline/src/data_pipeline/pipelines/gene_patches.py
index 8c24ff183..022b18613 100644
--- a/data-pipeline/src/data_pipeline/pipelines/gene_patches.py
+++ b/data-pipeline/src/data_pipeline/pipelines/gene_patches.py
@@ -1,5 +1,3 @@
-import hail as hl
-
from data_pipeline.pipeline import Pipeline, run_pipeline
from data_pipeline.data_types.gene import patch_rnu4atac
diff --git a/graphql-api/src/elasticsearch.ts b/graphql-api/src/elasticsearch.ts
index 7638d66c3..6ebd46380 100644
--- a/graphql-api/src/elasticsearch.ts
+++ b/graphql-api/src/elasticsearch.ts
@@ -82,8 +82,8 @@ const scheduleElasticsearchRequest = (fn: any) => {
const limitedElastic = {
indices: elastic.indices,
clearScroll: elastic.clearScroll.bind(elastic),
- search: (...args: Parameters) =>
- scheduleElasticsearchRequest(() => elastic.search(...args)).then((response) => {
+ search: (args: elasticsearch.RequestParams.Search) =>
+ scheduleElasticsearchRequest(() => elastic.search(args)).then((response) => {
// @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'.
if (response.body.timed_out) {
throw new Error('Elasticsearch search timed out')
@@ -95,8 +95,8 @@ const limitedElastic = {
}
return response
}),
- scroll: (...args: Parameters) =>
- scheduleElasticsearchRequest(() => elastic.scroll(...args)).then((response) => {
+ scroll: (args: { scroll: string; scrollId?: string }) =>
+ scheduleElasticsearchRequest(() => elastic.scroll(args)).then((response) => {
// @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'.
if (response.body.timed_out) {
throw new Error('Elasticsearch scroll timed out')
@@ -117,10 +117,22 @@ const limitedElastic = {
}
return response
}),
- get: (...args: Parameters) =>
- scheduleElasticsearchRequest(() => elastic.get(...args)),
+ get: (args: { index: string; type: '_doc'; id: string }) =>
+ scheduleElasticsearchRequest(() => elastic.get(args)),
mget: (...args: Parameters) =>
scheduleElasticsearchRequest(() => elastic.mget(...args)),
}
+export type LimitedElasticClient = typeof limitedElastic
+
+export type GetResponse = {
+ body: { _source: { value: Record } }
+}
+
+export type SearchHit = { _id: string; _source: any }
+
+export type SearchResponse = {
+ body: { hits: { total: { value: number }; hits: SearchHit[] }; _scroll_id?: string }
+}
+
export { limitedElastic as client }
diff --git a/graphql-api/src/queries/gene-queries.ts b/graphql-api/src/queries/gene-queries.ts
index ac7858c21..57b05e986 100644
--- a/graphql-api/src/queries/gene-queries.ts
+++ b/graphql-api/src/queries/gene-queries.ts
@@ -1,43 +1,69 @@
+import elasticsearch from '@elastic/elasticsearch'
import { withCache } from '../cache'
import { fetchAllSearchResults } from './helpers/elasticsearch-helpers'
-const GENE_INDICES = {
- GRCh37: 'genes_grch37',
- GRCh38: 'genes_grch38',
-}
+import { ReferenceGenome } from '@gnomad/dataset-metadata/metadata'
+import { LimitedElasticClient, GetResponse, SearchResponse, SearchHit } from '../elasticsearch'
-const _fetchGeneById = async (esClient: any, geneId: any, referenceGenome: any) => {
- try {
- const response = await esClient.get({
- // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message
- index: GENE_INDICES[referenceGenome],
- type: '_doc',
- id: geneId,
- })
+type GeneIndex = 'genes_grch37' | 'genes_grch38' | 'genes_grch38_patches-2025-10-23--19-35'
+
+type GeneSearchRegion = { reference_genome: ReferenceGenome; xstart: number; xstop: number }
- return response.body._source.value
- } catch (err) {
- // meta will not be present if the request times out in the queue before reaching ES
- // @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'.
- if (err.meta && err.meta.body && err.meta.body.found === false) {
- return null
+const GENE_INDICES: Record = {
+ // Order matters here: later indices take precedence over earlier
+ GRCh37: ['genes_grch37'],
+ GRCh38: ['genes_grch38', 'genes_grch38_patches-2025-10-23--19-35'],
+}
+
+const _fetchGeneById = async (
+ esClient: LimitedElasticClient,
+ geneId: string,
+ referenceGenome: ReferenceGenome
+) => {
+ const indices = GENE_INDICES[referenceGenome]
+ const requests = indices.map(
+ (index) =>
+ esClient
+ .get({
+ index,
+ type: '_doc',
+ id: geneId,
+ })
+ .catch((err) => {
+ // meta will not be present if the request times out in the queue before reaching ES
+ if (err.meta && err.meta.body && err.meta.body.found === false) {
+ return null
+ }
+ throw err
+ }) as Promise
+ )
+ return Promise.all(requests).then(
+ (responses) => {
+ const responsesWithValue = responses.filter((response) => response !== null)
+ return responsesWithValue.length > 0
+ ? responsesWithValue[responsesWithValue.length - 1]!.body._source.value
+ : null
+ },
+ (err) => {
+ throw err
}
- throw err
- }
+ )
}
export const fetchGeneById = withCache(
_fetchGeneById,
- (_: any, geneId: any, referenceGenome: any) => `gene:${geneId}:${referenceGenome}`,
+ (_: any, geneId: string, referenceGenome: ReferenceGenome) => `gene:${geneId}:${referenceGenome}`,
{ expiration: 86400 }
)
-export const fetchGeneBySymbol = async (esClient: any, geneSymbol: any, referenceGenome: any) => {
- const response = await esClient.search({
- // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message
- index: GENE_INDICES[referenceGenome],
- type: '_doc',
+export const fetchGeneBySymbol = async (
+ esClient: LimitedElasticClient,
+ geneSymbol: string,
+ referenceGenome: ReferenceGenome
+) => {
+ const indices = GENE_INDICES[referenceGenome]
+ const responses = await searchMultipleIndices(esClient, indices, {
body: {
query: {
bool: {
@@ -48,20 +74,22 @@ export const fetchGeneBySymbol = async (esClient: any, geneSymbol: any, referenc
size: 1,
})
- if (response.body.hits.total.value === 0) {
+ const responsesWithValue = responses.filter((response) => response.body.hits.total.value > 0)
+ if (responsesWithValue.length === 0) {
return null
}
- return response.body.hits.hits[0]._source.value
+ return responsesWithValue[responsesWithValue.length - 1].body.hits.hits[0]._source.value
}
-export const fetchGenesByRegion = async (esClient: any, region: any) => {
- const { reference_genome: referenceGenome, xstart, xstop } = region
+export const fetchGenesByRegion = async (
+ esClient: LimitedElasticClient,
+ region: GeneSearchRegion
+) => {
+ const { reference_genome, xstart, xstop } = region
+ const indices = GENE_INDICES[reference_genome]
- const hits = await fetchAllSearchResults(esClient, {
- // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message
- index: GENE_INDICES[referenceGenome],
- type: '_doc',
+ const hits = await fetchAllSearchResultsFromMultipleIndices(esClient, indices, {
size: 200,
_source: [
'value.exons',
@@ -98,28 +126,91 @@ export const fetchGenesByRegion = async (esClient: any, region: any) => {
},
})
- return hits.map((hit: any) => hit._source.value)
+ const mergedHits = mergeHitsById(hits.flat())
+ return mergedHits.map((hit) => hit._source.value)
+}
+
+const fetchAllSearchResultsFromMultipleIndices = async (
+ esClient: LimitedElasticClient,
+ indices: string[],
+ searchParams: elasticsearch.RequestParams.Search
+) => {
+ const requests = indices.map((index) =>
+ fetchAllSearchResults(esClient, {
+ index,
+ type: '_doc',
+ ...searchParams,
+ })
+ )
+ return Promise.all(requests)
}
-export const fetchGenesMatchingText = async (esClient: any, query: any, referenceGenome: any) => {
+const searchMultipleIndices = async (
+ esClient: LimitedElasticClient,
+ indices: string[],
+ searchParams: elasticsearch.RequestParams.Search
+): Promise => {
+ const requests = indices.map(
+ (index) =>
+ esClient.search({
+ index,
+ type: '_doc',
+ ...searchParams,
+ }) as Promise
+ )
+
+ return Promise.all(requests)
+}
+
+const mergeHitsById = (hits: SearchHit[]): SearchHit[] => {
+ const ids: string[] = []
+ const idsToHits: Record = {}
+ hits.forEach((hit) => {
+ if (idsToHits[hit._id] === undefined) {
+ ids.push(hit._id)
+ }
+ idsToHits[hit._id] = hit
+ })
+ return ids.map((id) => idsToHits[id])
+}
+
+const mergeResponsesById = (responses: SearchResponse[]) => {
+ const ids: string[] = []
+ const idsToDocs: Record = {}
+ responses.forEach((response) =>
+ response.body.hits.hits.forEach((hit) => {
+ if (idsToDocs[hit._id] === undefined) {
+ ids.push(hit._id)
+ }
+ idsToDocs[hit._id] = hit._source
+ })
+ )
+
+ return ids.map((id) => idsToDocs[id])
+}
+
+export const fetchGenesMatchingText = async (
+ esClient: LimitedElasticClient,
+ query: string,
+ referenceGenome: ReferenceGenome
+) => {
const upperCaseQuery = query.toUpperCase()
// Ensembl ID
if (/^ENSG\d{11}$/.test(upperCaseQuery)) {
const gene = await _fetchGeneById(esClient, upperCaseQuery, referenceGenome)
- return [
- {
- ensembl_id: gene.gene_id,
- symbol: gene.symbol,
- },
- ]
+ return (
+ gene && [
+ {
+ ensembl_id: gene.gene_id,
+ symbol: gene.symbol,
+ },
+ ]
+ )
}
// Symbol
- const response = await esClient.search({
- // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message
- index: GENE_INDICES[referenceGenome],
- type: '_doc',
+ const responses = await searchMultipleIndices(esClient, GENE_INDICES[referenceGenome], {
_source: ['gene_id', 'value.gene_version', 'value.symbol'],
body: {
query: {
@@ -134,15 +225,16 @@ export const fetchGenesMatchingText = async (esClient: any, query: any, referenc
size: 5,
})
- if (response.body.hits.total.value === 0) {
+ const responsesWithValue = responses.filter((response) => response.body.hits.total.value !== 0)
+ if (responsesWithValue.length === 0) {
return []
}
- return response.body.hits.hits
- .map((hit: any) => hit._source)
- .map((doc: any) => ({
- ensembl_id: doc.gene_id,
- ensembl_version: doc.value.gene_version,
- symbol: doc.value.symbol,
- }))
+ const mergedDocs = mergeResponsesById(responsesWithValue)
+
+ return mergedDocs.map((doc) => ({
+ ensembl_id: doc.gene_id,
+ ensembl_version: doc.value.gene_version,
+ symbol: doc.value.symbol,
+ }))
}
diff --git a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
index 5ec797ddb..ee70cb1fa 100644
--- a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
+++ b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
@@ -1,43 +1,42 @@
+import { LimitedElasticClient, SearchResponse, SearchHit } from '../../elasticsearch'
+
/**
* Search and then scroll to retrieve all pages of search results.
*
- * @param {elasticsearch.Client} client Elasticsearch client
- * @param {Object} searchParams Argument to elasticsearch.Client#search
- * @return {Object[]} Combined list of hits from all responses
*/
-export const fetchAllSearchResults = async (client: any, searchParams: any) => {
- const allResults: any = []
- const responseQueue = []
+export const fetchAllSearchResults = async (client: LimitedElasticClient, searchParams: any) => {
+ const allResults: SearchHit[] = []
+ const responseQueue: SearchResponse[] = []
const size = searchParams.size || 1000
const scroll = searchParams.scroll || '30s'
responseQueue.push(
- await client.search({
+ await (client.search({
...searchParams,
scroll,
size,
- })
+ }) as Promise)
)
while (responseQueue.length) {
- const response = responseQueue.shift()
+ const response = responseQueue.shift()!
allResults.push(...response.body.hits.hits)
if (allResults.length === response.body.hits.total.value) {
// eslint-disable-next-line no-await-in-loop
await client.clearScroll({
- scrollId: response.body._scroll_id, // eslint-disable-line no-underscore-dangle
+ scroll_id: response.body._scroll_id, // eslint-disable-line no-underscore-dangle
})
break
}
responseQueue.push(
// eslint-disable-next-line no-await-in-loop
- await client.scroll({
+ await (client.scroll({
scroll,
scrollId: response.body._scroll_id, // eslint-disable-line no-underscore-dangle
- })
+ }) as Promise)
)
}
From 17d59da0c1901049133b82ac599c8a3791d6a891 Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Tue, 30 Sep 2025 15:30:34 -0400
Subject: [PATCH 3/9] Build RNU4ATAC variant patches and export to ES
---
.../variant/patch_rnu4atac_variants.py | 48 +++++++++++++++++++
.../annotate_transcript_consequences.py | 48 ++++++++++---------
.../pipelines/export_to_elasticsearch.py | 24 ++++++++++
.../pipelines/variant_patches.py | 20 ++++++++
4 files changed, 118 insertions(+), 22 deletions(-)
create mode 100644 data-pipeline/src/data_pipeline/data_types/variant/patch_rnu4atac_variants.py
create mode 100644 data-pipeline/src/data_pipeline/pipelines/variant_patches.py
diff --git a/data-pipeline/src/data_pipeline/data_types/variant/patch_rnu4atac_variants.py b/data-pipeline/src/data_pipeline/data_types/variant/patch_rnu4atac_variants.py
new file mode 100644
index 000000000..e682c398e
--- /dev/null
+++ b/data-pipeline/src/data_pipeline/data_types/variant/patch_rnu4atac_variants.py
@@ -0,0 +1,48 @@
+import hail as hl
+
+from data_pipeline.data_types.variant.transcript_consequence.annotate_transcript_consequences import (
+ annotate_transcript_consequences_in_table,
+)
+
+
+def patch_rnu4atac_variants(vepped_path=None, freq_path=None, transcripts_data={}):
+ veps = hl.read_table(vepped_path)
+ freqs = hl.read_table(freq_path)
+ # Drop all consequences except for gene RNU4ATAC and transcript ENST00000580972
+ veps = veps.filter(veps.vep.transcript_consequences.any(lambda tc: tc.gene_symbol == "RNU4ATAC"))
+ veps = veps.annotate(
+ vep=veps.vep.annotate(
+ transcript_consequences=veps.vep.transcript_consequences.filter(
+ lambda tc: tc.transcript_id == "ENST00000580972"
+ )
+ )
+ )
+ veps = veps.filter(veps.vep.transcript_consequences.length() > 0)
+ veps = annotate_transcript_consequences_in_table(veps, transcripts_data=transcripts_data)
+
+ # We filter the data again here because annotate_transcript_consequences_in_table removes consequences with unimportant consequences terms
+ veps = veps.filter(veps.transcript_consequences.length() > 0)
+ veps = veps.annotate(
+ transcript_consequences=veps.transcript_consequences.map(
+ lambda tc: tc.annotate(
+ transcript_version="2",
+ gene_version="2",
+ is_mane_select=False,
+ is_mane_select_version=False,
+ refseq_id=hl.null(hl.tstr),
+ refseq_version=hl.null(hl.tstr),
+ )
+ )
+ )
+ veps = veps.annotate(
+ transcript_consequences=veps.transcript_consequences.map(
+ lambda tc: tc.drop("polyphen_prediction", "sift_prediction")
+ )
+ )
+
+ freqs = freqs.drop("transcript_consequences")
+ veps = veps.join(freqs)
+
+ # Include just consequences and index fields
+ veps = veps.select(veps.variant_id, veps.rsids, veps.caid, veps.vrs, veps.transcript_consequences)
+ return veps
diff --git a/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py b/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py
index 1ec656b03..103d876ce 100644
--- a/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py
+++ b/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py
@@ -3,13 +3,18 @@
from .hgvs import hgvsp_from_consequence_amino_acids
from .vep import consequence_term_rank
-
OMIT_CONSEQUENCE_TERMS = hl.set(["upstream_gene_variant", "downstream_gene_variant"])
+# ruff doesn't like explicit comparisons to None, but we need them in here, so:
+# ruff: noqa: E711
+
-def annotate_transcript_consequences(variants_path, transcripts_path, mane_transcripts_path=None):
+def annotate_transcript_consequences(variants_path, transcripts_path=None, mane_transcripts_path=None):
ds = hl.read_table(variants_path)
+ return annotate_transcript_consequences_in_table(ds, transcripts_path, mane_transcripts_path)
+
+def annotate_transcript_consequences_in_table(ds, transcripts_path=None, mane_transcripts_path=None):
most_severe_consequence = ds.vep.most_severe_consequence
transcript_consequences = ds.vep.transcript_consequences
@@ -62,26 +67,25 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans
transcript_consequences = transcript_consequences.map(lambda c: c.select(*consequences))
- transcripts = hl.read_table(transcripts_path)
-
- # TODO: This can potentially be improved by removing Table.collect
- # See https://hail.zulipchat.com/#narrow/stream/123010-Hail-0.2E2.20support/topic/Optimize.20annotation.20with.20small.20dataset
- # and https://github.com/Nealelab/ukb_common/blob/ad94d20f8c9f3b711e40a473425925775f0b1f30/utils/generic.py#L18
- transcript_info = hl.dict(
- [
- (row.transcript_id, row.transcript_info)
- for row in transcripts.select(
- transcript_info=hl.struct(
- transcript_version=transcripts.transcript_version,
- gene_version=transcripts.gene.gene_version,
- )
- ).collect()
- ]
- )
-
- transcript_consequences = transcript_consequences.map(
- lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id))
- )
+ if transcripts_path != None:
+ transcripts = hl.read_table(transcripts_path)
+ # TODO: This can potentially be improved by removing Table.collect
+ # See https://hail.zulipchat.com/#narrow/stream/123010-Hail-0.2E2.20support/topic/Optimize.20annotation.20with.20small.20dataset
+ # and https://github.com/Nealelab/ukb_common/blob/ad94d20f8c9f3b711e40a473425925775f0b1f30/utils/generic.py#L18
+ transcript_info = hl.dict(
+ [
+ (row.transcript_id, row.transcript_info)
+ for row in transcripts.select(
+ transcript_info=hl.struct(
+ transcript_version=transcripts.transcript_version,
+ gene_version=transcripts.gene.gene_version,
+ )
+ ).collect()
+ ]
+ )
+ transcript_consequences = transcript_consequences.map(
+ lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id))
+ )
if mane_transcripts_path:
mane_transcripts = hl.read_table(mane_transcripts_path)
diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
index 680a96f41..acdbe8be2 100644
--- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
+++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
@@ -143,6 +143,30 @@ def add_liftover_document_id(ds):
"block_size": 1_000,
},
},
+ "gnomad_v4_variant_patches": {
+ "get_table": lambda: subset_table(
+ add_variant_document_id(
+ hl.read_table(
+ "gs://gnomad-browser-data-pipeline/phil-scratch/output/gnomad_v4/gnomad_v4_variants_patched.ht"
+ )
+ )
+ ),
+ "args": {
+ "index": "gnomad_v4_variants_patches",
+ "index_fields": [
+ "document_id",
+ "variant_id",
+ "rsids",
+ "caid",
+ "locus",
+ "transcript_consequences.gene_id",
+ "transcript_consequences.transcript_id",
+ "vrs.alt.allele_id",
+ ],
+ "id_field": "document_id",
+ "block_size": 1_000,
+ },
+ },
"gnomad_v4_exome_coverage": {
"get_table": lambda: subset_table(
hl.read_table(gnomad_v4_coverage_pipeline.get_output("exome_coverage").get_output_path())
diff --git a/data-pipeline/src/data_pipeline/pipelines/variant_patches.py b/data-pipeline/src/data_pipeline/pipelines/variant_patches.py
new file mode 100644
index 000000000..0762e6063
--- /dev/null
+++ b/data-pipeline/src/data_pipeline/pipelines/variant_patches.py
@@ -0,0 +1,20 @@
+from data_pipeline.pipeline import Pipeline, run_pipeline
+
+from data_pipeline.data_types.variant.patch_rnu4atac_variants import patch_rnu4atac_variants
+
+pipeline = Pipeline()
+
+pipeline.add_task(
+ "patch_rnu4atac_variants",
+ patch_rnu4atac_variants,
+ "/gnomad_v4/gnomad_v4_variants_patched.ht",
+ {
+ "vepped_path": "gs://gnomad-v4-data-pipeline/inputs/secondary-analyses/gnomad_v4.1.RNU4ATAC.vep115.ht",
+ "freq_path": "gs://gnomad-v4-data-pipeline/output/gnomad_v4/gnomad_v4_variants_annotated_4.ht",
+ },
+)
+
+pipeline.set_outputs({"variant_patches": "patch_rnu4atac_variants"})
+
+if __name__ == "__main__":
+ run_pipeline(pipeline)
From 517188951144b6e909b7a82eac4e8d0ecae8f8ca Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Wed, 15 Oct 2025 14:31:33 -0400
Subject: [PATCH 4/9] Use RNU4ATAC variant patches in API
---
graphql-api/src/queries/gene-queries.ts | 17 +-
.../queries/helpers/elasticsearch-helpers.ts | 16 ++
.../gnomad-v4-variant-queries.ts | 247 ++++++++++++------
3 files changed, 182 insertions(+), 98 deletions(-)
diff --git a/graphql-api/src/queries/gene-queries.ts b/graphql-api/src/queries/gene-queries.ts
index 57b05e986..51e28bdaf 100644
--- a/graphql-api/src/queries/gene-queries.ts
+++ b/graphql-api/src/queries/gene-queries.ts
@@ -1,7 +1,7 @@
import elasticsearch from '@elastic/elasticsearch'
import { withCache } from '../cache'
-import { fetchAllSearchResults } from './helpers/elasticsearch-helpers'
+import { fetchAllSearchResultsFromMultipleIndices } from './helpers/elasticsearch-helpers'
import { ReferenceGenome } from '@gnomad/dataset-metadata/metadata'
import { LimitedElasticClient, GetResponse, SearchResponse, SearchHit } from '../elasticsearch'
@@ -130,21 +130,6 @@ export const fetchGenesByRegion = async (
return mergedHits.map((hit) => hit._source.value)
}
-const fetchAllSearchResultsFromMultipleIndices = async (
- esClient: LimitedElasticClient,
- indices: string[],
- searchParams: elasticsearch.RequestParams.Search
-) => {
- const requests = indices.map((index) =>
- fetchAllSearchResults(esClient, {
- index,
- type: '_doc',
- ...searchParams,
- })
- )
- return Promise.all(requests)
-}
-
const searchMultipleIndices = async (
esClient: LimitedElasticClient,
indices: string[],
diff --git a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
index ee70cb1fa..a18f237cb 100644
--- a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
+++ b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
@@ -1,3 +1,4 @@
+import elasticsearch from '@elastic/elasticsearch'
import { LimitedElasticClient, SearchResponse, SearchHit } from '../../elasticsearch'
/**
@@ -43,6 +44,21 @@ export const fetchAllSearchResults = async (client: LimitedElasticClient, search
return allResults
}
+export const fetchAllSearchResultsFromMultipleIndices = async (
+ esClient: LimitedElasticClient,
+ indices: string[],
+ searchParams: elasticsearch.RequestParams.Search
+) => {
+ const requests = indices.map((index) =>
+ fetchAllSearchResults(esClient, {
+ index,
+ type: '_doc',
+ ...searchParams,
+ })
+ )
+ return Promise.all(requests)
+}
+
// Retrieve index metadata set by data pipeline
export const fetchIndexMetadata = async (esClient: any, index: any) => {
const response = await esClient.indices.getMapping({
diff --git a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts
index 46955fb37..5d96a23a4 100644
--- a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts
+++ b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts
@@ -5,7 +5,10 @@ import { isRsId } from '@gnomad/identifiers'
import { UserVisibleError } from '../../errors'
import { fetchLocalAncestryPopulationsByVariant } from '../local-ancestry-queries'
-import { fetchAllSearchResults } from '../helpers/elasticsearch-helpers'
+import {
+ fetchAllSearchResults,
+ fetchAllSearchResultsFromMultipleIndices,
+} from '../helpers/elasticsearch-helpers'
import { mergeOverlappingRegions } from '../helpers/region-helpers'
import {
fetchLofCurationResultsByVariant,
@@ -16,10 +19,67 @@ import {
import { getFlagsForContext } from './shared/flags'
import { getConsequenceForContext } from './shared/transcriptConsequence'
import largeGenes from '../helpers/large-genes'
+import { LimitedElasticClient, SearchResponse } from '../../elasticsearch'
const GNOMAD_V4_VARIANT_INDEX = 'gnomad_v4_variants'
+const GNOMAD_V4_VARIANT_INDEX_PATCHES = 'gnomad_v4_variants_patches-2025-10-14--20-02'
type Subset = 'all' | 'non_ukb'
+type ESTranscriptConsequence = {
+ biotype: string
+ consequence_terms: string[]
+ gene_id: string
+ gene_symbol: string
+ gene_version: string
+ is_canonical: boolean
+ major_consequence: string
+ transcript_id: string
+ transcript_version: string
+}
+type ESPatch = {
+ variant_id: string
+ transcript_consequences: ESTranscriptConsequence[]
+}
+
+const mergeTranscriptConsequences = (
+ transcriptConsequences: ESTranscriptConsequence[],
+ patchedTranscriptConsequences?: ESTranscriptConsequence[] | null
+) => {
+ if (!patchedTranscriptConsequences) {
+ return transcriptConsequences
+ }
+
+ const result: ESTranscriptConsequence[] = []
+ transcriptConsequences.forEach((csq) => {
+ const patchedConsequence = patchedTranscriptConsequences!.find(
+ (patchedCsq) => patchedCsq.transcript_id === csq.transcript_id
+ )
+ result.push(patchedConsequence || csq)
+ })
+ return result
+}
+
+const mergeTranscriptConsequencesInVariant = (
+ variant: { variant_id: string; transcript_consequences: ESTranscriptConsequence[] },
+ patches: ESPatch[]
+) => {
+ const matchingPatch = patches.find((patch) => patch.variant_id === variant.variant_id)
+ if (matchingPatch === undefined) {
+ return variant
+ }
+
+ return {
+ ...variant,
+ transcript_consequences: mergeTranscriptConsequences(
+ variant.transcript_consequences,
+ matchingPatch.transcript_consequences
+ ),
+ }
+}
+
+const hasPositiveAC = (variant: any, subset: string) =>
+ (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) ||
+ variant.exome.freq[subset].ac_raw > 0
// ================================================================================================
// Count query
@@ -69,30 +129,50 @@ const chooseIdField = (variantId: string) => {
return 'variant_id'
}
-const fetchVariantById = async (esClient: any, variantId: any, subset: Subset) => {
+const fetchVariantById = async (
+ esClient: LimitedElasticClient,
+ variantId: string,
+ subset: Subset
+) => {
const idField = chooseIdField(variantId)
- const response = await esClient.search({
+ const query = {
+ bool: {
+ filter: { term: { [idField]: variantId } },
+ },
+ }
+
+ const variantResponsePromise = esClient.search({
index: GNOMAD_V4_VARIANT_INDEX,
body: {
- query: {
- bool: {
- filter: { term: { [idField]: variantId } },
- },
- },
+ query,
},
size: 1,
- })
+ }) as Promise
+ const patchResponsePromise = esClient.search({
+ index: GNOMAD_V4_VARIANT_INDEX_PATCHES,
+ body: { query },
+ size: 1,
+ }) as Promise
- if (response.body.hits.total.value === 0) {
+ const variantResponse = await variantResponsePromise
+
+ if (variantResponse.body.hits.total.value === 0) {
throw new UserVisibleError('Variant not found')
}
// An rsID may match multiple variants
- if (response.body.hits.total.value > 1) {
+ if (variantResponse.body.hits.total.value > 1) {
throw new UserVisibleError('Multiple variants found, query using variant ID to select one.')
}
- const variant = response.body.hits.hits[0]._source.value
+ const patchResponse = await patchResponsePromise
+ const patchedTranscriptConsequences =
+ patchResponse.body.hits.total.value > 0
+ ? (patchResponse.body.hits.hits[0]._source.value
+ .transcript_consequences as ESTranscriptConsequence[])
+ : null
+
+ const variant = variantResponse.body.hits.hits[0]._source.value
const subsetGenomeFreq = variant.genome.freq.all || {}
const subsetJointFreq = variant.joint.freq[subset] || {}
@@ -244,9 +324,10 @@ const fetchVariantById = async (esClient: any, variantId: any, subset: Subset) =
flags: variantFlags,
// TODO: Include RefSeq transcripts once the browser supports them.
lof_curations: lofCurationResults,
- transcript_consequences: (variant.transcript_consequences || []).filter((csq: any) =>
- csq.gene_id.startsWith('ENSG')
- ),
+ transcript_consequences: mergeTranscriptConsequences(
+ variant.transcript_consequences,
+ patchedTranscriptConsequences
+ ).filter((csq: any) => csq.gene_id.startsWith('ENSG')),
in_silico_predictors: inSilicoPredictorsList,
}
@@ -454,28 +535,30 @@ const fetchVariantsByGene = async (esClient: any, gene: any, subset: Subset) =>
},
}))
- const hits = await fetchAllSearchResults(esClient, {
- index: GNOMAD_V4_VARIANT_INDEX,
- type: '_doc',
- size: pageSize,
- _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset),
- body: {
- query: {
- bool: {
- filter: [{ term: { gene_id: gene.gene_id } }, { bool: { should: rangeQueries } }],
+ const [hits, consequencePatchHits] = await fetchAllSearchResultsFromMultipleIndices(
+ esClient,
+ [GNOMAD_V4_VARIANT_INDEX, GNOMAD_V4_VARIANT_INDEX_PATCHES],
+ {
+ type: '_doc',
+ size: pageSize,
+ _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset),
+ body: {
+ query: {
+ bool: {
+ filter: [{ term: { gene_id: gene.gene_id } }, { bool: { should: rangeQueries } }],
+ },
},
+ sort: [{ 'locus.position': { order: 'asc' } }],
},
- sort: [{ 'locus.position': { order: 'asc' } }],
- },
- })
+ }
+ )
+
+ const consequencePatches: ESPatch[] = consequencePatchHits.map((hit) => hit._source.value)
const shapedHits = hits
.map((hit: any) => hit._source.value)
- .filter(
- (variant: any) =>
- (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) ||
- variant.exome.freq[subset].ac_raw > 0
- )
+ .filter((variant) => hasPositiveAC(variant, subset))
+ .map((variant) => mergeTranscriptConsequencesInVariant(variant, consequencePatches))
.map(shapeVariantSummary(subset, { type: 'gene', geneId: gene.gene_id }))
const lofCurationResults = await fetchLofCurationResultsByGene(esClient, 'v4', gene)
@@ -507,38 +590,40 @@ const fetchVariantsByRegion = async (esClient: any, region: any, subset: Subset)
const genomeSubset = 'all'
const jointSubset = 'all'
- const hits = await fetchAllSearchResults(esClient, {
- index: GNOMAD_V4_VARIANT_INDEX,
- type: '_doc',
- size: 10000,
- _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset),
- body: {
- query: {
- bool: {
- filter: [
- { term: { 'locus.contig': `chr${region.chrom}` } },
- {
- range: {
- 'locus.position': {
- gte: region.start,
- lte: region.stop,
+ const [hits, consequencePatchHits] = await fetchAllSearchResultsFromMultipleIndices(
+ esClient,
+ [GNOMAD_V4_VARIANT_INDEX, GNOMAD_V4_VARIANT_INDEX_PATCHES],
+ {
+ type: '_doc',
+ size: 10000,
+ _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset),
+ body: {
+ query: {
+ bool: {
+ filter: [
+ { term: { 'locus.contig': `chr${region.chrom}` } },
+ {
+ range: {
+ 'locus.position': {
+ gte: region.start,
+ lte: region.stop,
+ },
},
},
- },
- ],
+ ],
+ },
},
+ sort: [{ 'locus.position': { order: 'asc' } }],
},
- sort: [{ 'locus.position': { order: 'asc' } }],
- },
- })
+ }
+ )
+
+ const consequencePatches: ESPatch[] = consequencePatchHits.map((hit) => hit._source.value)
const variants = hits
.map((hit: any) => hit._source.value)
- .filter(
- (variant: any) =>
- (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) ||
- variant.exome.freq[subset].ac_raw > 0
- )
+ .filter((variant) => hasPositiveAC(variant, subset))
+ .map((variant) => mergeTranscriptConsequencesInVariant(variant, consequencePatches))
.map(shapeVariantSummary(subset, { type: 'region' }))
const lofCurationResults = await fetchLofCurationResultsByRegion(esClient, 'v4', region)
@@ -599,31 +684,33 @@ const fetchVariantsByTranscript = async (esClient: any, transcript: any, subset:
},
}))
- const hits = await fetchAllSearchResults(esClient, {
- index: GNOMAD_V4_VARIANT_INDEX,
- type: '_doc',
- size: 10000,
- _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset),
- body: {
- query: {
- bool: {
- filter: [
- { term: { transcript_id: transcript.transcript_id } },
- { bool: { should: rangeQueries } },
- ],
+ const [hits, consequencePatchHits] = await fetchAllSearchResultsFromMultipleIndices(
+ esClient,
+ [GNOMAD_V4_VARIANT_INDEX, GNOMAD_V4_VARIANT_INDEX_PATCHES],
+ {
+ type: '_doc',
+ size: 10000,
+ _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset),
+ body: {
+ query: {
+ bool: {
+ filter: [
+ { term: { transcript_id: transcript.transcript_id } },
+ { bool: { should: rangeQueries } },
+ ],
+ },
},
+ sort: [{ 'locus.position': { order: 'asc' } }],
},
- sort: [{ 'locus.position': { order: 'asc' } }],
- },
- })
+ }
+ )
+
+ const consequencePatches: ESPatch[] = consequencePatchHits.map((hit) => hit._source.value)
return hits
.map((hit: any) => hit._source.value)
- .filter(
- (variant: any) =>
- (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) ||
- variant.exome.freq[subset].ac_raw > 0
- )
+ .filter((variant) => hasPositiveAC(variant, subset))
+ .map((variant) => mergeTranscriptConsequencesInVariant(variant, consequencePatches))
.map(
shapeVariantSummary(subset, { type: 'transcript', transcriptId: transcript.transcript_id })
)
@@ -665,11 +752,7 @@ const fetchMatchingVariants = async (
return hits
.map((hit: any) => hit._source.value)
- .filter(
- (variant: any) =>
- (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) ||
- variant.exome.freq[subset].ac_raw > 0
- )
+ .filter((variant) => hasPositiveAC(variant, subset))
.map((variant: any) => ({
variant_id: variant.variant_id,
}))
From 12a856344cc76ab071b2a5d2fc2716e98ba0311a Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Fri, 17 Oct 2025 14:20:49 -0400
Subject: [PATCH 5/9] Build RNU4ATAC transcript patches and export to ES
---
.../pipelines/export_to_elasticsearch.py | 13 +++++++++
.../pipelines/transcript_patches.py | 28 +++++++++++++++++++
2 files changed, 41 insertions(+)
create mode 100644 data-pipeline/src/data_pipeline/pipelines/transcript_patches.py
diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
index acdbe8be2..1446aa58e 100644
--- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
+++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py
@@ -42,6 +42,8 @@
from data_pipeline.pipelines.gnomad_v4_lof_curation_results import pipeline as gnomad_v4_lof_curation_results_pipeline
from data_pipeline.pipelines.gene_patches import pipeline as gnomad_v4_gene_patches
+from data_pipeline.pipelines.transcript_patches import pipeline as gnomad_v4_transcript_patches
+
logger = logging.getLogger("gnomad_data_pipeline")
@@ -119,6 +121,17 @@ def add_liftover_document_id(ds):
"block_size": 1_000,
},
},
+ "transcripts_grch38_patched": {
+ "get_table": lambda: hl.read_table(
+ gnomad_v4_transcript_patches.get_output("transcripts_grch38_patched").get_output_path()
+ ),
+ "args": {
+ "index": "transcripts_grch38_patched",
+ "index_fields": ["transcript_id"],
+ "id_field": "transcript_id",
+ "block_size": 1_000,
+ },
+ },
##############################################################################################################
# gnomAD v4
##############################################################################################################
diff --git a/data-pipeline/src/data_pipeline/pipelines/transcript_patches.py b/data-pipeline/src/data_pipeline/pipelines/transcript_patches.py
new file mode 100644
index 000000000..fe332d898
--- /dev/null
+++ b/data-pipeline/src/data_pipeline/pipelines/transcript_patches.py
@@ -0,0 +1,28 @@
+from data_pipeline.pipeline import Pipeline, run_pipeline
+
+from data_pipeline.data_types.transcript import extract_transcripts
+from data_pipeline.helpers import annotate_table
+
+pipeline = Pipeline()
+
+pipeline.add_task(
+ "extract_patched_transcripts",
+ extract_transcripts,
+ "/transcripts/transcripts_grch38_patched_base.ht",
+ {"genes_path": "gs://gnomad-browser-data-pipeline/phil-scratch/output/genes/genes_grch38_patched.ht"},
+)
+
+pipeline.add_task(
+ "annotate_patched_transcripts",
+ annotate_table,
+ "/transcripts/transcripts_grch38_annotated_1.ht",
+ {
+ "table_path": pipeline.get_task("extract_patched_transcripts"),
+ "gnomad_constraint": "gs://gnomad-v4-data-pipeline/output/constraint/gnomad_v4_constraint.ht",
+ },
+)
+
+pipeline.set_outputs({"transcripts_grch38_patched": "annotate_patched_transcripts"})
+
+if __name__ == "__main__":
+ run_pipeline(pipeline)
From 427892df28b88327ade578445c7e0f90d3139b68 Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Fri, 17 Oct 2025 14:44:39 -0400
Subject: [PATCH 6/9] Use RNU4ATAC transcript patches in API
---
graphql-api/src/queries/gene-queries.ts | 17 ++----
.../queries/helpers/elasticsearch-helpers.ts | 15 ++++-
graphql-api/src/queries/transcript-queries.ts | 55 ++++++++++++-------
3 files changed, 54 insertions(+), 33 deletions(-)
diff --git a/graphql-api/src/queries/gene-queries.ts b/graphql-api/src/queries/gene-queries.ts
index 51e28bdaf..cc8b85a96 100644
--- a/graphql-api/src/queries/gene-queries.ts
+++ b/graphql-api/src/queries/gene-queries.ts
@@ -1,7 +1,10 @@
import elasticsearch from '@elastic/elasticsearch'
import { withCache } from '../cache'
-import { fetchAllSearchResultsFromMultipleIndices } from './helpers/elasticsearch-helpers'
+import {
+ fetchAllSearchResultsFromMultipleIndices,
+ getFromMultipleIndices,
+} from './helpers/elasticsearch-helpers'
import { ReferenceGenome } from '@gnomad/dataset-metadata/metadata'
import { LimitedElasticClient, GetResponse, SearchResponse, SearchHit } from '../elasticsearch'
@@ -38,17 +41,7 @@ const _fetchGeneById = async (
throw err
}) as Promise
)
- return Promise.all(requests).then(
- (responses) => {
- const responsesWithValue = responses.filter((response) => response !== null)
- return responsesWithValue.length > 0
- ? responsesWithValue[responsesWithValue.length - 1]!.body._source.value
- : null
- },
- (err) => {
- throw err
- }
- )
+ return getFromMultipleIndices(requests)
}
export const fetchGeneById = withCache(
diff --git a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
index a18f237cb..ab946c26c 100644
--- a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
+++ b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts
@@ -1,5 +1,5 @@
import elasticsearch from '@elastic/elasticsearch'
-import { LimitedElasticClient, SearchResponse, SearchHit } from '../../elasticsearch'
+import { LimitedElasticClient, SearchResponse, SearchHit, GetResponse } from '../../elasticsearch'
/**
* Search and then scroll to retrieve all pages of search results.
@@ -69,3 +69,16 @@ export const fetchIndexMetadata = async (esClient: any, index: any) => {
// eslint-disable-next-line no-underscore-dangle
return Object.values(response.body)[0].mappings._meta
}
+
+export const getFromMultipleIndices = (requests: Promise[]) =>
+ Promise.all(requests).then(
+ (responses) => {
+ const responsesWithValue = responses.filter((response) => response !== null)
+ return responsesWithValue.length > 0
+ ? responsesWithValue[responsesWithValue.length - 1]!.body._source.value
+ : null
+ },
+ (err) => {
+ throw err
+ }
+ )
diff --git a/graphql-api/src/queries/transcript-queries.ts b/graphql-api/src/queries/transcript-queries.ts
index e36b22969..e60ffd726 100644
--- a/graphql-api/src/queries/transcript-queries.ts
+++ b/graphql-api/src/queries/transcript-queries.ts
@@ -1,24 +1,39 @@
-const TRANSCRIPT_INDICES = {
- GRCh37: 'transcripts_grch37',
- GRCh38: 'transcripts_grch38',
+import { ReferenceGenome } from '@gnomad/dataset-metadata/metadata'
+import { GetResponse, LimitedElasticClient } from '../elasticsearch'
+import { getFromMultipleIndices } from './helpers/elasticsearch-helpers'
+
+type TranscriptIndex =
+ | 'transcripts_grch37'
+ | 'transcripts_grch38'
+ | 'transcripts_grch38_patched-2025-10-23--19-36'
+
+const TRANSCRIPT_INDICES: Record = {
+ GRCh37: ['transcripts_grch37'],
+ GRCh38: ['transcripts_grch38', 'transcripts_grch38_patched-2025-10-23--19-36'],
}
-export const fetchTranscriptById = async (es: any, transcriptId: any, referenceGenome: any) => {
- try {
- const response = await es.get({
- // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message
- index: TRANSCRIPT_INDICES[referenceGenome],
- type: '_doc',
- id: transcriptId,
- })
+export const fetchTranscriptById = async (
+ esClient: LimitedElasticClient,
+ transcriptId: string,
+ referenceGenome: ReferenceGenome
+) => {
+ const indices = TRANSCRIPT_INDICES[referenceGenome]
+ const requests = indices.map(
+ (index) =>
+ esClient
+ .get({
+ index,
+ type: '_doc',
+ id: transcriptId,
+ })
+ .catch((err) => {
+ // meta will not be present if the request times out in the queue before reaching ES
+ if (err.meta && err.meta.body.found === false) {
+ return null
+ }
+ throw err
+ }) as Promise
+ )
- return response.body._source.value
- } catch (err) {
- // meta will not be present if the request times out in the queue before reaching ES
- // @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'.
- if (err.meta && err.meta.body.found === false) {
- return null
- }
- throw err
- }
+ return getFromMultipleIndices(requests)
}
From 555cdd3a72e0fa9fcb6173a9948e689bc5d021df Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Thu, 23 Oct 2025 11:02:16 -0400
Subject: [PATCH 7/9] Add VEP 115 warning to RNU4ATAC
---
browser/src/GenePage/GeneFlags.spec.tsx | 8 +++++++
browser/src/GenePage/GeneFlags.tsx | 12 +++++++++++
.../__snapshots__/GeneFlags.spec.tsx.snap | 21 +++++++++++++++++++
3 files changed, 41 insertions(+)
diff --git a/browser/src/GenePage/GeneFlags.spec.tsx b/browser/src/GenePage/GeneFlags.spec.tsx
index 45d29ef28..b10c4400a 100644
--- a/browser/src/GenePage/GeneFlags.spec.tsx
+++ b/browser/src/GenePage/GeneFlags.spec.tsx
@@ -29,4 +29,12 @@ describe('GeneFlags', () => {
expect(tree).toMatchSnapshot()
})
+
+ test('renders VEP 115 warning for RNU4ATAC', () => {
+ const testGene = geneFactory.build({ symbol: 'RNU4ATAC', reference_genome: 'GRCh38' })
+
+ const tree = renderer.create( )
+
+ expect(tree).toMatchSnapshot()
+ })
})
diff --git a/browser/src/GenePage/GeneFlags.tsx b/browser/src/GenePage/GeneFlags.tsx
index 3150800a8..dcf08e848 100644
--- a/browser/src/GenePage/GeneFlags.tsx
+++ b/browser/src/GenePage/GeneFlags.tsx
@@ -13,11 +13,15 @@ type Props = {
}
const allOfUsCMRGGenes = ['CBS', 'KCNE1', 'CRYAA']
+const vep115Genes = ['RNU4ATAC']
const GeneFlags = ({ gene }: Props) => {
const shouldDisplayCMRGWarning =
gene.reference_genome === 'GRCh38' && allOfUsCMRGGenes.includes(gene.symbol)
+ const shouldDisplayVEP115Warning =
+ gene.reference_genome === 'GRCh38' && vep115Genes.includes(gene.symbol)
+
return (
<>
{shouldDisplayCMRGWarning && (
@@ -35,6 +39,14 @@ const GeneFlags = ({ gene }: Props) => {
) callset to remedy this issue in the future.
)}
+ {shouldDisplayVEP115Warning && (
+
+ Warning MANE Select and variant consequence information in
+ this gene were annotated using Ensembl VEP version 115 (GENCODE v49). For more
+ information, see our{' '}
+ help page .
+
+ )}
{gene.flags.includes('chip') && (
Note Analysis of allele balance and age data indicates that
diff --git a/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap b/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap
index 72aeb4bab..4d1b9affe 100644
--- a/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap
+++ b/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap
@@ -32,6 +32,27 @@ exports[`GeneFlags renders CMRG flag if one of 3 relevant genes 1`] = `
`;
+exports[`GeneFlags renders VEP 115 warning for RNU4ATAC 1`] = `
+
+
+ Warning
+
+ MANE Select and variant consequence information in this gene were annotated using Ensembl VEP version 115 (GENCODE v49). For more information, see our
+
+
+ help page
+
+ .
+
+`;
+
exports[`GeneFlags renders chip flag if present on gene 1`] = `
Date: Thu, 23 Oct 2025 12:22:37 -0400
Subject: [PATCH 8/9] Correct typo in identifier
---
browser/src/GenePage/GeneInfo.tsx | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/browser/src/GenePage/GeneInfo.tsx b/browser/src/GenePage/GeneInfo.tsx
index ccf461f22..526a9f9f6 100644
--- a/browser/src/GenePage/GeneInfo.tsx
+++ b/browser/src/GenePage/GeneInfo.tsx
@@ -23,12 +23,12 @@ type ManeSelectTranscriptIdProps = {
}
const ManeSelectTranscriptId = ({ gene }: ManeSelectTranscriptIdProps) => {
- const gencodeVersionOfManeSelectTransript = gene.transcripts.find(
+ const gencodeVersionOfManeSelectTranscript = gene.transcripts.find(
(transcript: any) => transcript.transcript_id === gene.mane_select_transcript.ensembl_id
)
const shouldLinkToTranscriptPage =
- gencodeVersionOfManeSelectTransript &&
- gencodeVersionOfManeSelectTransript.transcript_version ===
+ gencodeVersionOfManeSelectTranscript &&
+ gencodeVersionOfManeSelectTranscript.transcript_version ===
gene.mane_select_transcript.ensembl_version
return (
From 4a4e2b44635eb66a45ff690c92207720a8365fe6 Mon Sep 17 00:00:00 2001
From: Phil Darnowsky
Date: Thu, 23 Oct 2025 12:27:01 -0400
Subject: [PATCH 9/9] Tighten types around MANE select ID
---
browser/src/GenePage/GeneInfo.tsx | 49 ++++++++++++++++++-------------
1 file changed, 28 insertions(+), 21 deletions(-)
diff --git a/browser/src/GenePage/GeneInfo.tsx b/browser/src/GenePage/GeneInfo.tsx
index 526a9f9f6..a3bbaa35e 100644
--- a/browser/src/GenePage/GeneInfo.tsx
+++ b/browser/src/GenePage/GeneInfo.tsx
@@ -8,39 +8,40 @@ import Link from '../Link'
import GeneReferences from './GeneReferences'
type ManeSelectTranscriptIdProps = {
- gene: {
- mane_select_transcript: {
- ensembl_id: string
- ensembl_version: string
- refseq_id: string
- refseq_version: string
- }
- transcripts: {
- transcript_id: string
- transcript_version: string
- }[]
+ mane_select_transcript: {
+ ensembl_id: string
+ ensembl_version: string
+ refseq_id: string
+ refseq_version: string
}
+ transcripts: {
+ transcript_id: string
+ transcript_version: string
+ }[]
}
-const ManeSelectTranscriptId = ({ gene }: ManeSelectTranscriptIdProps) => {
- const gencodeVersionOfManeSelectTranscript = gene.transcripts.find(
- (transcript: any) => transcript.transcript_id === gene.mane_select_transcript.ensembl_id
+const ManeSelectTranscriptId = ({
+ mane_select_transcript,
+ transcripts,
+}: ManeSelectTranscriptIdProps) => {
+ const gencodeVersionOfManeSelectTranscript = transcripts.find(
+ (transcript) => transcript.transcript_id === mane_select_transcript.ensembl_id
)
const shouldLinkToTranscriptPage =
gencodeVersionOfManeSelectTranscript &&
gencodeVersionOfManeSelectTranscript.transcript_version ===
- gene.mane_select_transcript.ensembl_version
+ mane_select_transcript.ensembl_version
return (
{shouldLinkToTranscriptPage ? (
-
- {gene.mane_select_transcript.ensembl_id}.{gene.mane_select_transcript.ensembl_version}
+
+ {mane_select_transcript.ensembl_id}.{mane_select_transcript.ensembl_version}
) : (
- `${gene.mane_select_transcript.ensembl_id}.${gene.mane_select_transcript.ensembl_version}`
+ `${mane_select_transcript.ensembl_id}.${mane_select_transcript.ensembl_version}`
)}{' '}
- / {gene.mane_select_transcript.refseq_id}.{gene.mane_select_transcript.refseq_version}
+ / {mane_select_transcript.refseq_id}.{mane_select_transcript.refseq_version}
)
}
@@ -109,8 +110,14 @@ const GeneInfo = ({ gene }: GeneInfoProps) => {
}
>
- {/* @ts-expect-error TS(2322) FIXME: Type '{ gene_id: string; gene_version: string; sym... Remove this comment to see the full error message */}
- {gene.mane_select_transcript ? : 'Not available'}
+ {gene.mane_select_transcript ? (
+
+ ) : (
+ 'Not available'
+ )}
)}