Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate de novo release #654

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 78 additions & 49 deletions gnomad_qc/v4/create_release/create_de_novo_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

import hail as hl
from gnomad.resources.grch38.gnomad import all_sites_an, public_release
from gnomad.sample_qc.relatedness import filter_to_trios
from gnomad.sample_qc.relatedness import call_de_novo, get_de_novo_expr
from gnomad.utils.annotations import annotate_adj
from gnomad.utils.slack import slack_notifications
from gnomad.utils.vep import process_consequences
from hail.utils import new_temp_file
Expand All @@ -28,11 +29,9 @@
SIFT_VERSION,
)
from gnomad_qc.v4.resources.annotations import get_info, get_trio_stats
from gnomad_qc.v4.resources.basics import get_gnomad_v4_vds
from gnomad_qc.v4.resources.constants import CURRENT_RELEASE
from gnomad_qc.v4.resources.meta import meta
from gnomad_qc.v4.resources.release import release_de_novo
from gnomad_qc.v4.resources.sample_qc import de_novo_mt, pedigree
from gnomad_qc.v4.resources.sample_qc import dense_trio_mt, pedigree, trio_denovo_ht
from gnomad_qc.v4.resources.variant_qc import final_filter

logging.basicConfig(
Expand Down Expand Up @@ -104,52 +103,83 @@ def filter_de_novos(ht: hl.Table) -> hl.Table:
return ht


def get_releasable_trios_dense_mt(
fam_ht: hl.Table,
var_ht: hl.Table,
def get_releasable_de_novo_calls_ht(
mt: hl.MatrixTable,
priors_ht: hl.Table,
ped: hl.Pedigree,
test: bool = False,
) -> hl.matrixtable:
) -> hl.Table:
"""
Densify the VDS to only the releasable trios.
Get de novo calls Hail Table.

:param fam_ht: Table containing family information.
:param var_ht: Table containing trio stats.
:param mt: Dense MatrixTable of the releasable trios.
:param priors_ht: Table with AFs used as population frequency priors in de novo calculations.
:param ped: Pedigree.
:param test: Whether to filter to chr20 for testing. Default is False.
:return: Dense MatrixTable with de novo variants in releasable trios.
:return: Hail Table with de novo calls.
"""
if test:
logger.info("Filtering trio stats to chr20 for testing...")
var_ht = hl.filter_intervals(var_ht, [hl.parse_locus_interval("chr20")])

# Filter the trio stats Table to de novo variants.
var_ht = filter_de_novos(var_ht)
var_ht = var_ht.repartition(100).checkpoint(new_temp_file("de_novo_filtered", "ht"))

# Filter the metadata table to only samples in high quality and releasable trios.
meta_ht = meta().ht()
meta_ht = meta_ht.filter(meta_ht.high_quality & meta_ht.project_meta.releasable)
fam_ht = fam_ht.filter(
hl.is_defined(meta_ht[fam_ht.id])
& hl.is_defined(meta_ht[fam_ht.pat_id])
& hl.is_defined(meta_ht[fam_ht.mat_id])
logger.info("Filtering to chr20 for testing...")
mt = hl.filter_intervals(mt, [hl.parse_locus_interval("chr20")])

mt = mt.annotate_rows(
n_unsplit_alleles=hl.len(mt.alleles),
mixed_site=(
(hl.len(mt.alleles) > 2)
& hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
& hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:])
),
)
mt = hl.experimental.sparse_split_multi(mt).checkpoint(
dense_trio_mt(releasable=True, split=True, test=test).path, _read_if_exists=True
)
meta_ht = filter_to_trios(meta_ht, fam_ht)
meta_ht = meta_ht.repartition(100).checkpoint(
new_temp_file("releasable_trios", "ht")
mt = annotate_adj(mt)

# Many of our larger datasets have the PL and AD fields for homref genotypes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Many of our larger datasets have the PL and AD fields for homref genotypes
# Many of our larger datasets have the PL and AD fields for homref genotypes

can you check if this is true for v3 as well? if yes, it's best to say v3 and v4 have the PL and ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, they are also NA in v3 VDS. FYI, I used this code to check:

from gnomad_qc.v3.resources.basics import get_gnomad_v3_vds
from gnomad.sample_qc.relatedness import filter_to_trios

vds = get_gnomad_v3_vds(split=True, 
                       filter_intervals=['chr11:113409605-113475691'],
                       )

trios = hl.import_fam("gs://gnomad-qin/g1k.trios.fam", delimiter='\t')
vds = filter_to_trios(vds, trios)

mt = hl.vds.to_dense_mt(vds)
mt.entries().show(20)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you for looking! this is good to know

# intentionally removed to save storage space and costs. We need to approximate the
# AD and PL fields when missing.
mt = mt.annotate_entries(
AD=hl.or_else(mt.AD, [mt.DP, 0]),
PL=hl.or_else(mt.PL, [0, mt.GQ, 2 * mt.GQ]),
)
mt = mt.annotate_rows(prior=priors_ht[mt.row_key].freq[0].AF)
mt = mt.select_entries("GT", "AD", "DP", "GQ", "PL", "adj")

tm = hl.trio_matrix(mt, ped, complete_trios=True)
tm = tm.transmute_cols(is_xx=tm.is_female)
tm = tm.checkpoint(new_temp_file("trio_matrix", "mt"))

ht = tm.entries()

ht = ht.annotate(
is_de_novo=call_de_novo(
ht.locus,
ht.proband_entry,
ht.father_entry,
ht.mother_entry,
is_xx_expr=ht.is_xx,
)
)

# Get the split gnomAD VDS filtered to high quality releasable trios and de novo
# variants.
vds = get_gnomad_v4_vds(
split=True,
high_quality_only=True,
chrom="chr20" if test else None,
filter_samples_ht=meta_ht,
filter_variant_ht=var_ht,
checkpoint_variant_data=True,
ht = (
ht.filter(ht.is_de_novo)
.naive_coalesce(1000)
.checkpoint(new_temp_file("de_novo_calls", "ht"))
)

return hl.vds.to_dense_mt(vds)
ht = ht.annotate(
de_novo_call_info=get_de_novo_expr(
locus_expr=ht.locus,
alleles_expr=ht.alleles,
proband_expr=ht.proband_entry,
father_expr=ht.father_entry,
mother_expr=ht.mother_entry,
is_xx_expr=ht.is_xx,
freq_prior_expr=ht.prior,
)
)

return ht


# TODO: Change when we decide on the final de novo data to release.
Expand Down Expand Up @@ -394,14 +424,13 @@ def main(args):
test = args.test
overwrite = args.overwrite

if args.generate_dense_mt:
get_releasable_trios_dense_mt(
pedigree().ht(),
get_trio_stats(releasable_only=True).ht(),
test=test,
).naive_coalesce(args.de_novo_n_partitions).write(
de_novo_mt(releasable=True, test=test).path, overwrite=overwrite
)
if args.generate_de_novo_calls:
mt = dense_trio_mt(releasable=True).mt()
priors_ht = public_release("exomes").ht()
ped = hl.Pedigree.read(pedigree().path, delimiter="\t")

ht = get_releasable_de_novo_calls_ht(mt, priors_ht, ped, test)
ht.write(trio_denovo_ht(releasable=True, test=test).path, overwrite=overwrite)

if args.generate_final_ht:
ht = get_de_novo_release_ht(args.tables_for_join, args.version, test=test)
Expand Down Expand Up @@ -443,8 +472,8 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
action="store_true",
)
parser.add_argument(
"--generate-dense-mt",
help="Generate a dense MT for releasable trios with the de novo variants.",
"--generate-de-novo-calls",
help="Generate de novo calls HT.",
action="store_true",
)
parser.add_argument(
Expand Down
55 changes: 29 additions & 26 deletions gnomad_qc/v4/resources/sample_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,12 +999,14 @@ def ped_filter_param_json_path(

def dense_trio_mt(
releasable: bool = True,
split: bool = False,
test: bool = False,
) -> VersionedMatrixTableResource:
"""
Get the VersionedMatrixTableResource for the dense trio MatrixTable.

:param releasable: Whether to get the resource for the releasable trios only.
:param split: Whether to get the resource for the split trio MatrixTable.
:param test: Whether to use a tmp path for a test resource.
:return: VersionedMatrixTableResource of dense trio MatrixTable.
"""
Expand All @@ -1015,7 +1017,33 @@ def dense_trio_mt(
version: MatrixTableResource(
f"{get_sample_qc_root(version, test, data_type='exomes')}"
f"/relatedness/trios/gnomad.{data_type}.v{version}.trios"
f"{'.releasable' if releasable else ''}.dense.mt"
f"{'.releasable' if releasable else ''}.dense"
f"{'.split' if split else ''}.mt"
)
for version in SAMPLE_QC_VERSIONS
},
)


def trio_denovo_ht(
releasable: bool = True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does a non-releasable version of this HT exist?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, will remove.

test: bool = False,
) -> VersionedTableResource:
"""
Get the VersionedTableResource for the trio de novo Table.

:param releasable: Whether to get the resource for the releasable trios only.
:param test: Whether to use a tmp path for a test resource.
:return: VersionedTableResource of trio de novo Table.
"""
data_type = "exomes"
return VersionedTableResource(
CURRENT_SAMPLE_QC_VERSION,
{
version: TableResource(
f"{get_sample_qc_root(version, test, data_type='exomes')}"
f"/relatedness/trios/gnomad.{data_type}.v{version}.trios"
f"{'.releasable' if releasable else ''}.denovo.ht"
)
for version in SAMPLE_QC_VERSIONS
},
Expand Down Expand Up @@ -1094,28 +1122,3 @@ def get_sample_qc_field_def_json_path(version: str = CURRENT_SAMPLE_QC_VERSION)
hgdp_tgp_relatedness = TableResource(
path="gs://gnomad/v4.0/sample_qc/additional_resources/gnomad.genomes.v4.0.hgdp_tgp_relatedness.ht",
)


def de_novo_mt(
releasable: bool = True,
test: bool = False,
) -> VersionedMatrixTableResource:
"""
Get the VersionedMatrixTableResource for the dense de novo variants MatrixTable.

:param releasable: Whether to get the resource for the releasable trios only.
:param test: Whether to use a tmp path for a test resource.
:return: VersionedMatrixTableResource of dense de novo variants MatrixTable.
"""
data_type = "exomes"
return VersionedMatrixTableResource(
CURRENT_SAMPLE_QC_VERSION,
{
version: MatrixTableResource(
f"{get_sample_qc_root(version, test, data_type='exomes')}"
f"/relatedness/trios/gnomad.{data_type}.v{version}.de_novo"
f"{'.releasable' if releasable else ''}.dense.mt"
)
for version in SAMPLE_QC_VERSIONS
},
)
Loading