Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,43 @@ We maintain deployment configuration options and steps within a [private reposit
the production MaveDB environment. The main difference between the production setup and these local setups is that
the worker and api services are split into distinct environments, allowing them to scale up or down individually
dependent on need.

## Scripts

### export_public_data.py

Usage:

```
python3 -m mavedb.scripts.export_public_data
```

This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents:
- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets
- variants/
- [URN].counts.csv (for each variant URN): The score set's variant count columns,
sorted by variant number
- [URN].scores.csv (for each variant URN): The score set's variant count columns,
sorted by variant number

In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets.
Experiments are nested in their parent experiment sets, and score sets in their parent experiments.

The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like
`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance.

Unpublished data and data sets licensed other than under the Create Commmons Zero license are not included in the dump,
and user details are limited to ORCID IDs and names of contributors to published data sets.


### recalculate_score_set_statistics.py

Usage:

```
python3 -m mavedb.scripts.recalculate_score_set_statistics

python3 -m mavedb.scripts.recalculate_score_set_statistics -urn <score set URN>
```

This script recalculates statistics for all score sets or, if a URN is specified, for one score set. Statistics are stored as a JSON object in each score set's `statistics` property.
24 changes: 24 additions & 0 deletions alembic/versions/b823e14d6a00_score_set_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Score set statistics

Revision ID: b823e14d6a00
Revises: 9702d32bacb3
Create Date: 2024-05-12 18:37:23.203099

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = 'b823e14d6a00'
down_revision = '9702d32bacb3'
branch_labels = None
depends_on = None


def upgrade():
op.add_column('scoresets', sa.Column('statistics', postgresql.JSONB(astext_type=sa.Text()), nullable=True))


def downgrade():
op.drop_column('scoresets', 'statistics')
234 changes: 233 additions & 1 deletion src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import csv
import io
import logging
import re
from typing import Any, BinaryIO, Iterable, Optional, Sequence
from typing import Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, Union

import mavehgvs
from mavehgvs import util as mavehgvs_util
import numpy as np
import pandas as pd
from pandas.testing import assert_index_equal
Expand Down Expand Up @@ -42,6 +45,8 @@
from mavedb.models.variant import Variant
from mavedb.view_models.search import ScoreSetsSearch

logger = logging.getLogger(__name__)

VariantData = dict[str, Optional[dict[str, dict]]]


Expand Down Expand Up @@ -635,3 +640,230 @@ def columns_for_dataset(dataset: Optional[pd.DataFrame]) -> list[str]:
return []

return [col for col in dataset.columns if col not in HGVSColumns.options()]


def get_score_set_target_lengths(score_set: ScoreSet):
dna_lengths: list[int] = []
protein_lengths: list[int] = []
for target_gene in score_set.target_genes:
if target_gene.target_sequence is not None:
if target_gene.target_sequence.sequence_type == "protein": # or "dna"
protein_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0
protein_lengths.append(protein_length)
dna_lengths.append(protein_length * 3) # Infer DNA target length from protein.
elif target_gene.target_sequence.sequence_type == "dna":
dna_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0
protein_lengths.append(0) # Do not infer a protein target length from DNA.
dna_lengths.append(dna_length)
else:
# Invalid sequence type
raise ValidationError("Invalid sequence type")
return {
"dna": dna_lengths,
"protein": protein_lengths,
}


def summarize_nt_mutations_in_variant(variant: Variant):
if variant.hgvs_nt is None:
return {"num_mutations": 0}
nt_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings(
[variant.hgvs_nt]
)
nt_variant: Optional[mavehgvs.Variant] = nt_variants[0][0]
if nt_variant is None:
return {"num_mutations": 0}
return {
"num_mutations": len(nt_variant.positions) if type(nt_variant.positions) is list else 1,
}


def summarize_pro_mutations_in_variant(variant: Variant):
if variant.hgvs_pro is None:
return {"num_mutations": 0, "most_severe_mutation_type": None}
pro_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings(
[variant.hgvs_pro]
)
pro_variant: Optional[mavehgvs.Variant] = pro_variants[0][0]
if pro_variant is None:
return {"num_mutations": 0, "most_severe_mutation_type": None}
# print(f"{pro_variant.sequence}, VTYPE: {pro_variant.variant_type}, {pro_variant.target_id}")

has_synonmyous_mutations = False
has_missense_mutations = False
has_nonsense_mutations = False
has_other_mutations = False

# The pro_variant object contains either one sequence or a list of them, and similarly one variation type or a list.
# We normalize both here, turning single elements into lists of length 1.
sequences: list[Union[str, tuple[str, str], None]] = (
pro_variant.sequence if type(pro_variant.sequence) is list else [pro_variant.sequence]
)
variation_types: list[str] = (
pro_variant.variant_type if type(pro_variant.variant_type) is list else [pro_variant.variant_type]
)

# Look at each variation (mutation) and count synonymous, nonsense, and missense mutations.
for i, variation_type in enumerate(variation_types):
if variation_type == "equal":
has_synonmyous_mutations = True
elif variation_type == "sub":
sequence = sequences[i]
# For a substitution, there should be two sequence elements (WT and mutated). The second sequence element
# may in general be one of the following:
# - An amino acid
# - Ter (stop codon). MaveHGVS doesn't support the short notation *.
# - - or del. This should not occur when the variant type is "sub."
# - =. This should not occur when the variant type is "sub," but we allow it.
if type(sequence) is not tuple:
logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.")
has_other_mutations = True
elif sequence[1] in ["*", "Ter"]:
has_nonsense_mutations = True
elif sequence[1] in ["-", "del"]:
logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.")
has_other_mutations = True
elif sequence[1] == "=" or sequence[0] == sequence[1]:
has_synonmyous_mutations = True
else:
has_missense_mutations = True
else:
# print(variation_type)
has_other_mutations = True

# Set the variant type only if all variations are of the same type.
if has_other_mutations:
mutation_type = "other"
elif has_nonsense_mutations:
mutation_type = "nonsense"
elif has_missense_mutations:
mutation_type = "missense"
elif has_synonmyous_mutations:
mutation_type = "synonymous"
else:
mutation_type = "other"

return {
"num_mutations": len(pro_variant.positions) if type(pro_variant.positions) is list else 1,
"most_severe_mutation_type": mutation_type,
}


def calculate_score_set_statistics(score_set: ScoreSet):
score_set.target_genes

lengths = get_score_set_target_lengths(score_set)
dna_target_length = sum(lengths["dna"])
pro_target_length = sum(lengths["protein"])

num_single_mutant_nt_variants = 0
num_double_mutant_nt_variants = 0
num_triple_plus_mutant_nt_variants = 0

num_single_mutant_pro_variants = 0
num_double_mutant_pro_variants = 0
num_triple_plus_mutant_pro_variants = 0

num_splice_variants = 0

num_missense_variants = 0
num_nonsense_variants = 0
num_synonymous_variants = 0

# Count of all AA- and NT-sequence mutations, to be used in determining the average number of mutations per position
num_nt_mutations = 0
num_pro_mutations = 0

all_variants_have_hgvs_nt = True
all_variants_have_hgvs_pro = True
some_variants_have_hgvs_nt = False
some_variants_have_hgvs_pro = False

for v in score_set.variants:
# if not re.search(r"\*$", v.hgvs_pro):
# continue

variant_has_hgvs_nt = v.hgvs_nt is not None
variant_has_hgvs_pro = v.hgvs_pro is not None
all_variants_have_hgvs_nt = all_variants_have_hgvs_nt and variant_has_hgvs_nt
all_variants_have_hgvs_pro = all_variants_have_hgvs_pro and variant_has_hgvs_pro
some_variants_have_hgvs_nt = some_variants_have_hgvs_nt or variant_has_hgvs_nt
some_variants_have_hgvs_pro = some_variants_have_hgvs_pro or variant_has_hgvs_pro

nt_mutation_summary = summarize_nt_mutations_in_variant(v)
pro_mutation_summary = summarize_pro_mutations_in_variant(v)

num_nt_mutations = nt_mutation_summary["num_mutations"]
num_pro_mutations = pro_mutation_summary["num_mutations"]

# Count variants by number of AA-sequence mutations.
if num_pro_mutations == 1:
num_single_mutant_pro_variants += 1
elif num_pro_mutations == 2:
num_double_mutant_pro_variants += 1
elif num_pro_mutations > 2:
num_triple_plus_mutant_pro_variants += 1

# Count variants by number of NT-sequence mutations.
if num_nt_mutations == 1:
num_single_mutant_nt_variants += 1
elif num_nt_mutations == 2:
num_double_mutant_nt_variants += 1
elif num_nt_mutations > 2:
num_triple_plus_mutant_nt_variants += 1

# Count variants with hgvs_splice set.
if v.hgvs_splice is not None:
num_splice_variants += 1

# Count protein sequence variants by most severe mutation type. Those with mutations other than missense
most_severe_pro_mutation_type: int = pro_mutation_summary["most_severe_mutation_type"]
if most_severe_pro_mutation_type == "nonsense":
num_nonsense_variants += 1
elif most_severe_pro_mutation_type == "missense":
num_missense_variants += 1
elif most_severe_pro_mutation_type == "synonymous":
num_synonymous_variants += 1

# print(f"{v.hgvs_pro}: {num_nt_mutations}/{num_pro_mutations} ({most_severe_pro_mutation_type})")

statistics = {
"num_splice_variants": num_splice_variants,
"target_length": {
"dna": dna_target_length,
"protein": pro_target_length,
},
}

if some_variants_have_hgvs_nt or some_variants_have_hgvs_pro:
statistics["num_variants_by_mutation_count"] = {}

if some_variants_have_hgvs_nt:
statistics["num_variants_by_mutation_count"]["nt"] = { # type: ignore
"single": num_single_mutant_nt_variants,
"double": num_double_mutant_nt_variants,
"triple_or_more": num_triple_plus_mutant_nt_variants,
}

if some_variants_have_hgvs_pro:
statistics["num_variants_by_mutation_count"]["pro"] = { # type: ignore
"single": num_single_mutant_pro_variants,
"double": num_double_mutant_pro_variants,
"triple_or_more": num_triple_plus_mutant_pro_variants,
}

if all_variants_have_hgvs_nt or all_variants_have_hgvs_pro:
statistics["mean_num_mutations_per_position"] = {}

if all_variants_have_hgvs_nt:
statistics["mean_num_mutations_per_position"]["dna"] = num_nt_mutations / dna_target_length if dna_target_length > 0 else 0 # type: ignore

if all_variants_have_hgvs_pro:
statistics["num_variants_by_mutation_type"] = {
"missense": num_missense_variants,
"nonsense": num_nonsense_variants,
"synonymous": num_synonymous_variants,
}
statistics["mean_num_mutations_per_position"]["protein"] = num_pro_mutations / pro_target_length if pro_target_length > 0 else 0 # type: ignore

return statistics
1 change: 1 addition & 0 deletions src/mavedb/models/score_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class ScoreSet(Base):
extra_metadata = Column(JSONB, nullable=False)
dataset_columns = Column(JSONB, nullable=False, default={})
external_links = Column(JSONB, nullable=False, default={})
statistics = Column(JSONB, nullable=True)

normalised = Column(Boolean, nullable=False, default=False)
private = Column(Boolean, nullable=False, default=True)
Expand Down
42 changes: 42 additions & 0 deletions src/mavedb/scripts/recalculate_score_set_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Recalculate and update statistics for all score sets.

Usage:
```
python3 -m mavedb.scripts.recalculate_score_set_statistics

python3 -m mavedb.scripts.recalculate_score_set_statistics -urn <score set URN>
```
"""

import argparse
import logging


from sqlalchemy import select

from mavedb.lib.score_sets import calculate_score_set_statistics
from mavedb.lib.script_environment import init_script_environment
from mavedb.models.score_set import ScoreSet

logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser(
prog='recalculate_score_set_statistics',
description='Calculate and store statistics for one score set or all score sets.'
)
parser.add_argument("-urn", help="URN of the score set to update")
args = parser.parse_args()

db = init_script_environment()

if args.urn is not None:
score_sets = db.scalars(select(ScoreSet).where(ScoreSet.urn == args.urn)).all()
else:
score_sets = db.scalars(select(ScoreSet)).all()

for score_set in score_sets:
statistics = calculate_score_set_statistics(score_set)
score_set.statistics = statistics
db.add(score_set)
db.commit()
Loading