VariantEffect · jstone-dev · May 14, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/README.md b/README.md
@@ -135,3 +135,43 @@ We maintain deployment configuration options and steps within a [private reposit
 the production MaveDB environment. The main difference between the production setup and these local setups is that
 the worker and api services are split into distinct environments, allowing them to scale up or down individually
 dependent on need.
+
+## Scripts
+
+### export_public_data.py
+
+Usage:
+
+```
+python3 -m mavedb.scripts.export_public_data
+```
+
+This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents:
+- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets
+- variants/
+  - [URN].counts.csv (for each variant URN): The score set's variant count columns,
+    sorted by variant number
+  - [URN].scores.csv (for each variant URN): The score set's variant count columns,
+    sorted by variant number
+
+In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets.
+Experiments are nested in their parent experiment sets, and score sets in their parent experiments.
+
+The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like
+`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance.
+
+Unpublished data and data sets licensed other than under the Create Commmons Zero license are not included in the dump,
+and user details are limited to ORCID IDs and names of contributors to published data sets.
+
+
+### recalculate_score_set_statistics.py
+
+Usage:
+
+```
+python3 -m mavedb.scripts.recalculate_score_set_statistics
+
+python3 -m mavedb.scripts.recalculate_score_set_statistics -urn <score set URN>
+```
+
+This script recalculates statistics for all score sets or, if a URN is specified, for one score set. Statistics are stored as a JSON object in each score set's `statistics` property.
diff --git a/alembic/versions/b823e14d6a00_score_set_statistics.py b/alembic/versions/b823e14d6a00_score_set_statistics.py
@@ -0,0 +1,24 @@
+"""Score set statistics
+
+Revision ID: b823e14d6a00
+Revises: 9702d32bacb3
+Create Date: 2024-05-12 18:37:23.203099
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = 'b823e14d6a00'
+down_revision = '9702d32bacb3'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column('scoresets', sa.Column('statistics', postgresql.JSONB(astext_type=sa.Text()), nullable=True))
+
+
+def downgrade():
+    op.drop_column('scoresets', 'statistics')
diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py
@@ -1,8 +1,11 @@
 import csv
 import io
+import logging
 import re
-from typing import Any, BinaryIO, Iterable, Optional, Sequence
+from typing import Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, Union
 
+import mavehgvs
+from mavehgvs import util as mavehgvs_util
 import numpy as np
 import pandas as pd
 from pandas.testing import assert_index_equal
@@ -42,6 +45,8 @@
 from mavedb.models.variant import Variant
 from mavedb.view_models.search import ScoreSetsSearch
 
+logger = logging.getLogger(__name__)
+
 VariantData = dict[str, Optional[dict[str, dict]]]
 
 
@@ -635,3 +640,230 @@ def columns_for_dataset(dataset: Optional[pd.DataFrame]) -> list[str]:
         return []
 
     return [col for col in dataset.columns if col not in HGVSColumns.options()]
+
+
+def get_score_set_target_lengths(score_set: ScoreSet):
+    dna_lengths: list[int] = []
+    protein_lengths: list[int] = []
+    for target_gene in score_set.target_genes:
+        if target_gene.target_sequence is not None:
+            if target_gene.target_sequence.sequence_type == "protein":  # or "dna"
+                protein_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0
+                protein_lengths.append(protein_length)
+                dna_lengths.append(protein_length * 3)  # Infer DNA target length from protein.
+            elif target_gene.target_sequence.sequence_type == "dna":
+                dna_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0
+                protein_lengths.append(0)  # Do not infer a protein target length from DNA.
+                dna_lengths.append(dna_length)
+            else:
+                # Invalid sequence type
+                raise ValidationError("Invalid sequence type")
+    return {
+        "dna": dna_lengths,
+        "protein": protein_lengths,
+    }
+
+
+def summarize_nt_mutations_in_variant(variant: Variant):
+    if variant.hgvs_nt is None:
+        return {"num_mutations": 0}
+    nt_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings(
+        [variant.hgvs_nt]
+    )
+    nt_variant: Optional[mavehgvs.Variant] = nt_variants[0][0]
+    if nt_variant is None:
+        return {"num_mutations": 0}
+    return {
+        "num_mutations": len(nt_variant.positions) if type(nt_variant.positions) is list else 1,
+    }
+
+
+def summarize_pro_mutations_in_variant(variant: Variant):
+    if variant.hgvs_pro is None:
+        return {"num_mutations": 0, "most_severe_mutation_type": None}
+    pro_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings(
+        [variant.hgvs_pro]
+    )
+    pro_variant: Optional[mavehgvs.Variant] = pro_variants[0][0]
+    if pro_variant is None:
+        return {"num_mutations": 0, "most_severe_mutation_type": None}
+    # print(f"{pro_variant.sequence}, VTYPE: {pro_variant.variant_type}, {pro_variant.target_id}")
+
+    has_synonmyous_mutations = False
+    has_missense_mutations = False
+    has_nonsense_mutations = False
+    has_other_mutations = False
+
+    # The pro_variant object contains either one sequence or a list of them, and similarly one variation type or a list.
+    # We normalize both here, turning single elements into lists of length 1.
+    sequences: list[Union[str, tuple[str, str], None]] = (
+        pro_variant.sequence if type(pro_variant.sequence) is list else [pro_variant.sequence]
+    )
+    variation_types: list[str] = (
+        pro_variant.variant_type if type(pro_variant.variant_type) is list else [pro_variant.variant_type]
+    )
+
+    # Look at each variation (mutation) and count synonymous, nonsense, and missense mutations.
+    for i, variation_type in enumerate(variation_types):
+        if variation_type == "equal":
+            has_synonmyous_mutations = True
+        elif variation_type == "sub":
+            sequence = sequences[i]
+            # For a substitution, there should be two sequence elements (WT and mutated). The second sequence element
+            # may in general be one of the following:
+            # - An amino acid
+            # - Ter (stop codon). MaveHGVS doesn't support the short notation *.
+            # - - or del. This should not occur when the variant type is "sub."
+            # - =. This should not occur when the variant type is "sub," but we allow it.
+            if type(sequence) is not tuple:
+                logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.")
+                has_other_mutations = True
+            elif sequence[1] in ["*", "Ter"]:
+                has_nonsense_mutations = True
+            elif sequence[1] in ["-", "del"]:
+                logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.")
+                has_other_mutations = True
+            elif sequence[1] == "=" or sequence[0] == sequence[1]:
+                has_synonmyous_mutations = True
+            else:
+                has_missense_mutations = True
+        else:
+            # print(variation_type)
+            has_other_mutations = True
+
+    # Set the variant type only if all variations are of the same type.
+    if has_other_mutations:
+        mutation_type = "other"
+    elif has_nonsense_mutations:
+        mutation_type = "nonsense"
+    elif has_missense_mutations:
+        mutation_type = "missense"
+    elif has_synonmyous_mutations:
+        mutation_type = "synonymous"
+    else:
+        mutation_type = "other"
+
+    return {
+        "num_mutations": len(pro_variant.positions) if type(pro_variant.positions) is list else 1,
+        "most_severe_mutation_type": mutation_type,
+    }
+
+
+def calculate_score_set_statistics(score_set: ScoreSet):
+    score_set.target_genes
+
+    lengths = get_score_set_target_lengths(score_set)
+    dna_target_length = sum(lengths["dna"])
+    pro_target_length = sum(lengths["protein"])
+
+    num_single_mutant_nt_variants = 0
+    num_double_mutant_nt_variants = 0
+    num_triple_plus_mutant_nt_variants = 0
+
+    num_single_mutant_pro_variants = 0
+    num_double_mutant_pro_variants = 0
+    num_triple_plus_mutant_pro_variants = 0
+
+    num_splice_variants = 0
+
+    num_missense_variants = 0
+    num_nonsense_variants = 0
+    num_synonymous_variants = 0
+
+    # Count of all AA- and NT-sequence mutations, to be used in determining the average number of mutations per position
+    num_nt_mutations = 0
+    num_pro_mutations = 0
+
+    all_variants_have_hgvs_nt = True
+    all_variants_have_hgvs_pro = True
+    some_variants_have_hgvs_nt = False
+    some_variants_have_hgvs_pro = False
+
+    for v in score_set.variants:
+        # if not re.search(r"\*$", v.hgvs_pro):
+        #     continue
+
+        variant_has_hgvs_nt = v.hgvs_nt is not None
+        variant_has_hgvs_pro = v.hgvs_pro is not None
+        all_variants_have_hgvs_nt = all_variants_have_hgvs_nt and variant_has_hgvs_nt
+        all_variants_have_hgvs_pro = all_variants_have_hgvs_pro and variant_has_hgvs_pro
+        some_variants_have_hgvs_nt = some_variants_have_hgvs_nt or variant_has_hgvs_nt
+        some_variants_have_hgvs_pro = some_variants_have_hgvs_pro or variant_has_hgvs_pro
+
+        nt_mutation_summary = summarize_nt_mutations_in_variant(v)
+        pro_mutation_summary = summarize_pro_mutations_in_variant(v)
+
+        num_nt_mutations = nt_mutation_summary["num_mutations"]
+        num_pro_mutations = pro_mutation_summary["num_mutations"]
+
+        # Count variants by number of AA-sequence mutations.
+        if num_pro_mutations == 1:
+            num_single_mutant_pro_variants += 1
+        elif num_pro_mutations == 2:
+            num_double_mutant_pro_variants += 1
+        elif num_pro_mutations > 2:
+            num_triple_plus_mutant_pro_variants += 1
+
+        # Count variants by number of NT-sequence mutations.
+        if num_nt_mutations == 1:
+            num_single_mutant_nt_variants += 1
+        elif num_nt_mutations == 2:
+            num_double_mutant_nt_variants += 1
+        elif num_nt_mutations > 2:
+            num_triple_plus_mutant_nt_variants += 1
+
+        # Count variants with hgvs_splice set.
+        if v.hgvs_splice is not None:
+            num_splice_variants += 1
+
+        # Count protein sequence variants by most severe mutation type. Those with mutations other than missense
+        most_severe_pro_mutation_type: int = pro_mutation_summary["most_severe_mutation_type"]
+        if most_severe_pro_mutation_type == "nonsense":
+            num_nonsense_variants += 1
+        elif most_severe_pro_mutation_type == "missense":
+            num_missense_variants += 1
+        elif most_severe_pro_mutation_type == "synonymous":
+            num_synonymous_variants += 1
+
+        # print(f"{v.hgvs_pro}: {num_nt_mutations}/{num_pro_mutations} ({most_severe_pro_mutation_type})")
+
+    statistics = {
+        "num_splice_variants": num_splice_variants,
+        "target_length": {
+            "dna": dna_target_length,
+            "protein": pro_target_length,
+        },
+    }
+
+    if some_variants_have_hgvs_nt or some_variants_have_hgvs_pro:
+        statistics["num_variants_by_mutation_count"] = {}
+
+    if some_variants_have_hgvs_nt:
+        statistics["num_variants_by_mutation_count"]["nt"] = {  # type: ignore
+            "single": num_single_mutant_nt_variants,
+            "double": num_double_mutant_nt_variants,
+            "triple_or_more": num_triple_plus_mutant_nt_variants,
+        }
+
+    if some_variants_have_hgvs_pro:
+        statistics["num_variants_by_mutation_count"]["pro"] = {  # type: ignore
+            "single": num_single_mutant_pro_variants,
+            "double": num_double_mutant_pro_variants,
+            "triple_or_more": num_triple_plus_mutant_pro_variants,
+        }
+
+    if all_variants_have_hgvs_nt or all_variants_have_hgvs_pro:
+        statistics["mean_num_mutations_per_position"] = {}
+
+    if all_variants_have_hgvs_nt:
+        statistics["mean_num_mutations_per_position"]["dna"] = num_nt_mutations / dna_target_length if dna_target_length > 0 else 0  # type: ignore
+
+    if all_variants_have_hgvs_pro:
+        statistics["num_variants_by_mutation_type"] = {
+            "missense": num_missense_variants,
+            "nonsense": num_nonsense_variants,
+            "synonymous": num_synonymous_variants,
+        }
+        statistics["mean_num_mutations_per_position"]["protein"] = num_pro_mutations / pro_target_length if pro_target_length > 0 else 0  # type: ignore
+
+    return statistics
diff --git a/src/mavedb/models/score_set.py b/src/mavedb/models/score_set.py
@@ -73,6 +73,7 @@ class ScoreSet(Base):
     extra_metadata = Column(JSONB, nullable=False)
     dataset_columns = Column(JSONB, nullable=False, default={})
     external_links = Column(JSONB, nullable=False, default={})
+    statistics = Column(JSONB, nullable=True)
 
     normalised = Column(Boolean, nullable=False, default=False)
     private = Column(Boolean, nullable=False, default=True)

diff --git a/src/mavedb/scripts/recalculate_score_set_statistics.py b/src/mavedb/scripts/recalculate_score_set_statistics.py
@@ -0,0 +1,42 @@
+"""
+Recalculate and update statistics for all score sets.
+
+Usage:
+```
+python3 -m mavedb.scripts.recalculate_score_set_statistics
+
+python3 -m mavedb.scripts.recalculate_score_set_statistics -urn <score set URN>
+```
+"""
+
+import argparse
+import logging
+
+
+from sqlalchemy import select
+
+from mavedb.lib.score_sets import calculate_score_set_statistics
+from mavedb.lib.script_environment import init_script_environment
+from mavedb.models.score_set import ScoreSet
+
+logger = logging.getLogger(__name__)
+
+parser = argparse.ArgumentParser(
+    prog='recalculate_score_set_statistics',
+    description='Calculate and store statistics for one score set or all score sets.'
+)
+parser.add_argument("-urn", help="URN of the score set to update")
+args = parser.parse_args()
+
+db = init_script_environment()
+
+if args.urn is not None:
+    score_sets = db.scalars(select(ScoreSet).where(ScoreSet.urn == args.urn)).all()
+else:
+    score_sets = db.scalars(select(ScoreSet)).all()
+
+for score_set in score_sets:
+    statistics = calculate_score_set_statistics(score_set)
+    score_set.statistics = statistics
+    db.add(score_set)
+    db.commit()