Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 73 additions & 20 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import re
from operator import attrgetter
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal
from typing import Any, BinaryIO, Iterable, List, Optional, TYPE_CHECKING, Sequence, Literal

from mavedb.models.mapped_variant import MappedVariant
import numpy as np
Expand Down Expand Up @@ -401,12 +401,12 @@ def find_publish_or_private_superseded_score_set_tail(
def get_score_set_variants_as_csv(
db: Session,
score_set: ScoreSet,
data_type: Literal["scores", "counts"],
data_types: List[Literal["scores", "counts", "clinVar", "gnomAD"]],
start: Optional[int] = None,
limit: Optional[int] = None,
drop_na_columns: Optional[bool] = None,
include_custom_columns: bool = True,
include_post_mapped_hgvs: bool = False,
include_custom_columns: Optional[bool] = True,
include_post_mapped_hgvs: Optional[bool] = False,
) -> str:
"""
Get the variant data from a score set as a CSV string.
Expand All @@ -417,8 +417,8 @@ def get_score_set_variants_as_csv(
The database session to use.
score_set : ScoreSet
The score set to get the variants from.
data_type : {'scores', 'counts'}
The type of data to get. Either 'scores' or 'counts'.
data_types : List[Literal["scores", "counts", "clinVar", "gnomAD"]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if namespaces would be a better name for this now. The other thing that's a little tricky is we're really working with two distinct types of namespaces: score / count data supplied by the investigator and different namespaces for variant data we cache in our database. While we fetch score / count columns via one method, we will fetch data for other namespaces by a different one.

Copy link
Member Author

@EstelleDa EstelleDa Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer the namespaces in here. It's clearer. I also considered whether put all of them in one function cause it's quite complex and messy. Separating them makes it easy to distinguish whether the data is from ours or others. We can modify this part when we're ready to add the clinVar and gnomAD's data.

The data types to get. Either one of 'scores', 'counts', 'clinVar', 'gnomAD' or some of them.
start : int, optional
The index to start from. If None, starts from the beginning.
limit : int, optional
Expand All @@ -437,18 +437,33 @@ def get_score_set_variants_as_csv(
The CSV string containing the variant data.
"""
assert type(score_set.dataset_columns) is dict
custom_columns_set = "score_columns" if data_type == "scores" else "count_columns"
type_column = "score_data" if data_type == "scores" else "count_data"

custom_columns = {
"scores": "score_columns",
"counts": "count_columns",
}
custom_columns_set = [custom_columns[dt] for dt in data_types if dt in custom_columns]
type_to_column = {
"scores": "score_data",
"counts": "count_data"
}
type_columns = [type_to_column[dt] for dt in data_types if dt in type_to_column]
columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"]
if include_post_mapped_hgvs:
columns.append("post_mapped_hgvs_g")
columns.append("post_mapped_hgvs_p")

if include_custom_columns:
custom_columns = [str(x) for x in list(score_set.dataset_columns.get(custom_columns_set, []))]
columns += custom_columns
elif data_type == "scores":
for column in custom_columns_set:
dataset_columns = [str(x) for x in list(score_set.dataset_columns.get(column, []))]
if column == "score_columns":
for c in dataset_columns:
prefixed = "scores." + c
columns.append(prefixed)
elif column == "count_columns":
for c in dataset_columns:
prefixed = "counts." + c
columns.append(prefixed)
elif len(data_types) == 1 and data_types[0] == "scores":
Comment on lines +440 to +466
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, we might consider maintaining a dictionary of all of our namespaces. Each namespace keys a list of column names. The idea here is that if we combine the outer key and any column name from the contained list, we get a namespaced column. This will let us both easily generate a namespaced column name while also retaining efficient and easy access to the real internal column name. The other benefit is that we avoid key collisions right from the start of the CSV generation process, so we won't have to worry about identifier columns colliding between say, ClinVar and gnomAD namespaces.

In this suggestion, I also eliminated the dictionaries for the internal mappings of the column keys. I think that we are only ever going to have two of those, and doing each of them explicitly is more readable.

Note that on the last line, I think we want to add the score column whenever scores is in the data types list and we aren't including custom columns. Right now if you supplied [clinvar, scores] and include_custom_columns = False, you wouldn't get the score column as output which isn't what we want.

Suggested change
custom_columns = {
"scores": "score_columns",
"counts": "count_columns",
}
custom_columns_set = [custom_columns[dt] for dt in data_types if dt in custom_columns]
type_to_column = {
"scores": "score_data",
"counts": "count_data"
}
type_columns = [type_to_column[dt] for dt in data_types if dt in type_to_column]
columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"]
if include_post_mapped_hgvs:
columns.append("post_mapped_hgvs_g")
columns.append("post_mapped_hgvs_p")
if include_custom_columns:
custom_columns = [str(x) for x in list(score_set.dataset_columns.get(custom_columns_set, []))]
columns += custom_columns
elif data_type == "scores":
for column in custom_columns_set:
dataset_columns = [str(x) for x in list(score_set.dataset_columns.get(column, []))]
if column == "score_columns":
for c in dataset_columns:
prefixed = "scores." + c
columns.append(prefixed)
elif column == "count_columns":
for c in dataset_columns:
prefixed = "counts." + c
columns.append(prefixed)
elif len(data_types) == 1 and data_types[0] == "scores":
namespaced_score_set_columns: dict[str, list[str]] = {
"core": ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"],
"mavedb": [],
}
if include_post_mapped_hgvs:
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g")
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p")
for namespace in data_types:
namespaced_score_set_columns[namespace] = []
if include_custom_columns:
if "scores" in namespaced_score_set_columns:
namespaced_score_set_columns["scores"] = [
col for col in [str(x) for x in list(score_set.dataset_columns.get("score_columns", []))]
]
if "counts" in namespaced_score_set_columns:
namespaced_score_set_columns["counts"] = [
col for col in [str(x) for x in list(score_set.dataset_columns.get("count_columns", []))]
]
elif "scores" in namespaced_score_set_columns:

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@EstelleDa One thing @sallybg suggested to me today was to put the hgvs strings into their own, more descriptive, namespace rather than mavedb. I'm tempted by something like hgvs but that might be confusing considering we have other columns that start with hgvs. So maybe mapped_hgvs is most precise. We can put other identifiers into their own namespaces, so I think having an hgvs specific one is best.

columns.append(REQUIRED_SCORE_COLUMN)

variants: Sequence[Variant] = []
Expand Down Expand Up @@ -488,7 +503,35 @@ def get_score_set_variants_as_csv(
variants_query = variants_query.limit(limit)
variants = db.scalars(variants_query).all()

rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column, mappings=mappings) # type: ignore
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_columns, mappings=mappings) # type: ignore

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The nice thing about this implementation is that by the time we get to this point we're already done. We've fetched and renamed all our export files. We just need to generate a list of columns for the drop_na_columns_from_csv_file_rows function.

    rows_columns = [f"{namespace}.{col}" for namespace, cols in namespaced_score_set_columns.items() for col in cols]

One thing that your solution did that this one doesn't do is remove the prefix from columns if only one namespace is being exported. For consistency with the new changes, I think I actually prefer to always export columns inside their namespace. This implementation also would require a few small changes to some of the other CSV export routers, but I think that would be trivial.

# TODO: will add len(data_types) == 1 and "scores"/"counts" are not in [data_types] and include_post_mapped_hgvs
# case when we get the clinVar and gnomAD
if len(data_types) > 1 and include_post_mapped_hgvs:
rename_map = {}
rename_map["post_mapped_hgvs_g"] = "mavedb.post_mapped_hgvs_g"
rename_map["post_mapped_hgvs_p"] = "mavedb.post_mapped_hgvs_p"

# Update column order list (preserve original order)
columns = [rename_map.get(col, col) for col in columns]

# Rename keys in each row
renamed_rows_data = []
for row in rows_data:
renamed_row = {rename_map.get(k, k): v for k, v in row.items()}
renamed_rows_data.append(renamed_row)

rows_data = renamed_rows_data
elif len(data_types) == 1:
prefix = f"{data_types[0]}."
columns = [col[len(prefix):] if col.startswith(prefix) else col for col in columns]

# Rename rows to remove the same prefix from keys
renamed_rows_data = []
for row in rows_data:
renamed_row = {(k[len(prefix):] if k.startswith(prefix) else k): v for k, v in row.items()}
renamed_rows_data.append(renamed_row)
rows_data = renamed_rows_data
if drop_na_columns:
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)

Expand Down Expand Up @@ -532,7 +575,7 @@ def is_null(value):
def variant_to_csv_row(
variant: Variant,
columns: list[str],
dtype: str,
dtype: list[str],
mapping: Optional[MappedVariant] = None,
na_rep="NA",
) -> dict[str, Any]:
Expand All @@ -546,7 +589,7 @@ def variant_to_csv_row(
columns : list[str]
Columns to serialize.
dtype : str, {'scores', 'counts'}
The type of data requested. Either the 'score_data' or 'count_data'.
The type of data requested. ['score_data'], ['count_data'] or ['score_data', 'count_data'].
na_rep : str
String to represent null values.

Expand Down Expand Up @@ -577,8 +620,18 @@ def variant_to_csv_row(
else:
value = ""
else:
parent = variant.data.get(dtype) if variant.data else None
value = str(parent.get(column_key)) if parent else na_rep
for dt in dtype:
parent = variant.data.get(dt) if variant.data else None
if column_key.startswith("scores."):
inner_key = column_key.replace("scores.", "")
elif column_key.startswith("counts."):
inner_key = column_key.replace("counts.", "")
else:
# fallback for non-prefixed columns
inner_key = column_key
if parent and inner_key in parent:
value = str(parent[inner_key])
break
if is_null(value):
value = na_rep
row[column_key] = value
Expand All @@ -589,7 +642,7 @@ def variant_to_csv_row(
def variants_to_csv_rows(
variants: Sequence[Variant],
columns: list[str],
dtype: str,
dtype: List[str],
mappings: Optional[Sequence[Optional[MappedVariant]]] = None,
na_rep="NA",
) -> Iterable[dict[str, Any]]:
Expand All @@ -602,8 +655,8 @@ def variants_to_csv_rows(
List of variants.
columns : list[str]
Columns to serialize.
dtype : str, {'scores', 'counts'}
The type of data requested. Either the 'score_data' or 'count_data'.
dtype : list, {'scores', 'counts'}
The type of data requested. ['score_data'], ['count_data'] or ['score_data', 'count_data'].
na_rep : str
String to represent null values.

Expand Down
25 changes: 14 additions & 11 deletions src/mavedb/routers/score_sets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from datetime import date
from typing import Any, List, Optional, Sequence, Union
from typing import Any, List, Literal, Optional, Sequence, Union

import pandas as pd
from arq import ArqRedis
Expand Down Expand Up @@ -249,7 +249,13 @@ def get_score_set_variants_csv(
urn: str,
start: int = Query(default=None, description="Start index for pagination"),
limit: int = Query(default=None, description="Maximum number of variants to return"),
data_types: List[Literal["scores", "counts", "clinVar", "gnomAD"]] = Query(
default=["scores"],
description="One or more data types to include: scores, counts, clinVar, gnomAD"
),
drop_na_columns: Optional[bool] = None,
include_custom_columns: Optional[bool] = None,
include_post_mapped_hgvs: Optional[bool] = None,
db: Session = Depends(deps.get_db),
user_data: Optional[UserData] = Depends(get_current_user),
) -> Any:
Expand All @@ -262,9 +268,6 @@ def get_score_set_variants_csv(
TODO (https://github.com/VariantEffect/mavedb-api/issues/446) We may want to turn this into a general-purpose CSV
export endpoint, with options governing which columns to include.

Parameters
__________

Parameters
__________
urn : str
Expand Down Expand Up @@ -312,12 +315,12 @@ def get_score_set_variants_csv(
csv_str = get_score_set_variants_as_csv(
db,
score_set,
"scores",
data_types,
start,
limit,
drop_na_columns,
include_custom_columns=False,
include_post_mapped_hgvs=True,
include_custom_columns,
include_post_mapped_hgvs,
)
return StreamingResponse(iter([csv_str]), media_type="text/csv")

Expand Down Expand Up @@ -373,7 +376,7 @@ def get_score_set_scores_csv(

assert_permission(user_data, score_set, Action.READ)

csv_str = get_score_set_variants_as_csv(db, score_set, "scores", start, limit, drop_na_columns)
csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], start, limit, drop_na_columns)
return StreamingResponse(iter([csv_str]), media_type="text/csv")


Expand Down Expand Up @@ -428,7 +431,7 @@ async def get_score_set_counts_csv(

assert_permission(user_data, score_set, Action.READ)

csv_str = get_score_set_variants_as_csv(db, score_set, "counts", start, limit, drop_na_columns)
csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], start, limit, drop_na_columns)
return StreamingResponse(iter([csv_str]), media_type="text/csv")


Expand Down Expand Up @@ -1252,12 +1255,12 @@ async def update_score_set(
] + item.dataset_columns["count_columns"]

scores_data = pd.DataFrame(
variants_to_csv_rows(item.variants, columns=score_columns, dtype="score_data")
variants_to_csv_rows(item.variants, columns=score_columns, dtype=["score_data"])
).replace("NA", pd.NA)

if item.dataset_columns["count_columns"]:
count_data = pd.DataFrame(
variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data")
variants_to_csv_rows(item.variants, columns=count_columns, dtype=["count_data"])
).replace("NA", pd.NA)
else:
count_data = None
Expand Down
4 changes: 2 additions & 2 deletions src/mavedb/scripts/export_public_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,12 @@ def export_public_data(db: Session):
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
csv_filename_base = score_set.urn.replace(":", "-")

csv_str = get_score_set_variants_as_csv(db, score_set, "scores")
csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"])
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)

count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
if count_columns and len(count_columns) > 0:
csv_str = get_score_set_variants_as_csv(db, score_set, "counts")
csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"])
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)


Expand Down
Loading