Skip to content

Commit

Permalink
Update fields and fix number templating bug (virusseq#81)
Browse files Browse the repository at this point in the history
* add two new fields

* update docker setup

* fix number templating

* update song schema to fixed one

* check for integer and float separately and fmt

* clean up
  • Loading branch information
jaserud authored Aug 10, 2021
1 parent 5920467 commit e1da65b
Show file tree
Hide file tree
Showing 11 changed files with 97 additions and 59 deletions.
2 changes: 1 addition & 1 deletion compose/song-db-init/song-init.sql

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions sample-files/metadata.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
study_id specimen collector sample ID sample collected by sequence submitted by sample collection date sample collection date null reason geo_loc_name (country) geo_loc_name (state/province/territory) organism fasta header name purpose of sampling purpose of sampling details anatomical material anatomical part body product environmental material environmental site collection device collection method host (scientific name) host disease host age host age null reason host age unit host age bin host gender purpose of sequencing purpose of sequencing details sequencing instrument sequencing protocol raw sequence data processing method dehosting method consensus sequence software name consensus sequence software version breadth of coverage value depth of coverage value reference genome accession gene name diagnostic pcr Ct value bioinformatics protocol isolate
COVIDPR Qc-L00210314 Not Provided Laboratoire de santé publique du Québec (LSPQ) 2021-03-01 Canada Quebec Severe acute respiratory syndrome coronavirus 2 Canada/Qc-L00210314/2020 Cluster/Outbreak investigation Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Homo sapiens COVID-19 41 year 40 - 49 Female Baseline surveillance (random sampling) Not Provided Oxford Nanopore 1D_DNA_MinION Trimmomatic v. 0.38 Nanostripper bcftools 1 95% 400x NC_045512.2 E gene (orf4) 21
COVIDPR Qc-L00212401 Not Provided Laboratoire de santé publique du Québec (LSPQ) 2021-03-04 Canada Quebec Severe acute respiratory syndrome coronavirus 2 Canada/Qc-L00212401/2020 Cluster/Outbreak investigation Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Homo sapiens COVID-19 42 year 40 - 49 Male Baseline surveillance (random sampling) Not Provided Oxford Nanopore 1D_DNA_MinION Trimmomatic v. 0.38 Nanostripper bcftools 1 98% 400x NC_045512.2 E gene (orf4) 21
study_id specimen collector sample ID sample collected by sequence submitted by sample collection date sample collection date null reason geo_loc_name (country) geo_loc_name (state/province/territory) organism fasta header name purpose of sampling purpose of sampling details anatomical material anatomical part body product environmental material environmental site collection device collection method host (scientific name) host disease host age host age null reason host age unit host age bin host gender purpose of sequencing purpose of sequencing details sequencing instrument sequencing protocol raw sequence data processing method dehosting method consensus sequence software name consensus sequence software version breadth of coverage value depth of coverage value reference genome accession gene name diagnostic pcr Ct value bioinformatics protocol isolate GISAID accession diagnostic pcr Ct value null reason
COVIDPR MUSE-1111 Not Provided Laboratoire de santé publique du Québec (LSPQ) 2021-03-01 Canada Quebec Severe acute respiratory syndrome coronavirus 2 Canada/Qc-L00210314/2020 Cluster/Outbreak investigation Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Homo sapiens COVID-19 41 year 40 - 49 Male Baseline surveillance (random sampling) Not Provided Oxford Nanopore 1D_DNA_MinION Trimmomatic v. 0.38 Nanostripper bcftools 1 95% 400x NC_045512.2 E gene (orf4) 21
COVIDPR MUSE-2222 Not Provided Laboratoire de santé publique du Québec (LSPQ) 2021-03-04 Canada Quebec Severe acute respiratory syndrome coronavirus 2 Canada/Qc-L00212401/2020 Cluster/Outbreak investigation Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Not Provided Homo sapiens COVID-19 42 year 40 - 49 Female Baseline surveillance (random sampling) Not Provided Oxford Nanopore 1D_DNA_MinION Trimmomatic v. 0.38 Nanostripper bcftools 1 98% 400x NC_045512.2 E gene (orf4) 21
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.text.StringSubstitutor;
import org.apache.commons.text.lookup.StringLookup;
import org.cancogenvirusseq.muse.config.MuseAppConfig;
Expand Down Expand Up @@ -91,7 +90,7 @@ private static List<String> findFilesWithHeaderOnly(
.collect(Collectors.toUnmodifiableList());
}

private static BiFunction<MapperReduceResult, Map<String, String>, MapperReduceResult>
private static BiFunction<MapperReduceResult, Map<String, Object>, MapperReduceResult>
accumulator(SubmissionBundle submissionBundle, String payloadTemplate) {
return (acc, r) -> {
val payload = convertRecordToPayload(r, payloadTemplate);
Expand Down Expand Up @@ -142,16 +141,16 @@ private static BinaryOperator<MapperReduceResult> combiner() {

@SneakyThrows
private static ObjectNode convertRecordToPayload(
Map<String, String> valuesMap, String payloadTemplate) {
Map<String, Object> valuesMap, String payloadTemplate) {
StringLookup lookupFunc =
key -> {
val value = valuesMap.getOrDefault(key, "");

// value is mapped to json value by these rules
if (NumberUtils.isCreatable(value)) {
if (value instanceof Number) {
// numeric no need to append quotes
return value;
} else if (value.trim().equals("")) {
return value.toString();
} else if (value.toString().trim().equals("")) {
// empty string map to null value
return "null";
} else {
Expand All @@ -163,8 +162,9 @@ private static ObjectNode convertRecordToPayload(
val sub = new StringSubstitutor(lookupFunc);
// throw error if valuesMap is missing template values in payloadTemplate
sub.setEnableUndefinedVariableException(true);

return new ObjectMapper().readValue(sub.replace(payloadTemplate), ObjectNode.class);
val templatedStr = sub.replace(payloadTemplate);
log.debug("Templated String - {}", templatedStr);
return new ObjectMapper().readValue(templatedStr, ObjectNode.class);
}

private static JsonNode createFilesObject(SubmissionFile submissionFile, String fileName) {
Expand Down
60 changes: 31 additions & 29 deletions src/main/java/org/cancogenvirusseq/muse/components/TsvParser.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
package org.cancogenvirusseq.muse.components;

import static java.lang.Double.parseDouble;
import static java.lang.Integer.parseInt;
import static java.util.stream.Collectors.toUnmodifiableList;
import static org.cancogenvirusseq.muse.model.tsv_parser.InvalidField.Reason.*;
import static org.cancogenvirusseq.muse.utils.StringUtils.isDouble;
import static org.cancogenvirusseq.muse.utils.StringUtils.isInteger;

import com.google.common.collect.ImmutableList;
import java.util.ArrayList;
Expand All @@ -17,7 +21,6 @@
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.commons.lang3.math.NumberUtils;
import org.cancogenvirusseq.muse.components.security.Scopes;
import org.cancogenvirusseq.muse.config.MuseAppConfig;
import org.cancogenvirusseq.muse.exceptions.submission.InvalidFieldsException;
Expand Down Expand Up @@ -54,7 +57,7 @@ public TsvParser(List<TsvFieldSchema> tsvFieldSchemas, Scopes scopes) {
}

@SneakyThrows
public Stream<Map<String, String>> parseAndValidateTsvStrToFlatRecords(
public Stream<Map<String, Object>> parseAndValidateTsvStrToFlatRecords(
String s, List<String> userScopes) {
log.info("Parsing TSV into flat records");
val lines = s.split("\n");
Expand All @@ -72,7 +75,7 @@ public Stream<Map<String, String>> parseAndValidateTsvStrToFlatRecords(
val records =
parse(lines)
.map(this::checkRequireNotEmpty)
.map(this::checkValueTypes)
.map(this::checkAndConvertNumberTypes)
.map(checkStudyScopesOperator)
.collect(toUnmodifiableList());

Expand All @@ -81,7 +84,7 @@ public Stream<Map<String, String>> parseAndValidateTsvStrToFlatRecords(
}

log.info("Parsed TSV successfully!");
return records.stream().map(Record::getStringStringMap);
return records.stream().map(Record::getStringObjectMap);
}

private HeaderCheckResult checkHeaders(List<String> expectedHeaders, List<String> actualHeaders) {
Expand Down Expand Up @@ -111,7 +114,7 @@ private Stream<Record> parse(String[] lines) {
val line = lines[j];
val data = line.split("\t");

Map<String, String> record = new HashMap<>();
Map<String, Object> record = new HashMap<>();
for (int i = 0; i < headers.length; ++i) {
val value = i >= data.length ? "" : data[i];
record.put(headers[i], cleanup(value));
Expand All @@ -125,23 +128,32 @@ private Record checkRequireNotEmpty(Record record) {
tsvFieldSchemas.forEach(
s -> {
val fieldName = s.getName();
val value = record.getStringStringMap().get(fieldName);
if (s.isRequireNotEmpty() && isEmpty(value)) {
val value = record.getStringObjectMap().get(fieldName).toString();
if (s.isRequireNotEmpty() && value.isEmpty()) {
record.addFieldError(fieldName, NOT_ALLOWED_TO_BE_EMPTY, record.getIndex());
}
});

return record;
}

private Record checkValueTypes(Record record) {
@SneakyThrows
private Record checkAndConvertNumberTypes(Record record) {
tsvFieldSchemas.forEach(
s -> {
val fieldName = s.getName();
val value = record.getStringStringMap().get(fieldName);
if (s.getValueType().equals(TsvFieldSchema.ValueType.number)
&& isNotEmpty(value) // ignore empty because it's checked before
&& isNotNumber(value)) {
val value = record.getStringObjectMap().get(fieldName).toString();

// short circuit for non number types or if empty (we checked for required before)
if (!s.getValueType().equals(TsvFieldSchema.ValueType.number) || value.isEmpty()) {
return;
}

if (isInteger(value)) {
record.getStringObjectMap().put(fieldName, parseInt(value));
} else if (isDouble(value)) {
record.getStringObjectMap().put(fieldName, parseDouble(value));
} else {
record.addFieldError(fieldName, EXPECTING_NUMBER_TYPE, record.getIndex());
}
});
Expand All @@ -155,7 +167,8 @@ private Record checkStudyScopes(Record record, List<String> userScopes) {
.anyMatch(
scopes.isSystemScope.or(
userScope ->
userScope.contains(record.getStringStringMap().get(STUDY_FIELD_NAME))));
userScope.contains(
record.getStringObjectMap().get(STUDY_FIELD_NAME).toString())));

if (!isAuthorized) {
record.addFieldError(STUDY_FIELD_NAME, UNAUTHORIZED_FOR_STUDY_UPLOAD, record.getIndex());
Expand All @@ -181,26 +194,14 @@ private List<InvalidField> getAllInvalidFieldErrors(List<Record> records) {
}

private Boolean recordNotEmpty(Record recordsDto) {
return !recordsDto.getStringStringMap().values().stream()
.allMatch(v -> v.trim().equalsIgnoreCase(""));
return !recordsDto.getStringObjectMap().values().stream()
.allMatch(v -> v.toString().trim().equalsIgnoreCase(""));
}

private static String cleanup(String rawValue) {
return rawValue.replace("\r", "").replace("\n", "");
}

private static Boolean isEmpty(String value) {
return value == null || value.trim().equalsIgnoreCase("");
}

private static Boolean isNotEmpty(String value) {
return !isEmpty(value);
}

private static Boolean isNotNumber(String value) {
return !NumberUtils.isCreatable(value);
}

@Value
static class HeaderCheckResult {
List<String> missing;
Expand All @@ -214,12 +215,13 @@ Boolean isInvalid() {
@Value
static class Record {
Integer index;
Map<String, String> stringStringMap;
Map<String, Object> stringObjectMap;
List<InvalidField> fieldErrors;

public void addFieldError(String fieldName, InvalidField.Reason reason, Integer index) {
fieldErrors.add(
new InvalidField(fieldName, stringStringMap.getOrDefault(fieldName, ""), reason, index));
new InvalidField(
fieldName, stringObjectMap.getOrDefault(fieldName, "").toString(), reason, index));
}

public Boolean hasFieldErrors() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
public class SubmissionBundle {
private final Authentication userAuthentication;
private final Set<String> originalFileNames = new HashSet<>();
private final ArrayList<Map<String, String>> records = new ArrayList<>();
private final ArrayList<Map<String, Object>> records = new ArrayList<>();
private final ConcurrentHashMap<String, SubmissionFile> files = new ConcurrentHashMap<>();

public SubmissionBundle(@NonNull Authentication userAuthentication) {
Expand Down
18 changes: 18 additions & 0 deletions src/main/java/org/cancogenvirusseq/muse/utils/StringUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package org.cancogenvirusseq.muse.utils;

import java.util.regex.Pattern;
import lombok.experimental.UtilityClass;

@UtilityClass
public class StringUtils {
private static final Pattern INTEGER_PATTERN = Pattern.compile("^-?\\d+$");
private static final Pattern DOUBLE_PATTERN = Pattern.compile("^-?\\d+\\.\\d+$");

public static Boolean isDouble(String s) {
return DOUBLE_PATTERN.matcher(s).matches();
}

public static Boolean isInteger(String s) {
return INTEGER_PATTERN.matcher(s).matches();
}
}
6 changes: 5 additions & 1 deletion src/main/resources/payload-template
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@
},
"pathogen_diagnostic_testing": {
"gene_name": ${gene name},
"diagnostic_pcr_ct_value": ${diagnostic pcr Ct value}
"diagnostic_pcr_ct_value": ${diagnostic pcr Ct value},
"diagnostic_pcr_ct_value_null_reason": ${diagnostic pcr Ct value null reason}
},
"database_identifiers": {
"gisaid_accession": ${GISAID accession}
}
}
8 changes: 5 additions & 3 deletions src/main/resources/tsv-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
{ "name": "collection method", "valueType": "string" },
{ "name": "host (scientific name)", "valueType": "string" },
{ "name": "host disease", "valueType": "string" },
{ "name": "host age", "valueType": "string" },
{ "name": "host age", "valueType": "number" },
{ "name": "host age null reason", "valueType": "string" },
{ "name": "host age unit", "valueType": "string" },
{ "name": "host age bin", "valueType": "string" },
Expand All @@ -33,11 +33,13 @@
{ "name": "raw sequence data processing method", "valueType": "string" },
{ "name": "dehosting method", "valueType": "string" },
{ "name": "consensus sequence software name", "valueType": "string" },
{ "name": "consensus sequence software version", "valueType": "number" },
{ "name": "consensus sequence software version", "valueType": "string" },
{ "name": "breadth of coverage value", "valueType": "string" },
{ "name": "depth of coverage value", "valueType": "string" },
{ "name": "reference genome accession", "valueType": "string" },
{ "name": "bioinformatics protocol", "valueType": "string" },
{ "name": "gene name", "valueType": "string" },
{ "name": "diagnostic pcr Ct value", "valueType": "number" }
{ "name": "diagnostic pcr Ct value", "valueType": "number" },
{ "name": "GISAID accession", "valueType": "string" },
{ "name": "diagnostic pcr Ct value null reason", "valueType": "string" }
]
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public class ComponentTestStubs {
+ "}"
+ "}";

public static final ArrayList<Map<String, String>> STUB_RECORDS =
public static final ArrayList<Map<String, Object>> STUB_RECORDS =
new ArrayList<>(
ImmutableList.of(
ImmutableMap.of(
Expand All @@ -81,7 +81,7 @@ public class ComponentTestStubs {
"fasta header name",
"ABCD/sam1/ddd/erd",
"age",
"123"),
123),
ImmutableMap.of(
"study_id",
"TEST-PR",
Expand All @@ -90,7 +90,7 @@ public class ComponentTestStubs {
"fasta header name",
"EFG/sam2/ddd/erd",
"age",
"456")));
456)));

public static final String STUB_RECORD_0_PAYLOAD =
"{ \"studyId\": \"TEST-PR\", "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ void testErrorOnFailedToMapRecordsAndFile() {
val records =
List.of(
STUB_RECORDS.get(0),
Map.of("submitter id", "sam2NotHere", "fasta header name", "notHere", "age", "456"));
Map.<String, Object>of(
"submitter id", "sam2NotHere", "fasta header name", "notHere", "age", 456));

val submissionBundle = new SubmissionBundle(authentication);
submissionBundle.getFiles().putAll(STUB_FILE_SAMPLE_MAP);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,27 @@ public TsvParserTests() {
void testTsvStrParsedToRecords() {
val tsvStr =
"age\tname\tsubmitterId\tstudy_id\n"
+ "123\tconsensus_sequence\tQc-L00244359\tTEST-STUDY\n";
+ "123\tconsensus_sequence\tQc-L00244359\tTEST-STUDY\n"
+ "44.56\tconsensus_sequence\tQc-1234\tTEST-STUDY\n";

val expected =
List.of(
Map.of(
Map.<String, Object>of(
"submitterId",
"Qc-L00244359",
"name",
"consensus_sequence",
"age",
"123",
123,
"study_id",
"TEST-STUDY"),
Map.<String, Object>of(
"submitterId",
"Qc-1234",
"name",
"consensus_sequence",
"age",
44.56,
"study_id",
"TEST-STUDY"));

Expand All @@ -92,13 +103,13 @@ void testTsvStrParsedToRecordsWithStudyScope() {
+ "123\tconsensus_sequence\tQc-L00244359\tTEST-STUDY\n";
val expected =
List.of(
Map.of(
Map.<String, Object>of(
"submitterId",
"Qc-L00244359",
"name",
"consensus_sequence",
"age",
"123",
123,
"study_id",
"TEST-STUDY"));

Expand All @@ -124,13 +135,13 @@ void testNullStudyPrefixScopeWorksCorrectly() {
+ "123\tconsensus_sequence\tQc-L00244359\tTEST-STUDY\n";
val expected =
List.of(
Map.of(
Map.<String, Object>of(
"submitterId",
"Qc-L00244359",
"name",
"consensus_sequence",
"age",
"123",
123,
"study_id",
"TEST-STUDY"));

Expand All @@ -157,13 +168,13 @@ void testEmptyStudyPrefixScopeWorksCorrectly() {
+ "123\tconsensus_sequence\tQc-L00244359\tTEST-STUDY\n";
val expected =
List.of(
Map.of(
Map.<String, Object>of(
"submitterId",
"Qc-L00244359",
"name",
"consensus_sequence",
"age",
"123",
123,
"study_id",
"TEST-STUDY"));

Expand Down

0 comments on commit e1da65b

Please sign in to comment.