Skip to content

Commit 4dd7e95

Browse files
Merge 25.3 to develop
2 parents 1dc4493 + b27cadd commit 4dd7e95

File tree

16 files changed

+442
-12
lines changed

16 files changed

+442
-12
lines changed

panoramapublic/src/org/labkey/panoramapublic/model/validation/SpecLibSourceFile.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
package org.labkey.panoramapublic.model.validation;
22

3+
import org.jetbrains.annotations.NotNull;
4+
import org.json.JSONObject;
5+
import org.labkey.api.data.Container;
6+
import org.labkey.panoramapublic.speclib.LibSourceFile;
7+
38
import java.util.Objects;
49

510
// For table panoramapublic.speclibsourcefile
@@ -78,4 +83,16 @@ public int hashCode()
7883
{
7984
return Objects.hash(getSourceType(), getName());
8085
}
86+
87+
@NotNull
88+
public JSONObject toJSON(Container container)
89+
{
90+
JSONObject jsonObject = super.toJSON(container);
91+
if (isIdFile() && LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(getName()) && !found())
92+
{
93+
jsonObject.put("statusDetails", "The DIA-NN TSV report must be in the same directory as the " +
94+
".speclib, and share some leading characters in the file name");
95+
}
96+
return jsonObject;
97+
}
8198
}

panoramapublic/src/org/labkey/panoramapublic/proteomexchange/validator/SpecLibValidator.java

Lines changed: 222 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,18 +171,40 @@ private static List<LibSourceFile> getLibSources(SpecLibReader libReader, ISpect
171171
{
172172
throw UnexpectedException.wrap(e, "Error reading source files from library file " + libFilePath.toString());
173173
}
174-
if (sourceFiles != null && sourceFiles.stream().anyMatch(LibSourceFile::isMaxQuantSearch))
174+
175+
if (sourceFiles == null) return null;
176+
177+
if (sourceFiles.stream().anyMatch(LibSourceFile::isMaxQuantSearch))
175178
{
176179
// For libraries built with MaxQuant search results we need to add additional files that are required for library building
177180
Set<String> idFileNames = sourceFiles.stream().filter(LibSourceFile::hasIdFile).map(LibSourceFile::getIdFile).collect(Collectors.toSet());
178-
for (String file: LibSourceFile.MAX_QUANT_ID_FILES)
181+
for (String file : LibSourceFile.MAX_QUANT_ID_FILES)
179182
{
180183
if (!idFileNames.contains(file))
181184
{
182185
sourceFiles.add(new LibSourceFile(null, file, null));
183186
}
184187
}
185188
}
189+
else if (sourceFiles.stream().anyMatch(LibSourceFile::isDiannSearch))
190+
{
191+
// Building a library with DIA-NN results in Skyline requires a .speclib file and a report TSV file.
192+
// The .blib file includes the name of .speclib but not the name of the report TSV file.
193+
// Building a library without the TSV gives this error message in Skyline:
194+
// "...the TSV report is required to read speclib files and must be in the same directory as the speclib
195+
// and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.tsv)..."
196+
197+
// At some point Skyline may start including the names of all source files in the .blib SQLite file,
198+
// so first check if any TSV files were listed as sources in the .blib
199+
boolean hasTsvFiles = sourceFiles.stream()
200+
.anyMatch(file -> file.hasIdFile() && file.getIdFile().toLowerCase().endsWith(".tsv"));
201+
if (!hasTsvFiles)
202+
{
203+
// If there is no TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
204+
sourceFiles.add(new LibSourceFile(null, LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, null));
205+
}
206+
}
207+
186208
return sourceFiles;
187209
}
188210

@@ -241,12 +263,29 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
241263
String idFile = source.getIdFile();
242264
if (source.hasIdFile() && !checkedFiles.contains(idFile))
243265
{
266+
if (LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(idFile)) continue; // We will look for this when we come to the .speclib file
267+
244268
checkedFiles.add(idFile);
245269
Path path = getPath(idFile, rawFilesDirPaths, false, fcs);
246270
SpecLibSourceFile sourceFile = new SpecLibSourceFile(idFile, PEPTIDE_ID);
247271
sourceFile.setSpecLibValidationId(getId());
248272
sourceFile.setPath(path != null ? path.toString() : DataFile.NOT_FOUND);
249273
idFiles.add(sourceFile);
274+
275+
if (source.isDiannSearch())
276+
{
277+
// If this is a DIA-NN .speclib file, check for the required report TSV file.
278+
// We are doing this because the .blib does not include the name of the report TSV file.
279+
// We only know that: "the TSV report is required to read speclib files and must be in the
280+
// same directory as the speclib and share some leading characters
281+
// (e.g. somedata-tsv.speclib and somedata-report.tsv)"
282+
Path reportFilePath = sourceFile.found() ? getDiannReportFilePath(path) : null;
283+
SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile(LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, PEPTIDE_ID);
284+
diannReportSourceFile.setSpecLibValidationId(getId());
285+
diannReportSourceFile.setPath(reportFilePath != null ? reportFilePath.toString() : DataFile.NOT_FOUND);
286+
idFiles.add(diannReportSourceFile);
287+
checkedFiles.add(idFile);
288+
}
250289
}
251290
}
252291
setSpectrumFiles(spectrumFiles);
@@ -266,6 +305,77 @@ private Path getPath(String name, Set<Path> rawFilesDirPaths, boolean isMaxquant
266305
return null;
267306
}
268307

308+
private static Path getDiannReportFilePath(Path speclibFilePath)
309+
{
310+
Path specLibFileDir = speclibFilePath.getParent();
311+
try (Stream<Path> paths = Files.list(specLibFileDir))
312+
{
313+
List<Path> files = paths.filter(path -> Files.isRegularFile(path)).collect(Collectors.toList());
314+
return getDiannReportFilePath(speclibFilePath.getFileName().toString(), files);
315+
}
316+
catch (IOException e)
317+
{
318+
throw UnexpectedException.wrap(e, "Error looking for DIA-NN report TSV file in " + specLibFileDir);
319+
}
320+
}
321+
322+
private static Path getDiannReportFilePath(String specLibFileName, List<Path> candidateFiles)
323+
{
324+
Map<Path, Integer> prefixLengthMap = getCommonPrefixLengthsForTsvFiles(candidateFiles, specLibFileName);
325+
326+
// Find the TSV file with the longest common prefix that also has the expected column headers in the first line
327+
return prefixLengthMap.entrySet().stream()
328+
.sorted((entry1, entry2) -> Integer.compare(entry2.getValue(), entry1.getValue())) // Sort descending by matching prefix length
329+
.map(Map.Entry::getKey) // File paths
330+
.filter(file -> hasRequiredHeaders(file)) // First line should have expected header columns
331+
.findFirst() // Get the first file that meets the conditions
332+
.orElse(null);
333+
}
334+
335+
private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> files, String specLibFileName)
336+
{
337+
String specLibFileBaseName = FileUtil.getBaseName(specLibFileName); // Remove file extension
338+
Map<Path, Integer> prefixLengthMap = new HashMap<>();
339+
files.stream()
340+
.filter(file -> file.getFileName().toString().toLowerCase().endsWith(".tsv")) // Ensure it's a TSV file
341+
.forEach(file -> {
342+
// Get the longest common prefix length
343+
int commonPrefixLength = commonPrefixLength(specLibFileBaseName, FileUtil.getBaseName(file.getFileName().toString()));
344+
345+
if (commonPrefixLength > 0)
346+
{
347+
prefixLengthMap.put(file, commonPrefixLength);
348+
}
349+
});
350+
return prefixLengthMap;
351+
}
352+
353+
private static int commonPrefixLength(String s1, String s2)
354+
{
355+
int maxLength = Math.min(s1.length(), s2.length());
356+
int index = 0;
357+
while (index < maxLength && s1.charAt(index) == s2.charAt(index))
358+
{
359+
index++;
360+
}
361+
return index;
362+
}
363+
364+
private static boolean hasRequiredHeaders(Path diannReportTsv)
365+
{
366+
try
367+
{
368+
// Read the first line of the file
369+
String firstLine = Files.lines(diannReportTsv).findFirst().orElse("");
370+
// Check if the first line has the expected header columns names
371+
return List.of(firstLine.trim().split("\t")).containsAll(LibSourceFile.DIANN_REPORT_EXPECTED_HEADERS);
372+
}
373+
catch (IOException e)
374+
{
375+
throw UnexpectedException.wrap(e, "Error reading the first line of TSV file " + diannReportTsv);
376+
}
377+
}
378+
269379
private Path findInDirectoryTree(java.nio.file.Path rawFilesDirPath, String fileName, boolean allowBaseName)
270380
{
271381
try
@@ -459,6 +569,116 @@ public void testAccept()
459569
assertTrue(accept("170428_DBS_cal_7a.d", "170428_DBS_cal_7a.d.zip"));
460570
}
461571

572+
@Test
573+
public void testCommonPrefixLength() throws IOException
574+
{
575+
Path testDataDir = getDiannTestFilesPath();
576+
577+
// The spec lib file name to compare against
578+
String specLibFileName = "report-lib.parquet.skyline-for-test.speclib";
579+
580+
Path tsvFile1 = testDataDir.resolve("report-lib.tsv");
581+
Path tsvFile2 = testDataDir.resolve("report-lib-for-test.tsv");
582+
Path tsvFile3 = testDataDir.resolve("report-lib.parquet.tsv");
583+
Path tsvFile4 = testDataDir.resolve("report-lib.parquet-test.tsv");
584+
Path tsvFile5 = testDataDir.resolve("no-prefix-match-report.tsv");
585+
Path nonTsvFile1 = testDataDir.resolve("report-lib.parquet.skyline-for-test.txt");
586+
Path nonTsvFile2 = testDataDir.resolve("report.txt");
587+
Path nonTsvFile3 = testDataDir.resolve(specLibFileName);
588+
589+
List<Path> files = List.of(tsvFile1, tsvFile2, tsvFile3, tsvFile4, tsvFile5, nonTsvFile1, nonTsvFile2, nonTsvFile3);
590+
591+
Map<Path, Integer> prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
592+
// Expect 4 TSV files in the list; files without a prefix match, and non-TSV files should be ignored.
593+
assertEquals("Unexpected size of prefixLengthMap", 4, prefixLengthMap.size());
594+
595+
// File report-lib.tsv should have a common prefix "report-lib"
596+
assertTrue(prefixLengthMap.containsKey(tsvFile1));
597+
assertEquals("report-lib".length(), prefixLengthMap.get(tsvFile1).intValue());
598+
599+
// File report-lib-test.tsv should have a common prefix "report-lib"
600+
assertTrue(prefixLengthMap.containsKey(tsvFile2));
601+
assertEquals("report-lib".length(), prefixLengthMap.get(tsvFile2).intValue());
602+
603+
// File report-lib.parquet.tsv should have a common prefix "report-lib.parquet"
604+
assertTrue(prefixLengthMap.containsKey(tsvFile3));
605+
assertEquals("report-lib.parquet".length(), prefixLengthMap.get(tsvFile3).intValue());
606+
607+
// File report-lib.parquet-test.tsv should have a common prefix "report-lib.parquet"
608+
assertTrue(prefixLengthMap.containsKey(tsvFile4));
609+
assertEquals("report-lib.parquet".length(), prefixLengthMap.get(tsvFile4).intValue());
610+
611+
// File no-prefix-match-report.tsv should not have a common prefix
612+
assertFalse(tsvFile5 + " does not share a prefix with " + specLibFileName, prefixLengthMap.containsKey(tsvFile5));
613+
614+
assertFalse(prefixLengthMap.containsKey(nonTsvFile1));
615+
assertFalse(prefixLengthMap.containsKey(nonTsvFile2));
616+
assertFalse(prefixLengthMap.containsKey(nonTsvFile3));
617+
618+
// List of files that do not share a common prefix with the speclib file
619+
files = List.of(testDataDir.resolve("abcd.tsv"), testDataDir.resolve("1234.tsv"), testDataDir.resolve("lib.parquet.skyline.tsv"));
620+
prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
621+
assertEquals(0, prefixLengthMap.size());
622+
623+
prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
624+
assertEquals(0, prefixLengthMap.size());
625+
}
626+
627+
@Test
628+
public void testGetDiannReportFilePath() throws IOException
629+
{
630+
Path testDataDir = getDiannTestFilesPath();
631+
String specLibFileName = "report-lib.parquet.skyline-for-test.speclib";
632+
633+
Path reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, Collections.emptyList());
634+
assertNull("Unexpected report TSV file path returned. Input file list is empty.", reportTsvFile);
635+
636+
// TSV Files in the test directory
637+
Path tsvFile1 = testDataDir.resolve("report.tsv");
638+
Path tsvFile2 = testDataDir.resolve("report-lib-for-test.tsv");
639+
Path tsvFile3 = testDataDir.resolve("no-prefix-match-report-for-test.tsv");
640+
Path tsvFile4 = testDataDir.resolve("report-lib.parquet-missing-headers.txt");
641+
// Non-TSV files in the test directory
642+
Path nonTsvFile1 = testDataDir.resolve("report.txt");
643+
Path nonTsvFile2 = testDataDir.resolve("report-lib.parquet.skyline-for-test.txt");
644+
Path nonTsvFile3 = testDataDir.resolve(specLibFileName);
645+
Path nonTsvFile4 = testDataDir.resolve("test_diann_library.blib");
646+
647+
List<Path> candidateFiles = new ArrayList<>();
648+
candidateFiles.add(nonTsvFile1);
649+
candidateFiles.add(nonTsvFile2);
650+
candidateFiles.add(nonTsvFile3);
651+
candidateFiles.add(nonTsvFile4);
652+
653+
assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files",
654+
SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));
655+
656+
candidateFiles.add(tsvFile3); // TSV file does not share a prefix with the speclib file
657+
assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files that share a prefix with the speclib file",
658+
SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));
659+
660+
candidateFiles.add(tsvFile4); // TSV file does not have the required column headers
661+
assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files that share a prefix with the speclib file" +
662+
" and have the required column headers",
663+
SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));
664+
665+
candidateFiles.add(tsvFile1); // Shares a prefix and has the required column headers
666+
reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles);
667+
assertNotNull(reportTsvFile);
668+
assertEquals(tsvFile1, reportTsvFile);
669+
670+
candidateFiles.add(tsvFile2); // Shares a longer prefix with the speclib file
671+
reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles);
672+
assertNotNull(reportTsvFile);
673+
assertEquals(tsvFile2, reportTsvFile);
674+
}
675+
676+
private static Path getDiannTestFilesPath() throws IOException
677+
{
678+
return JunitUtil.getSampleData(ModuleLoader.getInstance().getModule(PanoramaPublicModule.class),
679+
"TargetedMS/panoramapublic/LibraryTest-DiaNN").toPath();
680+
}
681+
462682
private ISpectrumLibrary createLibrary(Path path)
463683
{
464684
return new ISpectrumLibrary()

panoramapublic/src/org/labkey/panoramapublic/speclib/LibSourceFile.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,14 @@ public boolean isMaxQuantSearch()
8181
{
8282
return (hasIdFile() && getIdFile().endsWith("msms.txt")) || containsScoreType("MAXQUANT SCORE");
8383
}
84+
85+
public static String DIANN_REPORT_TSV_PLACEHOLDER = "DIA-NN report file";
86+
87+
// These are some of the column headers that we expect to see in a DIA-NN report TSV file
88+
public static List<String> DIANN_REPORT_EXPECTED_HEADERS = List.of("File.Name", "Run", "Protein.Group", "Protein.Ids", "Protein.Names");
89+
90+
public boolean isDiannSearch()
91+
{
92+
return (hasIdFile() && getIdFile().toLowerCase().endsWith(".speclib"));
93+
}
8494
}
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Files downloaded from https://panoramaweb.org/QuickProt_datasets.url (Ranish lab)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
File.Name Run Protein.Group Protein.Ids Protein.Names Genes PG.Quantity PG.Normalised PG.MaxLFQ Genes.Quantity Genes.Normalised Genes.MaxLFQ Genes.MaxLFQ.Unique Modified.Sequence Stripped.Sequence Precursor.Id Precursor.Charge Q.Value PEP Global.Q.Value Protein.Q.Value PG.Q.Value Global.PG.Q.Value GG.Q.Value Translated.Q.Value Proteotypic Precursor.Quantity Precursor.Normalised Quantity.Quality RT RT.Start RT.Stop iRT Predicted.RT Predicted.iRT First.Protein.Description Lib.Q.Value Lib.PG.Q.Value Ms1.Profile.Corr Ms1.Area Ms1.Normalised Normalisation.Factor Evidence Spectrum.Similarity Averagine Mass.Evidence CScore Fragment.Quant.Raw Fragment.Correlations MS2.Scan IM iIM Predicted.IM Predicted.iIM
2+
Z:\Omar\20241220\Raw data\D0_rep1_DIA.mzML D0_rep1_DIA P37108 P37108 SRP14_HUMAN SRP14 2.06919e+07 3.43595e+07 2.92879e+07 2.06919e+07 3.43595e+07 2.92879e+07 2.92879e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.00682898 0.0389249 0.00525982 0.000484731 0.000438597 0.000218723 0.000438982 0 1 110686 212302 0.834495 40.7852 40.6788 40.8916 45.9386 40.6427 45.3926 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.28009 81750.1 156802 1.91806 1.68953 0.127559 0.0508348 0 0.928589 0;13949.7;0;11208.5;0;0;0;9458.04;0;0;0;0; 0;0.654654;0;0.654654;0;0;0;0.654654;0;0;0;0; 57730 0 0 0 0
3+
Z:\Omar\20241220\Raw data\D0_rep2_DIA.mzML D0_rep2_DIA P37108 P37108 SRP14_HUMAN SRP14 1.60054e+07 2.4481e+07 1.99438e+07 1.60054e+07 2.4481e+07 1.99438e+07 1.99438e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.00227236 0.0166455 0.00525982 0.000449035 0.000411184 0.000218723 0.000411523 0 1 308546 381628 0.819633 41.4875 41.2747 41.7003 45.9386 41.4297 44.7753 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.374476 234681 290267 1.23686 1.15183 0.138951 0.0769019 0 0.982622 0;51162.2;0;25079.7;25917.9;0;0;0;0;0;11474;200182; 0;0.380268;0;0.0403012;0.259276;0;0;0;0;0;-0.0343988;0.00347259; 58712 0 0 0 0
4+
Z:\Omar\20241220\Raw data\D2_rep2_DIA.mzML D2_rep2_DIA P37108 P37108 SRP14_HUMAN SRP14 5.96182e+07 4.16815e+07 3.39897e+07 5.96182e+07 4.16815e+07 3.39897e+07 3.39897e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.00034626 0.000973613 0.00525982 0.000284657 0.000262881 0.000218723 0.000263089 0 1 1.01521e+06 480606 0.892339 40.6718 40.5124 40.7783 45.9386 40.9778 45.9936 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.381614 915531 433417 0.473405 2.07644 0.502376 0.172652 0 0.997521 35164.8;117100;65947.1;121602;109001;0;0;114272;0;13159.4;0;25451.3; 0.790086;0.414912;0;0.441212;0.524186;0;0;0.778551;0;0;0;0; 57579 0 0 0 0
5+
Z:\Omar\20241220\Raw data\D4_rep2_DIA.mzML D4_rep2_DIA P37108 P37108 SRP14_HUMAN SRP14 3.58914e+07 2.43299e+07 2.35164e+07 3.58914e+07 2.43299e+07 2.35164e+07 2.35164e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.000124016 0.00081362 0.00525982 0.000289603 0.000265887 0.000218723 0.000266028 0 1 565612 340708 0.89329 41.1094 40.9502 41.2158 45.9386 41.2654 45.7235 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.675966 450953 271641 0.602372 1.9792 0.46964 0.0769019 0 0.997199 26633.2;137531;17003.4;78320.3;38846;0;0;65957.6;0;0;0;16332.8; 0.884287;0.151077;0;-0.211294;0.884635;0;0;0.523756;0;0;0;0; 58183 0 0 0 0

0 commit comments

Comments
 (0)