Skip to content

Commit 9ff9c9e

Browse files
committed
feat: add mapping and FAQ info
1 parent ea538c1 commit 9ff9c9e

3 files changed

Lines changed: 278 additions & 5 deletions

File tree

frontend/src/lib/components/FAQ.svelte

Lines changed: 225 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,231 @@
170170

171171
<AccordionItem>
172172
<svelte:fragment slot="summary">
173-
<h3 class="h3">What should I do if I encounter technical issues?</h3>
173+
<h3 class="h3">What file formats are available for download?</h3>
174+
</svelte:fragment>
175+
<svelte:fragment slot="content">
176+
<p>
177+
{config.APP_NAME} provides two download formats for protein annotations:
178+
</p>
179+
<ul class="list-inside list-disc">
180+
<li>
181+
<strong>CSV format</strong>: A comma-separated values file containing
182+
annotation regions with columns for UniProt ID, source database,
183+
label, start position, end position, description, and length. This
184+
format is ideal for data analysis and spreadsheet applications.
185+
</li>
186+
<li>
187+
<strong>.3lines format</strong>: A TMbed Format 4 style file
188+
containing the protein sequence and topology annotations in a
189+
three-line format. This format is compatible with TMbed tools and
190+
provides per-residue annotation information. See the next FAQ section
191+
for detailed format information.
192+
</li>
193+
</ul>
194+
</svelte:fragment>
195+
</AccordionItem>
196+
197+
<AccordionItem>
198+
<svelte:fragment slot="summary">
199+
<h3 class="h3">What is the .3lines format?</h3>
200+
</svelte:fragment>
201+
<svelte:fragment slot="content">
202+
<p>
203+
The .3lines format follows the <a
204+
class="anchor"
205+
href="https://github.com/BernhoferM/TMbed?tab=readme-ov-file#prediction-output"
206+
>TMbed Format 4</a
207+
>
208+
for TMbed predictions and
209+
<a
210+
class="anchor"
211+
href="https://github.com/BernhoferM/TMbed?tab=readme-ov-file#prediction-output"
212+
>TMbed Format 2</a
213+
> specification for all other sources and contains three lines per annotation
214+
source:
215+
</p>
216+
<ol class="list-inside list-decimal">
217+
<li>
218+
<strong>Header line</strong>
219+
<pre
220+
class="pre font-bold">&gt;&#123;uniprot_accession&#125;|&#123;uniprot_id&#125; - &#123;source_name&#125;</pre>
221+
</li>
222+
<li>
223+
<strong>Sequence line</strong>: The protein amino acid sequence
224+
</li>
225+
<li>
226+
<strong>Topology line</strong>: A string of annotation symbols, one
227+
per residue, indicating the transmembrane topology
228+
</li>
229+
</ol>
230+
<p class="mt-2">Example:</p>
231+
<pre
232+
class="overflow-x-auto rounded bg-surface-200 p-2 text-sm dark:bg-surface-800">
233+
>A0A4Q4MGP0|A0A4Q4MGP0_9PLEO - UniProtKB
234+
MSSNGLTETTLRGTAIGLMVVTTAMVFARAILRSDQKKSIQWDEIWLIVGYMLFMAITGVYINKTSLLFRLLAVEEGRLAPYPSVSKDGFNAQKTFFFTSPGLWLTLWSIKFSLLAFYKRIMVGVKLYLTLWWVVLAYCVLTLVLSIMLHITACGSSPSSWFVENGCGADNVRKSLISFWEGFAVDLSTDLMIMLLPIGIIRNLQIPLARKIQIGGLFALGIFVIIASIVRVIQVGATTGASNTTPSLTWLALWSIIESSVAIMVGCGPGLYRKAKAVYSNTPVHAYNSRGYIKTTADRRPETKGNADDEYGFPMKTMSIDIAARVSRGDSEEELVSQEINGKIRVTRSVVVSHKSE
235+
...........HHHHHHHHHHHHHHHHHHHHH...........HHHHHHHHHHHHHHHHHHH.................................HHHHHHHHHHHHHHHHHHHHHHH...........HHHHHHHHHHHHHHHHHHHHHH...............................HHHHHHHHHHHHHHHHHHHH...........HHHHHHHHHHHHHHHHHHHH...................HHHHHHHHHHHHHHHHHHHH.....................................................................................
236+
>A0A4Q4MGP0|A0A4Q4MGP0_9PLEO - TMbed
237+
MSSNGLTETTLRGTAIGLMVVTTAMVFARAILRSDQKKSIQWDEIWLIVGYMLFMAITGVYINKTSLLFRLLAVEEGRLAPYPSVSKDGFNAQKTFFFTSPGLWLTLWSIKFSLLAFYKRIMVGVKLYLTLWWVVLAYCVLTLVLSIMLHITACGSSPSSWFVENGCGADNVRKSLISFWEGFAVDLSTDLMIMLLPIGIIRNLQIPLARKIQIGGLFALGIFVIIASIVRVIQVGATTGASNTTPSLTWLALWSIIESSVAIMVGCGPGLYRKAKAVYSNTPVHAYNSRGYIKTTADRRPETKGNADDEYGFPMKTMSIDIAARVSRGDSEEELVSQEINGKIRVTRSVVVSHKSE
238+
ooooooooooohhhhhhhhhhhhhhhhhhhhhiiiiiiiiiHHHHHHHHHHHHHHHHHHHHHHoooooooooooooooooooooooooooooooohhhhhhhhhhhhhhhhhhhhhhhiiiiiiiiiiHHHHHHHHHHHHHHHHHHHHHHHooooooooooooooooooooooooohhhhhhhhhhhhhhhhhhhhhhhiiiiiiiiiiiHHHHHHHHHHHHHHHHHHHHHHHHooooooooooooooohhhhhhhhhhhhhhhhhhhhhhhiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
239+
</pre>
240+
</svelte:fragment>
241+
</AccordionItem>
242+
243+
<AccordionItem>
244+
<svelte:fragment slot="summary">
245+
<h3 class="h3">What is the CSV format?</h3>
246+
</svelte:fragment>
247+
<svelte:fragment slot="content">
248+
<p>
249+
The CSV format provides a structured, tabular representation of
250+
annotation data that is easy to import into spreadsheet applications,
251+
databases, or data analysis tools.
252+
</p>
253+
<p class="mt-2">
254+
Each row in the CSV file represents a single annotation region and
255+
contains the following columns:
256+
</p>
257+
<ul class="list-inside list-disc">
258+
<li>
259+
<strong>uniprot_id</strong>: The UniProt accession identifier for the
260+
protein
261+
</li>
262+
<li>
263+
<strong>source</strong>: The source database name (e.g., TMbed, TopDB,
264+
Membranome, UniProtKB, TMAlphaFold)
265+
</li>
266+
<li>
267+
<strong>label</strong>: The annotation label from the source database
268+
(e.g., H, B, S, i, o, X, M, AH, BS)
269+
</li>
270+
<li>
271+
<strong>start</strong>: The starting position of the annotation region
272+
(1-indexed)
273+
</li>
274+
<li>
275+
<strong>end</strong>: The ending position of the annotation region
276+
(1-indexed, inclusive)
277+
</li>
278+
<li>
279+
<strong>description</strong>: A human-readable description of the
280+
annotation label
281+
</li>
282+
<li>
283+
<strong>length</strong>: The length of the annotation region in amino
284+
acids
285+
</li>
286+
</ul>
287+
<p class="mt-2">Example:</p>
288+
<pre
289+
class="overflow-x-auto rounded bg-surface-200 p-2 text-sm dark:bg-surface-800">
290+
uniprot_id,source,label,start,end,description,length
291+
A0A4Q4MGP0,uniprot,AH,12,32,Alpha-Helix,21
292+
A0A4Q4MGP0,uniprot,AH,44,62,Alpha-Helix,19
293+
A0A4Q4MGP0,uniprot,AH,96,118,Alpha-Helix,23
294+
A0A4Q4MGP0,uniprot,AH,130,151,Alpha-Helix,22
295+
A0A4Q4MGP0,uniprot,AH,183,202,Alpha-Helix,20
296+
A0A4Q4MGP0,uniprot,AH,214,233,Alpha-Helix,20
297+
A0A4Q4MGP0,uniprot,AH,253,272,Alpha-Helix,20
298+
A0A4Q4MGP0,tmbed,o,1,11,Outside,11
299+
A0A4Q4MGP0,tmbed,h,12,32,Alpha-helix (OUT-->IN),21
300+
A0A4Q4MGP0,tmbed,i,33,41,Inside,9
301+
A0A4Q4MGP0,tmbed,H,42,63,Alpha-helix (IN-->OUT),22
302+
A0A4Q4MGP0,tmbed,o,64,95,Outside,32
303+
A0A4Q4MGP0,tmbed,h,96,118,Alpha-helix (OUT-->IN),23
304+
A0A4Q4MGP0,tmbed,i,119,128,Inside,10
305+
</pre>
306+
</svelte:fragment>
307+
</AccordionItem>
308+
309+
<AccordionItem>
310+
<svelte:fragment slot="summary">
311+
<h3 class="h3">What do the annotation symbols mean?</h3>
312+
</svelte:fragment>
313+
<svelte:fragment slot="content">
314+
<p>
315+
The unified TMbed format uses the following symbols to represent
316+
transmembrane topology annotations:
317+
</p>
318+
<ul class="list-inside list-disc">
319+
<li>
320+
<strong>B</strong>: Transmembrane beta strand (IN→OUT direction)
321+
</li>
322+
<li>
323+
<strong>b</strong>: Transmembrane beta strand (OUT→IN direction)
324+
</li>
325+
<li>
326+
<strong>H</strong>: Transmembrane alpha helix (IN→OUT direction)
327+
</li>
328+
<li>
329+
<strong>h</strong>: Transmembrane alpha helix (OUT→IN direction)
330+
</li>
331+
<li><strong>S</strong>: Signal peptide</li>
332+
<li>
333+
<strong>i</strong>: Non-transmembrane region, inside (cytoplasmic
334+
side)
335+
</li>
336+
<li>
337+
<strong>o</strong>: Non-transmembrane region, outside (extracellular
338+
side)
339+
</li>
340+
<li>
341+
<strong>.</strong>: Non-membrane / unannotated region
342+
</li>
343+
</ul>
344+
<p class="mt-2">
345+
TMbed provides directional information (H/h, B/b) indicating the
346+
orientation of transmembrane segments. Other annotation sources are
347+
mapped to this unified format, which may result in some information loss
348+
(e.g., directionality) for sources that don't provide this level of
349+
detail.
350+
</p>
351+
</svelte:fragment>
352+
</AccordionItem>
353+
354+
<AccordionItem>
355+
<svelte:fragment slot="summary">
356+
<h3 class="h3">How are annotations from different databases mapped?</h3>
357+
</svelte:fragment>
358+
<svelte:fragment slot="content">
359+
<p>
360+
To ensure consistency across different annotation sources, labels from
361+
various databases are mapped to the unified TMbed format. The following
362+
mappings are applied:
363+
</p>
364+
<ul class="list-inside list-disc">
365+
<li>
366+
<strong>TopDB</strong>: X→S (Signal peptide), M→H (Membrane, assumed
367+
alpha-helix), I→i (Inside), O→o (Outside)
368+
</li>
369+
<li>
370+
<strong>Membranome</strong>: AH→H (Alpha-helix), I→i (Inside), O→o
371+
(Outside)
372+
</li>
373+
<li>
374+
<strong>UniProtKB</strong>: AH→H (Alpha-Helix), BS→B (Beta-Sheet)
375+
</li>
376+
<li>
377+
<strong>TMAlphaFold</strong>: AH→H (Alpha-Helix), BS→B (Beta-Sheet)
378+
</li>
379+
<li>
380+
<strong>TMbed</strong>: Already in unified format, no mapping needed
381+
</li>
382+
</ul>
383+
<p class="mt-2">
384+
<strong>Important limitations:</strong> Some databases don't distinguish
385+
between alpha-helical and beta-strand transmembrane regions (e.g., TopDB's
386+
'M' label), so they are mapped to the most common type (alpha-helix). Additionally,
387+
most sources don't provide directional information, so uppercase symbols
388+
(H, B) are used by default. To avoid information loss, separate entries are
389+
generated for each annotation source in the .3lines format, allowing you
390+
to compare annotations directly.
391+
</p>
392+
</svelte:fragment>
393+
</AccordionItem>
394+
395+
<AccordionItem>
396+
<svelte:fragment slot="summary">
397+
<h3 class="h3">Experiencing technical issues?</h3>
174398
</svelte:fragment>
175399
<svelte:fragment slot="content">
176400
<p>

frontend/src/lib/download/generators.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import legendData from "$lib/assets/shared/legend.json";
55
import type { PublicAnnotation } from "$lib/client/model";
66
import type { ProteinInfo } from "$lib/client/model";
77
import type { SourceDB } from "$lib/annotations";
8+
import { mapLabelToUnified } from "./labelMapping";
89

910
/**
1011
* Groups annotations by their source database
@@ -126,8 +127,8 @@ export function generate3Lines(
126127
for (const [sourceDB, sourceAnnotations] of Object.entries(annotationsBySource)) {
127128
if (sourceAnnotations.length === 0) continue;
128129

129-
// Initialize per-residue array with default label 'i' (inside)
130-
const topologyLabels: string[] = new Array(sequenceLength).fill("i");
130+
// Initialize per-residue array with default label '.' (non-membrane)
131+
const topologyLabels: string[] = new Array(sequenceLength).fill(".");
131132

132133
// Apply annotations for this source
133134
for (const annotation of sourceAnnotations) {
@@ -136,10 +137,12 @@ export function generate3Lines(
136137

137138
// Validate bounds
138139
if (start >= 0 && end < sequenceLength && start <= end) {
139-
const label = annotation.label;
140+
const originalLabel = annotation.label;
141+
// Map label to unified format
142+
const mappedLabel = mapLabelToUnified(sourceDB as SourceDB, originalLabel);
140143
// Set labels for the annotation range
141144
for (let i = start; i <= end; i++) {
142-
topologyLabels[i] = label;
145+
topologyLabels[i] = mappedLabel;
143146
}
144147
}
145148
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright 2026 Tobias Olenyi.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
import type { SourceDB } from "$lib/annotations";
5+
6+
/**
7+
* Label mapping to unified TMbed format (Format 4 style)
8+
* Maps database-specific labels to the unified format: B, H, S, i, o, .
9+
*/
10+
export const LABEL_MAPPINGS: Record<SourceDB, Record<string, string>> = {
11+
tmbed: {
12+
// Already in unified format - no mapping needed
13+
'H': 'H', 'h': 'h', 'B': 'B', 'b': 'b',
14+
'i': 'i', 'o': 'o', 'S': 'S', '.': '.'
15+
},
16+
topdb: {
17+
'X': 'S', // Signal peptide
18+
'M': 'H', // Membrane (assume alpha-helix, no strand info)
19+
'I': 'i', // Inside
20+
'O': 'o', // Outside
21+
},
22+
membranome: {
23+
'AH': 'H', // Alpha-helix (no directionality)
24+
'I': 'i', // Inside
25+
'O': 'o', // Outside
26+
},
27+
uniprot: {
28+
'AH': 'H', // Alpha-Helix
29+
'BS': 'B', // Beta-Sheet (assume Beta-strand)
30+
},
31+
tmalphafold: {
32+
'AH': 'H', // Alpha-Helix
33+
'BS': 'B', // Beta-Sheet
34+
}
35+
};
36+
37+
/**
38+
* Maps a label from a source database to the unified TMbed format
39+
* @param sourceDb - The source database identifier
40+
* @param label - The original label from the source database
41+
* @returns The mapped label in unified format, or '.' if unknown
42+
*/
43+
export function mapLabelToUnified(sourceDb: SourceDB, label: string): string {
44+
const mapping = LABEL_MAPPINGS[sourceDb];
45+
return mapping?.[label] ?? '.'; // Default to '.' if unknown
46+
}

0 commit comments

Comments
 (0)