Skip to content

Commit 5c5b0a3

Browse files
AmirMohammad CheraghaliAmirMohammad Cheraghali
authored andcommitted
feat: add 25 new proteins to library with metadata (viral, structural, signaling)
1 parent 38c302e commit 5c5b0a3

2 files changed

Lines changed: 1375 additions & 541 deletions

File tree

scripts/remove_duplicates.js

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import fs from 'fs';
2+
import path from 'path';
3+
import { fileURLToPath } from 'url';
4+
5+
const __filename = fileURLToPath(import.meta.url);
6+
const __dirname = path.dirname(__filename);
7+
8+
const libraryPath = path.join(__dirname, '../src/data/library.ts');
9+
let content = fs.readFileSync(libraryPath, 'utf8');
10+
11+
// Find the OFFLINE_LIBRARY array content
12+
const startMarker = 'export const OFFLINE_LIBRARY: LibraryEntry[] = [';
13+
const startIndex = content.indexOf(startMarker);
14+
if (startIndex === -1) {
15+
console.error('Could not find OFFLINE_LIBRARY start');
16+
process.exit(1);
17+
}
18+
19+
const listStart = startIndex + startMarker.length;
20+
const listContent = content.substring(listStart);
21+
22+
// We want to process the list items.
23+
// We can use a regex to match each object content.
24+
// A block generally starts with `{` and ends with `},` or `}` (last one)
25+
// But formatting varies.
26+
// Safest is to iterate and match `id: 'XXXX'`.
27+
28+
// Map to track seen IDs
29+
const seenIds = new Set();
30+
const duplicateIds = new Set();
31+
32+
// Regex to find IDs and their positions
33+
const idRegex = /id:\s*'([A-Z0-9]{4})'/g;
34+
let match;
35+
const matches = [];
36+
37+
while ((match = idRegex.exec(content)) !== null) {
38+
const id = match[1];
39+
matches.push({ id, index: match.index });
40+
if (seenIds.has(id)) {
41+
duplicateIds.add(id);
42+
} else {
43+
seenIds.add(id);
44+
}
45+
}
46+
47+
console.log(`Found ${matches.length} entries.`);
48+
console.log(`Found ${duplicateIds.size} duplicates:`, Array.from(duplicateIds).join(', '));
49+
50+
if (duplicateIds.size === 0) {
51+
console.log('No duplicates found.');
52+
process.exit(0);
53+
}
54+
55+
// We need to remove the BLOCKS for the second occurrences.
56+
// This is tricky with regex/string manipulation because we need the full block boundaries.
57+
// But we know my duplicates are in the newly appended section at the end.
58+
// So for each unique ID, we keep the FIRST occurrence (matches[0]) and remove subsequent ones.
59+
// But deleting from string invalidates indices.
60+
// We should build a new list.
61+
62+
// Let's rewrite the logic to:
63+
// 1. Read the file into lines.
64+
// 2. State machine to find blocks.
65+
// 3. If block has ID seen before, skip it.
66+
67+
const lines = content.split('\n');
68+
const newLines = [];
69+
const seenIdsInLines = new Set();
70+
let insideBlock = false;
71+
let currentBlock = [];
72+
let currentBlockId = null;
73+
74+
// Before the list starts, just copy.
75+
// We can assume lines before 'export const OFFLINE_LIBRARY' are header.
76+
// But let's just parse blocks loosely.
77+
78+
// Assuming generated file format is fairly consistent?
79+
// My appended blocks look like `{ id: ... },` or multiple lines.
80+
81+
for (let i = 0; i < lines.length; i++) {
82+
const line = lines[i];
83+
84+
// Detect start of a block
85+
// It usually has `id: '...'` on the first line or second.
86+
// Or starts with `{`.
87+
88+
// Simplification: The file is messy.
89+
// But duplicate blocks are structurally consistent (start with `{ id: ...` or `{` then `id: ...`).
90+
91+
// Let's rely on `id: 'XXXX'` being the key identifier.
92+
// If we see a line with `id: 'XXXX'`, we check if seen.
93+
// If seen, we must have entered a duplicate block.
94+
// We need to discard this block.
95+
// A block ends at `},` or `}`.
96+
97+
if (line.includes("id: '")) {
98+
// Extract ID
99+
const m = line.match(/id:\s*'([A-Z0-9]{4})'/);
100+
if (m) {
101+
const id = m[1];
102+
if (seenIdsInLines.has(id)) {
103+
// DUPLICATE DETECTED
104+
console.log(`Removing duplicate block for ${id} around line ${i + 1}`);
105+
// We need to NOT add the current block.
106+
// But we might have already added `{` if it was on previous line.
107+
// Assuming standard formatting:
108+
// Case 1: `{ id: '...', ... },` (One line) -> Skip this line.
109+
// Case 2: ` {` (prev line) \n ` id: '...'` (this line) -> Remove prev line too?
110+
111+
// This line-by-line is fragile.
112+
// Better: Use the fact that duplicates are at the END.
113+
// We know the duplicates are in the "New Additions" section.
114+
// Let's manually identify the cut-off or just use the Matches array.
115+
} else {
116+
seenIdsInLines.add(id);
117+
}
118+
}
119+
}
120+
}
121+
122+
// Okay, re-reading the list of MATCHES.
123+
// We can find the start/end of the duplicate blocks in the string.
124+
// For every match beyond the first for an ID:
125+
// Find the `{` preceding it.
126+
// Find the `},` or `}` following it.
127+
// Cut it out.
128+
129+
// Reverse order deletion to preserve indices.
130+
const matchesToDel = [];
131+
const seen = new Set();
132+
133+
// Re-scan to identify second occurrences
134+
let tempMatches = [];
135+
let tempRegex = /id:\s*'([A-Z0-9]{4})'/g;
136+
while ((match = tempRegex.exec(content)) !== null) {
137+
if (seen.has(match[1])) {
138+
matchesToDel.push({ id: match[1], index: match.index });
139+
} else {
140+
seen.add(match[1]);
141+
}
142+
}
143+
144+
// Sort matchesToDel by index descending
145+
matchesToDel.sort((a, b) => b.index - a.index);
146+
147+
let processedContent = content;
148+
149+
for (const m of matchesToDel) {
150+
// Find matching `{` backwards from m.index
151+
const openBrace = processedContent.lastIndexOf('{', m.index);
152+
// Find matching `}` forwards from m.index
153+
// We need to handle nested braces? Library entries have metadata, so no nested braces usually.
154+
// `details` string might contain braces? Unlikely.
155+
const closeBrace = processedContent.indexOf('}', m.index);
156+
157+
// Check for comma after closeBrace
158+
let endCut = closeBrace + 1;
159+
if (processedContent[endCut] === ',') endCut++;
160+
161+
// Check for newline/whitespace cleanup?
162+
// We cut from openBrace to endCut.
163+
164+
console.log(`Cutting duplicate ${m.id} at index ${m.index} (range ${openBrace}-${endCut})`);
165+
processedContent = processedContent.substring(0, openBrace) + processedContent.substring(endCut);
166+
}
167+
168+
// Fix double commas or empty lines if any?
169+
// The cut might leave `, \n \n `
170+
processedContent = processedContent.replace(/,\s*,/g, ',');
171+
// processedContent = processedContent.replace(/\n\s*\n/g, '\n');
172+
173+
fs.writeFileSync(libraryPath, processedContent, 'utf8');
174+
console.log('Duplicates removed.');

0 commit comments

Comments
 (0)