Skip to content

Commit 8940407

Browse files
committed
translated chemstation ms parser to c++ and added python wrapper
1 parent af13daa commit 8940407

File tree

7 files changed

+204
-1
lines changed

7 files changed

+204
-1
lines changed

__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from hplc import read_chromatograms, plot_chromatograms
2+
from chemstation import read_chemstation_file
23

3-
__all__ = ["read_chromatograms", "plot_chromatograms"]
4+
__all__ = ["read_chromatograms", "plot_chromatograms", "read_chemstation_file"]

chemstation/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
from .read_ms_file import read_chemstation_file
3+
4+
__all__ = [
5+
'read_chemstation_file',
6+
]
249 Bytes
Binary file not shown.
1.89 KB
Binary file not shown.

chemstation/read_ms_file.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import parser_ms as pm
2+
import pandas as pd
3+
import plotly.graph_objs as go
4+
import plotly.express as px
5+
from typeguard import typechecked
6+
from typing import List
7+
8+
9+
@typechecked
10+
def convert_cycles_to_dfs(cycles: List[dict]) -> List[pd.DataFrame]:
11+
"""Convert Chemstation LC-MS cycles into a list of Pandas DataFrames."""
12+
cycle_dfs = []
13+
for i, cycle in enumerate(cycles):
14+
df = pd.DataFrame({
15+
"mz": cycle["mz"],
16+
"intensity": cycle["intensity"],
17+
# Repeat for each row
18+
"retention_time": [cycle["retention_time"]] * len(cycle["mz"])
19+
})
20+
df["cycle_id"] = i
21+
cycle_dfs.append(df)
22+
return cycle_dfs
23+
24+
25+
@typechecked
26+
def merge_cycles_into_df(cycles: List[dict]) -> pd.DataFrame:
27+
"""Convert all cycles into a single Pandas DataFrame with cycle_id."""
28+
cycle_dfs = convert_cycles_to_dfs(cycles)
29+
return pd.concat(cycle_dfs, ignore_index=True)
30+
31+
32+
@typechecked
33+
def read_chemstation_file(file_path: str) -> pd.DataFrame:
34+
cycles = pm.read_cycles(file_path)
35+
cycle_dfs = convert_cycles_to_dfs(cycles)
36+
print(cycle_dfs[:5])
37+
return merge_cycles_into_df(cycles)

setup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
define_macros=[("VERSION_INFO", __version__)],
1111
extra_compile_args=["-std=c++17"],
1212
),
13+
Pybind11Extension(
14+
"parser_ms",
15+
["src/parser_ms.cpp"],
16+
define_macros=[("VERSION_INFO", __version__)],
17+
extra_compile_args=["-std=c++17"],
18+
),
1319
Pybind11Extension(
1420
"parser_xray",
1521
["src/parser_xray.cpp"],

src/parser_ms.cpp

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// NOTE: Reading data of MSD1.MS file from
2+
// Agilent ChemStation
3+
#include <cmath>
4+
#include <cstddef>
5+
#include <cstdint>
6+
#include <fstream>
7+
#include <iostream>
8+
#include <pybind11/pybind11.h>
9+
#include <pybind11/stl.h>
10+
#include <vector>
11+
12+
uint16_t endianSwapU16(uint16_t value) {
13+
return ((value & 0xFF) << 8) | ((value >> 8) & 0xFF);
14+
}
15+
16+
void endianSwapU32(uint32_t &x) {
17+
x = ((x << 24) & 0xFF000000) | ((x << 8) & 0x00FF0000) |
18+
((x >> 8) & 0x0000FF00) | ((x >> 24) & 0x000000FF);
19+
}
20+
21+
std::vector<char> ReadFile(const std::string &file_path) {
22+
std::ifstream file(file_path, std::ios::binary | std::ios::ate);
23+
if (!file.is_open()) {
24+
throw std::runtime_error("Error opening file");
25+
}
26+
std::size_t size = file.tellg();
27+
if (size == 0) {
28+
throw std::runtime_error("File is empty");
29+
}
30+
std::vector<char> buffer(size);
31+
file.seekg(0, std::ios::beg);
32+
file.read(buffer.data(), size);
33+
if (!file) {
34+
throw std::runtime_error("Error reading file");
35+
}
36+
file.close();
37+
return buffer;
38+
}
39+
40+
std::uint16_t CastToUint16(std::vector<char> &buffer, std::size_t offset) {
41+
std::uint16_t res = *reinterpret_cast<const uint16_t *>(&buffer[offset]);
42+
return endianSwapU16(res);
43+
}
44+
45+
std::uint32_t CastToUint32(std::vector<char> &buffer, std::size_t offset) {
46+
std::uint32_t res = *reinterpret_cast<const uint32_t *>(&buffer[offset]);
47+
endianSwapU32(res);
48+
return res;
49+
}
50+
51+
std::size_t NumberOfCycles(std::vector<char> &buffer) {
52+
int data_start = 0x116;
53+
return CastToUint32(buffer, data_start);
54+
}
55+
56+
std::size_t FindDataStart(std::vector<char> &buffer) {
57+
int data_start = 0x10A;
58+
int offset_correction = CastToUint16(buffer, data_start);
59+
int where = offset_correction * 2 - 2;
60+
return where;
61+
}
62+
63+
std::vector<double> ConvertMZIntensity(std::vector<uint16_t> &data) {
64+
std::vector<double> mz_intensity;
65+
mz_intensity.resize(data.size());
66+
for (std::size_t i = 0; i < data.size(); i++) {
67+
if (i % 2 != 0) { // Intensity
68+
uint16_t head_bits = data[i] >> 14; // Shift right by 14 bits
69+
uint16_t tail_bits =
70+
data[i] & 0x3FFF; // Extract tail: 0x3FFF = 0011111111111111 (14 bits)
71+
mz_intensity[i] = std::pow(8, head_bits) * tail_bits;
72+
} else { // MZ
73+
mz_intensity[i] = static_cast<double>(data[i]) / 20;
74+
}
75+
}
76+
return mz_intensity;
77+
}
78+
79+
struct Cycle {
80+
std::vector<double> mz;
81+
std::vector<double> intensity;
82+
double retention_time;
83+
84+
// Convert Cycle to a Python dictionary
85+
std::map<std::string, pybind11::object> to_dict() const {
86+
return {{"mz", pybind11::cast(mz)},
87+
{"intensity", pybind11::cast(intensity)},
88+
{"retention_time", pybind11::cast(retention_time)}};
89+
}
90+
};
91+
92+
void ReadCycleData(Cycle &cycle, std::vector<char> &buffer,
93+
std::size_t data_start, std::size_t cycle_size) {
94+
std::vector<std::uint16_t> data;
95+
data.resize(cycle_size);
96+
for (std::size_t i = 0; i < cycle_size; i++) {
97+
data[i] = CastToUint16(buffer, data_start);
98+
data_start += 2;
99+
}
100+
std::vector<double> mz_intensity = ConvertMZIntensity(data);
101+
cycle.mz.resize(mz_intensity.size() / 2);
102+
cycle.intensity.resize(mz_intensity.size() / 2);
103+
for (std::size_t i = 0; i < mz_intensity.size(); i++) {
104+
if (i % 2 == 0) {
105+
cycle.mz[i / 2] = mz_intensity[i];
106+
} else {
107+
cycle.intensity[i / 2] = mz_intensity[i];
108+
}
109+
}
110+
}
111+
112+
std::vector<Cycle> readCycles(const std::string &file_path) {
113+
std::vector<char> buffer = ReadFile(file_path);
114+
std::size_t data_start = FindDataStart(buffer);
115+
std::size_t number_of_cycles = NumberOfCycles(buffer);
116+
std::vector<Cycle> cycles;
117+
cycles.resize(number_of_cycles);
118+
std::size_t counter = data_start;
119+
for (std::size_t i = 0; i < number_of_cycles; i++) {
120+
if (counter >= buffer.size()) {
121+
throw std::runtime_error("Error extracting data");
122+
}
123+
counter += 2;
124+
std::size_t time = CastToUint32(buffer, counter);
125+
counter += 10;
126+
std::size_t temp = counter;
127+
std::size_t cycle_size = CastToUint16(buffer, counter);
128+
counter += 6;
129+
ReadCycleData(cycles[i], buffer, counter, cycle_size * 2);
130+
cycles[i].retention_time = static_cast<double>(time) / 60000;
131+
counter += cycle_size * 4;
132+
counter += 10;
133+
}
134+
return cycles;
135+
}
136+
137+
namespace py = pybind11;
138+
139+
std::vector<std::map<std::string, py::object>>
140+
py_readCycles(const std::string &file_path) {
141+
std::vector<Cycle> cycles = readCycles(file_path);
142+
std::vector<std::map<std::string, py::object>> result;
143+
for (const auto &cycle : cycles) {
144+
result.push_back(cycle.to_dict());
145+
}
146+
return result;
147+
}
148+
149+
PYBIND11_MODULE(parser_ms, m) {
150+
m.doc() = "Chemstation MS data extraction module";
151+
m.def("read_cycles", &py_readCycles,
152+
"Extract cycles from an Chemstation MS file");
153+
}

0 commit comments

Comments
 (0)