9
9
'get_trails' , 'plot_pattern' , 'get_minpos' , 'get_local_minima' , 'is_local_minima' , 'truncate' ,
10
10
'check_averagine' , 'pattern_to_mz' , 'cosine_averagine' , 'int_list_to_array' , 'mz_to_mass' , 'M_PROTON' ,
11
11
'isolate_isotope_pattern' , 'get_isotope_patterns' , 'report_' , 'feature_finder_report' , 'extract_bruker' ,
12
- 'convert_bruker' , 'map_bruker' , 'find_features' , 'replace_infs' , 'map_ms2' ]
12
+ 'convert_bruker' , 'map_bruker' , 'get_stats' , ' find_features' , 'replace_infs' , 'map_ms2' ]
13
13
14
14
# Cell
15
15
import numpy as np
@@ -628,8 +628,8 @@ def hill_stats(idx:np.ndarray, hill_range:np.ndarray, hill_ptrs:np.ndarray, hill
628
628
int_ = int_data [idx_ ]
629
629
mz_ = mass_data [idx_ ]
630
630
631
- int_sum = np .sum (int_ )
632
- int_area = np .abs (np .trapz (rt_ [rt_idx [idx_ ]], int_ )) #Area
631
+ ms1_int_sum = np .sum (int_ )
632
+ ms1_int_area = np .abs (np .trapz (rt_ [rt_idx [idx_ ]], int_ )) #Area
633
633
634
634
rt_min = rt_ [rt_idx [idx_ ]].min ()
635
635
rt_max = rt_ [rt_idx [idx_ ]].max ()
@@ -657,8 +657,8 @@ def hill_stats(idx:np.ndarray, hill_range:np.ndarray, hill_ptrs:np.ndarray, hill
657
657
658
658
stats [idx ,0 ] = average_mz
659
659
stats [idx ,1 ] = delta_m
660
- stats [idx ,2 ] = int_sum
661
- stats [idx ,3 ] = int_area
660
+ stats [idx ,2 ] = ms1_int_sum
661
+ stats [idx ,3 ] = ms1_int_area
662
662
stats [idx ,4 ] = rt_min
663
663
stats [idx ,5 ] = rt_max
664
664
@@ -1574,7 +1574,7 @@ def report_(idx:np.ndarray, isotope_charges:list, isotope_patterns:list, iso_idx
1574
1574
left_apex = np .abs (trace [:rt_apex_idx ]- half_max ).argmin ()
1575
1575
right_apex = np .abs (trace [rt_apex_idx :]- half_max ).argmin ()+ rt_apex_idx
1576
1576
1577
- int_apex = trace_sum [rt_apex_idx ]
1577
+ ms1_int_apex = trace_sum [rt_apex_idx ]
1578
1578
fwhm = rt_range [right_apex ] - rt_range [left_apex ]
1579
1579
1580
1580
n_isotopes = len (pattern )
@@ -1602,10 +1602,10 @@ def report_(idx:np.ndarray, isotope_charges:list, isotope_patterns:list, iso_idx
1602
1602
rt_start = rt_range [rt_min_idx ]
1603
1603
rt_end = rt_range [rt_max_idx ]
1604
1604
1605
- int_area = np .abs (np .trapz (trace_sum [rt_min_idx :rt_max_idx ], rt_range [rt_min_idx :rt_max_idx ]))
1606
- int_sum = trace_sum .sum ()
1605
+ ms1_int_area = np .abs (np .trapz (trace_sum [rt_min_idx :rt_max_idx ], rt_range [rt_min_idx :rt_max_idx ]))
1606
+ ms1_int_sum = trace_sum .sum ()
1607
1607
1608
- results [idx ,:] = np .array ([mz , mz_std , mz_most_abundant , charge , rt_start , rt_apex , rt_end , fwhm , n_isotopes , mass , int_apex , int_area , int_sum ])
1608
+ results [idx ,:] = np .array ([mz , mz_std , mz_most_abundant , charge , rt_start , rt_apex , rt_end , fwhm , n_isotopes , mass , ms1_int_apex , ms1_int_area , ms1_int_sum ])
1609
1609
1610
1610
# Cell
1611
1611
import pandas as pd
@@ -1639,7 +1639,7 @@ def feature_finder_report(query_data:dict, isotope_patterns:list, isotope_charge
1639
1639
1640
1640
report_ (range (len (isotope_charges )), isotope_charges , isotope_patterns , iso_idx , stats , sortindex_ , hill_ptrs , hill_data , int_data , rt_ , rt_idx , results , lookup_idx )
1641
1641
1642
- df = pd .DataFrame (results , columns = ['mz' ,'mz_std' ,'mz_most_abundant' ,'charge' ,'rt_start' ,'rt_apex' ,'rt_end' ,'fwhm' ,'n_isotopes' ,'mass' ,'int_apex ' ,'int_area ' , 'int_sum ' ])
1642
+ df = pd .DataFrame (results , columns = ['mz' ,'mz_std' ,'mz_most_abundant' ,'charge' ,'rt_start' ,'rt_apex' ,'rt_end' ,'fwhm' ,'n_isotopes' ,'mass' ,'ms1_int_apex ' ,'ms1_int_area ' , 'ms1_int_sum ' ])
1643
1643
1644
1644
df .sort_values (['rt_start' ,'mz' ])
1645
1645
@@ -1729,17 +1729,19 @@ def convert_bruker(feature_path:str)->pd.DataFrame:
1729
1729
"""
1730
1730
engine_featurefile = db .create_engine ('sqlite:///{}' .format (feature_path ))
1731
1731
feature_table = pd .read_sql_table ('LcTimsMsFeature' , engine_featurefile )
1732
-
1732
+ feature_cluster_mapping = pd . read_sql_table ( 'FeatureClusterMapping' , engine_featurefile )
1733
1733
from .constants import mass_dict
1734
1734
1735
1735
M_PROTON = mass_dict ['Proton' ]
1736
1736
feature_table ['Mass' ] = feature_table ['MZ' ].values * feature_table ['Charge' ].values - feature_table ['Charge' ].values * M_PROTON
1737
- feature_table = feature_table .rename (columns = {"MZ" : "mz" ,"Mass" : "mass" , "RT" : "rt_apex" , "RT_lower" :"rt_start" , "RT_upper" :"rt_end" , "Mobility" : "mobility" , "Mobility_lower" : "mobility_lower" , "Mobility_upper" : "mobility_upper" , "Charge" :"charge" ,"Intensity" :'int_sum ' ,"ClusterCount" :'n_isotopes' })
1737
+ feature_table = feature_table .rename (columns = {"MZ" : "mz" ,"Mass" : "mass" , "RT" : "rt_apex" , "RT_lower" :"rt_start" , "RT_upper" :"rt_end" , "Mobility" : "mobility" , "Mobility_lower" : "mobility_lower" , "Mobility_upper" : "mobility_upper" , "Charge" :"charge" ,"Intensity" :'ms1_int_sum ' ,"ClusterCount" :'n_isotopes' })
1738
1738
feature_table ['rt_apex' ] = feature_table ['rt_apex' ]/ 60
1739
1739
feature_table ['rt_start' ] = feature_table ['rt_start' ]/ 60
1740
1740
feature_table ['rt_end' ] = feature_table ['rt_end' ]/ 60
1741
1741
1742
- return feature_table
1742
+ feature_cluster_mapping = feature_cluster_mapping .rename (columns = {"FeatureId" : "feature_id" , "ClusterId" : "cluster_id" , "Monoisotopic" : "monoisotopic" , "Intensity" : "ms1_int_sum" })
1743
+
1744
+ return feature_table , feature_cluster_mapping
1743
1745
1744
1746
1745
1747
def map_bruker (feature_path :str , feature_table :pd .DataFrame , query_data :dict )-> pd .DataFrame :
@@ -1800,6 +1802,29 @@ def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->p
1800
1802
1801
1803
return features
1802
1804
1805
+ # Cell
1806
+ def get_stats (isotope_patterns , iso_idx , stats ):
1807
+ columns = ['mz_average' ,'delta_m' ,'int_sum' ,'int_area' ,'rt_min' ,'rt_max' ]
1808
+
1809
+ stats_idx = np .zeros (iso_idx [- 1 ], dtype = np .int64 )
1810
+ stats_map = np .zeros (iso_idx [- 1 ], dtype = np .int64 )
1811
+
1812
+ start_ = 0
1813
+ end_ = 0
1814
+
1815
+ for idx in range (len (iso_idx )- 1 ):
1816
+ k = isotope_patterns [iso_idx [idx ]:iso_idx [idx + 1 ]]
1817
+ end_ += len (k )
1818
+ stats_idx [start_ :end_ ] = k
1819
+ stats_map [start_ :end_ ] = idx
1820
+ start_ = end_
1821
+
1822
+ k = pd .DataFrame (stats [stats_idx ], columns = columns )
1823
+
1824
+ k ['feature_id' ] = stats_map
1825
+
1826
+ return k
1827
+
1803
1828
# Cell
1804
1829
import numpy as np
1805
1830
@@ -1861,6 +1886,8 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
1861
1886
ms_file = alphapept .io .MS_Data_File (out_file , is_read_only = False )
1862
1887
query_data = ms_file .read_DDA_query_data ()
1863
1888
1889
+ feature_cluster_mapping = pd .DataFrame ()
1890
+
1864
1891
if not settings ['workflow' ]["find_features" ]:
1865
1892
features = query_data_to_features (query_data )
1866
1893
else :
@@ -1930,12 +1957,16 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
1930
1957
lookup_idx_df = pd .DataFrame (lookup_idx , columns = ['isotope_pattern' , 'isotope_pattern_hill' ])
1931
1958
ms_file .write (lookup_idx_df , dataset_name = "feature_table_idx" )
1932
1959
1960
+ feature_cluster_mapping = get_stats (isotope_patterns , iso_idx , stats )
1961
+
1962
+
1933
1963
logging .info ('Report complete.' )
1934
1964
1935
1965
elif datatype == 'bruker' :
1936
1966
logging .info ('Feature finding on {}' .format (file_name ))
1937
1967
feature_path = extract_bruker (file_name )
1938
- feature_table = convert_bruker (feature_path )
1968
+ feature_table , feature_cluster_mapping = convert_bruker (feature_path )
1969
+
1939
1970
logging .info ('Bruker featurer finder complete. Extracted {:,} features.' .format (len (feature_table )))
1940
1971
1941
1972
# Calculate additional params
@@ -1952,8 +1983,11 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
1952
1983
else :
1953
1984
features = map_ms2 (feature_table , query_data , ** settings ['features' ])
1954
1985
1986
+ ms_file .write (feature_cluster_mapping , dataset_name = "feature_cluster_mapping" )
1987
+
1955
1988
logging .info ('Saving feature table.' )
1956
1989
ms_file .write (feature_table , dataset_name = "feature_table" )
1990
+
1957
1991
logging .info ('Feature table saved to {}' .format (out_file ))
1958
1992
1959
1993
@@ -2028,7 +2062,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
2028
2062
for i , key in enumerate (range_dict ):
2029
2063
tree_points [:,i ] = tree_points [:,i ]/ range_dict [key ][1 ]
2030
2064
2031
- matching_tree = KDTree (tree_points , metric = "minkowski " )
2065
+ matching_tree = KDTree (tree_points , metric = "euclidean " )
2032
2066
ref_points = np .array ([query_data [range_dict [_ ][0 ]] / range_dict [_ ][1 ] for _ in range_dict ]).T
2033
2067
ref_points = replace_infs (ref_points )
2034
2068
@@ -2047,7 +2081,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
2047
2081
ref_df ['query_idx' ] = ref_df .index
2048
2082
ref_df ['feature_idx' ] = idx [:,neighbor ]
2049
2083
2050
- for field in ['int_sum ' ,'int_apex ' ,'rt_start' ,'rt_apex' ,'rt_end' ,'fwhm' ,'mobility_lower' ,'mobility_upper' ]:
2084
+ for field in ['ms1_int_sum ' ,'ms1_int_apex ' ,'rt_start' ,'rt_apex' ,'rt_end' ,'fwhm' ,'mobility_lower' ,'mobility_upper' ]:
2051
2085
if field in feature_table .keys ():
2052
2086
ref_df [field ] = feature_table .iloc [idx [:,neighbor ]][field ].values
2053
2087
@@ -2062,7 +2096,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
2062
2096
_check &= mob_check
2063
2097
2064
2098
ref_matched |= _check
2065
- ref_df ['dist ' ] = dist [:,neighbor ]
2099
+ ref_df ['feature_dist ' ] = dist [:,neighbor ]
2066
2100
ref_df = ref_df [_check ]
2067
2101
2068
2102
all_df .append (ref_df )
@@ -2088,10 +2122,10 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
2088
2122
ref_df ['mobility_matched' ] = unmatched_ref ['mobility' ]
2089
2123
ref_df ['mobility_offset' ] = np .nan
2090
2124
2091
- for field in ['int_sum ' ,'int_apex ' ,'rt_start' ,'rt_apex' ,'rt_end' ,'fwhm' ]:
2125
+ for field in ['ms1_int_sum ' ,'ms1_int_apex ' ,'rt_start' ,'rt_apex' ,'rt_end' ,'fwhm' ]:
2092
2126
if field in feature_table .keys ():
2093
2127
unmatched_ref [field ] = np .nan
2094
- unmatched_ref ['dist ' ] = np .nan
2128
+ unmatched_ref ['feature_dist ' ] = np .nan
2095
2129
2096
2130
all_df .append (unmatched_ref )
2097
2131
0 commit comments