Skip to content

Commit a4a1155

Browse files
Merge pull request #405 from MannLabs/develop
Develop 0.4.0
2 parents 864cae5 + a17c5a3 commit a4a1155

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+8701
-10017
lines changed

.bumpversion.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.33
2+
current_version = 0.4.0
33
commit = True
44
tag = False
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

alphapept/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.3.33"
1+
__version__ = "0.4.0"
22

33
__requirements__ = {
44
"": "requirements/requirements.txt",

alphapept/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
AUTHOR_EMAIL = "[email protected]"
3434
COPYRIGHT = "Mann Labs"
3535
BRANCH = "master"
36-
VERSION_NO = "0.3.33"
36+
VERSION_NO = "0.4.0"
3737
MIN_PYTHON = "3.6"
3838
MAX_PYTHON = "4"
3939
AUDIENCE = "Developers"

alphapept/_nbdev.py

+3
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
"extract_bruker": "04_feature_finding.ipynb",
131131
"convert_bruker": "04_feature_finding.ipynb",
132132
"map_bruker": "04_feature_finding.ipynb",
133+
"get_stats": "04_feature_finding.ipynb",
133134
"find_features": "04_feature_finding.ipynb",
134135
"replace_infs": "04_feature_finding.ipynb",
135136
"map_ms2": "04_feature_finding.ipynb",
@@ -183,6 +184,7 @@
183184
"transform": "07_recalibration.ipynb",
184185
"kneighbors_calibration": "07_recalibration.ipynb",
185186
"get_calibration": "07_recalibration.ipynb",
187+
"chunks": "07_recalibration.ipynb",
186188
"density_scatter": "07_recalibration.ipynb",
187189
"save_fragment_calibration": "07_recalibration.ipynb",
188190
"calibrate_fragments_nn": "07_recalibration.ipynb",
@@ -245,6 +247,7 @@
245247
"get_summary": "11_interface.ipynb",
246248
"parallel_execute": "11_interface.ipynb",
247249
"bcolors": "11_interface.ipynb",
250+
"is_port_in_use": "11_interface.ipynb",
248251
"run_cli": "11_interface.ipynb",
249252
"cli_overview": "11_interface.ipynb",
250253
"cli_database": "11_interface.ipynb",

alphapept/default_settings.yaml

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ workflow:
55
find_features: true
66
search_data: true
77
recalibrate_data: true
8-
align: true
8+
align: false
99
match: false
1010
lfq_quantification: true
1111
general:
@@ -87,11 +87,12 @@ calibration:
8787
matching:
8888
match_p_min: 0.05
8989
match_d_min: 3
90+
match_group_tol: 0
9091
isobaric_label:
9192
label: None
9293
reporter_frag_tolerance: 15
9394
reporter_frag_tolerance_ppm: true
9495
quantification:
9596
max_lfq: true
9697
lfq_ratio_min: 1
97-
mode: int_sum
98+
mode: ms1_int_sum

alphapept/export.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ def remove_mods(sequence):
2424
def ap_to_mq_sequence(sequence, mod_translation):
2525
"""
2626
Converts AlphaPept sequence format to MaxQuant Format
27-
returns naked_sequence, len_sequence, modifications_, mq_sequence
27+
returns sequence_naked, len_sequence, modifications_, mq_sequence
2828
2929
"""
3030
# Add leading and trailing modification
31-
naked_sequence = remove_mods(sequence)
31+
sequence_naked = remove_mods(sequence)
3232
parsed_sequence = parse(sequence)
3333

3434
mq_sequence = '_'
@@ -37,7 +37,7 @@ def ap_to_mq_sequence(sequence, mod_translation):
3737

3838
for idx, AA in enumerate(parsed_sequence):
3939

40-
mq_sequence += naked_sequence[idx]
40+
mq_sequence += sequence_naked[idx]
4141
if len(AA) != 1:
4242
if mod_translation[AA] is not None:
4343
if mod_translation[AA] in modifications:
@@ -68,9 +68,9 @@ def ap_to_mq_sequence(sequence, mod_translation):
6868

6969
mq_sequence += '_'
7070

71-
n_AA = len(naked_sequence)
71+
n_AA = len(sequence_naked)
7272

73-
return naked_sequence, n_AA, modifications_, mq_sequence
73+
return sequence_naked, n_AA, modifications_, mq_sequence
7474

7575

7676
# Cell
@@ -100,9 +100,9 @@ def prepare_ap_results(ref_ap):
100100

101101
ref_ap['id'] = ref_ap.index
102102

103-
naked_sequence, nAA, mq_modifications, mq_sequence = zip(*ref_ap['sequence'].apply(lambda x: ap_to_mq_sequence(x, mod_translation)))
103+
sequence_naked, nAA, mq_modifications, mq_sequence = zip(*ref_ap['sequence'].apply(lambda x: ap_to_mq_sequence(x, mod_translation)))
104104

105-
ref_ap['naked_sequence'] = naked_sequence
105+
ref_ap['sequence_naked'] = sequence_naked
106106
ref_ap['n_AA'] = nAA
107107
ref_ap['mq_modifications'] = mq_modifications
108108
ref_ap['mq_sequence'] = mq_sequence

alphapept/feature_finding.py

+53-19
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
'get_trails', 'plot_pattern', 'get_minpos', 'get_local_minima', 'is_local_minima', 'truncate',
1010
'check_averagine', 'pattern_to_mz', 'cosine_averagine', 'int_list_to_array', 'mz_to_mass', 'M_PROTON',
1111
'isolate_isotope_pattern', 'get_isotope_patterns', 'report_', 'feature_finder_report', 'extract_bruker',
12-
'convert_bruker', 'map_bruker', 'find_features', 'replace_infs', 'map_ms2']
12+
'convert_bruker', 'map_bruker', 'get_stats', 'find_features', 'replace_infs', 'map_ms2']
1313

1414
# Cell
1515
import numpy as np
@@ -628,8 +628,8 @@ def hill_stats(idx:np.ndarray, hill_range:np.ndarray, hill_ptrs:np.ndarray, hill
628628
int_ = int_data[idx_]
629629
mz_ = mass_data[idx_]
630630

631-
int_sum = np.sum(int_)
632-
int_area = np.abs(np.trapz(rt_[rt_idx[idx_]], int_)) #Area
631+
ms1_int_sum = np.sum(int_)
632+
ms1_int_area = np.abs(np.trapz(rt_[rt_idx[idx_]], int_)) #Area
633633

634634
rt_min = rt_[rt_idx[idx_]].min()
635635
rt_max = rt_[rt_idx[idx_]].max()
@@ -657,8 +657,8 @@ def hill_stats(idx:np.ndarray, hill_range:np.ndarray, hill_ptrs:np.ndarray, hill
657657

658658
stats[idx,0] = average_mz
659659
stats[idx,1] = delta_m
660-
stats[idx,2] = int_sum
661-
stats[idx,3] = int_area
660+
stats[idx,2] = ms1_int_sum
661+
stats[idx,3] = ms1_int_area
662662
stats[idx,4] = rt_min
663663
stats[idx,5] = rt_max
664664

@@ -1574,7 +1574,7 @@ def report_(idx:np.ndarray, isotope_charges:list, isotope_patterns:list, iso_idx
15741574
left_apex = np.abs(trace[:rt_apex_idx]-half_max).argmin()
15751575
right_apex = np.abs(trace[rt_apex_idx:]-half_max).argmin()+rt_apex_idx
15761576

1577-
int_apex = trace_sum[rt_apex_idx]
1577+
ms1_int_apex = trace_sum[rt_apex_idx]
15781578
fwhm = rt_range[right_apex] - rt_range[left_apex]
15791579

15801580
n_isotopes = len(pattern)
@@ -1602,10 +1602,10 @@ def report_(idx:np.ndarray, isotope_charges:list, isotope_patterns:list, iso_idx
16021602
rt_start = rt_range[rt_min_idx]
16031603
rt_end = rt_range[rt_max_idx]
16041604

1605-
int_area = np.abs(np.trapz(trace_sum[rt_min_idx:rt_max_idx], rt_range[rt_min_idx:rt_max_idx]))
1606-
int_sum = trace_sum.sum()
1605+
ms1_int_area = np.abs(np.trapz(trace_sum[rt_min_idx:rt_max_idx], rt_range[rt_min_idx:rt_max_idx]))
1606+
ms1_int_sum = trace_sum.sum()
16071607

1608-
results[idx,:] = np.array([mz, mz_std, mz_most_abundant, charge, rt_start, rt_apex, rt_end, fwhm, n_isotopes, mass, int_apex, int_area, int_sum])
1608+
results[idx,:] = np.array([mz, mz_std, mz_most_abundant, charge, rt_start, rt_apex, rt_end, fwhm, n_isotopes, mass, ms1_int_apex, ms1_int_area, ms1_int_sum])
16091609

16101610
# Cell
16111611
import pandas as pd
@@ -1639,7 +1639,7 @@ def feature_finder_report(query_data:dict, isotope_patterns:list, isotope_charge
16391639

16401640
report_(range(len(isotope_charges)), isotope_charges, isotope_patterns, iso_idx, stats, sortindex_, hill_ptrs, hill_data, int_data, rt_, rt_idx, results, lookup_idx)
16411641

1642-
df = pd.DataFrame(results, columns = ['mz','mz_std','mz_most_abundant','charge','rt_start','rt_apex','rt_end','fwhm','n_isotopes','mass','int_apex','int_area', 'int_sum'])
1642+
df = pd.DataFrame(results, columns = ['mz','mz_std','mz_most_abundant','charge','rt_start','rt_apex','rt_end','fwhm','n_isotopes','mass','ms1_int_apex','ms1_int_area', 'ms1_int_sum'])
16431643

16441644
df.sort_values(['rt_start','mz'])
16451645

@@ -1729,17 +1729,19 @@ def convert_bruker(feature_path:str)->pd.DataFrame:
17291729
"""
17301730
engine_featurefile = db.create_engine('sqlite:///{}'.format(feature_path))
17311731
feature_table = pd.read_sql_table('LcTimsMsFeature', engine_featurefile)
1732-
1732+
feature_cluster_mapping = pd.read_sql_table('FeatureClusterMapping', engine_featurefile)
17331733
from .constants import mass_dict
17341734

17351735
M_PROTON = mass_dict['Proton']
17361736
feature_table['Mass'] = feature_table['MZ'].values * feature_table['Charge'].values - feature_table['Charge'].values*M_PROTON
1737-
feature_table = feature_table.rename(columns={"MZ": "mz","Mass": "mass", "RT": "rt_apex", "RT_lower":"rt_start", "RT_upper":"rt_end", "Mobility": "mobility", "Mobility_lower": "mobility_lower", "Mobility_upper": "mobility_upper", "Charge":"charge","Intensity":'int_sum',"ClusterCount":'n_isotopes'})
1737+
feature_table = feature_table.rename(columns={"MZ": "mz","Mass": "mass", "RT": "rt_apex", "RT_lower":"rt_start", "RT_upper":"rt_end", "Mobility": "mobility", "Mobility_lower": "mobility_lower", "Mobility_upper": "mobility_upper", "Charge":"charge","Intensity":'ms1_int_sum',"ClusterCount":'n_isotopes'})
17381738
feature_table['rt_apex'] = feature_table['rt_apex']/60
17391739
feature_table['rt_start'] = feature_table['rt_start']/60
17401740
feature_table['rt_end'] = feature_table['rt_end']/60
17411741

1742-
return feature_table
1742+
feature_cluster_mapping = feature_cluster_mapping.rename(columns={"FeatureId": "feature_id", "ClusterId": "cluster_id", "Monoisotopic": "monoisotopic", "Intensity": "ms1_int_sum"})
1743+
1744+
return feature_table, feature_cluster_mapping
17431745

17441746

17451747
def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->pd.DataFrame:
@@ -1800,6 +1802,29 @@ def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->p
18001802

18011803
return features
18021804

1805+
# Cell
1806+
def get_stats(isotope_patterns, iso_idx, stats):
1807+
columns = ['mz_average','delta_m','int_sum','int_area','rt_min','rt_max']
1808+
1809+
stats_idx = np.zeros(iso_idx[-1], dtype=np.int64)
1810+
stats_map = np.zeros(iso_idx[-1], dtype=np.int64)
1811+
1812+
start_ = 0
1813+
end_ = 0
1814+
1815+
for idx in range(len(iso_idx)-1):
1816+
k = isotope_patterns[iso_idx[idx]:iso_idx[idx+1]]
1817+
end_ += len(k)
1818+
stats_idx[start_:end_] = k
1819+
stats_map[start_:end_] = idx
1820+
start_ = end_
1821+
1822+
k = pd.DataFrame(stats[stats_idx], columns=columns)
1823+
1824+
k['feature_id'] = stats_map
1825+
1826+
return k
1827+
18031828
# Cell
18041829
import numpy as np
18051830

@@ -1861,6 +1886,8 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
18611886
ms_file = alphapept.io.MS_Data_File(out_file, is_read_only=False)
18621887
query_data = ms_file.read_DDA_query_data()
18631888

1889+
feature_cluster_mapping = pd.DataFrame()
1890+
18641891
if not settings['workflow']["find_features"]:
18651892
features = query_data_to_features(query_data)
18661893
else:
@@ -1930,12 +1957,16 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
19301957
lookup_idx_df = pd.DataFrame(lookup_idx, columns = ['isotope_pattern', 'isotope_pattern_hill'])
19311958
ms_file.write(lookup_idx_df, dataset_name="feature_table_idx")
19321959

1960+
feature_cluster_mapping = get_stats(isotope_patterns, iso_idx, stats)
1961+
1962+
19331963
logging.info('Report complete.')
19341964

19351965
elif datatype == 'bruker':
19361966
logging.info('Feature finding on {}'.format(file_name))
19371967
feature_path = extract_bruker(file_name)
1938-
feature_table = convert_bruker(feature_path)
1968+
feature_table, feature_cluster_mapping = convert_bruker(feature_path)
1969+
19391970
logging.info('Bruker featurer finder complete. Extracted {:,} features.'.format(len(feature_table)))
19401971

19411972
# Calculate additional params
@@ -1952,8 +1983,11 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
19521983
else:
19531984
features = map_ms2(feature_table, query_data, **settings['features'])
19541985

1986+
ms_file.write(feature_cluster_mapping, dataset_name="feature_cluster_mapping")
1987+
19551988
logging.info('Saving feature table.')
19561989
ms_file.write(feature_table, dataset_name="feature_table")
1990+
19571991
logging.info('Feature table saved to {}'.format(out_file))
19581992

19591993

@@ -2028,7 +2062,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
20282062
for i, key in enumerate(range_dict):
20292063
tree_points[:,i] = tree_points[:,i]/range_dict[key][1]
20302064

2031-
matching_tree = KDTree(tree_points, metric="minkowski")
2065+
matching_tree = KDTree(tree_points, metric="euclidean")
20322066
ref_points = np.array([query_data[range_dict[_][0]] / range_dict[_][1] for _ in range_dict]).T
20332067
ref_points = replace_infs(ref_points)
20342068

@@ -2047,7 +2081,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
20472081
ref_df['query_idx'] = ref_df.index
20482082
ref_df['feature_idx'] = idx[:,neighbor]
20492083

2050-
for field in ['int_sum','int_apex','rt_start','rt_apex','rt_end','fwhm','mobility_lower','mobility_upper']:
2084+
for field in ['ms1_int_sum','ms1_int_apex','rt_start','rt_apex','rt_end','fwhm','mobility_lower','mobility_upper']:
20512085
if field in feature_table.keys():
20522086
ref_df[field] = feature_table.iloc[idx[:,neighbor]][field].values
20532087

@@ -2062,7 +2096,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
20622096
_check &= mob_check
20632097

20642098
ref_matched |= _check
2065-
ref_df['dist'] = dist[:,neighbor]
2099+
ref_df['feature_dist'] = dist[:,neighbor]
20662100
ref_df = ref_df[_check]
20672101

20682102
all_df.append(ref_df)
@@ -2088,10 +2122,10 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
20882122
ref_df['mobility_matched'] = unmatched_ref['mobility']
20892123
ref_df['mobility_offset'] = np.nan
20902124

2091-
for field in ['int_sum','int_apex','rt_start','rt_apex','rt_end','fwhm']:
2125+
for field in ['ms1_int_sum','ms1_int_apex','rt_start','rt_apex','rt_end','fwhm']:
20922126
if field in feature_table.keys():
20932127
unmatched_ref[field] = np.nan
2094-
unmatched_ref['dist'] = np.nan
2128+
unmatched_ref['feature_dist'] = np.nan
20952129

20962130
all_df.append(unmatched_ref)
20972131

alphapept/gui/experiment.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,12 @@ def submit_experiment(recorder: dict):
151151
st.info(
152152
f"Filename will be: {escape_markdown(long_name)}. Click submit button to add to queue."
153153
)
154+
155+
if (recorder['workflow']['match']) | (recorder['workflow']['match']):
156+
if len(recorder['experiment']['shortnames']) > 100:
157+
st.warning('Performance Warning: More than 100 files are selected and matching / align is selected.'
158+
'Matching / Align could take a long time. If you experience issues please contact [email protected]')
159+
154160
if st.button("Submit"):
155161
settings = load_settings_as_template(DEFAULT_SETTINGS_PATH)
156162
for group in recorder:
@@ -287,7 +293,7 @@ def experiment():
287293

288294
file_df = file_df_from_files(raw_files, file_folder)
289295
file_df["Fraction"] = [str(i+1) for i in range(len(file_df))]
290-
#file_df["Matching group"] = ""
296+
file_df["Matching group"] = [str(0)]*len(file_df)
291297

292298
gb = GridOptionsBuilder.from_dataframe(file_df)
293299
gb.configure_default_column(
@@ -315,15 +321,23 @@ def experiment():
315321
" \n- Creation date of file."
316322
" \n- Size (GB): Size in GB of the file."
317323
" \n- Shortname: Unique shortname for each file."
318-
" \n- Fraction: Fraction of each file."
319-
#" \n- Matching Group: Match-between-runs only among members of this group."
324+
" \n- Fraction: Fraction of each file. Files of the same fraction will be scored together. If dataset is not fractionated leave as is."
325+
" \n- Matching Group: Match-between-runs only among members of this group or neighboring groups. Leave as is if matching between all files."
320326
)
321327

322328
shortnames = file_df_selected["Shortname"].values.tolist()
323329
if len(shortnames) != len(set(shortnames)):
324330
st.warning("Warning: Shortnames are not unique.")
325331
error += 1
326332

333+
try:
334+
matching_groups = file_df_selected["Matching group"].values.astype('int').tolist()
335+
except:
336+
matching_groups = [str(0)]*len(file_df)
337+
338+
st.warning("Warning: Matching groups contain non-integer values. Please only use integers (0,1,2...).")
339+
error += 1
340+
327341
fasta_files_home_dir = files_in_folder(FASTA_PATH, ".fasta")
328342
fasta_files_home_dir = [
329343
os.path.join(FASTA_PATH, _) for _ in fasta_files_home_dir
@@ -351,9 +365,7 @@ def experiment():
351365
recorder["experiment"]["fractions"] = file_df_selected[
352366
"Fraction"
353367
].values.tolist()
354-
#recorder["experiment"]["matching_groups"] = file_df_selected[
355-
# "Matching group"
356-
#].values.tolist()
368+
recorder["experiment"]["matching_groups"] = matching_groups
357369

358370
f_dict = file_df_selected.groupby('Fraction')['Filename'].unique().to_dict()
359371
f_dict = {k: list(v) for k,v in f_dict.items()}

0 commit comments

Comments
 (0)