forked from tabdelaal/CyTOF-Linear-Classifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCyTOF_LDApredict.m
546 lines (513 loc) · 22.2 KB
/
CyTOF_LDApredict.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
function Predictions = CyTOF_LDApredict(TrainedModel,DataFolder,mode,RejectionThreshold)
% CyTOF_LDApredict function can be used to produce automatic cell type
% annotations for new samples, based on the trained LDA classifier using
% CyTOF_LDAtrain function.
%
% Input description
%
% TrainedModel: Model produced by CyTOF_LDAtrain, which includes the
% trained LDA classifier, the relevant markers used to train
% the model and whether or not an arcsinh transformation is
% involved.
%
% DataFolder: extension of the folder containing the test samples, can be
% either in FCS or CSV format.
%
% mode: either 'FCS' or 'CSV', defining the samples format.
%
% RejectionThreshold: Posterior probability lower threshold, below which
% the prediction will be 'unknown', this presents how
% confident the classifier is, before assigning a
% specific cell type to a cell.
% Value between 0 and 1, '0' means no rejection.
%
% Example:
% Predictions = CyTOF_LDApredict(Model,'HMIS-2\Samples\','CSV',0.7)
%
% For citation and further information please refer to this publication:
% "Predicting cell types in single cell mass cytometry data"
% read the test data
SamplesData = struct('Data',[]);
if strcmp(mode,'FCS')
H=dir(fullfile(DataFolder,'*.fcs'));
SamplesFiles = cellstr(char(H(1:end).name));
SamplesFiles = sort_nat(SamplesFiles);
for i = 1:length(SamplesFiles)
SamplesData(i).Data = fca_readfcs([DataFolder SamplesFiles{i}]);
SamplesData(i).Data = SamplesData(i).Data(:,TrainedModel.markers);
if (TrainedModel.Transformation)
SamplesData(i).Data = asinh((SamplesData(i).Data-1)/5);
if(strcmp(TrainedModel.Transformation,'arcsinh'))
SamplesData(i).Data=asinh(SamplesData(i).Data/5);
elseif (strcmp(TrainedModel.Transformation,'log'))
SamplesData(i).Data = log(SamplesData(i).Data);
SamplesData(i).Data(isinf(SamplesData(i).Data))=0;
end
end
end
clear i
elseif strcmp(mode,'CSV')
H=dir(fullfile(DataFolder,'*.csv'));
SamplesFiles = cellstr(char(H(1:end).name));
SamplesFiles = sort_nat(SamplesFiles);
for i = 1:length(SamplesFiles)
SamplesData(i).Data = csvread([DataFolder SamplesFiles{i}]);
SamplesData(i).Data = SamplesData(i).Data(:,TrainedModel.markers);
if (TrainedModel.Transformation)
SamplesData(i).Data = asinh((SamplesData(i).Data-1)/5);
if(strcmp(TrainedModel.Transformation,'arcsinh'))
SamplesData(i).Data=asinh(SamplesData(i).Data/5);
elseif (strcmp(TrainedModel.Transformation,'log'))
SamplesData(i).Data = log(SamplesData(i).Data);
SamplesData(i).Data(isinf(SamplesData(i).Data))=0;
end
end
end
clear i
else
msgbox('Invalid file format mode, choose FCS or CSV', ...
'Error','error');
return;
end
Predictions = struct('Labels',{});
for i = 1:length(SamplesData)
[Predictions(i).Labels,probs] = predict(TrainedModel.LDAclassifier,SamplesData(i).Data);
probs = max(probs,[],2); % get predictions posterior probabilities
Predictions(i).Labels(probs < RejectionThreshold) = cellstr('unknown');
end
end
%%
function [fcsdat, fcshdr, fcsdatscaled, fcsdatcomp] = fca_readfcs(filename)
% [fcsdat, fcshdr, fcsdatscaled] = fca_readfcs(filename);
%
% Read FCS 2.0 and FCS 3.0 type flow cytometry data file and put the list mode
% parameters to the fcsdat array with size of [NumOfPar TotalEvents].
% Some important header data are stored in the fcshdr structure:
% TotalEvents, NumOfPar, starttime, stoptime and specific info for parameters
% as name, range, bitdepth, logscale(yes-no) and number of decades.
%
% [fcsdat, fcshdr] = fca_readfcs;
% Without filename input the user can select the desired file
% using the standard open file dialog box.
%
% [fcsdat, fcshdr, fcsdatscaled] = fca_readfcs(filename);
% Supplying the third output the fcsdatscaled array contains the scaled
% parameters. It might be useful for logscaled parameters, but no effect
% in the case of linear parameters. The log scaling is the following
% operation for the "ith" parameter:
% fcsdatscaled(:,i) = ...
% 10.^(fcsdat(:,i)/fcshdr.par(i).range*fcshdr.par(i).decade;);
%
%
% Copyright (c) 2011, Laszlo Balkay
% All rights reserved.
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are
% met:
%
% * Redistributions of source code must retain the above copyright
% notice, this list of conditions and the following disclaimer.
% * Redistributions in binary form must reproduce the above copyright
% notice, this list of conditions and the following disclaimer in
% the documentation and/or other materials provided with the distribution
%
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
% POSSIBILITY OF SUCH DAMAGE.
%
% Ver 2.5
% 2006-2009 / University of Debrecen, Institute of Nuclear Medicine
% Laszlo Balkay
%
% 14/08/2006 I made some changes in the code by the suggestion of
% Brian Harms <[email protected]> and Ivan Cao-Berg <[email protected]>
% (given at the user reviews area of Mathwork File exchage) The program should work
% in the case of Becton EPics DLM FCS2.0, CyAn Summit FCS3.0 and FACSDiva type
% list mode files.
%
% 29/01/2008 Updated to read the BD LSR II file format and including the comments of
% Allan Moser (Cira Discovery Sciences, Inc.)
%
% 24/01/2009 Updated to read the Partec CyFlow format file. Thanks for
% Gavin A Price
%
% if noarg was supplied
if nargin == 0
[FileName, FilePath] = uigetfile('*.*','Select fcs file');
filename = [FilePath,FileName];
if FileName == 0;
fcsdat = []; fcshdr = [];
return;
end
else
filecheck = dir(filename);
if size(filecheck,1) == 0
hm = msgbox([filename,': The file does not exist!'], ...
'FcAnalysis info','warn');
fcsdat = []; fcshdr = [];
return;
end
end
% if filename arg. only contain PATH, set the default dir to this
% before issuing the uigetfile command. This is an option for the "fca"
% tool
[FilePath, FileNameMain, fext] = fileparts(filename);
FilePath = [FilePath filesep];
FileName = [FileNameMain, fext];
if isempty(FileNameMain)
currend_dir = cd;
cd(FilePath);
[FileName, FilePath] = uigetfile('*.*','Select FCS file');
filename = [FilePath,FileName];
if FileName == 0;
fcsdat = []; fcshdr = [];
return;
end
cd(currend_dir);
end
%fid = fopen(filename,'r','ieee-be');
fid = fopen(filename,'r','b');
fcsheader_1stline = fread(fid,64,'char');
fcsheader_type = char(fcsheader_1stline(1:6)');
%TMP: update to include FCS 3.1
if strcmp(fcsheader_type,'FCS3.1')
fcsheader_type='FCS3.0';
end
%
%reading the header
%
if strcmp(fcsheader_type,'FCS1.0')
hm = msgbox('FCS 1.0 file type is not supported!','FcAnalysis info','warn');
fcsdat = []; fcshdr = [];
fclose(fid);
return;
elseif strcmp(fcsheader_type,'FCS2.0') || strcmp(fcsheader_type,'FCS3.0') % FCS2.0 or FCS3.0 types
fcshdr.fcstype = fcsheader_type;
FcsHeaderStartPos = str2num(char(fcsheader_1stline(11:18)'));
FcsHeaderStopPos = str2num(char(fcsheader_1stline(19:26)')); %RLF edited to full 8-byte length
FcsDataStartPos = str2num(char(fcsheader_1stline(27:34)')); %RLF edited to full 8-byte length
status = fseek(fid,FcsHeaderStartPos,'bof');
fcsheader_main = fread(fid,FcsHeaderStopPos-FcsHeaderStartPos+1,'char');%read the main header
warning off MATLAB:nonIntegerTruncatedInConversionToChar;
fcshdr.filename = FileName;
fcshdr.filepath = FilePath;
% "The first character of the primary TEXT segment contains the
% delimiter" (FCS standard)
if fcsheader_main(1) == 12
mnemonic_separator = 'FF';
elseif fcsheader_main(1) == 9
mnemonic_separator = 'TAB'; %RLF
else
mnemonic_separator = char(fcsheader_main(1));
end
if mnemonic_separator == '@';% WinMDI
hm = msgbox([FileName,': The file can not be read (Unsupported FCS type: WinMDI histogram file)'],'FcAnalysis info','warn');
fcsdat = []; fcshdr = [];
fclose(fid);
return;
end
fcshdr.TotalEvents = str2num(get_mnemonic_value('$TOT',fcsheader_main, mnemonic_separator));
if fcshdr.TotalEvents == 0
fcsdat = 0;
fcsdatscaled = 0;
return
end
fcshdr.NumOfPar = str2num(get_mnemonic_value('$PAR',fcsheader_main, mnemonic_separator));
fcshdr.Creator = get_mnemonic_value('CREATOR',fcsheader_main, mnemonic_separator);
%comp matrix reader added by RLF 12_15_10
comp = get_mnemonic_value('SPILLOVER',fcsheader_main,mnemonic_separator);
if ~isempty(comp)
%%%
compcell=regexp(comp,',','split');
nc=str2double(compcell{1});
fcshdr.CompLabels=compcell(2:nc+1);
fcshdr.CompMat=reshape(str2double(compcell(nc+2:end)'),[nc nc])';
else
fcshdr.CompLabels=[];
fcshdr.CompMat=[];
end
plate = get_mnemonic_value('PLATE NAME',fcsheader_main,mnemonic_separator);
if ~isempty(plate)
fcshdr.plate=plate;
end
%%%%%%%%%%%%
%%%%%%added by RLF to account for large files
if FcsDataStartPos == 0
FcsDataStartPos = str2num(get_mnemonic_value('$BEGINDATA',fcsheader_main, mnemonic_separator));
end
%%%%%%%%%%%%%%%%%%%%%
for i=1:fcshdr.NumOfPar
fcshdr.par(i).name = get_mnemonic_value(['$P',num2str(i),'N'],fcsheader_main, mnemonic_separator);
fcshdr.par(i).name2 = get_mnemonic_value(['$P',num2str(i),'S'],fcsheader_main, mnemonic_separator);
fcshdr.par(i).range = str2num(get_mnemonic_value(['$P',num2str(i),'R'],fcsheader_main, mnemonic_separator));
fcshdr.par(i).bit = str2num(get_mnemonic_value(['$P',num2str(i),'B'],fcsheader_main, mnemonic_separator));
%============== Changed way that amplification type is treated --- ARM ==================
par_exponent_str= (get_mnemonic_value(['$P',num2str(i),'E'],fcsheader_main, mnemonic_separator));
if isempty(par_exponent_str)
% There is no "$PiE" mnemonic in the Lysys format
% in that case the PiDISPLAY mnem. shows the LOG or LIN definition
islogpar = get_mnemonic_value(['P',num2str(i),'DISPLAY'],fcsheader_main, mnemonic_separator);
if length(islogpar)==3 && isequal(islogpar, 'LOG') % islogpar == 'LOG'
par_exponent_str = '5,1';
else % islogpar = LIN case
par_exponent_str = '0,0';
end
end
par_exponent= str2num(par_exponent_str);
fcshdr.par(i).decade = par_exponent(1);
if fcshdr.par(i).decade == 0
fcshdr.par(i).log = 0;
fcshdr.par(i).logzero = 0;
else
fcshdr.par(i).log = 1;
if (par_exponent(2) == 0)
fcshdr.par(i).logzero = 1;
else
fcshdr.par(i).logzero = par_exponent(2);
end
end
gain_str = get_mnemonic_value(['$P',num2str(i),'G'],fcsheader_main, mnemonic_separator);
if ~isempty(gain_str)
fcshdr.par(i).gain=str2double(gain_str);
else
fcshdr.par(i).gain=1;
end
%============================================================================================
end
fcshdr.starttime = get_mnemonic_value('$BTIM',fcsheader_main, mnemonic_separator);
fcshdr.stoptime = get_mnemonic_value('$ETIM',fcsheader_main, mnemonic_separator);
fcshdr.cytometry = get_mnemonic_value('$CYT',fcsheader_main, mnemonic_separator);
fcshdr.date = get_mnemonic_value('$DATE',fcsheader_main, mnemonic_separator);
fcshdr.byteorder = get_mnemonic_value('$BYTEORD',fcsheader_main, mnemonic_separator);
fcshdr.datatype = get_mnemonic_value('$DATATYPE',fcsheader_main, mnemonic_separator);
fcshdr.system = get_mnemonic_value('$SYS',fcsheader_main, mnemonic_separator);
fcshdr.project = get_mnemonic_value('$PROJ',fcsheader_main, mnemonic_separator);
fcshdr.experiment = get_mnemonic_value('$EXP',fcsheader_main, mnemonic_separator);
fcshdr.cells = get_mnemonic_value('$Cells',fcsheader_main, mnemonic_separator);
fcshdr.creator = get_mnemonic_value('CREATOR',fcsheader_main, mnemonic_separator);
fcshdr.cytsn = get_mnemonic_value('$CYTSN',fcsheader_main, mnemonic_separator);
else
hm = msgbox([FileName,': The file can not be read (Unsupported FCS type)'],'FcAnalysis info','warn');
fcsdat = []; fcshdr = [];
fclose(fid);
return;
end
%
%reading the events
%
status = fseek(fid,FcsDataStartPos,'bof');
if strcmp(fcsheader_type,'FCS2.0')
if strcmp(mnemonic_separator,'\') || strcmp(mnemonic_separator,'FF')... %ordinary or FacsDIVA FCS2.0
|| strcmp(mnemonic_separator,'/') || strcmp(mnemonic_separator,'TAB')% added by GAP 1/22/09 %added by RLF 09/02/10
if fcshdr.par(1).bit == 16
fcsdat = uint16(fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'uint16')');
if strcmp(fcshdr.byteorder,'1,2')...% this is the Cytomics data
|| strcmp(fcshdr.byteorder, '1,2,3,4') %added by GAP 1/22/09
fcsdat = bitor(bitshift(fcsdat,-8),bitshift(fcsdat,8));
end
elseif fcshdr.par(1).bit == 32
if fcshdr.datatype ~= 'F'
fcsdat = (fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'uint32')');
else % 'LYSYS' case
fcsdat = (fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'float32')');
end
else
bittype = ['ubit',num2str(fcshdr.par(1).bit)];
fcsdat = fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],bittype, 'ieee-le')';
end
elseif strcmp(mnemonic_separator,'!');% Becton EPics DLM FCS2.0
fcsdat_ = fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'uint16', 'ieee-le')';
fcsdat = zeros(fcshdr.TotalEvents,fcshdr.NumOfPar);
for i=1:fcshdr.NumOfPar
bintmp = dec2bin(fcsdat_(:,i));
fcsdat(:,i) = bin2dec(bintmp(:,7:16)); % only the first 10bit is valid for the parameter
end
end
fclose(fid);
elseif strcmp(fcsheader_type,'FCS3.0')
% if strcmp(mnemonic_separator,'|') % CyAn Summit FCS3.0
% fcsdat_ = (fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'uint16','ieee-le')');
% fcsdat = zeros(size(fcsdat_));
% new_xrange = 1024;
% for i=1:fcshdr.NumOfPar
% fcsdat(:,i) = fcsdat_(:,i)*new_xrange/fcshdr.par(i).range;
% fcshdr.par(i).range = new_xrange;
% end
% else % ordinary FCS 3.0
%%%%%edited by RLF 06_30_10
if strcmp(fcshdr.datatype,'D')
if strcmp(fcshdr.byteorder, '1,2,3,4')
fcsdat = fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'double','l')';
elseif strcmp(fcshdr.byteorder,'4,3,2,1')
fcsdat = fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'double','b')';
end
elseif strcmp(fcshdr.datatype,'F')
if strcmp(fcshdr.byteorder, '1,2,3,4')
fcsdat = fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'float32','l')';
elseif strcmp(fcshdr.byteorder,'4,3,2,1')
fcsdat = fread(fid,[fcshdr.NumOfPar fcshdr.TotalEvents],'float32','b')';
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%
% end
fclose(fid);
end
%
%calculate the scaled events (for log scales) %RLF added gain division
if nargout>2
fcsdatscaled = zeros(size(fcsdat));
for i = 1 : fcshdr.NumOfPar
Xlogdecade = fcshdr.par(i).decade;
XChannelMax = fcshdr.par(i).range;
Xlogvalatzero = fcshdr.par(i).logzero;
if fcshdr.par(i).gain~=1
fcsdatscaled(:,i) = double(fcsdat(:,i))./fcshdr.par(i).gain;
elseif fcshdr.par(i).log
fcsdatscaled(:,i) = Xlogvalatzero*10.^(double(fcsdat(:,i))/XChannelMax*Xlogdecade);
else fcsdatscaled(:,i) = fcsdat(:,i);
end
end
end
if nargout>3 && ~isempty(fcshdr.CompLabels) %RLF. applied to fcsdatscaled rather than fcsdat.
compcols=zeros(1,nc);
colLabels={fcshdr.par.name};
for i=1:nc
compcols(i)=find(strcmp(fcshdr.CompLabels{i},colLabels));
end
fcsdatcomp=fcsdatscaled;
fcsdatcomp(:,compcols)=fcsdatcomp(:,compcols)/fcshdr.CompMat;
else fcsdatcomp=[];
end
end
%%%%
function mneval = get_mnemonic_value(mnemonic_name,fcsheader,mnemonic_separator)
if strcmp(mnemonic_separator,'\') || strcmp(mnemonic_separator,'!') ...
|| strcmp(mnemonic_separator,'|') || strcmp(mnemonic_separator,'@')...
|| strcmp(mnemonic_separator, '/') % added by GAP 1/22/08
mnemonic_startpos = findstr(char(fcsheader'),mnemonic_name);
if isempty(mnemonic_startpos)
mneval = [];
return;
end
mnemonic_length = length(mnemonic_name);
mnemonic_stoppos = mnemonic_startpos + mnemonic_length;
next_slashes = findstr(char(fcsheader(mnemonic_stoppos+1:end)'),mnemonic_separator);
next_slash = next_slashes(1) + mnemonic_stoppos;
mneval = char(fcsheader(mnemonic_stoppos+1:next_slash-1)');
elseif strcmp(mnemonic_separator,'FF')
mnemonic_startpos = findstr(char(fcsheader'),mnemonic_name);
if isempty(mnemonic_startpos)
mneval = [];
return;
end
mnemonic_length = length(mnemonic_name);
mnemonic_stoppos = mnemonic_startpos + mnemonic_length ;
next_formfeeds = find( fcsheader(mnemonic_stoppos+1:end) == 12);
next_formfeed = next_formfeeds(1) + mnemonic_stoppos;
mneval = char(fcsheader(mnemonic_stoppos + 1 : next_formfeed-1)');
elseif strcmp(mnemonic_separator,'TAB') %added by RLF August 2010
mnemonic_startpos = findstr(char(fcsheader'),mnemonic_name);
if isempty(mnemonic_startpos)
mneval = [];
return;
end
mnemonic_length = length(mnemonic_name);
mnemonic_stoppos = mnemonic_startpos + mnemonic_length ;
next_formfeeds = find( fcsheader(mnemonic_stoppos+1:end) == 9);
next_formfeed = next_formfeeds(1) + mnemonic_stoppos;
mneval = char(fcsheader(mnemonic_stoppos + 1 : next_formfeed-1)');
end
end
%%
function [cs,index] = sort_nat(c,mode)
%sort_nat: Natural order sort of cell array of strings.
% usage: [S,INDEX] = sort_nat(C)
%
% where,
% C is a cell array (vector) of strings to be sorted.
% S is C, sorted in natural order.
% INDEX is the sort order such that S = C(INDEX);
%
% Natural order sorting sorts strings containing digits in a way such that
% the numerical value of the digits is taken into account. It is
% especially useful for sorting file names containing index numbers with
% different numbers of digits. Often, people will use leading zeros to get
% the right sort order, but with this function you don't have to do that.
% For example, if C = {'file1.txt','file2.txt','file10.txt'}, a normal sort
% will give you
%
% {'file1.txt' 'file10.txt' 'file2.txt'}
%
% whereas, sort_nat will give you
%
% {'file1.txt' 'file2.txt' 'file10.txt'}
%
% See also: sort
% Version: 1.4, 22 January 2011
% Author: Douglas M. Schwarz
% Email: dmschwarz=ieee*org, dmschwarz=urgrad*rochester*edu
% Real_email = regexprep(Email,{'=','*'},{'@','.'})
% Set default value for mode if necessary.
if nargin < 2
mode = 'ascend';
end
% Make sure mode is either 'ascend' or 'descend'.
modes = strcmpi(mode,{'ascend','descend'});
is_descend = modes(2);
if ~any(modes)
error('sort_nat:sortDirection',...
'sorting direction must be ''ascend'' or ''descend''.')
end
% Replace runs of digits with '0'.
c2 = regexprep(c,'\d+','0');
% Compute char version of c2 and locations of zeros.
s1 = char(c2);
z = s1 == '0';
% Extract the runs of digits and their start and end indices.
[digruns,first,last] = regexp(c,'\d+','match','start','end');
% Create matrix of numerical values of runs of digits and a matrix of the
% number of digits in each run.
num_str = length(c);
max_len = size(s1,2);
num_val = NaN(num_str,max_len);
num_dig = NaN(num_str,max_len);
for i = 1:num_str
num_val(i,z(i,:)) = sscanf(sprintf('%s ',digruns{i}{:}),'%f');
num_dig(i,z(i,:)) = last{i} - first{i} + 1;
end
% Find columns that have at least one non-NaN. Make sure activecols is a
% 1-by-n vector even if n = 0.
activecols = reshape(find(~all(isnan(num_val))),1,[]);
n = length(activecols);
% Compute which columns in the composite matrix get the numbers.
numcols = activecols + (1:2:2*n);
% Compute which columns in the composite matrix get the number of digits.
ndigcols = numcols + 1;
% Compute which columns in the composite matrix get chars.
charcols = true(1,max_len + 2*n);
charcols(numcols) = false;
charcols(ndigcols) = false;
% Create and fill composite matrix, comp.
comp = zeros(num_str,max_len + 2*n);
comp(:,charcols) = double(s1);
comp(:,numcols) = num_val(:,activecols);
comp(:,ndigcols) = num_dig(:,activecols);
% Sort rows of composite matrix and use index to sort c in ascending or
% descending order, depending on mode.
[unused,index] = sortrows(comp);
if is_descend
index = index(end:-1:1);
end
index = reshape(index,size(c));
cs = c(index);
end