CU2CL/cu2cl_libTooling.cpp at master · vtsynergy/CU2CL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* CU2CL - A prototype CUDA-to-OpenCL translator built on the Clang compiler infrastructure
* Version 0.8.0b (beta)
*
* (C) 2010-2017 Virginia Polytechnic Institute & State University (also known as "Virginia Tech"). All Rights Reserved.
* This software is provided as-is.  Neither the authors, Virginia Tech nor Virginia Tech Intellectual Properties, Inc. assert, warrant, or guarantee that the software is fit for any purpose whatsoever, nor do they collectively or individually accept any responsibility or liability for any action or activity that results from the use of this software.  The entire risk as to the quality and performance of the software rests with the user, and no remedies shall be provided by the authors, Virginia Tech or Virginia Tech Intellectual Properties, Inc.
*
*    This library is free software; you can redistribute it and/or modify it under the terms of the attached GNU Lesser General Public License v2.1 as published by the Free Software Foundation.
*
*    This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
*
*   You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Authors: Paul Sathre, Gabriel Martinez
*
*/

#define CU2CL_LICENSE \
	"/* (C) 2010-2017 Virginia Polytechnic Institute & State University (also known as \"Virginia Tech\"). All Rights Reserved.\n" \
	"/* This software is provided as-is.  Neither the authors, Virginia Tech nor Virginia Tech Intellectual Properties, Inc. assert, warrant, or guarantee that the software is fit for any purpose whatsoever, nor do they collectively or individually accept any responsibility or liability for any action or activity that results from the use of this software.  The entire risk as to the quality and performance of the software rests with the user, and no remedies shall be provided by the authors, Virginia Tech or Virginia Tech Intellectual Properties, Inc.\n" \
	"*\n" \
	"*    This library is free software; you can redistribute it and/or modify it under the terms of the attached GNU Lesser General Public License v2.1 as published by the Free Software Foundation.\n" \
	"*\n" \
	"*    This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.\n" \
	"*\n" \
	"*   You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA \n" \
	"*/\n"
#include "clang/AST/AST.h"
#include "clang/AST/ASTConsumer.h"
#include "clang/AST/Decl.h"
//Added to fix CUDA attributes being undeclared
#include "clang/AST/Attr.h"

#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/FileManager.h"
#include "clang/Basic/SourceManager.h"

//Added during the libTooling conversion
#include "clang/Driver/Options.h"

#include "clang/Frontend/CompilerInstance.h"
#include "clang/Frontend/FrontendPluginRegistry.h"
//Added during the libTooling conversion
#include "clang/Frontend/FrontendActions.h"

#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PPCallbacks.h"

#include "clang/Rewrite/Core/Rewriter.h"

//Added during the libTooling conversion
#include "clang/Tooling/CommonOptionsParser.h"
#include "clang/Tooling/Tooling.h"
//Support the RefactoringTool class
#include "clang/Tooling/Refactoring.h"

#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/Regex.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/CommandLine.h"

#include <list>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <iostream>
#include <cstdio>
#include <memory>

//Injects a small amount of code to time the translation process
#define CU2CL_ENABLE_TIMING

#ifdef CU2CL_ENABLE_TIMING
#include <sys/time.h>
#endif

/*
* The following macros define data structures, functions, and kernels
*  that make up a "CU2CL Runtime", providing synthesized analogues of
*  CUDA features, that do not have native equivalences in OpenCL.
*/

//A scaffold for supporting as much of cudaDeviceProp as possible
#define CL_DEVICE_PROP \
    "struct __cu2cl_DeviceProp {\n" \
    "    char name[256];\n" \
    "    cl_ulong totalGlobalMem;\n" \
    "    cl_ulong sharedMemPerBlock;\n" \
    "    cl_uint regsPerBlock;\n" \
    "    cl_uint warpSize;\n" \
    "    size_t memPitch; //Unsupported!\n" \
    "    size_t maxThreadsPerBlock;\n" \
    "    size_t maxThreadsDim[3];\n" \
    "    int maxGridSize[3]; //Unsupported!\n" \
    "    cl_uint clockRate;\n" \
    "    size_t totalConstMem; //Unsupported!\n" \
    "    cl_uint major;\n" \
    "    cl_uint minor;\n" \
    "    size_t textureAlignment; //Unsupported!\n" \
    "    cl_bool deviceOverlap;\n" \
    "    cl_uint multiProcessorCount;\n" \
    "    cl_bool kernelExecTimeoutEnabled;\n" \
    "    cl_bool integrated;\n" \
    "    int canMapHostMemory; //Unsupported!\n" \
    "    int computeMode; //Unsupported!\n" \
    "    int maxTexture1D; //Unsupported!\n" \
    "    int maxTexture2D[2]; //Unsupported!\n" \
    "    int maxTexture3D[3]; //Unsupported!\n" \
    "    int maxTexture2DArray[3]; //Unsupported!\n" \
    "    size_t surfaceAlignment; //Unsupported!\n" \
    "    int concurrentKernels; //Unsupported!\n" \
    "    cl_bool ECCEnabled;\n" \
    "    int pciBusID; //Unsupported!\n" \
    "    int pciDeviceID; //Unsupported!\n" \
    "    int tccDriver; //Unsupported!\n" \
    "    //int __cudaReserved[21];\n" \
    "};\n\n"

//Encapsulation for reading a .cl kernel file at runtime
#define LOAD_PROGRAM_SOURCE_H \
    "size_t __cu2cl_LoadProgramSource(const char *filename, const char **progSrc);\n"

#define LOAD_PROGRAM_SOURCE \
    "size_t __cu2cl_LoadProgramSource(const char *filename, const char **progSrc) {\n" \
    "    FILE *f = fopen(filename, \"r\");\n" \
    "    fseek(f, 0, SEEK_END);\n" \
    "    size_t len = (size_t) ftell(f);\n" \
    "    *progSrc = (const char *) malloc(sizeof(char)*len);\n" \
    "    rewind(f);\n" \
    "    fread((void *) *progSrc, len, 1, f);\n" \
    "    fclose(f);\n" \
    "    return len;\n" \
    "}\n\n"

//The host-side portion of a kernel to emulate the behavior of cudaMemset
#define CL_MEMSET_H \
    "cl_int __cu2cl_Memset(cl_mem devPtr, int value, size_t count);\n"

#define CL_MEMSET \
    "cl_int __cu2cl_Memset(cl_mem devPtr, int value, size_t count) {\n" \
    "    clSetKernelArg(__cu2cl_Kernel___cu2cl_Memset, 0, sizeof(cl_mem), &devPtr);\n" \
    "    clSetKernelArg(__cu2cl_Kernel___cu2cl_Memset, 1, sizeof(cl_uchar), &value);\n" \
    "    clSetKernelArg(__cu2cl_Kernel___cu2cl_Memset, 2, sizeof(cl_uint), &count);\n" \
    "    globalWorkSize[0] = count;\n" \
    "    return clEnqueueNDRangeKernel(__cu2cl_CommandQueue, __cu2cl_Kernel___cu2cl_Memset, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);\n" \
    "}\n\n"

//The device-side kernel that emulates the behavior of cudaMemset
#define CL_MEMSET_KERNEL \
    "__kernel void __cu2cl_Memset(__global uchar *ptr, uchar value, uint num) {\n" \
    "    size_t id = get_global_id(0);\n" \
    "    if (get_global_id(0) < num) {\n" \
    "        ptr[id] = value;\n" \
    "    }\n" \
    "}\n\n"

//A stub to query a specific property in __cu2cl_DeviceProp
// can be used independently of CL_GET_DEVICE_PROPS, but is not intended
#define CL_GET_DEVICE_INFO(TYPE, NAME) \
    "    ret |= clGetDeviceInfo(device, CL_DEVICE_" #TYPE ", sizeof(prop->" \
    #NAME "), &prop->" #NAME ", NULL);\n"

//A function to query the OpenCL properties which have direct analogues in cudaDeviceProp
#define CL_GET_DEVICE_PROPS_H \
    "cl_int __cu2cl_GetDeviceProperties(struct __cu2cl_DeviceProp * prop, cl_device_id device);\n"

#define CL_GET_DEVICE_PROPS \
    "cl_int __cu2cl_GetDeviceProperties(struct __cu2cl_DeviceProp *prop, cl_device_id device) {\n" \
    "    cl_int ret = CL_SUCCESS;\n" \
    CL_GET_DEVICE_INFO(NAME, name) \
    CL_GET_DEVICE_INFO(GLOBAL_MEM_SIZE, totalGlobalMem) \
    CL_GET_DEVICE_INFO(LOCAL_MEM_SIZE, sharedMemPerBlock) \
    CL_GET_DEVICE_INFO(REGISTERS_PER_BLOCK_NV, regsPerBlock) \
    CL_GET_DEVICE_INFO(WARP_SIZE_NV, warpSize) \
    CL_GET_DEVICE_INFO(MAX_WORK_GROUP_SIZE, maxThreadsPerBlock) \
    CL_GET_DEVICE_INFO(MAX_WORK_ITEM_SIZES, maxThreadsDim) \
    CL_GET_DEVICE_INFO(MAX_CLOCK_FREQUENCY, clockRate) \
    CL_GET_DEVICE_INFO(COMPUTE_CAPABILITY_MAJOR_NV, major) \
    CL_GET_DEVICE_INFO(COMPUTE_CAPABILITY_MINOR_NV, minor) \
    CL_GET_DEVICE_INFO(GPU_OVERLAP_NV, deviceOverlap) \
    CL_GET_DEVICE_INFO(MAX_COMPUTE_UNITS, multiProcessorCount) \
    CL_GET_DEVICE_INFO(KERNEL_EXEC_TIMEOUT_NV, kernelExecTimeoutEnabled) \
    CL_GET_DEVICE_INFO(INTEGRATED_MEMORY_NV, integrated) \
    CL_GET_DEVICE_INFO(ERROR_CORRECTION_SUPPORT, ECCEnabled) \
    "    return ret;\n" \
    "}\n\n"

//A function to check the status of the command queue, emulating cudaStreamQuery
#define CL_COMMAND_QUEUE_QUERY_H \
    "cl_int __cu2cl_CommandQueueQuery(cl_command_queue commands);\n"

#define CL_COMMAND_QUEUE_QUERY \
    "cl_int __cu2cl_CommandQueueQuery(cl_command_queue commands) {\n" \
    "   cl_int ret;\n" \
    "   cl_event event;\n" \
    "   clEnqueueMarker(commands, &event);\n" \
    "   clGetEventInfo(commands, &event);\n" \
    "}\n\n"

//A function to take the time between two events, emulating cudaEventElapsedTime
#define CL_EVENT_ELAPSED_TIME_H \
    "cl_int __cu2cl_EventElapsedTime(float *ms, cl_event start, cl_event end);\n"

#define CL_EVENT_ELAPSED_TIME \
    "cl_int __cu2cl_EventElapsedTime(float *ms, cl_event start, cl_event end) {\n" \
    "    cl_int ret;\n" \
    "    cl_ulong s, e;\n" \
    "    float fs, fe;\n" \
    "    ret |= clGetEventProfilingInfo(start, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &s, NULL);\n" \
    "    ret |= clGetEventProfilingInfo(end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &e, NULL);\n" \
    "    s = e - s;\n" \
    "    *ms = ((float) s)/1000000.0;\n" \
    "    return ret;\n" \
    "}\n\n"

//A function to check whether the command queue has hit an injected event yet, emulating cudaEventQuery
#define CL_EVENT_QUERY_H \
    "cl_int __cu2cl_EventQuery(cl_event event);\n"

#define CL_EVENT_QUERY \
    "cl_int __cu2cl_EventQuery(cl_event event) {\n" \
    "    cl_int ret;\n" \
    "    clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);\n" \
    "    return ret;\n" \
    "}\n\n"

//A function to emulate the behavior (not necessarily semantics) of cudaMallocHost
// allocates a device buffer, then maps it into the host address space, and returns a pointer to it
#define CL_MALLOC_HOST_H \
    "cl_int __cu2cl_MallocHost(void **ptr, size_t size, cl_mem *clMem);\n"

#define CL_MALLOC_HOST \
    "cl_int __cu2cl_MallocHost(void **ptr, size_t size, cl_mem *clMem) {\n" \
    "    cl_int ret;\n" \
    "    *clMem = clCreateBuffer(__cu2cl_Context, CL_MEM_READ_WRITE, size, NULL, NULL);\n" \
    "    *ptr = clEnqueueMapBuffer(__cu2cl_CommandQueue, *clMem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &ret);\n" \
    "    return ret;\n" \
    "}\n\n"

//A function to emulate the behavior (not necessarily semantics) of cudaFreeHost
// unmaps a buffer allocated with __cu2cl_MallocHost, then releases the associated device buffer
#define CL_FREE_HOST_H \
    "cl_int __cu2cl_FreeHost(void *ptr, cl_mem clMem);\n"

#define CL_FREE_HOST \
    "cl_int __cu2cl_FreeHost(void *ptr, cl_mem clMem) {\n" \
    "    cl_int ret;\n" \
    "    ret = clEnqueueUnmapMemObject(__cu2cl_CommandQueue, clMem, ptr, 0, NULL, NULL);\n" \
    "    ret |= clReleaseMemObject(clMem);\n" \
    "    return ret;\n" \
    "}\n\n"

//A helper function to scan all platforms for all devices and accumulate them into a single array
// can be used independently of __cu2cl_setDevice, but not intended
#define CU2CL_SCAN_DEVICES_H \
    "void __cu2cl_ScanDevices();\n"

#define CU2CL_SCAN_DEVICES \
    "void __cu2cl_ScanDevices() {\n" \
    "   int i;\n" \
    "   cl_uint num_platforms = 0;\n" \
    "   cl_uint num_devices = 0;\n" \
    "   cl_uint p_dev_count, d_idx;\n" \
    "\n" \
    "   //allocate space for platforms\n" \
    "   clGetPlatformIDs(0, 0, &num_platforms);\n" \
    "   cl_platform_id * platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);\n" \
    "\n" \
    "   //get all platforms\n" \
    "   clGetPlatformIDs(num_platforms, &platforms[0], 0);\n" \
    "\n" \
    "   //count devices over all platforms\n" \
    "   for (i = 0; i < num_platforms; i++) {\n" \
    "       p_dev_count = 0;\n" \
    "       clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, 0, &p_dev_count);\n" \
    "       num_devices += p_dev_count;\n" \
    "   }\n" \
    "\n" \
    "   //allocate space for devices\n" \
    "   __cu2cl_AllDevices = (cl_device_id *) malloc(sizeof(cl_device_id) * num_devices);\n" \
    "\n" \
    "   //get all devices\n" \
    "   d_idx = 0;\n" \
    "   for ( i = 0; i < num_platforms; i++) {\n" \
    "       clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices-d_idx, &__cu2cl_AllDevices[d_idx], &p_dev_count);\n" \
    "       d_idx += p_dev_count;\n" \
    "       p_dev_count = 0;\n" \
    "   }\n" \
    "\n" \
    "   __cu2cl_AllDevices_size = d_idx;\n" \
    "   free(platforms);\n" \
    "}\n\n"

//A function to reset the OpenCL context and queues for the Nth device among all system devices
// uses __cu2cl_ScanDevices to enumerate, and thus uses whatever device ordering it provides
//FIXME: cudaSetDevice preserves the context when switching, ours destroys it, need to modify
// to internally manage and intelligently deconstruct the context(s)
#define CU2CL_SET_DEVICE_H \
    "void __cu2cl_SetDevice(cl_uint devID);\n"

#define CU2CL_SET_DEVICE \
    "void __cu2cl_SetDevice(cl_uint devID) {\n" \
    "   if (__cu2cl_AllDevices_size == 0) {\n" \
    "       __cu2cl_ScanDevices();\n" \
    "   }\n" \
    "   //only switch devices if it's a valid choice\n" \
    "   if (devID < __cu2cl_AllDevices_size) {\n" \
    "       //Assume auto-initialized queue and context, and free them\n" \
    "       clReleaseCommandQueue(__cu2cl_CommandQueue);\n" \
    "       clReleaseContext(__cu2cl_Context);\n" \
    "       //update device and platform references\n" \
    "       __cu2cl_AllDevices_curr_idx = devID;\n" \
    "       __cu2cl_Device = __cu2cl_AllDevices[devID];\n" \
    "       clGetDeviceInfo(__cu2cl_Device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &__cu2cl_Platform, NULL);\n" \
    "       //and make a new context and queue for the selected device\n" \
    "       __cu2cl_Context = clCreateContext(NULL, 1, &__cu2cl_Device, NULL, NULL, NULL);\n" \
    "       __cu2cl_CommandQueue = clCreateCommandQueue(__cu2cl_Context, __cu2cl_Device, CL_QUEUE_PROFILING_ENABLE, NULL);\n" \
    "   }\n" \
    "}\n\n"

using namespace clang;
using namespace clang::tooling;
using namespace llvm::sys::path;

namespace {
    //Flags used to ensure certain pieces of boilerplate only get added once
    // Hoisted to the Tool level so they can act over all files when generating cu2cl_util.c/h
    bool UsesCUDADeviceProp = false;
    bool UsesCUDAMemset = false;
    bool UsesCUDAStreamQuery = false;
    bool UsesCUDAEventElapsedTime = false;
    bool UsesCUDAEventQuery = false;
    bool UsesCUDAMallocHost = false;
    bool UsesCUDAFreeHost = false;
    bool UsesCUDASetDevice = false;
    bool UsesCU2CLUtilCL = false;
    bool UsesCU2CLLoadSrc = false;

    //internal flags for command-line toggles
    bool AddInlineComments = true; //defaults to ON, turn off with '--inline-comments=false' at the command line
    //Extra Arguments to be appended to all generated clBuildProgram calls.
    std::string ExtraBuildArgs; //defaults to "", add more with '--cl-build-args="<args>"'
    bool FilterKernelName = false; //defaults to OFF, turn on with '--rename-kernel-files' or '--rename-kernel-files=true'

    bool UseGCCPaths = false; //defaults to OFF, turn on with '--import-gcc-paths'
    //We borrow the OutputFile data structure from Clang's CompilerInstance.h
    // So that we can use it to store output streams and emulate their temp
    // file usage at the tool level
    struct OutputFile {
	std::string Filename;
	std::string TempFilename;
	raw_ostream *OS;

	OutputFile(const std::string &filename, const std::string &tempFilename, raw_ostream *os) : Filename(filename), TempFilename(tempFilename), OS(os) { }
    };

    typedef std::map<std::string, std::vector<std::string> > FileStrCacheMap;
    typedef std::map<std::string, OutputFile *> IDOutFileMap;

    //Index structures for looking up all references to a given Decl
    //typedef std::map<Decl *, std::vector<DeclRefExpr *> > DeclToRefMap;
    typedef std::tuple<SourceManager *, Preprocessor *, LangOptions *, ASTContext *> SourceTuple;
    typedef std::map<std::string, std::vector<DeclRefExpr *> > DeclToRefMap;
    typedef std::map<std::string, std::vector<std::pair<FunctionDecl *, SourceTuple *> > > CanonicalFuncDeclMap;
    typedef std::vector<std::pair<NamedDecl*, SourceTuple *> > FlaggedDeclVec;

	bool hasFlaggedDecl(FlaggedDeclVec * vec, NamedDecl * decl) {
		for (FlaggedDeclVec::iterator itr = vec->begin(); itr != vec->end(); itr++){
			if (itr->first == decl) return true;
		}
		return false;
	}

    //Simple Vector to hold retained SourceManagers to use at the tool layer
    typedef std::vector<SourceManager *> SMVec;
    //A simple structure to retain ASTContexts so they can be later used at the tool layer (and appropriately released)
    typedef std::vector<ASTContext *> ASTContVec;
    //Global Replacement structs, contributed to by each instance of the translator (one-per-main-source-file)
    // only written to after local deduplication and coalescing
    std::vector<Replacement> GlobalHostReplace;
    std::vector<Replacement> GlobalKernReplace;

    std::map<SourceLocation, Replacement> GlobalHostVecVars;
    //All ASTContexts get pushed here as their translation units get processed
    // so that their member elements can be referred to after TU processing
    ASTContVec AllASTs;
    SMVec AllSMs;

    //All Declarations and references to them are recorded to propagate cl_mem and other critical rewrites across TU boundaries
    FlaggedDeclVec DeclsToTranslate;
    DeclToRefMap AllDeclRefsByDecl;
    CanonicalFuncDeclMap AllFuncsByCanon;

    //Global outFiles maps, moved so that they can be shared and written to at the tool level
    IDOutFileMap OutFiles;
    IDOutFileMap KernelOutFiles;

    //Global map of declaration statements to the files that own them (all others declare them "extern")
    //Filenames are original (not *-cl.cl/cpp/h) except cu2cl_util.c/h/cl
    FileStrCacheMap GlobalCDecls;
    FileStrCacheMap LocalBoilDefs;

    //Global boilerplate strings
    std::string CU2CLInit;
    std::string CU2CLClean;

    std::vector<std::string> GlobalHDecls, GlobalCFuncs, GlobalCLFuncs, UtilKernels;
    //We also borrow the loose method of dealing with temporary output files from
    // CompilerInstance::clearOutputFiles
    void clearOutputFile(OutputFile *OF, FileManager *FM) {
	if(!OF->TempFilename.empty()) {
	    SmallString<128> NewOutFile(OF->Filename);
	    FM->FixupRelativePath(NewOutFile);
	    if (llvm::error_code ec = llvm::sys::fs::rename(OF->TempFilename, NewOutFile.str()))
		llvm::errs() << "Unable to move CU2CL temporary output [" << OF->TempFilename << "] to [" << OF->Filename << "]!\n\t Diag Msg: " << ec.message() << "\n";
	    llvm::sys::fs::remove(OF->TempFilename);
	} else {
	    llvm::sys::fs::remove(OF->Filename);
	}
	delete OF->OS;
    }

    //Replace all instances of the phrase "kernel" with "knl"
    // Used to rename files as per Altera's kernel filename requirement
    std::string kernelNameFilter(std::string str) {
	std::string newStr = str;
	if (!FilterKernelName) return newStr;
	size_t pos = newStr.rfind("/"); //Only rewrite the file, not the path
	if (pos == std::string::npos) pos = 0;
	for (; ; pos += 3) {
	    pos = newStr.find("kernel", pos);
	    if (pos == std::string::npos) break;
	    newStr.erase(pos, 6);
	    newStr.insert(pos, "knl");
	}
	return newStr;
    }


bool isInBannedInclude(SourceLocation loc, SourceManager * SM, LangOptions * LO) {
	SourceLocation sloc = SM->getSpellingLoc(loc);
	//if (loc.isMacroID()) sloc = SM->getSpellingLoc(loc);
        std::string FileName = SM->getPresumedLoc(loc).getFilename();
	//llvm::errs() << "CU2CL DEBUG: " << FileName;

            llvm::StringRef fileExt = extension(FileName);
            if (fileExt.equals(".cu") || fileExt.equals(".cuh")) return false;
	//TODO check if the file was included by any file matching the below criteria
	if (filename(FileName).equals("cuda.h") || filename(FileName).equals("cuda_runtime.h") || filename(FileName).equals("cuda_runtime_api.h") || filename(FileName).equals("cuda_gl_interop.h") || filename(FileName).equals("cutil.h") || filename(FileName).equals("cutil_inline.h") || filename(FileName).equals("cutil_gl_inline.h") || filename(FileName).equals("vector_types.h") || SM->isInSystemHeader(loc) || SM->isInExternCSystemHeader(loc) || SM->isInSystemMacro(loc) || SM->isInSystemHeader(sloc) || SM->isInExternCSystemHeader(sloc) || SM->isInSystemMacro(sloc)) {
		//it's a forbidden file, just skip the file
		return true;
	}
	SourceLocation parentLoc = SM->getIncludeLoc(SM->getFileID(loc));
	//If the parent of the regular location isn't valid, try the spelling location
	if (!parentLoc.isValid() && loc.isMacroID()) parentLoc = SM->getIncludeLoc(SM->getFileID(sloc));
	if (!parentLoc.isValid()) {
		if (!SM->isInMainFile(loc)) llvm::errs() << "CU2CL DEBUG: " << loc.printToString(*SM) << "\nInvalid parent IncludeLoc\n";
		return false;
	}
	//If the include location is
	//llvm::errs() << "CU2CL DEBUG: Checking parent include from [" << parentLoc.printToString(*SM) << "]\n";
	Token fileTok;
	Lexer::getRawToken(parentLoc, fileTok, *SM, *LO);
//	SourceLocation angleLoc = Lexer::findLocationAfterToken(parentLoc, tok::angle_string_literal, *SM, *LO, true);
//	if (!angleLoc.isValid()) {
	if (!fileTok.is(tok::angle_string_literal) && !fileTok.is(tok::less)) {
		//llvm::errs() << fileTok.getName() << " :Parent is a quote #include!\n";

		//As a fallback, try banning based on the parent
		return isInBannedInclude(parentLoc, SM, LO);
	} else {
		//llvm::errs() << "Parent is an angle #include!\n";
		return true;
	}

}

//Simple timer calls that get injected if enabled
#ifdef CU2CL_ENABLE_TIMING
	uint64_t TransTime;
struct timeval startTime, endTime;

void init_time() {
    gettimeofday(&startTime, NULL);
}

uint64_t get_time() {
    gettimeofday(&endTime, NULL);
    return (uint64_t) (endTime.tv_sec - startTime.tv_sec)*1000000 +
        (endTime.tv_usec - startTime.tv_usec);
}
#endif

//Check which of two DeclGroups come first in the source
struct cmpDG {
    bool operator()(DeclGroupRef a, DeclGroupRef b) {
        SourceLocation aLoc = (a.isSingleDecl() ? a.getSingleDecl() : a.getDeclGroup()[0])->getLocStart();
        SourceLocation bLoc = (b.isSingleDecl() ? b.getSingleDecl() : b.getDeclGroup()[0])->getLocStart();
        return aLoc.getRawEncoding() < bLoc.getRawEncoding();
    }
};

//FIXME: Borrowed verbatim from Clang's Refactoring.cpp
// Just call theirs once we can (for now it's not recognized as a member of the clang::tooling namespace, though it should be
static int getRangeSize(SourceManager &Sources, const CharSourceRange &Range) {
  SourceLocation SpellingBegin = Sources.getSpellingLoc(Range.getBegin());
  SourceLocation SpellingEnd = Sources.getSpellingLoc(Range.getEnd());
  std::pair<FileID, unsigned> Start = Sources.getDecomposedLoc(SpellingBegin);
  std::pair<FileID, unsigned> End = Sources.getDecomposedLoc(SpellingEnd);
  if (Start.first != End.first) return -1;
  if (Range.isTokenRange())
    End.second += Lexer::MeasureTokenLength(SpellingEnd, Sources, LangOptions());
  return End.second - Start.second;
}

//This method is designed to walk a vector of Replacements that has already
// been deduplicated, and fuse Replacments that are enqueued on the same
// start SourceLocation
//\pre replace is sorted in order of increasing SourceLocation
//\pre replace has no duplicate Replacements
//\post replace has no more than one Replacement per SourceLocation
void coalesceReplacements(std::vector<Replacement> &replace) {
	//Must assemble a new vector in-place
	//Swap the input vector with the work vector so we can add replacements directly back as output
	std::vector<Replacement> work;
	work.swap(replace);

	//track the maximum range for a set of Replacements to be fused
	int max;
	//track the concatenated text for a set of Replacements to be fused
	std::stringstream text;
	std::vector<Replacement>::const_iterator J;

	//Iterate over every Replacement in the input vector
	for (std::vector<Replacement>::const_iterator I = work.begin(), E = work.end(); I != E; I++) {
	    //reset the max range size and string to match I
	    max = I->getLength();
	    text.str("");
	    text << I->getReplacementText().str();
	    //Look forward at all Replacements at the same location as I
	    for (J = I+1; J !=E && J->getFilePath() == I->getFilePath() && J->getOffset() == I->getOffset(); J++) {
	    	//Check if they cover a longer range, and concatenate changes
		max = (max > J->getLength() ? max : J->getLength());
		text << J->getReplacementText().str();
		//llvm::errs() << "Merging text: " << text.str();
	    }
	    //Add the coalesced Replacement back to the input vector
	    replace.push_back(Replacement(I->getFilePath(), I->getOffset(), max, text.str()));
	    //And finally move the I iterator forward to the last-fused Replacement
	    I = J-1;
	}
}
    void debugPrintReplacements(std::vector<Replacement> replace) {
	for (std::vector<Replacement>::const_iterator I = replace.begin(), E = replace.end(); I != E; I++) {
	    llvm::errs() << I->toString() << "\n";
	}

    }
    //Comments to be injected into source code are buffered until after translation
    // this struct implements a simple list for storing them, but is not meamnt for
    // use outside the bufferComment and writeComments functions
    // l is the SourceLoc pointer
    // s is the string itself
    // w declares whether it's a host (true) or device (false) comment
    //WARNING: Not threadsafe at all!
    struct commentBufferNode;
    struct commentBufferNode {
	void * l;
	char * s;
	std::vector<Replacement> * r;
	struct commentBufferNode * n;
	};
    struct commentBufferNode * tail, * head;

    //Buffer a new comment destined to be added to output OpenCL source files
    //WARNING: Not threadsafe at all!
    void bufferComment(SourceLocation loc, std::string str, std::vector<Replacement> *replacements) {
	struct commentBufferNode * n = (struct commentBufferNode *)malloc(sizeof(commentBufferNode));
	n->s = (char *)malloc(sizeof(char)*(str.length()+1));
	str.copy(n->s, str.length());
	n->s[str.length()] = '\0';
	n->l = loc.getPtrEncoding(); n->r = replacements; n->n = NULL;

	tail->n = n;
	tail = n;
    }


    // Workhorse for CU2CL diagnostics, provides independent specification of multiple err_notes
    //  and inline_notes which should be dumped to stderr and translated output, respectively
    // TODO: Eventually this could stand to be implemented using the real Basic/Diagnostic subsystem
    //  but at the moment, the set of errors isn't mature enough to make it worth it.
    // It's just cheaper to directly throw it more readily-adjustable strings until we set the
    //  error messages in stone.
    void emitCU2CLDiagnostic(SourceManager * SM, SourceLocation loc, std::string severity_str, std::string err_note, std::string inline_note, std::vector<Replacement> * replacements) {
        //Sanitize all incoming locations to make sure they're not MacroIDs
        SourceLocation expLoc = SM->getExpansionLoc(loc);
        SourceLocation writeLoc;

        //assemble both the stderr and inlined source output strings
        std::stringstream inlineStr;
        std::stringstream errStr;
	inlineStr << "/*";
        if (expLoc.isValid()){
	    //Tack the source line information onto the diagnostic
            //inlineStr << SM->getBufferName(expLoc) << ":" << SM->getExpansionLineNumber(expLoc) << ":" << SM->getExpansionColumnNumber(expLoc) << ": ";
            errStr << SM->getBufferName(expLoc) << ":" << SM->getExpansionLineNumber(expLoc) << ":" << SM->getExpansionColumnNumber(expLoc) << ": ";
            //grab the start of column write location
            writeLoc = SM->translateLineCol(SM->getFileID(expLoc), SM->getExpansionLineNumber(expLoc), 1);
        }
	//Inject the severity string to both outputs
        if (!severity_str.empty()) {
            errStr << severity_str << ": ";
            inlineStr << severity_str << " -- ";
        }
        inlineStr << inline_note << "*/\n";
        errStr << err_note << "\n";

        if (expLoc.isValid()){
            //print the inline string(s) to the output file
            bool isValid;
			//Buffer the comment for outputing after translation is finished.
			//Disable this section to turn off error emission, by default if an
			// inline error string is empty, it will turn off comment insertion for that error
			if (!inline_note.empty() && AddInlineComments) {
				bufferComment(writeLoc, inlineStr.str(), replacements);
			}
        }
        //Send the stderr string to stderr
        llvm::errs() << errStr.str();
    }

    // Convenience method for dumping the same CU2CL error to both stderr and inlined comments
    //  using the mechanism above
    // Assumes the err_note is replicated as the inline comment to add to source.
    void emitCU2CLDiagnostic(SourceManager * SM, SourceLocation loc, std::string severity_str, std::string err_note, std::vector<Replacement> * replacements) {
        emitCU2CLDiagnostic(SM, loc, severity_str, err_note, err_note, replacements);
    }
    //Convenience method for getting a string of raw text between two SourceLocations
    std::string getStmtText(LangOptions * LO, SourceManager * SM, Stmt *s) {
        SourceLocation a(SM->getExpansionLoc(s->getLocStart())), b(Lexer::getLocForEndOfToken(SourceLocation(SM->getExpansionLoc(s->getLocEnd())), 0,  *SM, *LO));
        return std::string(SM->getCharacterData(a), SM->getCharacterData(b)-SM->getCharacterData(a));
    }
	//Perform any last-minute checks on the Replacement and add it to the provided list of Replacements
    bool generateReplacement(std::vector<Replacement> &replacements, SourceManager * SM, SourceLocation sloc, int len, StringRef replace) {
	//Insert any protection logic here to make sure only legal replacements get added
	//TODO: Once Macro handling is improved, removing the SourceLoc check
	//if (!SM->isInSameSLocAddrSpace(sloc, sloc.getLocWithOffset(len), NULL)) {
	if (len < 0) { //If for some reason the length is negative (invalid) refuse to perform the replacement
	emitCU2CLDiagnostic(SM, sloc, "CU2CL Unhandled", "Replacement Range out of bounds", replace, &replacements);
	return false;
	}
        else replacements.push_back(Replacement(*SM, sloc, (unsigned)len, replace));

	return true;
    }
    //Method to output comments destined for addition to output OpenCL source
    // which have been buffered to avoid sideeffects with other rewrites
    //WARNING: Not threadsafe at all!
    void writeComments(SourceManager * SM) {
	struct commentBufferNode * curr = head->n;
	while (curr != NULL) { // as long as we have more comment nodes..
	    // inject the comment to the host output stream if true
		generateReplacement(*(curr->r), SM, SourceLocation::getFromPtrEncoding(curr->l), 0, llvm::StringRef(curr->s));
	    //move ahead, then destroy the current node
	    curr = curr->n;
	    free(head->n->s);
	    free(head->n);
	    head->n = curr;
	}

	tail = head;
    }

class RewriteCUDA;

//The class prototype necessary to trigger rewriting #included files
class RewriteIncludesCallback : public PPCallbacks {
private:
    RewriteCUDA *RCUDA;

public:
    RewriteIncludesCallback(RewriteCUDA *);

    virtual void InclusionDirective(SourceLocation, const Token &,
                                    llvm::StringRef, bool,
				    CharSourceRange, const FileEntry *,
                                    StringRef, StringRef,
				    const Module *);

};


/**
 * An AST consumer made to rewrite CUDA to OpenCL.
 * The entire translation process is essentially modeled as an ASTConsumer
 *  so that we can fully rely on Clang to construct the AST, then simply
 *  perform a full walk of the tree to identify the CUDA bits to translate.
 **/
class RewriteCUDA : public ASTConsumer {
protected:

private:
    typedef std::map<llvm::StringRef, std::list<llvm::StringRef> > StringRefListMap;

    CompilerInstance *CI;
    SourceManager *SM;
    LangOptions *LO;
    Preprocessor *PP;
    SourceTuple *ST;

    Rewriter HostRewrite;
    Rewriter KernelRewrite;

    //TODO: Once Clang updates to use vectors rather than sets for Replacements
    // change this to reflect that
    std::vector<Replacement> HostReplace;
    std::vector<Replacement> KernReplace;

    //Rewritten files
    FileID MainFileID;
    std::string mainFilename;
    OutputFile *MainOutFile;
    OutputFile *MainKernelOutFile;
    //TODO lump IDs and both outfiles together

    StringRefListMap Kernels;

    std::set<DeclGroupRef, cmpDG> GlobalVarDeclGroups;
    std::set<DeclGroupRef, cmpDG> CurVarDeclGroups;
    std::set<DeclGroupRef, cmpDG> DeviceMemDGs;
    std::set<DeclaratorDecl *> DeviceMemVars;
    std::set<DeclaratorDecl *> HostMemVars;
    std::set<VarDecl *> ConstMemVars;
    std::set<VarDecl *> SharedMemVars;
    std::set<ParmVarDecl *> CurRefParmVars;

    std::map<SourceLocation, Replacement> HostVecVars;

    TypeLoc LastLoc;

    std::string MainFuncName;
    FunctionDecl *MainDecl;

    //Preamble string to insert at top of main host file
    std::string HostPreamble;
    std::string HostIncludes;
    std::string HostDecls;
    std::string HostGlobalVars;
    std::string HostKernels;
    std::string HostFunctions;

    bool IncludingStringH;

    //Preamble string to insert at top of main kernel file
    std::string DevPreamble;
    std::string DevFunctions;

    //Pre- and Postamble strings that bundle OpenCL boilerplate for a translation unit
    //Global boilerplate is generated in CU2CLInit and CU2CLClean
    std::string CLInit;
    std::string CLClean;


void TraverseStmt(Stmt *e, unsigned int indent) {
        for (unsigned int i = 0; i < indent; i++)
            llvm::errs() << "  ";
        llvm::errs() << e->getStmtClassName() << "\n";
        indent++;
        for (Stmt::child_iterator CI = e->child_begin(), CE = e->child_end();
             CI != CE; ++CI)
            if (*CI)
                TraverseStmt(*CI, indent);
    }

    template <class T>
    T *FindStmt(Stmt *e) {
        if (T *t = dyn_cast<T>(e))
            return t;
        T *ret = NULL;
        for (Stmt::child_iterator CI = e->child_begin(), CE = e->child_end();
             CI != CE; ++CI) {
            ret = FindStmt<T>(*CI);
            if (ret)
                return ret;
        }
        return NULL;
    }


    std::string getTextFromLocs(SourceLocation a, SourceLocation b) {
	return std::string(SM->getCharacterData(a), SM->getCharacterData(b)-SM->getCharacterData(a));
    }


    //Simple function to strip attributes from host functions that may be declared as
    // both __host__ and __device__, then passes off to the host-side statement rewriter
    void RewriteHostFunction(FunctionDecl *hostFunc) {
	//Register it on the RedeclMap
	//TODO: We may want to check if this FunctionDecl (by text location) has already been added by another AST
	//TODO:  but for now we are assuming we will generate the same replacements that just get deduped
	    AllFuncsByCanon[hostFunc->getFirstDecl()->getLocStart().printToString(*SM)].push_back(std::pair<FunctionDecl *, SourceTuple *>(hostFunc, ST));


        //Remove any CUDA function attributes
        if (CUDAHostAttr *attr = hostFunc->getAttr<CUDAHostAttr>()) {
            RewriteAttr(attr, "", HostReplace);
        }
        if (CUDADeviceAttr *attr = hostFunc->getAttr<CUDADeviceAttr>()) {
            RewriteAttr(attr, "", HostReplace);
        }

        //Rewrite the body
        if (Stmt *body = hostFunc->getBody()) {
            RewriteHostStmt(body);
        }
        CurVarDeclGroups.clear();
    }

    //Forks host-side statement processing between expressions, declarations, and other statements
    void RewriteHostStmt(Stmt *s) {
        //Visit this node
        if (Expr *e = dyn_cast<Expr>(s)) {
            std::string str;
            if (RewriteHostExpr(e, str)) {
                ReplaceStmtWithText(e, str, HostReplace);
            }
        }
        else if (DeclStmt *ds = dyn_cast<DeclStmt>(s)) {
            DeclGroupRef DG = ds->getDeclGroup();
            Decl *firstDecl = DG.isSingleDecl() ? DG.getSingleDecl() : DG.getDeclGroup()[0];
            //Store VarDecl DeclGroupRefs
            if (firstDecl->getKind() == Decl::Var) {
                CurVarDeclGroups.insert(DG);
            }
            for (DeclGroupRef::iterator i = DG.begin(), e = DG.end(); i != e; ++i) {
                if (VarDecl *vd = dyn_cast<VarDecl>(*i)) {
                    RewriteHostVarDecl(vd);
                }
                //TODO other non-top level declarations??
            }
        }
        //TODO rewrite any other Stmts?

        else {
            //Traverse children and recurse
            for (Stmt::child_iterator CI = s->child_begin(), CE = s->child_end();
                 CI != CE; ++CI) {
                if (*CI)
                    RewriteHostStmt(*CI);
            }
        }
    }

    //Expressions, along with declarations, are the main meat of what needs to be rewritten
    //Host-side we primarily need to deal with CUDA C kernel launches and API call expressions
    bool RewriteHostExpr(Expr *e, std::string &newExpr) {
        //Return value specifies whether or not a rewrite occurred
        if (e->getSourceRange().isInvalid())
            return false;

        //Rewriter used for rewriting subexpressions
        Rewriter exprRewriter(*SM, *LO);
        //Instantiation locations are used to capture macros
        SourceRange realRange(SM->getExpansionLoc(e->getLocStart()),
                              SM->getExpansionLoc(e->getLocEnd()));

	//If DRE, register for potential late translation
        if (DeclRefExpr *dre = dyn_cast<DeclRefExpr>(e)) {

	    AllDeclRefsByDecl[dre->getDecl()->getLocStart().printToString(*SM)].push_back(dre);
	}

	//Detect CUDA C style kernel launches ie. fooKern<<<Grid, Block, shared, stream>>>(args..);
	// the Runtime and Driver API's launch mechanisms would be handled with the rest of the API calls
        if (CUDAKernelCallExpr *kce = dyn_cast<CUDAKernelCallExpr>(e)) {
            //Short-circuit templated kernel launches
            if (kce->isTypeDependent()) {
                emitCU2CLDiagnostic(SM, kce->getLocStart(), "CU2CL Untranslated", "Template-dependent kernel call", &HostReplace);
                return false;
            }
	    //Short-circuit launching a function pointer until we can handle it
	    else if (kce->getDirectCallee() == 0 && dyn_cast<ImplicitCastExpr>(kce->getCallee())) {
                emitCU2CLDiagnostic(SM, kce->getLocStart(), "CU2CL Unhandled", "Function pointer as kernel call", &HostReplace);
                return false;
            }
	    //If it's not a templated or pointer launch, proceed with translation
            newExpr = RewriteCUDAKernelCall(kce);
            return true;
        }
        else if (CallExpr *ce = dyn_cast<CallExpr>(e)) {
            if (ce->isTypeDependent()) {
                emitCU2CLDiagnostic(SM, ce->getLocStart(), "CU2CL Untranslated", "Template-dependent host call", &HostReplace);
                return false;
            }
            //This catches some errors encountered with heavily-nested, PP-assembled function-like macros
	    // mostly observed within the OpenGL and GLUT headers
            if (ce->getDirectCallee() == 0) {
                emitCU2CLDiagnostic(SM, SM->getExpansionLoc(ce->getLocStart()), "CU2CL Unhandled", "Could not identify direct callee in expression", &HostReplace);
            }
	    //This catches all Runtime API calls, since they are all prefixed by "cuda"
	    // and all Driver API calls that are prefixed with just "cu"
	    //Also catches cutil, cuFFT, cuBLAS, and other library calls incidentally, which may or may not be wanted
	    //TODO: Perhaps a second tier of filtering is needed
	    else if (ce->getDirectCallee()->getNameAsString().find("cu") == 0)
                return RewriteCUDACall(ce, newExpr);
        }
	//Catches expressions which refer to the member of a struct or class
	// in the CUDA case these are primarily just dim3s and cudaDeviceProp
        else if (MemberExpr *me = dyn_cast<MemberExpr>(e)) {
            //Check base Expr, if DeclRefExpr and a dim3, then rewrite
            if (DeclRefExpr *dre = dyn_cast<DeclRefExpr>(me->getBase())) {
                std::string type = dre->getDecl()->getType().getAsString();
                if (type == "dim3") {
                    std::string name = me->getMemberDecl()->getNameAsString();
                    if (name == "x") {
                        name = "[0]";
                    }
                    else if (name == "y") {
                        name = "[1]";
                    }
                    else if (name == "z") {
                        name = "[2]";
                    }
                    newExpr = getStmtText(LO, SM, dre) + name; //PrintStmtToString(dre) + name;
                    return true;
                }
                else if (type == "cudaDeviceProp") {
                    //TODO check what the reference is
                    //TODO if unsupported, print a warning

                    return false;
                }
            }
        }

	//Rewrite explicit casts of CUDA data types
        else if (ExplicitCastExpr *ece = dyn_cast<ExplicitCastExpr>(e)) {
            bool ret = true;

            TypeLoc origTL = ece->getTypeInfoAsWritten()->getTypeLoc();
            TypeLoc tl = origTL;
            while (!tl.getNextTypeLoc().isNull()) {
                tl = tl.getNextTypeLoc();
            }
            QualType qt = tl.getType();
            std::string type = qt.getAsString();

            if (type == "dim3") {
                if (origTL.getTypePtr()->isPointerType())
                    RewriteType(tl, "size_t *", exprRewriter);
                else
                    RewriteType(tl, "size_t[3]", exprRewriter);
            }
            else if (type == "struct cudaDeviceProp") {
                RewriteType(tl, "struct __cu2cl_DeviceProp", exprRewriter);
            }
            else if (type == "cudaStream_t") {
                RewriteType(tl, "cl_command_queue", exprRewriter);
            }
            else if (type == "cudaEvent_t") {
                RewriteType(tl, "cl_event", exprRewriter);
            }
            else {
                ret = false;
            }

            //Rewrite subexpression
            std::string s;
            if (RewriteHostExpr(ece->getSubExpr(), s)) {
                ReplaceStmtWithText(ece->getSubExpr(), s, exprRewriter);
                ret = true;
            }
            newExpr = exprRewriter.getRewrittenText(realRange);
            return ret;
        }
	//Rewrite unary expressions or type trait expressions (things like sizeof)
        else if (UnaryExprOrTypeTraitExpr *soe = dyn_cast<UnaryExprOrTypeTraitExpr>(e)) {
            if (soe->isArgumentType()) {
                bool ret = true;
                TypeLoc tl = soe->getArgumentTypeInfo()->getTypeLoc();
                while (!tl.getNextTypeLoc().isNull()) {
                    tl = tl.getNextTypeLoc();
                }
                QualType qt = tl.getType();
                std::string type = qt.getAsString();

                if (type == "dim3") {
                    RewriteType(tl, "size_t[3]", exprRewriter);
                }
                else if (type == "struct cudaDeviceProp") {
                    RewriteType(tl, "struct __cu2cl_DeviceProp", exprRewriter);
                }
                else if (type == "cudaStream_t") {
                    RewriteType(tl, "cl_command_queue", exprRewriter);