...

deepakkumar1984 · deepakkumar1984 · commit a44ce5dcb13e · 2019-02-11T16:37:59.000+10:30
diff --git a/Examples/MNIST/Program.cs b/Examples/MNIST/Program.cs
@@ -14,6 +14,12 @@ static void Main(string[] args)
         {
             Global.UseGpu();
 
+            Tensor x = Tensor.FromArray(Global.Device, new float[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 });
+            x = x.Reshape(3, 3);
+
+            var result = TOps.Diag(x);
+            result.Print();
+
             string datasetFolder = @"C:\dataset\MNIST";
             bool useDenseModel = false;
 
@@ -50,9 +56,9 @@ private static Sequential BuildFCModel()
         private static Sequential BuildConvModel()
         {
             Sequential model = new Sequential();
-            model.Add(new Conv2D(filters: 16, kernalSize: Tuple.Create<uint, uint>(5, 5), activation: ActType.Sigmoid));
+            model.Add(new Conv2D(filters: 16, kernalSize: Tuple.Create<uint, uint>(5, 5), activation: ActType.ReLU));
             model.Add(new MaxPooling2D(poolSize: Tuple.Create<uint, uint>(2, 2)));
-            model.Add(new Conv2D(filters: 32, kernalSize: Tuple.Create<uint, uint>(5, 5), activation: ActType.Sigmoid));
+            model.Add(new Conv2D(filters: 32, kernalSize: Tuple.Create<uint, uint>(5, 5), activation: ActType.ReLU));
             model.Add(new MaxPooling2D(poolSize: Tuple.Create<uint, uint>(2, 2)));
             //model.Add(new Dropout(0.2f));
             model.Add(new Flatten());
diff --git a/ManagedCuda/ManagedCuda.csproj b/ManagedCuda/ManagedCuda.csproj
@@ -23,7 +23,7 @@
   </PropertyGroup>
 
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
-    <DefineConstants>TRACE;WIN,CUDA90,CUDNN7</DefineConstants>
+    <DefineConstants>TRACE;WIN,CUDA100,CUDNN7</DefineConstants>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
diff --git a/SiaNet.Test/Im2ColTest.cs b/SiaNet.Test/Im2ColTest.cs
@@ -24,7 +24,7 @@ public void Im2Col_2d()
         [TestMethod]
         public void DiagTest()
         {
-            //Global.UseGpu();
+            Global.UseGpu();
             Tensor x = Tensor.FromArray(Global.Device, new float[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 });
             x = x.Reshape(3, 3);
 
diff --git a/Tensor/TensorSharp/Cuda/CudaBasicOps.cs b/Tensor/TensorSharp/Cuda/CudaBasicOps.cs
@@ -16,6 +16,7 @@
 using System.Linq;
 using System.Text;
 using TensorSharp.Core;
+using TensorSharp.Cuda.DeviceCode;
 using TensorSharp.CUDA.DeviceCode;
 using TensorSharp.CUDA.KernelOps;
 using TensorSharp.CUDA.MatrixMul;
@@ -61,6 +62,8 @@ public class CudaBasicOps
         /// </summary>
         private readonly ReduceDimIndexKernels reduceDimIndexKernels = new ReduceDimIndexKernels();
 
+        private readonly MatrixOps matrixOps = new MatrixOps();
+
 
         /// <summary>
         /// Initializes a new instance of the <see cref="CudaBasicOps"/> class.
@@ -986,5 +989,11 @@ public Tensor StdAll(Tensor result, Tensor src)
             return writeTarget;
         }
 
+
+        [RegisterOpStorageType("diag", typeof(CudaStorage))]
+        public Tensor Diag(Tensor src)
+        {
+            return matrixOps.Diag(src);
+        }
     }
 }
diff --git a/Tensor/TensorSharp/Cuda/DeviceCode/CU/MatrixOps.c b/Tensor/TensorSharp/Cuda/DeviceCode/CU/MatrixOps.c
@@ -2,69 +2,17 @@
 // tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
 // size (with the exception of 't2' in dimension 'dim').
 // This version uses a static number of dimensions.
-template <typename IndexType, int Dims>
-struct IndexToScatterGatherOffsets {
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t1, IndexType* t1Offset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = Dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			*t1Offset += curDimIndex * t1.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
-		}
-	}
-
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = Dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
-		}
-	}
-};
-
 // Same as above but using a dynamic number of dimensions.
 template <typename IndexType>
-struct IndexToScatterGatherOffsets<IndexType, -1> {
+struct DiagOffsets<IndexType, -1> {
 	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t1, IndexType* t1Offset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = index.dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			*t1Offset += curDimIndex * t1.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
+		IndexType linearId, const int dim, const TensorInfo<IndexType>& t, IndexType* tOffset) {
+		for (int d = t.dims - 1; d >= 0; d--) {
+			IndexType curDimIndex = linearId % t.sizes[d];
+			*tOffset += curDimIndex * t.strides[d];
 			}
-			linearId /= index.sizes[d];
-		}
-	}
 
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = index.dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
+			linearId /= t.sizes[d];
 		}
 	}
 };
@@ -75,20 +23,27 @@ __global__ void diag_kernel(
 	TensorInfo<IndexType> tensor,
 	TensorInfo<IndexType> src,
 	const IndexType totalElements) {
-	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; linearId < totalElements; linearId += gridDim.x * blockDim.x) {
-		IndexType tensorOffset = 0;
-		IndexType srcOffset = 0;
-		IndexType indexOffset = 0;
+	for (IndexType i = blockIdx.x * blockDim.x + threadIdx.x; i < totalElements; i += gridDim.x * blockDim.x) {
+		for (IndexType j = blockIdx.x * blockDim.x + threadIdx.x; j < totalElements; j += gridDim.x * blockDim.x) {
+			IndexType tensorOffset = 0;
+			IndexType srcOffset = 0;
 
-		IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
-			index, &indexOffset,
-			tensor, &tensorOffset,
-			src, &srcOffset);
+			DiagOffsets<IndexType>::compute(i, dim, tensor, &tensorOffset);
+			DiagOffsets<IndexType>::compute(i, dim, src, &srcOffset);
 
-		IndexType indexValue = (IndexType)index.data[indexOffset];
-		srcOffset += indexValue * src.strides[dim];
+			if (i == j)
+			{
+				IndexType indexValue = (IndexType)src.data[tensorOffset];
+				srcOffset += indexValue * src.strides[dim];
 
-		tensor.data[tensorOffset] = src.data[srcOffset];
+				tensor.data[tensorOffset] = src.data[srcOffset];
+			}
+			else
+			{
+				tensor.data[tensorOffset] = 0;
+			}
+			
+		}
 	}
 };
 
diff --git a/Tensor/TensorSharp/Cuda/DeviceCode/MatrixKernels.cs b/Tensor/TensorSharp/Cuda/DeviceCode/MatrixKernels.cs
@@ -32,116 +32,6 @@ namespace TensorSharp.CUDA.DeviceCode
     [Precompile]
     public class MatrixKernels : CudaCode
     {
-        /// <summary>
-        /// The code
-        /// </summary>
-        public static string Code = @"
-// Compute the offsets into the given tensors for a linear index. For the 't2'
-// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
-// size (with the exception of 't2' in dimension 'dim').
-// This version uses a static number of dimensions.
-template <typename IndexType, int Dims>
-struct IndexToScatterGatherOffsets {
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t1, IndexType* t1Offset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = Dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			*t1Offset += curDimIndex * t1.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
-		}
-	}
-
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = Dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
-		}
-	}
-};
-
-// Same as above but using a dynamic number of dimensions.
-template <typename IndexType>
-struct IndexToScatterGatherOffsets<IndexType, -1> {
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t1, IndexType* t1Offset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = index.dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			*t1Offset += curDimIndex * t1.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
-		}
-	}
-
-	static __device__ void compute(
-		IndexType linearId, const int dim,
-		const TensorInfo<IndexType>& index, IndexType* indexOffset,
-		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
-		for (int d = index.dims - 1; d >= 0; d--) {
-			IndexType curDimIndex = linearId % index.sizes[d];
-			*indexOffset += curDimIndex * index.strides[d];
-			if (d != dim) {
-				*t2Offset += curDimIndex * t2.strides[d];
-			}
-			linearId /= index.sizes[d];
-		}
-	}
-};
-
-
-template <typename IndexType, int Dims>
-__global__ void diag_kernel(
-	TensorInfo<IndexType> tensor,
-	TensorInfo<IndexType> src,
-	const IndexType totalElements) {
-	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; linearId < totalElements; linearId += gridDim.x * blockDim.x) {
-		IndexType tensorOffset = 0;
-		IndexType srcOffset = 0;
-		IndexType indexOffset = 0;
-
-		IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
-			index, &indexOffset,
-			tensor, &tensorOffset,
-			src, &srcOffset);
-
-		IndexType indexValue = (IndexType)index.data[indexOffset];
-		srcOffset += indexValue * src.strides[dim];
-
-		tensor.data[tensorOffset] = src.data[srcOffset];
-	}
-};
-
-#define DECLARE_DIAG(KERNEL_NAME, INDEX_TYPE, DIMS) \
-    extern ""C"" {\
-        __global__ void KERNEL_NAME(\
-                                          TensorInfo<INDEX_TYPE> tensor,\
-                                          TensorInfo<INDEX_TYPE> src,\
-                                          INDEX_TYPE totalElements)\
-        {\
-            diag_kernel<INDEX_TYPE, DIMS>(tensor, src, totalElements);\
-        }\
-    }
-";
-
         /// <summary>
         /// The diag matrix base name
         /// </summary>
@@ -161,8 +51,7 @@ public MatrixKernels() : base(GetCode(), "General", "ReduceApplyUtils")
         /// <returns>System.String.</returns>
         private static string GetCode()
         {
-            Code = Resources.MatrixOps;
-            var sb = new StringBuilder(Code);
+            var sb = new StringBuilder(Resources.MatrixOps);
 
             sb.AppendLine(GetMacroInvocations(true, 1));
             sb.AppendLine(GetMacroInvocations(true, 2));
diff --git a/Tensor/TensorSharp/Cuda/KernelOps/MatrixOps.cs b/Tensor/TensorSharp/Cuda/KernelOps/MatrixOps.cs
@@ -0,0 +1,22 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+using TensorSharp.CUDA;
+using TensorSharp.CUDA.DeviceCode;
+
+namespace TensorSharp.Cuda.KernelOps
+{
+    [OpsClass]
+    public class MatrixOps
+    {
+        private readonly MatrixKernels matrixKernels = new MatrixKernels();
+
+        public MatrixOps()
+        {
+
+        }
+
+        [RegisterOpStorageType("diag", typeof(CudaStorage))]
+        public Tensor Diag(Tensor src) { return matrixKernels.Diag(src); }
+    }
+}
diff --git a/Tensor/TensorSharp/TensorSharp.csproj b/Tensor/TensorSharp/TensorSharp.csproj
@@ -33,10 +33,6 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
-  <ItemGroup>
-    <Compile Remove="Cuda\DeviceCode\MatrixKernels.cs" />
-  </ItemGroup>
-
   <ItemGroup>
     <PackageReference Include="System.Drawing.Common" Version="4.5.1" />
     <PackageReference Include="System.Drawing.Primitives" Version="4.3.0" />
@@ -62,9 +58,6 @@
   </ItemGroup>
 
   <ItemGroup>
-    <None Update="Cuda\DeviceCode\CU\GatherSelect.cu">
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-    </None>
     <None Update="Cuda\DeviceCode\CU\MatrixOps.c">
       <CopyToOutputDirectory>Never</CopyToOutputDirectory>
     </None>

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ public void Im2Col_2d()`
`24`	`24`	`[TestMethod]`
`25`	`25`	`public void DiagTest()`
`26`	`26`	`{`
`27`		`- //Global.UseGpu();`
	`27`	`+ Global.UseGpu();`
`28`	`28`	`Tensor x = Tensor.FromArray(Global.Device, new float[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 });`
`29`	`29`	`x = x.Reshape(3, 3);`
`30`	`30`