GPUMLib  0.2.2
GPU Machine Learning Library
SumKernel.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "SumWarp.h"
22 #include "reduction.h"
23 
24 namespace GPUMLib {
25 
26 template <int blockSize> KERNEL Sum(cudafloat * inputs, cudafloat * outputs, int numInputs) {
27  extern __shared__ cudafloat sum[];
28 
29  int idx = blockIdx.x * blockDim.x + threadIdx.x;
30 
31  cudafloat value = CUDA_VALUE(0.0);
32  if (idx < numInputs) value = inputs[idx];
33 
34  sum[threadIdx.x] = value;
35  __syncthreads();
36 
37  SumBeforeWarp<blockSize>(sum);
38 
39  if (threadIdx.x < 32) {
40  SumWarp<blockSize>(sum);
41  if (threadIdx.x == 0) outputs[blockIdx.x] = sum[0];
42  }
43 }
44 
45 void KernelSum(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * outputs, int numInputs) {
46  switch(blockSize) {
47  #ifdef FERMI
48  case 1024:
49  Sum<1024><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
50  break;
51  #endif
52  case 512:
53  Sum<512><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
54  break;
55  case 256:
56  Sum<256><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
57  break;
58  case 128:
59  Sum<128><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
60  break;
61  case 64:
62  Sum<64><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
63  break;
64  case 32:
65  Sum<32><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
66  break;
67  case 16:
68  Sum<16><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
69  break;
70  case 8:
71  Sum<8><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
72  break;
73  case 4:
74  Sum<4><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
75  break;
76  case 2:
77  Sum<2><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
78  break;
79  case 1:
80  Sum<1><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, outputs, numInputs);
81  break;
82  }
83 }
84 
85 template <int blockSize> KERNEL SumSmallArray(cudafloat * inputs, cudafloat * output, int numInputs, cudafloat multiplyFactor) {
86  extern __shared__ cudafloat sum[];
87 
88  cudafloat value = CUDA_VALUE(0.0);
89  for(int i = threadIdx.x; i < numInputs; i += blockDim.x) value += inputs[i];
90  sum[threadIdx.x] = value;
91  __syncthreads();
92 
93  SumBeforeWarp<blockSize>(sum);
94 
95  if (threadIdx.x < 32) {
96  SumWarp<blockSize>(sum);
97 
98  if (threadIdx.x == 0) output[blockIdx.x] = sum[0] * multiplyFactor;
99  }
100 }
101 
102 void KernelSumSmallArray(cudaStream_t stream, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs, cudafloat multiplyFactor) {
103  switch(blockSize) {
104  #ifdef FERMI
105  case 1024:
106  SumSmallArray<1024><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
107  break;
108  #endif
109  case 512:
110  SumSmallArray<512><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
111  break;
112  case 256:
113  SumSmallArray<256><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
114  break;
115  case 128:
116  SumSmallArray<128><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
117  break;
118  case 64:
119  SumSmallArray<64><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
120  break;
121  case 32:
122  SumSmallArray<32><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
123  break;
124  case 16:
125  SumSmallArray<16><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
126  break;
127  case 8:
128  SumSmallArray<8><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
129  break;
130  case 4:
131  SumSmallArray<4><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
132  break;
133  case 2:
134  SumSmallArray<2><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
135  break;
136  case 1:
137  SumSmallArray<1><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
138  break;
139  }
140 }
141 
142 }
#define KERNEL
Defines the type of a kernel function.
void KernelSumSmallArray(cudaStream_t stream, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs, cudafloat multiplyFactor)
Definition: SumKernel.cu:102
void KernelSum(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *outputs, int numInputs)
Definition: SumKernel.cu:45
#define CUDA_VALUE(X)
float cudafloat