GPUMLib  0.2.2
GPU Machine Learning Library
MaxKernel.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "reduction.h"
22 
23 namespace GPUMLib {
24 
25 template <int blockSize> KERNEL Max(cudafloat * inputs, cudafloat * output, int numInputs) {
26  extern __shared__ cudafloat maxvalue[];
27 
28  int idx = blockIdx.x * blockDim.x + threadIdx.x;
29 
30  cudafloat value = MIN_CUDAFLOAT;
31  if (idx < numInputs) value = inputs[idx];
32 
33  maxvalue[threadIdx.x] = value;
34  __syncthreads();
35 
36  if (blockSize >= 1024) {
37  if (threadIdx.x < 512 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 512]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 512];
38  __syncthreads();
39  }
40 
41  if (blockSize >= 512) {
42  if (threadIdx.x < 256 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 256]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 256];
43  __syncthreads();
44  }
45 
46  if (blockSize >= 256) {
47  if (threadIdx.x < 128 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 128]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 128];
48  __syncthreads();
49  }
50 
51  if (blockSize >= 128) {
52  if (threadIdx.x < 64 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 64]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 64];
53  __syncthreads();
54  }
55 
56  if (threadIdx.x < 32) {
57  volatile cudafloat * _maxvalue = maxvalue;
58 
59  if (blockSize >= 64) {
60  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 32]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 32];
61  }
62 
63  if (blockSize >= 32) {
64  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 16]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 16];
65  }
66 
67  if (blockSize >= 16) {
68  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 8]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 8];
69  }
70 
71  if (blockSize >= 8) {
72  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 4]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 4];
73  }
74 
75  if (blockSize >= 4) {
76  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 2]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 2];
77  }
78 
79  if (blockSize >= 2) {
80  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 1]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 1];
81  }
82 
83  if (threadIdx.x == 0) {
84  output[blockIdx.x] = maxvalue[0];
85  }
86  }
87 }
88 
89 template <int blockSize> KERNEL MaxIndex(cudafloat * inputs, cudafloat * output, int * indexes, int numInputs) {
90  extern __shared__ cudafloat maxvalue[];
91 
92  int * maxpos = (int *) (maxvalue + blockDim.x);
93 
94  int idx = blockIdx.x * blockDim.x + threadIdx.x;
95 
96  cudafloat value = MIN_CUDAFLOAT;
97  if (idx < numInputs) value = inputs[idx];
98 
99  maxvalue[threadIdx.x] = value;
100  maxpos[threadIdx.x] = idx;
101  __syncthreads();
102 
103  if (blockSize >= 1024) {
104  if (threadIdx.x < 512 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 512]) {
105  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 512];
106  maxpos[threadIdx.x] = maxpos[threadIdx.x + 512];
107  }
108  __syncthreads();
109  }
110 
111  if (blockSize >= 512) {
112  if (threadIdx.x < 256 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 256]) {
113  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 256];
114  maxpos[threadIdx.x] = maxpos[threadIdx.x + 256];
115  }
116  __syncthreads();
117  }
118 
119  if (blockSize >= 256) {
120  if (threadIdx.x < 128 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 128]) {
121  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 128];
122  maxpos[threadIdx.x] = maxpos[threadIdx.x + 128];
123  }
124  __syncthreads();
125  }
126 
127  if (blockSize >= 128) {
128  if (threadIdx.x < 64 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 64]) {
129  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 64];
130  maxpos[threadIdx.x] = maxpos[threadIdx.x + 64];
131  }
132  __syncthreads();
133  }
134 
135  if (threadIdx.x < 32) {
136  volatile cudafloat * _maxvalue = maxvalue;
137  volatile int * _maxpos = maxpos;
138 
139  if (blockSize >= 64) {
140  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 32]) {
141  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 32];
142  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 32];
143  }
144  }
145 
146  if (blockSize >= 32) {
147  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 16]) {
148  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 16];
149  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 16];
150  }
151  }
152 
153  if (blockSize >= 16) {
154  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 8]) {
155  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 8];
156  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 8];
157  }
158  }
159 
160  if (blockSize >= 8) {
161  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 4]) {
162  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 4];
163  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 4];
164  }
165  }
166 
167  if (blockSize >= 4) {
168  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 2]) {
169  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 2];
170  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 2];
171  }
172  }
173 
174  if (blockSize >= 2) {
175  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 1]) {
176  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 1];
177  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 1];
178  }
179  }
180 
181  if (threadIdx.x == 0) {
182  output[blockIdx.x] = maxvalue[0];
183  indexes[blockIdx.x] = maxpos[0];
184  }
185  }
186 }
187 
188 template <int blockSize> KERNEL MaxSmallArray(cudafloat * inputs, cudafloat * output, int numInputs) {
189  extern __shared__ cudafloat maxvalue[];
190 
191  maxvalue[threadIdx.x] = MIN_CUDAFLOAT;
192  for(int i = threadIdx.x; i < numInputs; i += blockDim.x) if (maxvalue[threadIdx.x] < inputs[i]) maxvalue[threadIdx.x] = inputs[i];
193  __syncthreads();
194 
195  if (blockSize >= 1024) {
196  if (threadIdx.x < 512 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 512]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 512];
197  __syncthreads();
198  }
199 
200  if (blockSize >= 512) {
201  if (threadIdx.x < 256 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 256]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 256];
202  __syncthreads();
203  }
204 
205  if (blockSize >= 256) {
206  if (threadIdx.x < 128 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 128]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 128];
207  __syncthreads();
208  }
209 
210  if (blockSize >= 128) {
211  if (threadIdx.x < 64 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 64]) maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 64];
212  __syncthreads();
213  }
214 
215  if (threadIdx.x < 32) {
216  volatile cudafloat * _maxvalue = maxvalue;
217 
218  if (blockSize >= 64) {
219  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 32]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 32];
220  }
221 
222  if (blockSize >= 32) {
223  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 16]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 16];
224  }
225 
226  if (blockSize >= 16) {
227  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 8]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 8];
228  }
229 
230  if (blockSize >= 8) {
231  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 4]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 4];
232  }
233 
234  if (blockSize >= 4) {
235  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 2]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 2];
236  }
237 
238  if (blockSize >= 2) {
239  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 1]) _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 1];
240  }
241 
242  if (threadIdx.x == 0) {
243  output[blockIdx.x] = maxvalue[0];
244  }
245  }
246 }
247 
248 template <int blockSize> KERNEL MaxSmallArrayIndex(cudafloat * inputs, cudafloat * output, int * maxIndex, int numInputs, int * indexes) {
249  extern __shared__ cudafloat maxvalue[];
250 
251  int * maxpos = (int *) (maxvalue + blockDim.x);
252 
253  maxvalue[threadIdx.x] = MIN_CUDAFLOAT;
254  for(int i = threadIdx.x; i < numInputs; i += blockDim.x) {
255  if (maxvalue[threadIdx.x] < inputs[i]) {
256  maxvalue[threadIdx.x] = inputs[i];
257  if (indexes != nullptr) {
258  maxpos[threadIdx.x] = indexes[i];
259  } else {
260  maxpos[threadIdx.x] = i;
261  }
262  }
263  }
264  __syncthreads();
265 
266  if (blockSize >= 1024) {
267  if (threadIdx.x < 512 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 512]) {
268  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 512];
269  maxpos[threadIdx.x] = maxpos[threadIdx.x + 512];
270  }
271  __syncthreads();
272  }
273 
274  if (blockSize >= 512) {
275  if (threadIdx.x < 256 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 256]) {
276  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 256];
277  maxpos[threadIdx.x] = maxpos[threadIdx.x + 256];
278  }
279  __syncthreads();
280  }
281 
282  if (blockSize >= 256) {
283  if (threadIdx.x < 128 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 128]) {
284  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 128];
285  maxpos[threadIdx.x] = maxpos[threadIdx.x + 128];
286  }
287  __syncthreads();
288  }
289 
290  if (blockSize >= 128) {
291  if (threadIdx.x < 64 && maxvalue[threadIdx.x] < maxvalue[threadIdx.x + 64]) {
292  maxvalue[threadIdx.x] = maxvalue[threadIdx.x + 64];
293  maxpos[threadIdx.x] = maxpos[threadIdx.x + 64];
294  }
295  __syncthreads();
296  }
297 
298  if (threadIdx.x < 32) {
299  volatile cudafloat * _maxvalue = maxvalue;
300  volatile int * _maxpos = maxpos;
301 
302  if (blockSize >= 64) {
303  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 32]) {
304  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 32];
305  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 32];
306  }
307  }
308 
309  if (blockSize >= 32) {
310  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 16]) {
311  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 16];
312  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 16];
313  }
314  }
315 
316  if (blockSize >= 16) {
317  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 8]) {
318  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 8];
319  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 8];
320  }
321  }
322 
323  if (blockSize >= 8) {
324  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 4]) {
325  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 4];
326  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 4];
327  }
328  }
329 
330  if (blockSize >= 4) {
331  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 2]) {
332  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 2];
333  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 2];
334  }
335  }
336 
337  if (blockSize >= 2) {
338  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 1]) {
339  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 1];
340  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 1];
341  }
342  }
343 
344  if (threadIdx.x == 0) {
345  output[blockIdx.x] = maxvalue[0];
346  maxIndex[blockIdx.x] = maxpos[0];
347  }
348  }
349 }
350 
351 void KernelMax(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs) {
352  if (blocks == 1) {
353  switch(blockSize) {
354  #ifdef FERMI
355  case 1024:
356  MaxSmallArray<1024><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
357  break;
358  #endif
359  case 512:
360  MaxSmallArray<512><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
361  break;
362  case 256:
363  MaxSmallArray<256><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
364  break;
365  case 128:
366  MaxSmallArray<128><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
367  break;
368  case 64:
369  MaxSmallArray<64><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
370  break;
371  case 32:
372  MaxSmallArray<32><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
373  break;
374  case 16:
375  MaxSmallArray<16><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
376  break;
377  case 8:
378  MaxSmallArray<8><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
379  break;
380  case 4:
381  MaxSmallArray<4><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
382  break;
383  case 2:
384  MaxSmallArray<2><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
385  break;
386  case 1:
387  MaxSmallArray<1><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
388  break;
389  }
390  } else {
391  switch(blockSize) {
392  #ifdef FERMI
393  case 1024:
394  Max<1024><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
395  break;
396  #endif
397  case 512:
398  Max<512><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
399  break;
400  case 256:
401  Max<256><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
402  break;
403  case 128:
404  Max<128><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
405  break;
406  case 64:
407  Max<64><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
408  break;
409  case 32:
410  Max<32><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
411  break;
412  case 16:
413  Max<16><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
414  break;
415  case 8:
416  Max<8><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
417  break;
418  case 4:
419  Max<4><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
420  break;
421  case 2:
422  Max<2><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
423  break;
424  case 1:
425  Max<1><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
426  break;
427  }
428  }
429 }
430 
431 void KernelMaxIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int * maxIndexes, int numInputs, int * indexes) {
432  if (blocks == 1) {
433  switch(blockSize) {
434  #ifdef FERMI
435  case 1024:
436  MaxSmallArrayIndex<1024><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
437  break;
438  #endif
439  case 512:
440  MaxSmallArrayIndex<512><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
441  break;
442  case 256:
443  MaxSmallArrayIndex<256><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
444  break;
445  case 128:
446  MaxSmallArrayIndex<128><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
447  break;
448  case 64:
449  MaxSmallArrayIndex<64><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
450  break;
451  case 32:
452  MaxSmallArrayIndex<32><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
453  break;
454  case 16:
455  MaxSmallArrayIndex<16><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
456  break;
457  case 8:
458  MaxSmallArrayIndex<8><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
459  break;
460  case 4:
461  MaxSmallArrayIndex<4><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
462  break;
463  case 2:
464  MaxSmallArrayIndex<2><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
465  break;
466  case 1:
467  MaxSmallArrayIndex<1><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs, indexes);
468  break;
469  }
470  } else {
471  switch(blockSize) {
472  #ifdef FERMI
473  case 1024:
474  MaxIndex<1024><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
475  break;
476  #endif
477  case 512:
478  MaxIndex<512><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
479  break;
480  case 256:
481  MaxIndex<256><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
482  break;
483  case 128:
484  MaxIndex<128><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
485  break;
486  case 64:
487  MaxIndex<64><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
488  break;
489  case 32:
490  MaxIndex<32><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
491  break;
492  case 16:
493  MaxIndex<16><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
494  break;
495  case 8:
496  MaxIndex<8><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
497  break;
498  case 4:
499  MaxIndex<4><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
500  break;
501  case 2:
502  MaxIndex<2><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
503  break;
504  case 1:
505  MaxIndex<1><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, maxIndexes, numInputs);
506  break;
507  }
508  }
509 }
510 
511 }
#define MIN_CUDAFLOAT
#define KERNEL
Defines the type of a kernel function.
void KernelMaxIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int *maxIndexes, int numInputs, int *indexes)
Definition: MaxKernel.cu:431
void KernelMax(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs)
Definition: MaxKernel.cu:351
float cudafloat