GPUMLib  0.2.2
GPU Machine Learning Library
MinKernel.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <limits>
22 #include "reduction.h"
23 
24 namespace GPUMLib {
25 
26 template <int blockSize> KERNEL Min(cudafloat * inputs, cudafloat * output, int numInputs) {
27  extern __shared__ cudafloat minvalue[];
28 
29  int idx = blockIdx.x * blockDim.x + threadIdx.x;
30 
31  cudafloat value = MAX_CUDAFLOAT;
32  if (idx < numInputs) value = inputs[idx];
33 
34  minvalue[threadIdx.x] = value;
35  __syncthreads();
36 
37  if (blockSize >= 1024) {
38  if (threadIdx.x < 512 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 512]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 512];
39  __syncthreads();
40  }
41 
42  if (blockSize >= 512) {
43  if (threadIdx.x < 256 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 256]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 256];
44  __syncthreads();
45  }
46 
47  if (blockSize >= 256) {
48  if (threadIdx.x < 128 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 128]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 128];
49  __syncthreads();
50  }
51 
52  if (blockSize >= 128) {
53  if (threadIdx.x < 64 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 64]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 64];
54  __syncthreads();
55  }
56 
57  if (threadIdx.x < 32) {
58  volatile cudafloat * _minvalue = minvalue;
59 
60  if (blockSize >= 64) {
61  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 32]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 32];
62  }
63 
64  if (blockSize >= 32) {
65  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 16]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 16];
66  }
67 
68  if (blockSize >= 16) {
69  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 8]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 8];
70  }
71 
72  if (blockSize >= 8) {
73  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 4]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 4];
74  }
75 
76  if (blockSize >= 4) {
77  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 2]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 2];
78  }
79 
80  if (blockSize >= 2) {
81  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 1]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 1];
82  }
83 
84  if (threadIdx.x == 0) {
85  output[blockIdx.x] = minvalue[0];
86  }
87  }
88 }
89 
90 template <int blockSize> KERNEL MinIndex(cudafloat * inputs, cudafloat * output, int * indexes, int numInputs) {
91  extern __shared__ cudafloat minvalue[];
92 
93  int * minpos = (int *) (minvalue + blockDim.x);
94 
95  int idx = blockIdx.x * blockDim.x + threadIdx.x;
96 
97  cudafloat value = MAX_CUDAFLOAT;
98  if (idx < numInputs) value = inputs[idx];
99 
100  minvalue[threadIdx.x] = value;
101  minpos[threadIdx.x] = idx;
102  __syncthreads();
103 
104  if (blockSize >= 1024) {
105  if (threadIdx.x < 512 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 512]) {
106  minvalue[threadIdx.x] = minvalue[threadIdx.x + 512];
107  minpos[threadIdx.x] = minpos[threadIdx.x + 512];
108  }
109  __syncthreads();
110  }
111 
112  if (blockSize >= 512) {
113  if (threadIdx.x < 256 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 256]) {
114  minvalue[threadIdx.x] = minvalue[threadIdx.x + 256];
115  minpos[threadIdx.x] = minpos[threadIdx.x + 256];
116  }
117  __syncthreads();
118  }
119 
120  if (blockSize >= 256) {
121  if (threadIdx.x < 128 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 128]) {
122  minvalue[threadIdx.x] = minvalue[threadIdx.x + 128];
123  minpos[threadIdx.x] = minpos[threadIdx.x + 128];
124  }
125  __syncthreads();
126  }
127 
128  if (blockSize >= 128) {
129  if (threadIdx.x < 64 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 64]) {
130  minvalue[threadIdx.x] = minvalue[threadIdx.x + 64];
131  minpos[threadIdx.x] = minpos[threadIdx.x + 64];
132  }
133  __syncthreads();
134  }
135 
136  if (threadIdx.x < 32) {
137  volatile cudafloat * _minvalue = minvalue;
138  volatile int * _minpos = minpos;
139 
140  if (blockSize >= 64) {
141  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 32]) {
142  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 32];
143  _minpos[threadIdx.x] = _minpos[threadIdx.x + 32];
144  }
145  }
146 
147  if (blockSize >= 32) {
148  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 16]) {
149  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 16];
150  _minpos[threadIdx.x] = _minpos[threadIdx.x + 16];
151  }
152  }
153 
154  if (blockSize >= 16) {
155  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 8]) {
156  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 8];
157  _minpos[threadIdx.x] = _minpos[threadIdx.x + 8];
158  }
159  }
160 
161  if (blockSize >= 8) {
162  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 4]) {
163  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 4];
164  _minpos[threadIdx.x] = _minpos[threadIdx.x + 4];
165  }
166  }
167 
168  if (blockSize >= 4) {
169  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 2]) {
170  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 2];
171  _minpos[threadIdx.x] = _minpos[threadIdx.x + 2];
172  }
173  }
174 
175  if (blockSize >= 2) {
176  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 1]) {
177  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 1];
178  _minpos[threadIdx.x] = _minpos[threadIdx.x + 1];
179  }
180  }
181 
182  if (threadIdx.x == 0) {
183  output[blockIdx.x] = minvalue[0];
184  indexes[blockIdx.x] = minpos[0];
185  }
186  }
187 }
188 
189 template <int blockSize> KERNEL MinSmallArray(cudafloat * inputs, cudafloat * output, int numInputs) {
190  extern __shared__ cudafloat minvalue[];
191 
192  minvalue[threadIdx.x] = MAX_CUDAFLOAT;
193  for(int i = threadIdx.x; i < numInputs; i += blockDim.x) if (minvalue[threadIdx.x] > inputs[i]) minvalue[threadIdx.x] = inputs[i];
194  __syncthreads();
195 
196  if (blockSize >= 1024) {
197  if (threadIdx.x < 512 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 512]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 512];
198  __syncthreads();
199  }
200 
201  if (blockSize >= 512) {
202  if (threadIdx.x < 256 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 256]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 256];
203  __syncthreads();
204  }
205 
206  if (blockSize >= 256) {
207  if (threadIdx.x < 128 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 128]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 128];
208  __syncthreads();
209  }
210 
211  if (blockSize >= 128) {
212  if (threadIdx.x < 64 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 64]) minvalue[threadIdx.x] = minvalue[threadIdx.x + 64];
213  __syncthreads();
214  }
215 
216  if (threadIdx.x < 32) {
217  volatile cudafloat * _minvalue = minvalue;
218 
219  if (blockSize >= 64) {
220  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 32]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 32];
221  }
222 
223  if (blockSize >= 32) {
224  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 16]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 16];
225  }
226 
227  if (blockSize >= 16) {
228  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 8]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 8];
229  }
230 
231  if (blockSize >= 8) {
232  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 4]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 4];
233  }
234 
235  if (blockSize >= 4) {
236  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 2]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 2];
237  }
238 
239  if (blockSize >= 2) {
240  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 1]) _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 1];
241  }
242 
243  if (threadIdx.x == 0) {
244  output[blockIdx.x] = minvalue[0];
245  }
246  }
247 }
248 
249 template <int blockSize> KERNEL MinSmallArrayIndex(cudafloat * inputs, cudafloat * output, int * minIndex, int numInputs, int * indexes) {
250  extern __shared__ cudafloat minvalue[];
251 
252  int * minpos = (int *) (minvalue + blockDim.x);
253 
254  minvalue[threadIdx.x] = MAX_CUDAFLOAT;
255  for(int i = threadIdx.x; i < numInputs; i += blockDim.x) {
256  if (minvalue[threadIdx.x] > inputs[i]) {
257  minvalue[threadIdx.x] = inputs[i];
258  if (indexes != nullptr) {
259  minpos[threadIdx.x] = indexes[i];
260  } else {
261  minpos[threadIdx.x] = i;
262  }
263  }
264  }
265  __syncthreads();
266 
267  if (blockSize >= 1024) {
268  if (threadIdx.x < 512 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 512]) {
269  minvalue[threadIdx.x] = minvalue[threadIdx.x + 512];
270  minpos[threadIdx.x] = minpos[threadIdx.x + 512];
271  }
272  __syncthreads();
273  }
274 
275  if (blockSize >= 512) {
276  if (threadIdx.x < 256 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 256]) {
277  minvalue[threadIdx.x] = minvalue[threadIdx.x + 256];
278  minpos[threadIdx.x] = minpos[threadIdx.x + 256];
279  }
280  __syncthreads();
281  }
282 
283  if (blockSize >= 256) {
284  if (threadIdx.x < 128 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 128]) {
285  minvalue[threadIdx.x] = minvalue[threadIdx.x + 128];
286  minpos[threadIdx.x] = minpos[threadIdx.x + 128];
287  }
288  __syncthreads();
289  }
290 
291  if (blockSize >= 128) {
292  if (threadIdx.x < 64 && minvalue[threadIdx.x] > minvalue[threadIdx.x + 64]) {
293  minvalue[threadIdx.x] = minvalue[threadIdx.x + 64];
294  minpos[threadIdx.x] = minpos[threadIdx.x + 64];
295  }
296  __syncthreads();
297  }
298 
299  if (threadIdx.x < 32) {
300  volatile cudafloat * _minvalue = minvalue;
301  volatile int * _minpos = minpos;
302 
303  if (blockSize >= 64) {
304  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 32]) {
305  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 32];
306  _minpos[threadIdx.x] = _minpos[threadIdx.x + 32];
307  }
308  }
309 
310  if (blockSize >= 32) {
311  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 16]) {
312  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 16];
313  _minpos[threadIdx.x] = _minpos[threadIdx.x + 16];
314  }
315  }
316 
317  if (blockSize >= 16) {
318  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 8]) {
319  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 8];
320  _minpos[threadIdx.x] = _minpos[threadIdx.x + 8];
321  }
322  }
323 
324  if (blockSize >= 8) {
325  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 4]) {
326  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 4];
327  _minpos[threadIdx.x] = _minpos[threadIdx.x + 4];
328  }
329  }
330 
331  if (blockSize >= 4) {
332  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 2]) {
333  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 2];
334  _minpos[threadIdx.x] = _minpos[threadIdx.x + 2];
335  }
336  }
337 
338  if (blockSize >= 2) {
339  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 1]) {
340  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 1];
341  _minpos[threadIdx.x] = _minpos[threadIdx.x + 1];
342  }
343  }
344 
345  if (threadIdx.x == 0) {
346  output[blockIdx.x] = minvalue[0];
347  minIndex[blockIdx.x] = minpos[0];
348  }
349  }
350 }
351 
352 void KernelMin(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs) {
353  if (blocks == 1) {
354  switch(blockSize) {
355  #ifdef FERMI
356  case 1024:
357  MinSmallArray<1024><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
358  break;
359  #endif
360  case 512:
361  MinSmallArray<512><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
362  break;
363  case 256:
364  MinSmallArray<256><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
365  break;
366  case 128:
367  MinSmallArray<128><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
368  break;
369  case 64:
370  MinSmallArray<64><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
371  break;
372  case 32:
373  MinSmallArray<32><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
374  break;
375  case 16:
376  MinSmallArray<16><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
377  break;
378  case 8:
379  MinSmallArray<8><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
380  break;
381  case 4:
382  MinSmallArray<4><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
383  break;
384  case 2:
385  MinSmallArray<2><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
386  break;
387  case 1:
388  MinSmallArray<1><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
389  break;
390  }
391  } else {
392  switch(blockSize) {
393  #ifdef FERMI
394  case 1024:
395  Min<1024><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
396  break;
397  #endif
398  case 512:
399  Min<512><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
400  break;
401  case 256:
402  Min<256><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
403  break;
404  case 128:
405  Min<128><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
406  break;
407  case 64:
408  Min<64><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
409  break;
410  case 32:
411  Min<32><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
412  break;
413  case 16:
414  Min<16><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
415  break;
416  case 8:
417  Min<8><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
418  break;
419  case 4:
420  Min<4><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
421  break;
422  case 2:
423  Min<2><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
424  break;
425  case 1:
426  Min<1><<<blocks, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs);
427  break;
428  }
429  }
430 }
431 
432 void KernelMinIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int * minIndexes, int numInputs, int * indexes) {
433  if (blocks == 1) {
434  switch(blockSize) {
435  #ifdef FERMI
436  case 1024:
437  MinSmallArrayIndex<1024><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
438  break;
439  #endif
440  case 512:
441  MinSmallArrayIndex<512><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
442  break;
443  case 256:
444  MinSmallArrayIndex<256><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
445  break;
446  case 128:
447  MinSmallArrayIndex<128><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
448  break;
449  case 64:
450  MinSmallArrayIndex<64><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
451  break;
452  case 32:
453  MinSmallArrayIndex<32><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
454  break;
455  case 16:
456  MinSmallArrayIndex<16><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
457  break;
458  case 8:
459  MinSmallArrayIndex<8><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
460  break;
461  case 4:
462  MinSmallArrayIndex<4><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
463  break;
464  case 2:
465  MinSmallArrayIndex<2><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
466  break;
467  case 1:
468  MinSmallArrayIndex<1><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs, indexes);
469  break;
470  }
471  } else {
472  switch(blockSize) {
473  #ifdef FERMI
474  case 1024:
475  MinIndex<1024><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
476  break;
477  #endif
478  case 512:
479  MinIndex<512><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
480  break;
481  case 256:
482  MinIndex<256><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
483  break;
484  case 128:
485  MinIndex<128><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
486  break;
487  case 64:
488  MinIndex<64><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
489  break;
490  case 32:
491  MinIndex<32><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
492  break;
493  case 16:
494  MinIndex<16><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
495  break;
496  case 8:
497  MinIndex<8><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
498  break;
499  case 4:
500  MinIndex<4><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
501  break;
502  case 2:
503  MinIndex<2><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
504  break;
505  case 1:
506  MinIndex<1><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int)), stream>>>(inputs, output, minIndexes, numInputs);
507  break;
508  }
509  }
510 }
511 
512 }
void KernelMinIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int *minIndexes, int numInputs, int *indexes)
Definition: MinKernel.cu:432
void KernelMin(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs)
Definition: MinKernel.cu:352
#define KERNEL
Defines the type of a kernel function.
#define MAX_CUDAFLOAT
float cudafloat