GPUMLib  0.2.2
GPU Machine Learning Library
RANKernels.cu
1 /*
2  Ricardo Quintas is an MSc Student at the University of Coimbra, Portugal
3  Copyright (C) 2009, 2010 Ricardo Quintas
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "../common/CudaDefinitions.h"
22 #include "rankernels.h"
23 
24 namespace GPUMLib {
25 
26 /* KERNEL Euclidian Distance */
27 KERNEL EuclidianDistance(cudafloat *Output, int output_height, int output_width, cudafloat *Input, int input_width, cudafloat *Centers, int centers_width){
28 
29  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
30  int idny = blockIdx.y*blockDim.y + threadIdx.y;
31 
32  if(idnx < output_width && idny < output_height){
33 
34  float sum = 0;
35 
36  float a;
37  float b;
38 
39  for(int i = 0; i < centers_width; i++){
40 
41  a = Centers[idnx * centers_width + i];
42  b = Input[idny * input_width + i];
43 
44  sum = sum + pow( a - b , 2);
45 
46  }
47 
48  Output[idnx + idny * output_width] = sqrt(sum);
49  }
50 }
51 
52 extern "C" void KernelEuclidianDistance(cudafloat *Output, int output_height, int output_width, cudafloat *Input, int input_width, cudafloat *Centers, int centers_width)
53 {
54  int blockSize = 16;
55 
56  int wBlocks = output_width/blockSize + ((output_width%blockSize == 0)?0:1);
57  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
58 
59  dim3 grid(wBlocks,hBlocks);
60  dim3 threads(blockSize,blockSize);
61  EuclidianDistance<<<grid,threads>>>(Output, output_height, output_width, Input, input_width, Centers, centers_width);
62 }
63 
64 KERNEL FindMinKernel(cudafloat *Output, int output_height, int output_width, float *min_array, int* min_idx, cudafloat* Targets){
65 
66  int idny = blockIdx.y*blockDim.y + threadIdx.y;
67 
68  float min_tmp = -1;
69  int idx = 0;
70 
71  if(idny < output_height){
72 
73  for(int j = 0; j < output_width; j++){
74  if(Targets[j] != Targets[idny] && (Output[idny*output_width+j] < min_tmp || min_tmp == -1)){
75  min_tmp = Output[idny*output_width+j];
76  idx = j;
77  }
78  }
79 
80  min_array[idny] = min_tmp;
81  min_idx[idny] = idx;
82  }
83 }
84 
85 extern "C" void FindMin(cudafloat *Output, int output_height, int output_width, float *min_array, int* min_idx, cudafloat* Targets)
86 {
87  int blockSize = 16;
88 
89  int wBlocks = 1;
90  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
91 
92  dim3 grid(wBlocks,hBlocks);
93  dim3 threads(blockSize,blockSize);
94  FindMinKernel<<<grid,threads>>>(Output, output_height, output_width, min_array, min_idx, Targets);
95 }
96 
97 
98 
99 /********************/
100 /* KERNEL Euclidian Distance */
101 KERNEL FindNearestCenter(cudafloat *Output, int output_width, cudafloat *Sample, cudafloat *Centers, int centers_width, float* min_value){
102 
103  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
104 
105  if(idnx < output_width){
106 
107  double sum = 0;
108 
109  double a;
110  double b;
111 
112  for(int i = 0; i < centers_width; i++){
113 
114  a = Centers[idnx * centers_width + i];
115  b = Sample[i];
116 
117  sum = sum + pow( a - b , 2);
118 
119  }
120 
121  Output[idnx] = sqrt(sum);
122  }
123 
124  __syncthreads();
125 
126  if(idnx == 0){
127 
128  min_value[0] = Output[0];
129 
130  for(int i = 0; i < output_width; i++){
131 
132  if(min_value[0] > Output[i]){
133  min_value[0] = Output[i];
134  }
135 
136  }
137 
138  }
139 
140 
141 
142 }
143 
144 extern "C" void KernelFindNearestCenter(cudafloat *Output, int output_width, cudafloat *Sample, cudafloat *Centers, int centers_width, float* min_value)
145 {
146  int blockSize = 16;
147 
148  int wBlocks = output_width/blockSize + ((output_width%blockSize == 0)?0:1);
149  int hBlocks = 1;//output_height/blockSize + ((output_height%blockSize == 0)?0:1);
150 
151  dim3 grid(wBlocks,hBlocks);
152  dim3 threads(blockSize,blockSize);
153  FindNearestCenter<<<grid,threads>>>(Output, output_width, Sample, Centers, centers_width,min_value);
154 }
155 
156 
157 /****************/
158 KERNEL ActivationMatrix(cudafloat *Output, int output_height, int output_width, cudafloat *Input, int input_width, cudafloat *Centers, int centers_width, float* c_width, float scalingfactor){
159 
160  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
161  int idny = blockIdx.y*blockDim.y + threadIdx.y;
162 
163  if(idnx < output_width && idny < output_height){
164 
165  float sum = 0;
166 
167  float a;
168  float b;
169 
170  for(int i = 0; i < centers_width; i++){
171 
172  a = Centers[idnx * centers_width + i];
173  b = Input[idny * input_width + i];
174 
175  sum = sum + pow( a - b , 2);
176 
177  }
178 
179  sum = sqrt(sum);
180 
181  //column-major
182  float value = exp(-(pow(sum,2)/(scalingfactor*pow(c_width[idnx],2))));
183 
184  if(IsInfOrNaN(value)) value = CUDA_VALUE(0.0);
185 
186  Output[idnx * output_height + idny] = value;
187 
188  //row-major
189  //Output[idnx + output_width * idny] = exp(-(pow(sum,2)/(scalingfactor*pow(c_width[idnx],2))));
190 
191  }
192 }
193 
194 extern "C" void KernelActivationMatrix(cudafloat *Output, int output_height, int output_width, cudafloat *Input, int input_width, cudafloat *Centers, int centers_width, float *c_width, float scalingfactor)
195 {
196  int blockSize = 16;
197 
198  int wBlocks = output_width/blockSize + ((output_width%blockSize == 0)?0:1);
199  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
200 
201  dim3 grid(wBlocks,hBlocks);
202  dim3 threads(blockSize,blockSize);
203  ActivationMatrix<<<grid,threads>>>(Output, output_height, output_width, Input, input_width, Centers, centers_width,c_width,scalingfactor);
204 }
205 
206 KERNEL SigmaInverse(float *Output, int output_height,int output_width, cudafloat *S){
207 
208  int idny = blockIdx.y*blockDim.y + threadIdx.y;
209 
210  //column-major
211  if(idny < output_height)
212  if(S[idny] != 0)
213  Output[idny * output_height + idny] = 1/S[idny];
214 
215 }
216 
217 extern "C" void KernelSigmaInverse(float *Output, int output_height, int output_width, cudafloat *S)
218 {
219  int blockSize = 16;
220 
221  int wBlocks = 1;
222  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
223 
224  dim3 grid(wBlocks,hBlocks);
225  dim3 threads(blockSize,blockSize);
226 
227  SigmaInverse<<<grid,threads>>>(Output, output_height, output_width, S);
228 }
229 
230 /************************/
231 KERNEL CalculateDistance(cudafloat* Output, cudafloat* A, cudafloat* B, unsigned int n){
232  extern __shared__ cudafloat sdata[];
233 
234  // perform first level of reduction,
235  // reading from global memory, writing to shared memory
236  unsigned int tid = threadIdx.x;
237  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
238 
239  sdata[tid] = (i < n) ? pow(A[i]-B[i],2) : 0;
240  if (i + blockDim.x < n)
241  sdata[tid] += pow(A[i+blockDim.x]-B[i+blockDim.x],2);
242 
243  __syncthreads();
244 
245  // do reduction in shared mem
246  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
247  {
248  if (tid < s)
249  {
250  sdata[tid] += sdata[tid + s];
251  }
252  __syncthreads();
253  }
254 
255  // write result for this block to global mem
256  if(tid == 0) Output[blockIdx.x] = sdata[0];
257 }
258 
259 
260 extern "C" unsigned int nextPow2( unsigned int x ) {
261  --x;
262  x |= x >> 1;
263  x |= x >> 2;
264  x |= x >> 4;
265  x |= x >> 8;
266  x |= x >> 16;
267  return ++x;
268 }
269 
270 extern "C" cudafloat KernelCalculateDistance(cudafloat *output, cudafloat *A, cudafloat *B,int n)
271 {
272  int blockSize = 16;
273  bool needReadBack = true;
274 
275  int threads = (n < blockSize*2) ? nextPow2((n + 1)/ 2) : blockSize;
276  int blocks = (n + (threads * 2 - 1)) / (threads * 2);
277  int smemSize = (threads <= 32) ? 2 * threads * sizeof(cudafloat) : threads * sizeof(cudafloat);
278 
279  dim3 dimGrid(blocks, 1, 1);
280  dim3 dimBlock(threads, 1, 1);
281 
282  CalculateDistance<<<dimGrid,dimBlock,smemSize>>>(output,A,B,n);
283 
284  int s = blocks;
285 
286  while(s > 15){
287  threads = (s < blockSize*2) ? nextPow2((s + 1)/ 2) : blockSize;
288  blocks = (s + (threads * 2 - 1)) / (threads * 2);
289  smemSize = (threads <= 32) ? 2 * threads * sizeof(cudafloat) : threads * sizeof(cudafloat);
290 
291  dim3 dimGrid1(blocks, 1, 1);
292  dim3 dimBlock1(threads, 1, 1);
293 
294  CalculateDistance<<<dimGrid1,dimBlock1,smemSize>>>(output,A,B,s);
295 
296  s = (s + (threads * 2 - 1)) / (threads * 2);
297  }
298 
299  cudafloat gpu_result = 0;
300 
301  if (s > 1)
302  {
303  // copy result from device to host
304 
305  cudafloat* h_odata = (cudafloat*) malloc(sizeof(cudafloat)*s);
306 
307  cudaMemcpy( h_odata, output, s * sizeof(cudafloat), cudaMemcpyDeviceToHost);
308 
309  for(int i=0; i < s; i++)
310  {
311  gpu_result += h_odata[i];
312  }
313 
314  needReadBack = false;
315 
316  free(h_odata);
317 
318  }
319 
320  if (needReadBack)
321  {
322  cudaMemcpy( &gpu_result, output, sizeof(cudafloat), cudaMemcpyDeviceToHost);
323  }
324 
325  return sqrt(gpu_result);
326 }
327 
328 
329 
330 
331 
332 
333 
334 
335 
336 
337 
338 
339 
340 
341 
342 
343 
344 #define AS(i, j) As[i][j]
345 #define BS(i, j) Bs[i][j]
346 
347 #define BLOCK_SIZE 16
348 
353 __global__ void
354 matrixMul( float* C, float* A, float* B, int wA, int wB, int wC, int hC)
355 {
356 
357  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
358  int idny = blockIdx.y*blockDim.y + threadIdx.y;
359 
360  if(idnx < wC && idny < hC){
361 
362  float sum = 0;
363 
364  float a;
365  float b;
366 
367  for(int i = 0; i < wA; i++){
368 
369  a = A[i * hC + idny];
370 
371  b = B[i * wC + idnx];
372 
373  sum = sum + (a*b);
374 
375  }
376 
377  //row-major
378  C[idnx + wC * idny] = sum;
379 
380  }
381 
382 
383 }
384 
385 extern "C" void matmul(cudafloat *d_C, cudafloat* d_A, cudafloat* d_B,int uiWA,int uiWB, int uiWC, int uiHC)
386 {
387  int blockSize = 16;
388 
389  int wBlocks = uiWC/blockSize + ((uiWC%blockSize == 0)?0:1);
390  int hBlocks = uiHC/blockSize + ((uiHC%blockSize == 0)?0:1);
391 
392  dim3 grid(wBlocks,hBlocks);
393  dim3 threads(blockSize,blockSize);
394 
395  matrixMul<<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB, uiWC, uiHC);
396 
397 }
398 
399 
400 
401 
402 
403 
404 
405 
406 
407 
408 
409 
410 KERNEL CalculateNetworkActivation(cudafloat* output, cudafloat* Sample,int Length,cudafloat* dCenters,int NumCenters,cudafloat* dWeights,int NumClasses,cudafloat* dWidths,float scaling_factor){
411 
412  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
413  int idny = blockIdx.y*blockDim.y + threadIdx.y;
414 
415  if(idnx < NumClasses && idny < NumCenters){
416 
417  float sum = 0;
418 
419  float a;
420  float b;
421 
422  for(int i = 0; i < Length; i++){
423 
424  a = dCenters[idny * Length + i];
425  b = Sample[i];
426 
427  sum = sum + pow( a - b , 2);
428 
429  }
430 
431  sum = sqrt(sum);
432 
433  //column-major
434  //Output[idnx * output_height + idny] = exp(-(pow(sum,2)/(scalingfactor*pow(c_width[idnx],2))));
435 
436  //row-major
437  output[idnx + NumClasses * idny] = dWeights[idny*NumClasses+idnx]*exp(-(pow(sum,2)/(scaling_factor*pow(dWidths[idny],2))));
438 
439  }
440 
441 }
442 
443 extern "C" void KernelCalculateNetworkActivation(cudafloat* output, cudafloat* Sample,int Length,cudafloat* dCenters,int NumCenters,cudafloat* dWeights,int NumClasses,cudafloat* dWidths,float scaling_factor)
444 {
445  int blockSize = 16;
446 
447  int wBlocks = NumClasses/blockSize + ((NumClasses%blockSize == 0)?0:1);
448  int hBlocks = NumCenters/blockSize + ((NumCenters%blockSize == 0)?0:1);
449 
450  dim3 grid(wBlocks,hBlocks);
451  dim3 threads(blockSize,blockSize);
452  CalculateNetworkActivation<<<grid,threads>>>(output,Sample,Length,dCenters,NumCenters,dWeights,NumClasses,dWidths,scaling_factor);
453 }
454 
455 
456 
457 
458 KERNEL UpdateWidths(cudafloat* dWidths, cudafloat* newWidths, int Length){
459 
460  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
461 
462  if(idnx < Length){
463 
464  if(dWidths[idnx] > newWidths[idnx]){
465  dWidths[idnx] = newWidths[idnx];
466  }
467 
468  }
469 }
470 
471 extern "C" void KernelUpdateWidths(cudafloat* dWidths, cudafloat* newWidths, int Length)
472 {
473  int blockSize = 16;
474 
475  int wBlocks = Length/blockSize + ((Length%blockSize == 0)?0:1);
476  int hBlocks = 1;
477 
478  dim3 grid(wBlocks,hBlocks);
479  dim3 threads(blockSize,blockSize);
480  UpdateWidths<<<grid,threads>>>(dWidths,newWidths,Length);
481 }
482 
483 
484 
485 
486 
487 KERNEL CalculateError(cudafloat* result, cudafloat* target, cudafloat* output, int Length){
488 
489  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
490 
491  if(idnx < Length){
492 
493  output[idnx] = sqrt(pow(result[idnx]-target[idnx],2));
494 
495  if(IsInfOrNaN(output[idnx])){
496  output[idnx] = 0;
497  }
498 
499  }
500 
501 }
502 
503 KERNEL ReduceError(cudafloat* output,cudafloat* error, int Length){
504 
505  float sum = 0;
506 
507  for(int i = 0; i < Length; i++){
508  sum += pow(output[i],2);
509  }
510  error[0] = sqrt(sum/Length);
511 
512 }
513 
514 extern "C" void KernelCalculateError(cudafloat* result, cudafloat* target, cudafloat* output, int Length,float* error)
515 {
516  int blockSize = 16;
517 
518  int wBlocks = Length/blockSize + ((Length%blockSize == 0)?0:1);
519  int hBlocks = 1;
520 
521  dim3 grid(wBlocks,hBlocks);
522  dim3 threads(blockSize,blockSize);
523  CalculateError<<<grid,threads>>>(result,target,output,Length);
524 
525  dim3 grid2(1,1);
526  dim3 threads2(1,1);
527  ReduceError<<<grid2,threads2>>>(output,error,Length);
528 }
529 
530 
531 
532 
533 KERNEL SumActivations(cudafloat* output, int Length, int NumCenters){
534 
535  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
536 
537  if(idnx < Length){
538 
539  float sum = 0;
540 
541  for(int i = 0; i < NumCenters; i++){
542  sum += output[idnx+Length*i];
543  }
544 
545  output[idnx] = sum;
546  }
547 
548 }
549 
550 extern "C" void KernelSumActivations(cudafloat* output, int Length, int NumCenters)
551 {
552  int blockSize = 16;
553 
554  int wBlocks = Length/blockSize + ((Length%blockSize == 0)?0:1);
555  int hBlocks = 1;
556 
557  dim3 grid(wBlocks,hBlocks);
558  dim3 threads(blockSize,blockSize);
559  SumActivations<<<grid,threads>>>(output,Length,NumCenters);
560 }
561 
562 
563 
564 
565 
566 KERNEL Copy(cudafloat* dst, cudafloat *src,int Length){
567 
568  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
569 
570  if(idnx < Length){
571 
572  dst[idnx] = src[idnx];
573 
574  }
575 
576 }
577 
578 extern "C" void KernelCopyTo(cudafloat* dst, cudafloat *src,int Length)
579 {
580  int blockSize = 16;
581 
582  int wBlocks = Length/blockSize + ((Length%blockSize == 0)?0:1);
583  int hBlocks = 1;
584 
585  dim3 grid(wBlocks,hBlocks);
586  dim3 threads(blockSize,blockSize);
587  Copy<<<grid,threads>>>(dst,src,Length);
588 }
589 
590 }
#define KERNEL
Defines the type of a kernel function.
#define CUDA_VALUE(X)
void KernelSigmaInverse(float *Output, int output_height, int output_width, cudafloat *S)
Definition: RANKernels.cu:217
void KernelEuclidianDistance(cudafloat *d_C, cudafloat *d_A, cudafloat *d_B, int uiWA, int uiWB, int uiWC, int uiHC)
float cudafloat