GPUMLib  0.2.2
GPU Machine Learning Library
KMeansKernels.cu
1 /*
2  Ricardo Quintas is an MSc Student at the University of Coimbra, Portugal
3  Copyright (C) 2009, 2010 Ricardo Quintas
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "../common/CudaDefinitions.h"
22 #include "kmeanskernels.h"
23 
24 
25 
26 
27 
28 #define AS(i, j) As[i][j]
29 #define BS(i, j) Bs[i][j]
30 
31 #define BLOCK_SIZE 16
32 
37 __global__ void
38 EuclidianDistance( float* C, float* A, float* B, int wA, int wB, int wC, int hC)
39 {
40 
41  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
42  int idny = blockIdx.y*blockDim.y + threadIdx.y;
43 
44 
45 
46  // Block index
47  int bx = blockIdx.x;
48  int by = blockIdx.y;
49 
50  // Thread index
51  int tx = threadIdx.x;
52  int ty = threadIdx.y;
53 
54  // Index of the first sub-matrix of A processed by the block
55  int aBegin = wA * BLOCK_SIZE * by;
56 
57  // Index of the last sub-matrix of A processed by the block
58  int aEnd = aBegin + wA - 1;
59 
60  // Step size used to iterate through the sub-matrices of A
61  int aStep = BLOCK_SIZE;
62 
63  // Index of the first sub-matrix of B processed by the block
64  int bBegin = wB * BLOCK_SIZE * bx;
65 
66  // Step size used to iterate through the sub-matrices of B
67  int bStep = BLOCK_SIZE;
68 
69  // Csub is used to store the element of the block sub-matrix
70  // that is computed by the thread
71  float Csub = 0;
72 
73  // Loop over all the sub-matrices of A and B
74  // required to compute the block sub-matrix
75  for (int a = aBegin, b = bBegin;
76  a <= aEnd;
77  a += aStep, b += bStep) {
78 
79  // Declaration of the shared memory array As used to
80  // store the sub-matrix of A
81  __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
82 
83  // Declaration of the shared memory array Bs used to
84  // store the sub-matrix of B
85  __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
86 
87  // Load the matrices from device memory
88  // to shared memory; each thread loads
89  // one element of each matrix
90  if((a + wA * ty + tx) < hC*wA)
91  AS(ty, tx) = A[a + wA * ty + tx];
92  if((b + wB * ty + tx) < wC*wB)
93  BS(ty, tx) = B[b + wB * ty + tx];
94 
95  // Synchronize to make sure the matrices are loaded
96  __syncthreads();
97 
98  // Multiply the two matrices together;
99  // each thread computes one element
100  // of the block sub-matrix
101  // for (int k = 0; k < BLOCK_SIZE && (a+k) <= aEnd; ++k)
102  // Csub += pow(AS(ty, k) - BS(tx, k),2);
103 
104  if((a+0) <= aEnd) Csub += pow(AS(ty, 0) - BS(tx, 0),2);
105  if((a+1) <= aEnd) Csub += pow(AS(ty, 1) - BS(tx, 1),2);
106  if((a+2) <= aEnd) Csub += pow(AS(ty, 2) - BS(tx, 2),2);
107  if((a+3) <= aEnd) Csub += pow(AS(ty, 3) - BS(tx, 3),2);
108  if((a+4) <= aEnd) Csub += pow(AS(ty, 4) - BS(tx, 4),2);
109  if((a+5) <= aEnd) Csub += pow(AS(ty, 5) - BS(tx, 5),2);
110  if((a+6) <= aEnd) Csub += pow(AS(ty, 6) - BS(tx, 6),2);
111  if((a+7) <= aEnd) Csub += pow(AS(ty, 7) - BS(tx, 7),2);
112  if((a+8) <= aEnd) Csub += pow(AS(ty, 8) - BS(tx, 8),2);
113  if((a+9) <= aEnd) Csub += pow(AS(ty, 9) - BS(tx, 9),2);
114  if((a+10) <= aEnd) Csub += pow(AS(ty, 10) - BS(tx, 10),2);
115  if((a+11) <= aEnd) Csub += pow(AS(ty, 11) - BS(tx, 11),2);
116  if((a+12) <= aEnd) Csub += pow(AS(ty, 12) - BS(tx, 12),2);
117  if((a+13) <= aEnd) Csub += pow(AS(ty, 13) - BS(tx, 13),2);
118  if((a+14) <= aEnd) Csub += pow(AS(ty, 14) - BS(tx, 14),2);
119  if((a+15) <= aEnd) Csub += pow(AS(ty, 15) - BS(tx, 15),2);
120 
121  // Synchronize to make sure that the preceding
122  // computation is done before loading two new
123  // sub-matrices of A and B in the next iteration
124  __syncthreads();
125  }
126 
127  // Write the block sub-matrix to device memory;
128  // each thread writes one element
129  if(idnx < wC && idny < hC){
130  int c = wC * BLOCK_SIZE * by + BLOCK_SIZE * bx;
131  C[c + wC * ty + tx] = sqrt(Csub);
132  }
133 }
134 
135 void KernelEuclidianDistance(cudafloat *d_C, cudafloat* d_A, cudafloat* d_B,int uiWA,int uiWB, int uiWC, int uiHC)
136 {
137 
138  int blockSize = 16;
139 
140  int wBlocks = uiWC/blockSize + ((uiWC%blockSize == 0)?0:1);
141  int hBlocks = uiHC/blockSize + ((uiHC%blockSize == 0)?0:1);
142 
143  dim3 grid(wBlocks,hBlocks);
144  dim3 threads(blockSize,blockSize);
145 
146  EuclidianDistance<<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB, uiWC, uiHC);
147 
148 }
149 
150 
151 
152 
153 
154 KERNEL CenterAttribution(cudafloat *Output, int output_height, int output_width, int *attrib_center){
155 
156  int idnx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
157  int idny = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;
158 
159  float min_tmp = -1;
160  int idx = 0;
161 
162  if(idny < output_height){
163 
164  for(int j = 0; j < output_width; j++){
165  if(Output[__umul24(idny,output_width)+j] < min_tmp || min_tmp == -1){
166  min_tmp = Output[__umul24(idny,output_width)+j];
167  idx = j;
168  }
169  }
170 
171  attrib_center[idny] = idx;
172  }
173 }
174 
175 void KernelCenterAttribution(cudafloat *Output, int output_height, int output_width, int *attrib_center)
176 {
177  int blockSize = 16;
178 
179  int wBlocks = 1;
180  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
181 
182  dim3 grid(wBlocks,hBlocks);
183  dim3 threads(blockSize,blockSize);
184  CenterAttribution<<<grid,threads>>>(Output, output_height, output_width, attrib_center);
185 }
186 
187 
188 
189 KERNEL PrepareCenterCopy(cudafloat *Output, int output_height, int output_width, int *attrib_center){
190 
191  int idnx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
192  int idny = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;
193 
194  int blocks = 1;
195 
196  if(idnx < output_width){
197 
198  int count = 0;
199 
200  int start = blockIdx.y * output_height/blocks;
201  int stop = start + output_height/blocks;
202 
203 
204  for(int j = start; j < stop; j++){
205 
206  if(attrib_center[j] == idnx){
207 
208  Output[(start+count)*output_width + idnx] = j;
209 
210  count = count + 1;
211  }
212 
213  }
214 
215  /* if(attrib_center[stop-15] == idnx){ Output[(start+count)*output_width + idnx] = stop-15; count = count + 1; }
216  if(attrib_center[stop-14] == idnx){ Output[(start+count)*output_width + idnx] = stop-14; count = count + 1; }
217  if(attrib_center[stop-13] == idnx){ Output[(start+count)*output_width + idnx] = stop-13; count = count + 1; }
218  if(attrib_center[stop-12] == idnx){ Output[(start+count)*output_width + idnx] = stop-12; count = count + 1; }
219  if(attrib_center[stop-11] == idnx){ Output[(start+count)*output_width + idnx] = stop-11; count = count + 1; }
220  if(attrib_center[stop-10] == idnx){ Output[(start+count)*output_width + idnx] = stop-10; count = count + 1; }
221  if(attrib_center[stop-9] == idnx){ Output[(start+count)*output_width + idnx] = stop-9; count = count + 1; }
222  if(attrib_center[stop-8] == idnx){ Output[(start+count)*output_width + idnx] = stop-8; count = count + 1; }
223  if(attrib_center[stop-7] == idnx){ Output[(start+count)*output_width + idnx] = stop-7; count = count + 1; }
224  if(attrib_center[stop-6] == idnx){ Output[(start+count)*output_width + idnx] = stop-6; count = count + 1; }
225  if(attrib_center[stop-5] == idnx){ Output[(start+count)*output_width + idnx] = stop-5; count = count + 1; }
226  if(attrib_center[stop-4] == idnx){ Output[(start+count)*output_width + idnx] = stop-4; count = count + 1; }
227  if(attrib_center[stop-3] == idnx){ Output[(start+count)*output_width + idnx] = stop-3; count = count + 1; }
228  if(attrib_center[stop-2] == idnx){ Output[(start+count)*output_width + idnx] = stop-2; count = count + 1; }
229  if(attrib_center[stop-1] == idnx){ Output[(start+count)*output_width + idnx] = stop-1; count = count + 1; }*/
230 
231  __syncthreads();
232 
233  attrib_center[idnx+blockIdx.y*output_width] = count;
234  }
235 
236 }
237 
238 void KernelPrepareCenterCopy(cudafloat *Output, int output_height, int output_width, int *attrib_center)
239 {
240 
241 
242  int blockSize = 16;
243 
244  int wBlocks = output_width/blockSize + ((output_width%blockSize == 0)?0:1);
245  int hBlocks = 1;
246 
247  dim3 grid(wBlocks,hBlocks,1);
248  dim3 threads(blockSize,blockSize);
249 
250  PrepareCenterCopy<<<grid,threads>>>(Output,output_height,output_width,attrib_center);
251 
252 
253 }
254 
255 
256 
257 KERNEL CopyCenters(cudafloat *Output, int output_height, int output_width, cudafloat *Input,int input_width, int *attrib_center,cudafloat *Indexes, int idx_height, int idx_width){
258 
259  int idnx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
260  int idny = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;
261 
262  int blocks = 1;
263 
264  if(idny < output_height && idnx < output_width){
265 
266  int idx = __umul24(idny,output_width);
267 
268  int count = 0;
269  float aux = 0;
270 
271 
272  for(int block_number = 0; block_number < blocks; block_number++){
273 
274  int start = block_number * idx_height/blocks;
275 
276  for(int j = 0; j < attrib_center[idny+block_number*output_height]; j++){
277 
278  aux = aux + Input[__umul24(Indexes[(j+start)*idx_width + idny],output_width)+idnx];
279 
280  count = count + 1;
281 
282  }
283  }
284 
285  if(count > 0)
286  Output[idx + idnx] = aux/count;
287  else
288  Output[idx + idnx] = 0;
289 
290  }
291 
292 }
293 
294 void KernelCopyCenters(cudafloat *Output, int output_height, int output_width, cudafloat *Input,int input_width, int *attrib_center, cudafloat *Indexes, int idx_height, int idx_width)
295 {
296 
297 
298  int blockSize = 16;
299 
300  int wBlocks = output_width/blockSize + ((output_width%blockSize == 0)?0:1);;
301  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
302 
303  dim3 grid(wBlocks,hBlocks,1);
304  dim3 threads(blockSize,blockSize);
305 
306  CopyCenters<<<grid,threads>>>(Output,output_height,output_width,Input,input_width,attrib_center,Indexes,idx_height,idx_width);
307 
308 
309 }
310 
311 
312 
313 
314 unsigned int nextPow2( unsigned int x ) {
315  --x;
316  x |= x >> 1;
317  x |= x >> 2;
318  x |= x >> 4;
319  x |= x >> 8;
320  x |= x >> 16;
321  return ++x;
322 }
323 
324 KERNEL reduce2(int *g_idata,int *g_odata, int *g_idata_old, unsigned int n)
325 {
326  extern __shared__ int sdata2[];
327 
328  // perform first level of reduction,
329  // reading from global memory, writing to shared memory
330  unsigned int tid = threadIdx.x;
331  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
332 
333  sdata2[tid] = (i < n) ? ((g_idata[i] == g_idata_old[i])?0:1) : 0;
334  if (i + blockDim.x < n)
335  sdata2[tid] += (g_idata[i+blockDim.x] == g_idata_old[i+blockDim.x])?0:1;
336  __syncthreads();
337 
338  // do reduction in shared mem
339  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
340  {
341  if (tid < s)
342  {
343  sdata2[tid] += sdata2[tid + s];
344  }
345  __syncthreads();
346  }
347 
348  // write result for this block to global mem
349  if(tid == 0) g_odata[blockIdx.x] = sdata2[0];
350 }
351 
352 KERNEL reduce3(int *g_idata,int *g_odata, unsigned int n)
353 {
354  extern __shared__ int sdata3[];
355 
356  // perform first level of reduction,
357  // reading from global memory, writing to shared memory
358  unsigned int tid = threadIdx.x;
359  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
360 
361  sdata3[tid] = (i < n) ? g_idata[i] : 0;
362  if (i + blockDim.x < n)
363  sdata3[tid] += g_idata[i+blockDim.x];
364 
365  __syncthreads();
366 
367  // do reduction in shared mem
368  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
369  {
370  if (tid < s)
371  {
372  sdata3[tid] += sdata3[tid + s];
373  }
374  __syncthreads();
375  }
376 
377  // write result for this block to global mem
378  if(tid == 0) g_odata[blockIdx.x] = sdata3[0];
379 }
380 
381 void KernelReduce2(int *output, int *input, int *g_idata_old,int n)
382 {
383  int blockSize = 16;
384 
385  int threads = (n < blockSize*2) ? nextPow2((n + 1)/ 2) : blockSize;
386  int blocks = (n + (threads * 2 - 1)) / (threads * 2);
387  int smemSize = (threads <= 32) ? 2 * threads * sizeof(int) : threads * sizeof(int);
388 
389  dim3 dimGrid(blocks, 1, 1);
390  dim3 dimBlock(threads, 1, 1);
391 
392  reduce2<<<dimGrid,dimBlock,smemSize>>>(input,output,g_idata_old,n);
393 
394  int s = blocks;
395 
396  while(s > 1){
397  threads = (s < blockSize*2) ? nextPow2((s + 1)/ 2) : blockSize;
398  blocks = (s + (threads * 2 - 1)) / (threads * 2);
399  smemSize = (threads <= 32) ? 2 * threads * sizeof(cudafloat) : threads * sizeof(cudafloat);
400 
401  dim3 dimGrid1(blocks, 1, 1);
402  dim3 dimBlock1(threads, 1, 1);
403 
404  reduce3<<<dimGrid1,dimBlock1,smemSize>>>(output,output,s);
405 
406  s = (s + (threads * 2 - 1)) / (threads * 2);
407  }
408 
409  /* int gpu_result = 0;
410 
411  if (s > 1)
412  {
413  // copy result from device to host
414 
415  int* h_odata = (int*) malloc(sizeof(int)*s);
416 
417  cudaMemcpy( h_odata, output, s * sizeof(int), cudaMemcpyDeviceToHost);
418 
419  for(int i=0; i < s; i++)
420  {
421  gpu_result += h_odata[i];
422  }
423 
424  needReadBack = false;
425 
426  free(h_odata);
427 
428  }
429 
430  if (needReadBack)
431  {
432  // cudaMemcpy( &gpu_result, output, sizeof(int), cudaMemcpyDeviceToHost);
433  }
434 
435  // return gpu_result;*/
436 }
437 
438 
439 
440 
441 
442 
443 
444 
445 
446 
447 
448 
449 
450 
451 
452 
453 
454 
455 
456 /* KERNEL Euclidian Distance */
457 __device__ float distance( float *v1, float *v2, int dimensions)
458 {
459  float dist = 0;
460 
461  for( int i = 0; i < dimensions; i++ )
462  {
463  float tmp = v2[i] - v1[i];
464  dist += tmp * tmp;
465  }
466 
467  return sqrt(dist);
468 }
469 
470 KERNEL CenterAttribution_Bounds(cudafloat *Output, int output_height, int output_width, int *attrib_center, float* upperbound){
471 
472  int idny = blockIdx.y*blockDim.y + threadIdx.y;
473 
474  float min_tmp = -1;
475  int idx = 0;
476 
477  if(idny < output_height){
478 
479  for(int j = 0; j < output_width; j++){
480  if(Output[idny*output_width+j] < min_tmp || min_tmp == -1){
481  min_tmp = Output[idny*output_width+j];
482  idx = j;
483  }
484  }
485 
486  attrib_center[idny] = idx;
487  upperbound[idny] = Output[idny*output_width+idx];
488  }
489 }
490 
491 void KernelCenterAttribution_Bounds(cudafloat *Output, int output_height, int output_width, int *attrib_center, float* upperbound)
492 {
493  int blockSize = 16;
494 
495  int wBlocks = 1;
496  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
497 
498  dim3 grid(wBlocks,hBlocks);
499  dim3 threads(blockSize,blockSize);
500  CenterAttribution_Bounds<<<grid,threads>>>(Output, output_height, output_width, attrib_center,upperbound);
501 }
502 
503 
504 
505 
506 
507 KERNEL CopyCenters2(cudafloat *Output, int output_height, int output_width, cudafloat *Input,int input_width, int *attrib_center){
508 
509  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
510  int idny = blockIdx.y*blockDim.y + threadIdx.y;
511 
512  if(idny < output_height && idnx < output_width){
513 
514  int count = 0;
515  float aux = 0;
516 
517  for(int j = 0; j < input_width; j++){
518 
519  if(attrib_center[j] == idny){
520 
521  aux = aux + Input[j*output_width+idnx];
522 
523  count = count + 1;
524  }
525 
526  }
527 
528  Output[idny * output_width + idnx] = aux/count;
529 
530  }
531 
532 }
533 
534 void KernelCopyCenters2(cudafloat *Output, int output_height, int output_width, cudafloat *Input,int input_width, int *attrib_center)
535 {
536  int blockSize = 16;
537 
538  int wBlocks = output_width/blockSize + ((output_width%blockSize == 0)?0:1);;
539  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
540 
541  dim3 grid(wBlocks,hBlocks);
542  dim3 threads(blockSize,blockSize);
543 
544  CopyCenters2<<<grid,threads>>>(Output,output_height,output_width,Input,input_width,attrib_center);
545 }
546 
547 
548 
549 
550 
551 
552 
553 
554 KERNEL FindS(cudafloat *Output, int output_height, int output_width, float *S){
555 
556  int idny = blockIdx.y*blockDim.y + threadIdx.y;
557 
558  float min_tmp = -1;
559 
560 
561  if(idny < output_height){
562 
563  for(int j = 0; j < output_width; j++){
564  if(idny != j){
565  if(Output[idny*output_width+j] < min_tmp || min_tmp == -1){
566  min_tmp = Output[idny*output_width+j];
567  }
568  }
569  }
570 
571  S[idny] = CUDA_VALUE(0.5) * min_tmp;
572  }
573 }
574 
575 void KernelS(cudafloat *Output, int output_height, int output_width, float *S)
576 {
577 
578 
579 
580  int blockSize = 16;
581 
582  int wBlocks = 1;
583  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
584 
585  dim3 grid(wBlocks,hBlocks);
586  dim3 threads(blockSize,blockSize);
587  FindS<<<grid,threads>>>(Output, output_height, output_width, S);
588 
589 }
590 
591 
592 
593 
594 
595 
596 
597 
598 
599 
600 KERNEL Step3(float*Input,int input_height, float* UpperBounds, float* S, bool* R,int* CenterAttrib,float* LowerBounds,float* DistanceBeetweenCenters,float* InitialDistances, float* NewCenters,int centers_height,int centers_width){
601 
602  int idny = blockIdx.y*blockDim.y + threadIdx.y;
603 
604  if(idny < input_height){
605 
606  float upperbound = UpperBounds[idny];
607  int centeratt = CenterAttrib[idny];
608  float s_value = S[centeratt];
609 
610  if(!(upperbound <= s_value)){
611 
612  for(int j = 0; j < centers_height; j++){
613 
614  if(j != centeratt && upperbound > LowerBounds[idny*centers_height+j] && upperbound > CUDA_VALUE(0.5)*DistanceBeetweenCenters[centeratt*centers_height+j]){
615 
616  //std::cout << " - Step 3a - " << std::endl;
617  if(R[idny]){
618 
619  InitialDistances[idny*centers_height+centeratt] = distance(&(Input[idny*centers_width]),&(NewCenters[centeratt*centers_width]),centers_width);
620 
621  upperbound = InitialDistances[idny*centers_height+centeratt];
622 
623  R[idny] = false;
624 
625  }else{
626 
627  InitialDistances[idny*centers_height+centeratt] = upperbound;
628 
629  }
630  }
631 
632  //std::cout << " - Step 3b - " << std::endl;
633  if(InitialDistances[idny*centers_height+centeratt] > LowerBounds[idny*centers_height+j] ||
634  InitialDistances[idny*centers_height+centeratt] > CUDA_VALUE(0.5) * DistanceBeetweenCenters[centeratt*centers_height+j]){
635  //std::cout << " - Step 3b - " << std::endl;
636 
637  InitialDistances[idny*centers_height+j] = distance(&(Input[idny*centers_width]),&(NewCenters[j*centers_width]),centers_width);
638 
639  LowerBounds[idny*centers_height+j] = InitialDistances[idny*centers_height+j];
640 
641  if(InitialDistances[idny*centers_height+j] < InitialDistances[idny*centers_height+centeratt]){
642 
643  centeratt = j;
644 
645  upperbound = InitialDistances[idny*centers_height+centeratt];
646 
647  }
648 
649  }
650 
651  }
652 
653  }
654 
655  UpperBounds[idny] = upperbound;
656  CenterAttrib[idny] = centeratt;
657 
658  }
659 
660 }
661 
662 void KernelStep3(float* Input,int input_height, float* Upperbounds, float* S,bool* R,int* CenterAttrib,float* LowerBounds,float* DistanceBeetweenCenters,float* InitialDistances, float* NewCenters,int centers_height,int centers_width)
663 {
664 
665 
666  int blockSize = 16;
667 
668  int wBlocks = 1;
669  int hBlocks = input_height/blockSize + ((input_height%blockSize == 0)?0:1);
670 
671  dim3 grid(wBlocks,hBlocks);
672  dim3 threads(blockSize,blockSize);
673  Step3<<<grid,threads>>>(Input,input_height,Upperbounds,S,R,CenterAttrib,LowerBounds,DistanceBeetweenCenters,InitialDistances,NewCenters,centers_height,centers_width);
674 
675 }
676 
677 
678 
679 KERNEL Step5(int input_height, float* UpperBounds, bool* R,int* CenterAttrib,float* LowerBounds,float* DistanceBeetweenCenters,float* InitialDistances, float* NewCenters,int centers_height,int centers_width){
680 
681  int idny = blockIdx.y*blockDim.y + threadIdx.y;
682 
683  if(idny < input_height){
684 
685  for(int j = 0; j < centers_height; j++){
686 
687  if(LowerBounds[idny*centers_height+j] - DistanceBeetweenCenters[j*centers_height+j] > 0){
688  LowerBounds[idny*centers_height+j] = LowerBounds[idny*centers_height+j] - DistanceBeetweenCenters[j*centers_height+j];
689  }else{
690  LowerBounds[idny*centers_height+j] = 0;
691  }
692  }
693 
694  UpperBounds[idny] = UpperBounds[idny] + DistanceBeetweenCenters[CenterAttrib[idny]*centers_height+CenterAttrib[idny]];
695 
696  R[idny] = true;
697 
698  }
699 
700 }
701 
702 void KernelStep5(int input_height, float* Upperbounds, bool* R,int* CenterAttrib,float* LowerBounds,float* DistanceBeetweenCenters,float* InitialDistances, float* NewCenters,int centers_height,int centers_width)
703 {
704 
705 
706  int blockSize = 16;
707 
708  int wBlocks = 1;
709  int hBlocks = input_height/blockSize + ((input_height%blockSize == 0)?0:1);
710 
711  dim3 grid(wBlocks,hBlocks);
712  dim3 threads(blockSize,blockSize);
713  Step5<<<grid,threads>>>(input_height,Upperbounds,R,CenterAttrib,LowerBounds,DistanceBeetweenCenters,InitialDistances,NewCenters,centers_height,centers_width);
714 
715 
716 }
717 
718 
719 
720 
721 
722 
723 
724 
725 
726 
727 
728 
729 
730 
731 
732 KERNEL reduce_bool(bool *g_idata,bool *g_odata, unsigned int n)
733 {
734  extern __shared__ bool sdatabool[];
735 
736  // perform first level of reduction,
737  // reading from global memory, writing to shared memory
738  unsigned int tid = threadIdx.x;
739  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
740 
741  sdatabool[tid] = (i < n) ? g_idata[i] : true;
742  if (i + blockDim.x < n)
743  sdatabool[tid] = sdatabool[tid] && g_idata[i+blockDim.x];
744 
745  __syncthreads();
746 
747  // do reduction in shared mem
748  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
749  {
750  if (tid < s)
751  {
752  sdatabool[tid] = sdatabool[tid] && sdatabool[tid + s];
753  }
754  __syncthreads();
755  }
756 
757  // write result for this block to global mem
758  if(tid == 0) g_odata[blockIdx.x] = sdatabool[0];
759 }
760 
761 void KernelReduce_bool(bool *output, bool *input,int n)
762 {
763  int blockSize = 16;
764 
765  int threads = (n < blockSize*2) ? nextPow2((n + 1)/ 2) : blockSize;
766  int blocks = (n + (threads * 2 - 1)) / (threads * 2);
767  int smemSize = (threads <= 32) ? 2 * threads * sizeof(cudafloat) : threads * sizeof(cudafloat);
768 
769  dim3 dimGrid(blocks, 1, 1);
770  dim3 dimBlock(threads, 1, 1);
771 
772  reduce_bool<<<dimGrid,dimBlock,smemSize>>>(input,output,n);
773 
774  int s = blocks;
775 
776  while(s > 1){
777  threads = (s < blockSize*2) ? nextPow2((s + 1)/ 2) : blockSize;
778  blocks = (s + (threads * 2 - 1)) / (threads * 2);
779  smemSize = (threads <= 32) ? 2 * threads * sizeof(cudafloat) : threads * sizeof(cudafloat);
780 
781  dim3 dimGrid1(blocks, 1, 1);
782  dim3 dimBlock1(threads, 1, 1);
783 
784  reduce_bool<<<dimGrid1,dimBlock1,smemSize>>>(output,output,s);
785 
786  s = (s + (threads * 2 - 1)) / (threads * 2);
787  }
788 
789  /* bool gpu_result = true;
790 
791  if (s > 1)
792  {
793  // copy result from device to host
794 
795  bool* h_odata = (bool*) malloc(sizeof(bool)*s);
796 
797  cudaMemcpy( h_odata, output, s * sizeof(bool), cudaMemcpyDeviceToHost);
798 
799  for(int i=0; i < s; i++)
800  {
801 
802  gpu_result = gpu_result && h_odata[i];
803  }
804 
805 
806  needReadBack = false;
807 
808  free(h_odata);
809 
810  }
811 
812  if (needReadBack)
813  {
814  cudaMemcpy( &gpu_result, output, sizeof(bool), cudaMemcpyDeviceToHost);
815  }
816 
817  return gpu_result;*/
818 }
void KernelReduce_bool(bool *output, bool *input, int n)
void KernelCopyCenters2(cudafloat *Output, int output_height, int output_width, cudafloat *Input, int input_width, int *attrib_center)
void KernelReduce2(int *output, int *input, int *g_idata_old, int n)
void KernelCopyCenters(cudafloat *Output, int output_height, int output_width, cudafloat *Input, int input_width, int *attrib_center, cudafloat *Indexes, int idx_height, int idx_width)
void KernelS(cudafloat *Output, int output_height, int output_width, float *S)
#define KERNEL
Defines the type of a kernel function.
void KernelCenterAttribution(cudafloat *Output, int output_height, int output_width, int *attrib_center)
void KernelStep5(int input_height, float *Upperbounds, bool *R, int *CenterAttrib, float *LowerBounds, float *DistanceBeetweenCenters, float *InitialDistances, float *NewCenters, int centers_height, int centers_width)
#define CUDA_VALUE(X)
void KernelCenterAttribution_Bounds(cudafloat *Output, int output_height, int output_width, int *attrib_center, float *upperbound)
void KernelStep3(float *Input, int input_height, float *Upperbounds, float *S, bool *R, int *CenterAttrib, float *LowerBounds, float *DistanceBeetweenCenters, float *InitialDistances, float *NewCenters, int centers_height, int centers_width)
void KernelEuclidianDistance(cudafloat *d_C, cudafloat *d_A, cudafloat *d_B, int uiWA, int uiWB, int uiWC, int uiHC)
float cudafloat
void KernelPrepareCenterCopy(cudafloat *Output, int output_height, int output_width, int *attrib_center)