GPUMLib  0.2.2
GPU Machine Learning Library
RBFKernels.cu
1 /*
2  Ricardo Quintas is an MSc Student at the University of Coimbra, Portugal
3  Copyright (C) 2009, 2010 Ricardo Quintas
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "../common/CudaDefinitions.h"
21 #include "rbfkernels.h"
22 
23 KERNEL AdjustWidths(cudafloat *Distance, int distance_height, int distance_width, int rneighbours, float *widths){
24 
25  int idny = blockIdx.y*blockDim.y + threadIdx.y;
26 
27  float min_tmp = Distance[idny*distance_width];
28  int idx = 0;
29 
30  float width = 0;
31  int count = 0;
32 
33  if(idny < distance_height){
34 
35  while(count <= rneighbours ){
36 
37  min_tmp = -1;
38 
39  for(int j = 0; j < distance_width; j++){
40  if((Distance[idny*distance_width+j] != -1 && Distance[idny*distance_width+j] < min_tmp) || min_tmp == -1){
41  min_tmp = Distance[idny*distance_width+j];
42  idx = j;
43  }
44  }
45 
46  Distance[idny*distance_width+idx] = -1;
47 
48  if(min_tmp != -1){
49  width = width + pow(min_tmp,2);
50  count = count + 1;
51  }
52 
53  }
54 
55  widths[idny] = width/rneighbours;
56  }
57 }
58 
59 extern "C" void KernelAdjustWidths(cudafloat *Distance, int distance_height, int distance_width, int rneighbours, float *widths)
60 {
61  int blockSize = 16;
62 
63  int wBlocks = 1;
64  int hBlocks = distance_height/blockSize + ((distance_height%blockSize == 0)?0:1);
65 
66  dim3 grid(wBlocks,hBlocks);
67  dim3 threads(blockSize,blockSize);
68  AdjustWidths<<<grid,threads>>>(Distance, distance_height, distance_width, rneighbours, widths);
69 }
70 
71 
72 
73 
74 
75 KERNEL SigmaInverse(float *Output, int output_height,int output_width, cudafloat *S){
76 
77  int idny = blockIdx.y*blockDim.y + threadIdx.y;
78 
79  //column-major
80  if(idny < output_height)
81  if(S[idny] != 0)
82  Output[idny * output_height + idny] = 1/S[idny];
83 
84 }
85 
86 extern "C" void KernelSigmaInverse(float *Output, int output_height, int output_width, cudafloat *S)
87 {
88  int blockSize = 16;
89 
90  int wBlocks = 1;
91  int hBlocks = output_height/blockSize + ((output_height%blockSize == 0)?0:1);
92 
93  dim3 grid(wBlocks,hBlocks);
94  dim3 threads(blockSize,blockSize);
95 
96  SigmaInverse<<<grid,threads>>>(Output, output_height, output_width, S);
97 }
98 
99 
100 
101 
102 
103 /*Adapted from the matrix multiplication examples of the CUDA Toolkit*/
104 #define AS(i, j) As[i][j]
105 #define BS(i, j) Bs[i][j]
106 
107 #define BLOCK_SIZE 16
108 
109 __global__ void
110 CalculateDistance( float* C, float* A, float* B, int wA, int wB, int wC, int hC)
111 {
112 
113  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
114  int idny = blockIdx.y*blockDim.y + threadIdx.y;
115 
116  // Block index
117  int bx = blockIdx.x;
118  int by = blockIdx.y;
119 
120  // Thread index
121  int tx = threadIdx.x;
122  int ty = threadIdx.y;
123 
124  // Index of the first sub-matrix of A processed by the block
125  int aBegin = wA * BLOCK_SIZE * by;
126 
127  // Index of the last sub-matrix of A processed by the block
128  int aEnd = aBegin + wA - 1;
129 
130  // Step size used to iterate through the sub-matrices of A
131  int aStep = BLOCK_SIZE;
132 
133  // Index of the first sub-matrix of B processed by the block
134  int bBegin = wB * BLOCK_SIZE * bx;
135 
136  // Step size used to iterate through the sub-matrices of B
137  int bStep = BLOCK_SIZE;
138 
139  // Csub is used to store the element of the block sub-matrix
140  // that is computed by the thread
141  float Csub = 0;
142 
143  // Loop over all the sub-matrices of A and B
144  // required to compute the block sub-matrix
145  for (int a = aBegin, b = bBegin;
146  a <= aEnd;
147  a += aStep, b += bStep) {
148 
149  // Declaration of the shared memory array As used to
150  // store the sub-matrix of A
151  __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
152 
153  // Declaration of the shared memory array Bs used to
154  // store the sub-matrix of B
155  __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
156 
157  // Load the matrices from device memory
158  // to shared memory; each thread loads
159  // one element of each matrix
160  AS(ty, tx) = A[a + wA * ty + tx];
161  BS(ty, tx) = B[b + wB * ty + tx];
162 
163  // Synchronize to make sure the matrices are loaded
164  __syncthreads();
165 
166  // Calculate the distance between the two matrices;
167  // each thread computes one element
168  // of the block sub-matrix
169  // for (int k = 0; k < BLOCK_SIZE && (a+k) <= aEnd; ++k)
170  // Csub += pow(AS(ty, k) - BS(tx, k),2);
171 
172  if((a+0) <= aEnd) Csub += pow(AS(ty, 0) - BS(tx, 0),2);
173  if((a+1) <= aEnd) Csub += pow(AS(ty, 1) - BS(tx, 1),2);
174  if((a+2) <= aEnd) Csub += pow(AS(ty, 2) - BS(tx, 2),2);
175  if((a+3) <= aEnd) Csub += pow(AS(ty, 3) - BS(tx, 3),2);
176  if((a+4) <= aEnd) Csub += pow(AS(ty, 4) - BS(tx, 4),2);
177  if((a+5) <= aEnd) Csub += pow(AS(ty, 5) - BS(tx, 5),2);
178  if((a+6) <= aEnd) Csub += pow(AS(ty, 6) - BS(tx, 6),2);
179  if((a+7) <= aEnd) Csub += pow(AS(ty, 7) - BS(tx, 7),2);
180  if((a+8) <= aEnd) Csub += pow(AS(ty, 8) - BS(tx, 8),2);
181  if((a+9) <= aEnd) Csub += pow(AS(ty, 9) - BS(tx, 9),2);
182  if((a+10) <= aEnd) Csub += pow(AS(ty, 10) - BS(tx, 10),2);
183  if((a+11) <= aEnd) Csub += pow(AS(ty, 11) - BS(tx, 11),2);
184  if((a+12) <= aEnd) Csub += pow(AS(ty, 12) - BS(tx, 12),2);
185  if((a+13) <= aEnd) Csub += pow(AS(ty, 13) - BS(tx, 13),2);
186  if((a+14) <= aEnd) Csub += pow(AS(ty, 14) - BS(tx, 14),2);
187  if((a+15) <= aEnd) Csub += pow(AS(ty, 15) - BS(tx, 15),2);
188 
189  // Synchronize to make sure that the preceding
190  // computation is done before loading two new
191  // sub-matrices of A and B in the next iteration
192  __syncthreads();
193  }
194 
195  // Write the block sub-matrix to device memory;
196  // each thread writes one element
197  if(idnx < wC && idny < hC){
198  int c = wC * BLOCK_SIZE * by + BLOCK_SIZE * bx;
199  C[c + wC * ty + tx] = sqrt(Csub);
200  }
201 }
202 
203 extern "C" void KernelCalculateDistance(cudafloat *d_C, cudafloat* d_A, cudafloat* d_B,int uiWA,int uiWB, int uiWC, int uiHC)
204 {
205 
206  int blockSize = 16;
207 
208  int wBlocks = uiWC/blockSize + ((uiWC%blockSize == 0)?0:1);
209  int hBlocks = uiHC/blockSize + ((uiHC%blockSize == 0)?0:1);
210 
211  dim3 grid(wBlocks,hBlocks);
212  dim3 threads(blockSize,blockSize);
213 
214  CalculateDistance<<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB, uiWC, uiHC);
215 
216 }
217 
218 
219 __global__ void
220 ActivationMatrix( float* C, float* A, float* B, int wA, int wB, int wC, int hC,float scalingfactor, float* c_width)
221 {
222 
223  int idnx = blockIdx.x*blockDim.x + threadIdx.x;
224  int idny = blockIdx.y*blockDim.y + threadIdx.y;
225 
226  // Block index
227  int bx = blockIdx.x;
228  int by = blockIdx.y;
229 
230  // Thread index
231  int tx = threadIdx.x;
232  int ty = threadIdx.y;
233 
234  // Index of the first sub-matrix of A processed by the block
235  int aBegin = wA * BLOCK_SIZE * by;
236 
237  // Index of the last sub-matrix of A processed by the block
238  int aEnd = aBegin + wA - 1;
239 
240  // Step size used to iterate through the sub-matrices of A
241  int aStep = BLOCK_SIZE;
242 
243  // Index of the first sub-matrix of B processed by the block
244  int bBegin = wB * BLOCK_SIZE * bx;
245 
246  // Step size used to iterate through the sub-matrices of B
247  int bStep = BLOCK_SIZE;
248 
249  // Csub is used to store the element of the block sub-matrix
250  // that is computed by the thread
251  float Csub = 0;
252 
253  // Loop over all the sub-matrices of A and B
254  // required to compute the block sub-matrix
255  for (int a = aBegin, b = bBegin;
256  a <= aEnd;
257  a += aStep, b += bStep) {
258 
259  // Declaration of the shared memory array As used to
260  // store the sub-matrix of A
261  __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
262 
263  // Declaration of the shared memory array Bs used to
264  // store the sub-matrix of B
265  __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
266 
267  // Load the matrices from device memory
268  // to shared memory; each thread loads
269  // one element of each matrix
270  AS(ty, tx) = A[a + wA * ty + tx];
271  BS(ty, tx) = B[b + wB * ty + tx];
272 
273  // Synchronize to make sure the matrices are loaded
274  __syncthreads();
275 
276  // Calculate the distance between the two matrices;
277  // each thread computes one element
278  // of the block sub-matrix
279  // for (int k = 0; k < BLOCK_SIZE && (a+k) <= aEnd; ++k)
280  // Csub += pow(AS(ty, k) - BS(tx, k),2);
281 
282  if((a+0) <= aEnd) Csub += pow(AS(ty, 0) - BS(tx, 0),2);
283  if((a+1) <= aEnd) Csub += pow(AS(ty, 1) - BS(tx, 1),2);
284  if((a+2) <= aEnd) Csub += pow(AS(ty, 2) - BS(tx, 2),2);
285  if((a+3) <= aEnd) Csub += pow(AS(ty, 3) - BS(tx, 3),2);
286  if((a+4) <= aEnd) Csub += pow(AS(ty, 4) - BS(tx, 4),2);
287  if((a+5) <= aEnd) Csub += pow(AS(ty, 5) - BS(tx, 5),2);
288  if((a+6) <= aEnd) Csub += pow(AS(ty, 6) - BS(tx, 6),2);
289  if((a+7) <= aEnd) Csub += pow(AS(ty, 7) - BS(tx, 7),2);
290  if((a+8) <= aEnd) Csub += pow(AS(ty, 8) - BS(tx, 8),2);
291  if((a+9) <= aEnd) Csub += pow(AS(ty, 9) - BS(tx, 9),2);
292  if((a+10) <= aEnd) Csub += pow(AS(ty, 10) - BS(tx, 10),2);
293  if((a+11) <= aEnd) Csub += pow(AS(ty, 11) - BS(tx, 11),2);
294  if((a+12) <= aEnd) Csub += pow(AS(ty, 12) - BS(tx, 12),2);
295  if((a+13) <= aEnd) Csub += pow(AS(ty, 13) - BS(tx, 13),2);
296  if((a+14) <= aEnd) Csub += pow(AS(ty, 14) - BS(tx, 14),2);
297  if((a+15) <= aEnd) Csub += pow(AS(ty, 15) - BS(tx, 15),2);
298 
299  // Synchronize to make sure that the preceding
300  // computation is done before loading two new
301  // sub-matrices of A and B in the next iteration
302  __syncthreads();
303  }
304 
305  // Write the block sub-matrix to device memory;
306  // each thread writes one element
307  if(idnx < wC && idny < hC){
308  int c = hC * BLOCK_SIZE * bx + BLOCK_SIZE * by;
309  C[c + hC * tx + ty] = exp(-(Csub/(scalingfactor*pow(c_width[idnx],2))));
310  }
311 }
312 
313 
314 extern "C" void KernelActivationMatrix(cudafloat *d_C, cudafloat* d_A, cudafloat* d_B,int uiWA,int uiWB, int uiWC, int uiHC, float scalingfactor, float* c_width)
315 {
316 
317  int blockSize = 16;
318 
319  int wBlocks = uiWC/blockSize + ((uiWC%blockSize == 0)?0:1);
320  int hBlocks = uiHC/blockSize + ((uiHC%blockSize == 0)?0:1);
321 
322  dim3 grid(wBlocks,hBlocks);
323  dim3 threads(blockSize,blockSize);
324 
325  ActivationMatrix<<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB, uiWC, uiHC,scalingfactor,c_width);
326 
327 }
328 
329 
330 
331 
void KernelActivationMatrix(cudafloat *d_C, cudafloat *d_A, cudafloat *d_B, int uiWA, int uiWB, int uiWC, int uiHC, float scalingfactor, float *c_width)
Definition: RBFKernels.cu:314
void KernelAdjustWidths(cudafloat *Distance, int distance_height, int distance_width, int rneighbours, float *widths)
Definition: RBFKernels.cu:59
void KernelCalculateDistance(cudafloat *d_C, cudafloat *d_A, cudafloat *d_B, int uiWA, int uiWB, int uiWC, int uiHC)
Definition: RBFKernels.cu:203
#define KERNEL
Defines the type of a kernel function.
void KernelSigmaInverse(float *Output, int output_height, int output_width, cudafloat *S)
Definition: RBFKernels.cu:86
float cudafloat