GPUMLib  0.2.2
GPU Machine Learning Library
SVM_cuda_code.h
1 /*
2 Joao Goncalves is a MSc Student at the University of Coimbra, Portugal
3 Copyright (C) 2012 Joao Goncalves
4 
5 This file is part of GPUMLib.
6 
7 GPUMLib is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11 
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef SVM_CUDA_H_
22 #define SVM_CUDA_H_
23 
24 //GPUMLib stuff
25 #include <cuda.h>
26 
27 namespace GPUMLib {
28 
29  //class SVM {
30  // allocate SMO variables on the device
31 
33  __device__ int d_i_low;
35  __device__ int d_i_high;
36 
38  __device__ cudafloat d_b_low;
39  __device__ cudafloat d_b_high;
40 
42  __device__ cudafloat d_alpha_i_low_old;
44  __device__ cudafloat d_alpha_i_high_old;
45 
47  __device__ cudafloat d_alpha_i_low_new;
49  __device__ cudafloat d_alpha_i_high_new;
50 
52  __device__ cudafloat d_b;
53 
56  __device__ int cuGetThreadID() {
57  return blockIdx.x * blockDim.x + threadIdx.x;
58  }
59 
65  __device__ cudafloat cuGetSampleAttribute(cudafloat * samples,
66  int n_samples, int sample_id,
67  int attribute_id) {
68  int index = n_samples * attribute_id + sample_id;
69  return samples[index];
70  }
71 
72  template <svm_kernel_type kernel_type>
73  __device__ cudafloat cuKernelDotProductUsingModelOnly(int sv_i, int sample_i,
74  cudafloat * model, int nsvs,
75  int num_dimensions,
76  cudafloat * kernel_args) {
77  // select kernel_type from available_kernels
78  if(kernel_type==SVM_KT_LINEAR) {
79  // 0 = linear kernel (default)
80  // = x1.x2
81  cudafloat sum = CUDA_VALUE(0.0);
82  for (int i = 0; i < num_dimensions; i++) {
83  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
84  cudafloat x1_i = cuGetSampleAttribute(model, nsvs, sample_i, i + 2);
85  sum += x0_i * x1_i;
86  }
87  return sum;
88  } else if(kernel_type==SVM_KT_POLYNOMIAL) {
89  //polynomial kernel
90  //(a*(x1.x2)+b)^c
91  // sum = x1.x2
92  cudafloat sum = CUDA_VALUE(0.0);
93  for (int i = 0; i < num_dimensions; i++) {
94  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
95  cudafloat x1_i = cuGetSampleAttribute(model, nsvs, sample_i, i + 2);
96  sum += x0_i * x1_i;
97  }
98  cudafloat val = CUDA_POW(kernel_args[0] * sum + kernel_args[1], kernel_args[2]);
99  return val;
100  } else if(kernel_type==SVM_KT_RBF) {
101  // radial basis function (RBF) kernel, a = sigma
102  // e^(-(1/(a^2)*(x1-x2)^2))
103  cudafloat sum_dif_squared = CUDA_VALUE(0.0);
104  for (int i = 0; i < num_dimensions; i++) {
105  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
106  cudafloat x1_i = cuGetSampleAttribute(model, nsvs, sample_i, i + 2);
107  cudafloat _dif = x0_i - x1_i;
108  cudafloat _dif_sq = _dif * _dif;
109  sum_dif_squared += _dif_sq;
110  }
111  cudafloat result = CUDA_EXP(-kernel_args[0] * sum_dif_squared);
112  return result;
113  } else if(kernel_type==SVM_KT_SIGMOID) {
114  // sigmoid kernel
115  cudafloat sum = CUDA_VALUE(0.0);
116  for (int i = 0; i < num_dimensions; i++) {
117  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
118  cudafloat x1_i = cuGetSampleAttribute(model, nsvs, sample_i, i + 2);
119  sum += x0_i * x1_i;
120  }
121  cudafloat val = CUDA_TANH(kernel_args[0] * sum + kernel_args[1]);
122  return val;
123  } else if(kernel_type==SVM_KT_UKF) {
124  // universal kernel function
125  // K(x1,x2) = a*(||x1-x2||^2+b^2)^-c
126  cudafloat sum_dif_squared = 0;
127  for (int i = 0; i < num_dimensions; i++) {
128  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
129  cudafloat x1_i = cuGetSampleAttribute(model, nsvs, sample_i, i + 2);
130  cudafloat _dif = x0_i - x1_i;
131  cudafloat _dif_sq = _dif * _dif;
132  sum_dif_squared += _dif_sq;
133  }
134  cudafloat result = kernel_args[0] * CUDA_POW(sum_dif_squared + kernel_args[1] * kernel_args[1], -kernel_args[2]);
135  return result;
136  } else
137  return CUDA_VALUE(0.0);
138  }
139 
140  template <svm_kernel_type kernel_type>
141  __device__ cudafloat cuKernelDotProduct(int sv_i, int sample_i,
142  cudafloat * model, cudafloat * dataset,
143  int nsvs, int dataset_size,
144  int num_dimensions,
145  cudafloat * kernel_args) {
146  // select kernel_type from available_kernels
147  if(kernel_type==SVM_KT_LINEAR) {
148  // 0 = linear kernel (default)
149  // = x1.x2
150  cudafloat sum = CUDA_VALUE(0.0);
151  for (int i = 0; i < num_dimensions; i++) {
152  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
153  cudafloat x1_i = cuGetSampleAttribute(dataset, dataset_size, sample_i, i);
154  sum += x0_i * x1_i;
155  }
156  return sum;
157  } else if(kernel_type==SVM_KT_POLYNOMIAL) {
158  //polynomial kernel
159  //(a*(x1.x2)+b)^c
160  // sum = x1.x2
161  cudafloat sum = CUDA_VALUE(0.0);
162  for (int i = 0; i < num_dimensions; i++) {
163  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
164  cudafloat x1_i = cuGetSampleAttribute(dataset, dataset_size, sample_i, i);
165  sum += x0_i * x1_i;
166  }
167  cudafloat val = CUDA_POW(kernel_args[0] * sum + kernel_args[1], kernel_args[2]);
168  return val;
169  } else if(kernel_type==SVM_KT_RBF) {
170  // radial basis function (RBF) kernel, a = sigma
171  // e^(-(1/(a^2)*(x1-x2)^2))
172  cudafloat sum_dif_squared = CUDA_VALUE(0.0);
173  for (int i = 0; i < num_dimensions; i++) {
174  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
175  cudafloat x1_i = cuGetSampleAttribute(dataset, dataset_size, sample_i, i);
176  cudafloat _dif = x0_i - x1_i;
177  cudafloat _dif_sq = _dif * _dif;
178  sum_dif_squared += _dif_sq;
179  }
180  cudafloat result = CUDA_EXP(-kernel_args[0] * sum_dif_squared);
181  return result;
182  } else if(kernel_type==SVM_KT_SIGMOID) {
183  // sigmoid kernel
184  cudafloat sum = CUDA_VALUE(0.0);
185  for (int i = 0; i < num_dimensions; i++) {
186  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
187  cudafloat x1_i = cuGetSampleAttribute(dataset, dataset_size, sample_i, i);
188  sum += x0_i * x1_i;
189  }
190  cudafloat val = CUDA_TANH(kernel_args[0] * sum + kernel_args[1]);
191  return val;
192  } else if(kernel_type==SVM_KT_UKF) {
193  // universal kernel function
194  // K(x1,x2) = a*(||x1-x2||^2+b^2)^-c
195  cudafloat sum_dif_squared = 0;
196  for (int i = 0; i < num_dimensions; i++) {
197  cudafloat x0_i = cuGetSampleAttribute(model, nsvs, sv_i, i + 2);
198  cudafloat x1_i = cuGetSampleAttribute(dataset, dataset_size, sample_i, i);
199  cudafloat _dif = x0_i - x1_i;
200  cudafloat _dif_sq = _dif * _dif;
201  sum_dif_squared += _dif_sq;
202  }
203  cudafloat result = kernel_args[0] * CUDA_POW(sum_dif_squared + kernel_args[1] * kernel_args[1], -kernel_args[2]);
204  return result;
205  } else
206  return CUDA_VALUE(0.0);
207  }
208 
209  template <svm_kernel_type kernel_type>
210  __device__ cudafloat cuKernelDotProduct(int i0, int i1, cudafloat * samples, int n_samples, int num_dimensions, cudafloat * kernel_args) {
211  // select kernel_type from available_kernels
212  if(kernel_type==SVM_KT_LINEAR) {
213  // 0 = linear kernel (default)
214  // = x1.x2
215  cudafloat sum = CUDA_VALUE(0.0);
216  for (int i = 0; i < num_dimensions; i++) {
217  cudafloat x0_i = cuGetSampleAttribute(samples, n_samples, i0, i);
218  cudafloat x1_i = cuGetSampleAttribute(samples, n_samples, i1, i);
219  sum += x0_i * x1_i;
220  }
221  return sum;
222  } else if(kernel_type==SVM_KT_POLYNOMIAL) {
223  //polynomial kernel
224  //(a*(x1.x2)+b)^c
225  // sum = x1.x2
226  cudafloat sum = CUDA_VALUE(0.0);
227  for (int i = 0; i < num_dimensions; i++) {
228  cudafloat x0_i = cuGetSampleAttribute(samples, n_samples, i0, i);
229  cudafloat x1_i = cuGetSampleAttribute(samples, n_samples, i1, i);
230  sum += x0_i * x1_i;
231  }
232  cudafloat val = CUDA_POW(kernel_args[0] * sum + kernel_args[1], kernel_args[2]);
233  return val;
234  } else if(kernel_type==SVM_KT_RBF) {
235  // radial basis function (RBF) kernel, a = sigma
236  // e^(-(1/(a^2)*(x1-x2)^2))
237  cudafloat sum_dif_squared = CUDA_VALUE(0.0);
238  for (int i = 0; i < num_dimensions; i++) {
239  cudafloat x0_i = cuGetSampleAttribute(samples, n_samples, i0, i);
240  cudafloat x1_i = cuGetSampleAttribute(samples, n_samples, i1, i);
241  cudafloat _dif = x0_i - x1_i;
242  cudafloat _dif_sq = _dif * _dif;
243  sum_dif_squared += _dif_sq;
244  }
245  cudafloat result = CUDA_EXP(-kernel_args[0] * sum_dif_squared);
246  return result;
247  } else if(kernel_type==SVM_KT_SIGMOID) {
248  // sigmoid kernel
249  cudafloat sum = CUDA_VALUE(0.0);
250  for (int i = 0; i < num_dimensions; i++) {
251  cudafloat x0_i = cuGetSampleAttribute(samples, n_samples, i0, i);
252  cudafloat x1_i = cuGetSampleAttribute(samples, n_samples, i1, i);
253  sum += x0_i * x1_i;
254  }
255  cudafloat val = CUDA_TANH(kernel_args[0] * sum + kernel_args[1]);
256  return val;
257  } else if(kernel_type==SVM_KT_UKF) {
258  // universal kernel function
259  // K(x1,x2) = a*(||x1-x2||^2+b^2)^-c
260  cudafloat sum_dif_squared = 0;
261  for (int i = 0; i < num_dimensions; i++) {
262  cudafloat x0_i = cuGetSampleAttribute(samples, n_samples, i0, i);
263  cudafloat x1_i = cuGetSampleAttribute(samples, n_samples, i1, i);
264  cudafloat _dif = x0_i - x1_i;
265  cudafloat _dif_sq = _dif * _dif;
266  sum_dif_squared += _dif_sq;
267  }
268  cudafloat result = kernel_args[0] * CUDA_POW(sum_dif_squared + kernel_args[1] * kernel_args[1], -kernel_args[2]);
269  return result;
270  } else
271  return CUDA_VALUE(0.0);
272  }
273 
274  template <svm_kernel_type kernel_type>
275  __global__ void cuUpdateKKTConditions(cudafloat * f, int * y, cudafloat * x, cudafloat * kernel_args, int nsamples, int ndims) {
276  int tid = cuGetThreadID();
277  if(tid<nsamples) {
278  cudafloat kxh_xi = cuKernelDotProduct<kernel_type>(d_i_high,tid,x,nsamples,ndims,kernel_args);
279  cudafloat kxl_xi = cuKernelDotProduct<kernel_type>(d_i_low,tid,x,nsamples,ndims,kernel_args);
280  //do the update
281  cudafloat fdelta = (d_alpha_i_high_new - d_alpha_i_high_old) * y[d_i_high] * kxh_xi + (d_alpha_i_low_new - d_alpha_i_low_old) * y[d_i_low] * kxl_xi;
282  f[tid] += fdelta;
283  }
284  }
285 
286  template <svm_kernel_type kernel_type>
287  __device__ int cuClassify(int sample_index, cudafloat * dataset, int dataset_size, cudafloat * model, int nsvs, cudafloat b, int ndims, cudafloat * kernel_args ) {
288  cudafloat sum = CUDA_VALUE(0.0);
289  for (int sv_i = 0; sv_i < nsvs; sv_i++) {
290  cudafloat alpha_i = cuGetSampleAttribute(model, nsvs, sv_i, 0);
291  cudafloat y = cuGetSampleAttribute(model, nsvs, sv_i, 1);
292  cudafloat k_proj = cuKernelDotProduct<kernel_type>(sv_i, sample_index, model, dataset, nsvs, dataset_size, ndims, kernel_args);
293  sum += alpha_i * y * k_proj;
294  }
295  sum += b;
296  if(sum > 0)
297  return 1;
298  else
299  return -1;
300  }
301 
302  template <svm_kernel_type kernel_type>
303  __global__ void cuClassifyDataSet(int * results, cudafloat * dataset, int dataset_size, cudafloat * model, int nsvs, cudafloat b, int ndims, cudafloat * kernel_args) {
304  int tid = cuGetThreadID();
305  if(tid < dataset_size) {
306  int result = cuClassify<kernel_type>(tid, dataset, dataset_size, model, nsvs, b, ndims, kernel_args);
307  results[tid] = result;
308  }
309  }
310 
311  template <svm_kernel_type kernel_type>
312  __device__ cudafloat cuClassifierOutput(int sample_index, cudafloat * model, int nsvs, int ndims, cudafloat * kernel_args ) {
313  cudafloat sum = CUDA_VALUE(0.0);
314  for (int sv_i = 0; sv_i < nsvs; sv_i++) {
315  cudafloat alpha_i = cuGetSampleAttribute(model, nsvs, sv_i, 0);
316  cudafloat y = cuGetSampleAttribute(model, nsvs, sv_i, 1);
317  cudafloat k_proj = cuKernelDotProductUsingModelOnly<kernel_type>(sv_i, sample_index, model, nsvs, ndims, kernel_args);
318  sum += alpha_i * y * k_proj;
319  }
320  return sum;
321  }
322 
323  template <svm_kernel_type kernel_type>
324  __global__ void cuCalculateOffsetsUsingModel(cudafloat * results, cudafloat * model, int nsvs, int ndims, cudafloat * kernel_args) {
325  int tid = cuGetThreadID();
326  if(tid < nsvs) {
327  cudafloat result = cuClassifierOutput<kernel_type>(tid, model, nsvs, ndims, kernel_args);
328  results[tid] = result;
329  }
330  }
331 
332  template <int blockSize>
333  __global__ void cuCalculateB_1stPass(cudafloat * offsets, cudafloat * bs, int n_svs) {
334  // local arrays (shared memory)
335  extern __shared__ cudafloat results[];
336 
337  // compute classification
338  int tid = cuGetThreadID();
339  cudafloat result = CUDA_VALUE(0.0);
340  if(tid < n_svs) {
341  result = offsets[tid];
342  }
343  results[threadIdx.x] = result;
344  __syncthreads();
345 
346  if (blockSize >= 1024) {
347  if (threadIdx.x < 512) {
348  results[threadIdx.x] += results[threadIdx.x + 512];
349  }
350  __syncthreads();
351  }
352 
353  if (blockSize >= 512) {
354  if (threadIdx.x < 256) {
355  results[threadIdx.x] += results[threadIdx.x + 256];
356  }
357  __syncthreads();
358  }
359 
360  if (blockSize >= 256) {
361  if (threadIdx.x < 128) {
362  results[threadIdx.x] += results[threadIdx.x + 128];
363  }
364  __syncthreads();
365  }
366 
367  if (blockSize >= 128) {
368  if (threadIdx.x < 64) {
369  results[threadIdx.x] += results[threadIdx.x + 64];
370  }
371  __syncthreads();
372  }
373 
374  if (threadIdx.x < 32) {
375  volatile cudafloat * _results = results;
376 
377  if (blockSize >= 64) {
378  _results[threadIdx.x] += _results[threadIdx.x + 32];
379  }
380  if (blockSize >= 32) {
381  _results[threadIdx.x] += _results[threadIdx.x + 16];
382  }
383  if (blockSize >= 16) {
384  _results[threadIdx.x] += _results[threadIdx.x + 8];
385  }
386  if (blockSize >= 8) {
387  _results[threadIdx.x] += _results[threadIdx.x + 4];
388  }
389  if (blockSize >= 4) {
390  _results[threadIdx.x] += _results[threadIdx.x + 2];
391  }
392  if (blockSize >= 2) {
393  _results[threadIdx.x] += _results[threadIdx.x + 1];
394  }
395 
396  if (threadIdx.x == 0) {
397  bs[blockIdx.x] = results[0];
398  }
399  }
400  }
401 
402  template<int blockSize>
403  KERNEL cuCalculateB_FinalPass(cudafloat * data, int size) {
404  extern __shared__ cudafloat
405  sum[];
406 
407  cudafloat value = CUDA_VALUE(0.0);
408  for (int i = threadIdx.x; i < size; i += blockDim.x)
409  value += data[i];
410  sum[threadIdx.x] = value;
411  __syncthreads();
412 
413  if (blockSize >= 1024) {
414  if (threadIdx.x < 512)
415  sum[threadIdx.x] += sum[threadIdx.x + 512];
416  __syncthreads();
417  }
418 
419  if (blockSize >= 512) {
420  if (threadIdx.x < 256)
421  sum[threadIdx.x] += sum[threadIdx.x + 256];
422  __syncthreads();
423  }
424 
425  if (blockSize >= 256) {
426  if (threadIdx.x < 128)
427  sum[threadIdx.x] += sum[threadIdx.x + 128];
428  __syncthreads();
429  }
430 
431  if (blockSize >= 128) {
432  if (threadIdx.x < 64)
433  sum[threadIdx.x] += sum[threadIdx.x + 64];
434  __syncthreads();
435  }
436 
437  if (threadIdx.x < 32) {
438  volatile cudafloat * _sum = sum;
439 
440  if (blockSize >= 64)
441  _sum[threadIdx.x] += _sum[threadIdx.x + 32];
442  if (blockSize >= 32)
443  _sum[threadIdx.x] += _sum[threadIdx.x + 16];
444  if (blockSize >= 16)
445  _sum[threadIdx.x] += _sum[threadIdx.x + 8];
446  if (blockSize >= 8)
447  _sum[threadIdx.x] += _sum[threadIdx.x + 4];
448  if (blockSize >= 4)
449  _sum[threadIdx.x] += _sum[threadIdx.x + 2];
450  if (blockSize >= 2)
451  _sum[threadIdx.x] += _sum[threadIdx.x + 1];
452 
453  if (threadIdx.x == 0) {
454  d_b = sum[0];
455  }
456  }
457  }
458 
459  /************************************************************************/
460  /* for all i=0...n-1
461  alpha_i=0 (and old alphas)
462  f_i=-y_i
463  */
464  /************************************************************************/
465  __global__ void cuInitializeSMO(cudafloat * alphas, cudafloat * f, int * classes, int _nsamples) {
466  int tid = cuGetThreadID();
467  // for all i = 0... (N-1)
468  if (tid < _nsamples) {
469  //alpha_i=0
470  alphas[tid] = CUDA_VALUE(0.0);
471  //f_i=-y_i
472  f[tid] = -classes[tid];
473  }
474  }
475 
476  template <svm_kernel_type kernel_type>
477  __global__ void cuUpdateAlphasSimple(cudafloat * x, cudafloat * alphas, int * y,
478  cudafloat constant_c_negative, cudafloat constant_c_positive, cudafloat * kernel_args,
479  int nsamples, int ndims) {
480  // get thread ID
481  int tid = cuGetThreadID();
482  // only one thread executes this
483  if(tid == 0) {
484 
485  // new alphas
486  cudafloat ailn;
487  cudafloat aihn;
488 
489  //copy data from device's memory to local registers
490  int il = d_i_low;
491  int ih = d_i_high;
492  cudafloat aiho = alphas[ih];
493  cudafloat ailo = alphas[il];
494  cudafloat bl = d_b_low;
495  cudafloat bh = d_b_high;
496 
497  // targets
498  int y_i_low = y[il];
499  int y_i_high = y[ih];
500 
501  // store old alphas for this iteration
502  d_alpha_i_low_old = ailo;
503  d_alpha_i_high_old = aiho;
504 
505  // kernel computations
506  cudafloat kxl_xl = cuKernelDotProduct<kernel_type>(il,il,x,nsamples,ndims,kernel_args);
507  cudafloat kxh_xh = cuKernelDotProduct<kernel_type>(ih,ih,x,nsamples,ndims,kernel_args);
508  cudafloat kxh_xl = cuKernelDotProduct<kernel_type>(ih,il,x,nsamples,ndims,kernel_args);
509 
510  // eta
511  cudafloat eta = kxh_xh + kxl_xl - CUDA_VALUE(2.0) * kxh_xl;
512 
513  // compute new alphas
514  ailn = d_alpha_i_low_old + y_i_low * (bh - bl) / eta;
515  aihn = d_alpha_i_high_old + y_i_low * y_i_high * (d_alpha_i_low_old - ailn);
516 
517  // clip alphas in range 0 <= a <= C
518  if (aihn < 0.0) {
519  aihn = CUDA_VALUE(0.0);
520  } else if (y_i_high == -1 && aihn > constant_c_negative) {
521  aihn = constant_c_negative;
522  } else if (y_i_high == 1 && aihn > constant_c_positive) {
523  aihn = constant_c_positive;
524  }
525 
526  if (ailn < CUDA_VALUE(0.0)) {
527  ailn = CUDA_VALUE(0.0);
528  } else if (y_i_low == -1 && ailn > constant_c_negative) {
529  ailn = constant_c_negative;
530  } else if (y_i_low == 1 && ailn > constant_c_positive) {
531  ailn = constant_c_positive;
532  }
533 
534  //store new alphas from registers back into device's memory
535  alphas[il] = ailn;
536  d_alpha_i_low_new = ailn;
537  alphas[ih] = aihn;
538  d_alpha_i_high_new = aihn;
539  }
540  }
541 
542  // updates alphas. Should be only executed by one thread (grid with one thread)
543  template <svm_kernel_type kernel_type>
544  __global__ void cuUpdateAlphasAdvanced(cudafloat * x, cudafloat * alphas, int * y,
545  cudafloat constant_c_negative, cudafloat constant_c_positive, cudafloat * kernel_args,
546  int nsamples, int ndims) {
547  // get thread ID
548  int tid = cuGetThreadID();
549  // only one thread executes this
550  if(tid == 0) {
551 
552  //ARGH
553  cudafloat constant_c = constant_c_negative;
554 
555  // new alphas
556  cudafloat ailn;
557  cudafloat aihn;
558 
559  //copy data from device's memory to local registers
560  int il = d_i_low;
561  int ih = d_i_high;
562  cudafloat aiho = alphas[ih];
563  cudafloat ailo = alphas[il];
564  cudafloat bl = d_b_low;
565  cudafloat bh = d_b_high;
566 
567  // targets
568  cudafloat y_i_low = y[il];
569  cudafloat y_i_high = y[ih];
570 
571  // store old alphas for this iteration
572  d_alpha_i_low_old = ailo;
573  d_alpha_i_high_old = aiho;
574 
575  // kernel computations
576  cudafloat kxl_xl = cuKernelDotProduct<kernel_type>(il,il,x,nsamples,ndims,kernel_args);
577  cudafloat kxh_xh = cuKernelDotProduct<kernel_type>(ih,ih,x,nsamples,ndims,kernel_args);
578  cudafloat kxh_xl = cuKernelDotProduct<kernel_type>(ih,il,x,nsamples,ndims,kernel_args);
579 
580  // eta
581  cudafloat eta = kxh_xh + kxl_xl - CUDA_VALUE(2.0) * kxh_xl;
582 
583  cudafloat alphadiff = ailo - aiho;
584  cudafloat sign = y_i_low * y_i_high;
585 
586  cudafloat alpha_l_upperbound, alpha_l_lowerbound;
587  if (sign < CUDA_VALUE(0.0)) {
588  if (alphadiff < 0) {
589  alpha_l_lowerbound = 0;
590  alpha_l_upperbound = constant_c + alphadiff;
591  } else {
592  alpha_l_lowerbound = alphadiff;
593  alpha_l_upperbound = constant_c;
594  }
595  } else {
596  double alpha_sum = ailo + aiho;
597  if (alpha_sum < constant_c) {
598  alpha_l_upperbound = alpha_sum;
599  alpha_l_lowerbound = 0;
600  } else {
601  alpha_l_lowerbound = alpha_sum - constant_c;
602  alpha_l_upperbound = constant_c;
603  }
604  }
605  if (eta > 0) {
606  ailn = ailo + y_i_low * (bh - bl) / eta;
607  if (ailn < alpha_l_lowerbound) {
608  ailn = alpha_l_lowerbound;
609  } else
610  if (ailn > alpha_l_upperbound) {
611  ailn = alpha_l_upperbound;
612  }
613  } else {
614  double slope = y_i_low * (bh - bl);
615  double delta = slope * (alpha_l_upperbound - alpha_l_lowerbound);
616  if (delta > 0) {
617  if (slope > 0) {
618  ailn = alpha_l_upperbound;
619  } else {
620  ailn = alpha_l_lowerbound;
621  }
622  } else {
623  ailn = ailo;
624  }
625  }
626  cudafloat alpha_l_diff = ailn - ailo;
627  cudafloat alpha_h_diff = -sign * alpha_l_diff;
628  aihn = aiho + alpha_h_diff;
629 
630  //store new alphas from registers back into device's memory
631  alphas[il] = ailn;
632  d_alpha_i_low_new = ailn;
633  alphas[ih] = aihn;
634  d_alpha_i_high_new = aihn;
635  }
636  }
637 
641  inline __device__ bool cufequal(double a, double b, double tolerance) {
642  double dif = fabs(a - b);
643  return dif < tolerance;
644  }
645 
646  template<int blockSize> KERNEL cuFirstOrderHeuristic1stPass(cudafloat * f, cudafloat * alphas, int * y, cudafloat * minimuns, int * min_indices,
647  cudafloat * maximuns, int * max_indices, int input_size, cudafloat constant_epsilon, cudafloat constant_c) {
648  // local arrays (shared memory)
649  extern __shared__ cudafloat minvalues[];
650  int * minposs = (int *) (minvalues + blockSize);
651  //-------------
652  cudafloat * maxvalues = (cudafloat*) (minposs + blockSize);
653  int * maxposs = (int *) (maxvalues + blockSize);
654  //-------------
655  int tid = cuGetThreadID();
656  //-------------
657  cudafloat min_value = MAX_CUDAFLOAT;
658  cudafloat max_value = -MAX_CUDAFLOAT;
659  //only work on I_low & I_high sets
660  if (tid < input_size) {
661  cudafloat alpha_i = alphas[tid];
662  int y_i = y[tid];
663 
664  //define sets
665  // I0 <- 0 < ai < C
666  // I1 <- yi > 0, ai = 0
667  // I2 <- yi < 0, ai = C
668  // I3 <- yi > 0, ai = C
669  // I4 <- yi < 0, ai = 0
670 
671  cudafloat constant_c_minus_eps = constant_c - constant_epsilon;
672 
673  bool I0 = (alpha_i > constant_epsilon) && (alpha_i < constant_c_minus_eps);
674  bool I1 = (y_i > 0) && (alpha_i < constant_epsilon);
675  bool I2 = (y_i < 0) && (alpha_i > constant_c_minus_eps);
676  bool I3 = (y_i > 0) && (alpha_i > constant_c_minus_eps);
677  bool I4 = (y_i < 0) && (alpha_i < constant_epsilon);
678 
679  //I_HIGH set
680  if(I0||I1||I2){
681  min_value = f[tid];
682  }
683  //I_LOW set
684  if(I0||I3||I4){
685  max_value = f[tid];
686  }
687  }
688  minvalues[threadIdx.x] = min_value;
689  maxvalues[threadIdx.x] = max_value;
690  minposs[threadIdx.x] = tid;
691  maxposs[threadIdx.x] = tid;
692 
693  __syncthreads();
694  //-------------
695 
696  if (blockSize >= 1024) {
697  if (threadIdx.x < 512) {
698  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 512]) {
699  minvalues[threadIdx.x] = minvalues[threadIdx.x + 512];
700  minposs[threadIdx.x] = minposs[threadIdx.x + 512];
701  }
702  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 512]) {
703  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 512];
704  maxposs[threadIdx.x] = maxposs[threadIdx.x + 512];
705  }
706  }
707  __syncthreads();
708  }
709 
710  if (blockSize >= 512) {
711  if (threadIdx.x < 256) {
712  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 256]) {
713  minvalues[threadIdx.x] = minvalues[threadIdx.x + 256];
714  minposs[threadIdx.x] = minposs[threadIdx.x + 256];
715  }
716  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 256]) {
717  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 256];
718  maxposs[threadIdx.x] = maxposs[threadIdx.x + 256];
719  }
720  }
721  __syncthreads();
722  }
723 
724  if (blockSize >= 256) {
725  if (threadIdx.x < 128) {
726  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 128]) {
727  minvalues[threadIdx.x] = minvalues[threadIdx.x + 128];
728  minposs[threadIdx.x] = minposs[threadIdx.x + 128];
729  }
730  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 128]) {
731  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 128];
732  maxposs[threadIdx.x] = maxposs[threadIdx.x + 128];
733  }
734  }
735  __syncthreads();
736  }
737 
738  if (blockSize >= 128) {
739  if (threadIdx.x < 64) {
740  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 64]) {
741  minvalues[threadIdx.x] = minvalues[threadIdx.x + 64];
742  minposs[threadIdx.x] = minposs[threadIdx.x + 64];
743  }
744  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 64]) {
745  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 64];
746  maxposs[threadIdx.x] = maxposs[threadIdx.x + 64];
747  }
748  }
749  __syncthreads();
750  }
751 
752  if (threadIdx.x < 32) {
753  volatile cudafloat * _minvalues = minvalues;
754  volatile int * _minposs = minposs;
755  volatile cudafloat * _maxvalues = maxvalues;
756  volatile int * _maxposs = maxposs;
757 
758  if (blockSize >= 64) {
759  if (_minvalues[threadIdx.x] > _minvalues[threadIdx.x + 32]) {
760  _minvalues[threadIdx.x] = _minvalues[threadIdx.x + 32];
761  _minposs[threadIdx.x] = _minposs[threadIdx.x + 32];
762  }
763  if (_maxvalues[threadIdx.x] < _maxvalues[threadIdx.x + 32]) {
764  _maxvalues[threadIdx.x] = _maxvalues[threadIdx.x + 32];
765  _maxposs[threadIdx.x] = _maxposs[threadIdx.x + 32];
766  }
767  }
768 
769  if (blockSize >= 32) {
770  if (_minvalues[threadIdx.x] > _minvalues[threadIdx.x + 16]) {
771  _minvalues[threadIdx.x] = _minvalues[threadIdx.x + 16];
772  _minposs[threadIdx.x] = _minposs[threadIdx.x + 16];
773  }
774  if (_maxvalues[threadIdx.x] < _maxvalues[threadIdx.x + 16]) {
775  _maxvalues[threadIdx.x] = _maxvalues[threadIdx.x + 16];
776  _maxposs[threadIdx.x] = _maxposs[threadIdx.x + 16];
777  }
778  }
779 
780  if (blockSize >= 16) {
781  if (_minvalues[threadIdx.x] > _minvalues[threadIdx.x + 8]) {
782  _minvalues[threadIdx.x] = _minvalues[threadIdx.x + 8];
783  _minposs[threadIdx.x] = _minposs[threadIdx.x + 8];
784  }
785  if (_maxvalues[threadIdx.x] < _maxvalues[threadIdx.x + 8]) {
786  _maxvalues[threadIdx.x] = _maxvalues[threadIdx.x + 8];
787  _maxposs[threadIdx.x] = _maxposs[threadIdx.x + 8];
788  }
789  }
790 
791  if (blockSize >= 8) {
792  if (_minvalues[threadIdx.x] > _minvalues[threadIdx.x + 4]) {
793  _minvalues[threadIdx.x] = _minvalues[threadIdx.x + 4];
794  _minposs[threadIdx.x] = _minposs[threadIdx.x + 4];
795  }
796  if (_maxvalues[threadIdx.x] < _maxvalues[threadIdx.x + 4]) {
797  _maxvalues[threadIdx.x] = _maxvalues[threadIdx.x + 4];
798  _maxposs[threadIdx.x] = _maxposs[threadIdx.x + 4];
799  }
800  }
801 
802  if (blockSize >= 4) {
803  if (_minvalues[threadIdx.x] > _minvalues[threadIdx.x + 2]) {
804  _minvalues[threadIdx.x] = _minvalues[threadIdx.x + 2];
805  _minposs[threadIdx.x] = _minposs[threadIdx.x + 2];
806  }
807  if (_maxvalues[threadIdx.x] < _maxvalues[threadIdx.x + 2]) {
808  _maxvalues[threadIdx.x] = _maxvalues[threadIdx.x + 2];
809  _maxposs[threadIdx.x] = _maxposs[threadIdx.x + 2];
810  }
811  }
812 
813  if (blockSize >= 2) {
814  if (_minvalues[threadIdx.x] > _minvalues[threadIdx.x + 1]) {
815  _minvalues[threadIdx.x] = _minvalues[threadIdx.x + 1];
816  _minposs[threadIdx.x] = _minposs[threadIdx.x + 1];
817  }
818  if (_maxvalues[threadIdx.x] < _maxvalues[threadIdx.x + 1]) {
819  _maxvalues[threadIdx.x] = _maxvalues[threadIdx.x + 1];
820  _maxposs[threadIdx.x] = _maxposs[threadIdx.x + 1];
821  }
822  }
823 
824  if (threadIdx.x == 0) {
825  minimuns[blockIdx.x] = minvalues[0];
826  min_indices[blockIdx.x] = minposs[0];
827  maximuns[blockIdx.x] = maxvalues[0];
828  max_indices[blockIdx.x] = maxposs[0];
829  }
830  }
831  }
832 
833  //just to be executed by one block! output on min_i max_i min max
834  template<int blockSize> KERNEL cuFirstOrderHeuristicFinalPass(cudafloat * minimuns_input, int * min_indices_input, cudafloat * maximuns_input,
835  int * max_indices_input, int input_size) {
836 
837  // local arrays (shared memory)
838  extern __shared__ cudafloat
839  minvalues[];
840  int * minposs = (int *) (minvalues + blockDim.x);
841  //-------------
842  cudafloat * maxvalues = (cudafloat*) (minposs + blockDim.x);
843  int * maxposs = (int *) (maxvalues + blockDim.x);
844  //-------------
845  minvalues[threadIdx.x] = MAX_CUDAFLOAT;
846  maxvalues[threadIdx.x] = -MAX_CUDAFLOAT;
847  minposs[threadIdx.x] = threadIdx.x;
848  maxposs[threadIdx.x] = threadIdx.x;
849 
850  //-----------------
851  for (int i = threadIdx.x; i < input_size; i += blockDim.x) {
852  if (minvalues[threadIdx.x] > minimuns_input[i]) {
853  minvalues[threadIdx.x] = minimuns_input[i];
854  minposs[threadIdx.x] = min_indices_input[i];
855  }
856  if (maxvalues[threadIdx.x] < maximuns_input[i]) {
857  maxvalues[threadIdx.x] = maximuns_input[i];
858  maxposs[threadIdx.x] = max_indices_input[i];
859  }
860  }
861  __syncthreads();
862 
863  if (blockSize >= 1024) {
864  if (threadIdx.x < 512) {
865  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 512]) {
866  minvalues[threadIdx.x] = minvalues[threadIdx.x + 512];
867  minposs[threadIdx.x] = minposs[threadIdx.x + 512];
868  }
869  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 512]) {
870  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 512];
871  maxposs[threadIdx.x] = maxposs[threadIdx.x + 512];
872  }
873  }
874  __syncthreads();
875  }
876 
877  if (blockSize >= 512) {
878  if (threadIdx.x < 256) {
879  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 256]) {
880  minvalues[threadIdx.x] = minvalues[threadIdx.x + 256];
881  minposs[threadIdx.x] = minposs[threadIdx.x + 256];
882  }
883  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 256]) {
884  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 256];
885  maxposs[threadIdx.x] = maxposs[threadIdx.x + 256];
886  }
887  }
888  __syncthreads();
889  }
890 
891  if (blockSize >= 256) {
892  if (threadIdx.x < 128) {
893  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 128]) {
894  minvalues[threadIdx.x] = minvalues[threadIdx.x + 128];
895  minposs[threadIdx.x] = minposs[threadIdx.x + 128];
896  }
897  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 128]) {
898  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 128];
899  maxposs[threadIdx.x] = maxposs[threadIdx.x + 128];
900  }
901  }
902  __syncthreads();
903  }
904 
905  if (blockSize >= 128) {
906  if (threadIdx.x < 64) {
907  if (minvalues[threadIdx.x] > minvalues[threadIdx.x + 64]) {
908  minvalues[threadIdx.x] = minvalues[threadIdx.x + 64];
909  minposs[threadIdx.x] = minposs[threadIdx.x + 64];
910  }
911  if (maxvalues[threadIdx.x] < maxvalues[threadIdx.x + 64]) {
912  maxvalues[threadIdx.x] = maxvalues[threadIdx.x + 64];
913  maxposs[threadIdx.x] = maxposs[threadIdx.x + 64];
914  }
915  }
916  __syncthreads();
917  }
918 
919  if (threadIdx.x < 32) {
920  volatile cudafloat * _minvalue = minvalues;
921  volatile int * _minpos = minposs;
922  volatile cudafloat * _maxvalue = maxvalues;
923  volatile int * _maxpos = maxposs;
924 
925  if (blockSize >= 64) {
926  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 32]) {
927  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 32];
928  _minpos[threadIdx.x] = _minpos[threadIdx.x + 32];
929  }
930  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 32]) {
931  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 32];
932  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 32];
933  }
934  }
935 
936  if (blockSize >= 32) {
937  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 16]) {
938  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 16];
939  _minpos[threadIdx.x] = _minpos[threadIdx.x + 16];
940  }
941  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 16]) {
942  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 16];
943  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 16];
944  }
945  }
946 
947  if (blockSize >= 16) {
948  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 8]) {
949  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 8];
950  _minpos[threadIdx.x] = _minpos[threadIdx.x + 8];
951  }
952  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 8]) {
953  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 8];
954  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 8];
955  }
956  }
957 
958  if (blockSize >= 8) {
959  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 4]) {
960  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 4];
961  _minpos[threadIdx.x] = _minpos[threadIdx.x + 4];
962  }
963  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 4]) {
964  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 4];
965  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 4];
966  }
967  }
968 
969  if (blockSize >= 4) {
970  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 2]) {
971  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 2];
972  _minpos[threadIdx.x] = _minpos[threadIdx.x + 2];
973  }
974  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 2]) {
975  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 2];
976  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 2];
977  }
978  }
979 
980  if (blockSize >= 2) {
981  if (_minvalue[threadIdx.x] > _minvalue[threadIdx.x + 1]) {
982  _minvalue[threadIdx.x] = _minvalue[threadIdx.x + 1];
983  _minpos[threadIdx.x] = _minpos[threadIdx.x + 1];
984  }
985  if (_maxvalue[threadIdx.x] < _maxvalue[threadIdx.x + 1]) {
986  _maxvalue[threadIdx.x] = _maxvalue[threadIdx.x + 1];
987  _maxpos[threadIdx.x] = _maxpos[threadIdx.x + 1];
988  }
989  }
990 
991  if (threadIdx.x == 0) {
992  //high from min Ihigh
993  d_b_high = minvalues[0];
994  d_i_high = minposs[0];
995  //low from max Ilow
996  d_b_low = maxvalues[0];
997  d_i_low = maxposs[0];
998  }
999  }
1000  }
1001 }
1002 
1003 #endif
#define CUDA_TANH
#define CUDA_POW
#define KERNEL
Defines the type of a kernel function.
#define MAX_CUDAFLOAT
#define CUDA_VALUE(X)
#define CUDA_EXP
float cudafloat