GPUMLib  0.2.2
GPU Machine Learning Library
SVM.h
1 /*
2 Joao Goncalves is a MSc Student at the University of Coimbra, Portugal
3 Copyright (C) 2012 Joao Goncalves
4 
5 This file is part of GPUMLib.
6 
7 GPUMLib is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11 
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef SVM_H_
22 #define SVM_H_
23 
24 #ifndef DEBUG
25 #define DEBUG 1
27 #endif
28 
29 #include "svm_kernel_type.h"
30 
31 //GPUMLib stuff
32 #include "../common/CudaDefinitions.h"
33 #include "../common/Utilities.h"
34 #include "../memory/DeviceArray.h"
35 #include "../memory/DeviceMatrix.h"
36 #include "../memory/HostArray.h"
37 #include "../memory/HostMatrix.h"
38 #include <cuda.h>
39 
40 #include "SVM_cuda_code.h"
41 
42 namespace GPUMLib {
43 
46 
48  class SVM {
49  public:
50 
52  SVM(){
53  //do nothing
54  }
55 
61  int getSupportVectorIndices(GPUMLib::HostArray<cudafloat> & h_alphas, int * alpha_indices, int size) {
62  int non_zero_counter = 0;
63  for (int i = 0; i < size; i++) {
64  double alpha = h_alphas[i];
65  if (alpha > 0.0) {
66  alpha_indices[non_zero_counter] = i;
67  non_zero_counter++;
68  }
69  }
70  return non_zero_counter;
71  }
72 
80  int findMinimumPositionTarget_HostArray(GPUMLib::HostArray<int> & array, int array_length, int target) {
81  for (int i = 0; i < array_length; i++) {
82  int val = array[i];
83  //cout << i << ":" << val << endl;
84  if (val == target) {
85  return i;
86  }
87  }
88  return -1;
89  }
90 
101  void calculateB_1stPass(cudaStream_t stream,
102  int blocks, int blockSize,
103  cudafloat * offsets,
104  cudafloat * results,
105  int n_svs) {
106  if (blockSize == 1024) {
107  cuCalculateB_1stPass <1024><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
108  } else if(blockSize == 512) {
109  cuCalculateB_1stPass<512><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
110  } else if(blockSize == 256) {
111  cuCalculateB_1stPass<256><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
112  } else if(blockSize == 128) {
113  cuCalculateB_1stPass<128><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
114  } else if(blockSize == 64) {
115  cuCalculateB_1stPass<64><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
116  } else if(blockSize == 32) {
117  cuCalculateB_1stPass<32><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
118  } else if(blockSize == 16) {
119  cuCalculateB_1stPass<16><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
120  } else if(blockSize == 8) {
121  cuCalculateB_1stPass<8><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
122  } else if(blockSize == 4) {
123  cuCalculateB_1stPass<4><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
124  } else if(blockSize == 2) {
125  cuCalculateB_1stPass<2><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
126  } else if(blockSize == 1) {
127  cuCalculateB_1stPass<1><<<blocks, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(offsets, results, n_svs);
128  }
129  }
130 
138  void calculateB_FinalPass(cudaStream_t stream, int blockSize, cudafloat * input_floats, int input_size) {
139  switch (blockSize) {
140  case 1024:
141  cuCalculateB_FinalPass<1024><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
142  break;
143  case 512:
144  cuCalculateB_FinalPass<512><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
145  break;
146  case 256:
147  cuCalculateB_FinalPass<256><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
148  break;
149  case 128:
150  cuCalculateB_FinalPass<128><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
151  break;
152  case 64:
153  cuCalculateB_FinalPass<64><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);;
154  break;
155  case 32:
156  cuCalculateB_FinalPass<32><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
157  break;
158  case 16:
159  cuCalculateB_FinalPass<16><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
160  break;
161  case 8:
162  cuCalculateB_FinalPass<8><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
163  break;
164  case 4:
165  cuCalculateB_FinalPass<4><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
166  break;
167  case 2:
168  cuCalculateB_FinalPass<2><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
169  break;
170  case 1:
171  cuCalculateB_FinalPass<1><<<1, blockSize, blockSize * (sizeof(cudafloat)), stream>>>(input_floats, input_size);
172  break;
173  }
174  }
175 
196  void kernelFirstOrderHeuristic1stPass(cudaStream_t stream, int blocks, int blockSize,
197  cudafloat * f, cudafloat * alphas,
198  int * y, cudafloat * minimuns, int * min_indices,
199  cudafloat * maximuns, int * max_indices, int input_size,
200  cudafloat constant_epsilon, cudafloat constant_c) {
201  switch (blockSize) {
202  case 1024:
203  cuFirstOrderHeuristic1stPass<1024><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
204  break;
205  case 512:
206  cuFirstOrderHeuristic1stPass<512><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
207  break;
208  case 256:
209  cuFirstOrderHeuristic1stPass<256><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
210  break;
211  case 128:
212  cuFirstOrderHeuristic1stPass<128><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
213  break;
214  case 64:
215  cuFirstOrderHeuristic1stPass<64><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
216  break;
217  case 32:
218  cuFirstOrderHeuristic1stPass<32><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
219  break;
220  case 16:
221  cuFirstOrderHeuristic1stPass<16><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
222  break;
223  case 8:
224  cuFirstOrderHeuristic1stPass<8><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
225  break;
226  case 4:
227  cuFirstOrderHeuristic1stPass<4><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
228  break;
229  case 2:
230  cuFirstOrderHeuristic1stPass<2><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
231  break;
232  case 1:
233  cuFirstOrderHeuristic1stPass<1><<<blocks, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(f, alphas, y, minimuns, min_indices, maximuns, max_indices, input_size, constant_epsilon, constant_c);
234  break;
235  }
236  }
237 
252  void kernelFirstOrderHeuristicFinalPass(cudaStream_t stream, int blockSize,
253  cudafloat * minimuns_input, int * min_indices_input,
254  cudafloat * maximuns_input, int * max_indices_input,
255  int input_size) {
256  switch (blockSize) {
257  case 1024:
258  cuFirstOrderHeuristicFinalPass<1024><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
259  break;
260  case 512:
261  cuFirstOrderHeuristicFinalPass<512><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
262  break;
263  case 256:
264  cuFirstOrderHeuristicFinalPass<256><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
265  break;
266  case 128:
267  cuFirstOrderHeuristicFinalPass<128><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
268  break;
269  case 64:
270  cuFirstOrderHeuristicFinalPass<64><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);;
271  break;
272  case 32:
273  cuFirstOrderHeuristicFinalPass<32><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
274  break;
275  case 16:
276  cuFirstOrderHeuristicFinalPass<16><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
277  break;
278  case 8:
279  cuFirstOrderHeuristicFinalPass<8><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
280  break;
281  case 4:
282  cuFirstOrderHeuristicFinalPass<4><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
283  break;
284  case 2:
285  cuFirstOrderHeuristicFinalPass<2><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
286  break;
287  case 1:
288  cuFirstOrderHeuristicFinalPass<1><<<1, blockSize, blockSize * (sizeof(cudafloat) + sizeof(int) + sizeof(cudafloat) + sizeof(int)), stream>>>(minimuns_input, min_indices_input, maximuns_input, max_indices_input, input_size);
289  break;
290  }
291  }
292 
307  void updateAlphas(cudaStream_t stream, GPUMLib::svm_kernel_type kernel_type,
309  GPUMLib::DeviceArray<int> &d_y, cudafloat constant_c_negative, cudafloat constant_c_positive,
310  GPUMLib::DeviceArray<cudafloat> &d_kernel_args, int training_dataset_size, int ndims) {
311  switch (kernel_type) {
312  case SVM_KT_LINEAR:
313  cuUpdateAlphasAdvanced<SVM_KT_LINEAR> <<<1,1, 0, stream>>> (d_x.Pointer(),d_alphas.Pointer(),d_y.Pointer(),
314  constant_c_negative, constant_c_positive, d_kernel_args.Pointer(),
315  training_dataset_size, ndims);
316  break;
317  case SVM_KT_POLYNOMIAL:
318  cuUpdateAlphasAdvanced<SVM_KT_POLYNOMIAL> <<<1,1, 0, stream>>> (d_x.Pointer(),d_alphas.Pointer(),d_y.Pointer(),
319  constant_c_negative, constant_c_positive, d_kernel_args.Pointer(),
320  training_dataset_size, ndims);
321  break;
322  case SVM_KT_RBF:
323  cuUpdateAlphasAdvanced<SVM_KT_RBF> <<<1,1, 0, stream>>> (d_x.Pointer(),d_alphas.Pointer(),d_y.Pointer(),
324  constant_c_negative, constant_c_positive, d_kernel_args.Pointer(),
325  training_dataset_size, ndims);
326  break;
327  case SVM_KT_SIGMOID:
328  cuUpdateAlphasAdvanced<SVM_KT_SIGMOID> <<<1,1, 0, stream>>> (d_x.Pointer(),d_alphas.Pointer(),d_y.Pointer(),
329  constant_c_negative, constant_c_positive, d_kernel_args.Pointer(),
330  training_dataset_size, ndims);
331  break;
332  case SVM_KT_UKF:
333  cuUpdateAlphasAdvanced<SVM_KT_UKF> <<<1,1, 0, stream>>> (d_x.Pointer(),d_alphas.Pointer(),d_y.Pointer(),
334  constant_c_negative, constant_c_positive, d_kernel_args.Pointer(),
335  training_dataset_size, ndims);
336  break;
337  }
338  }
339 
354  void updateKKTConditions(cudaStream_t stream, GPUMLib::svm_kernel_type kernel_type, int n_blocks, int blocksize,
357  int training_dataset_size, int ndims) {
358  switch (kernel_type) {
359  case SVM_KT_LINEAR:
360  cuUpdateKKTConditions<SVM_KT_LINEAR> <<< n_blocks, blocksize, 0, stream >>>(d_f.Pointer(), d_y.Pointer(), d_x.Pointer(),d_kernel_args.Pointer(),training_dataset_size,ndims);
361  break;
362  case SVM_KT_POLYNOMIAL:
363  cuUpdateKKTConditions<SVM_KT_POLYNOMIAL><<< n_blocks, blocksize, 0, stream >>>(d_f.Pointer(), d_y.Pointer(), d_x.Pointer(),d_kernel_args.Pointer(),training_dataset_size,ndims);
364  break;
365  case SVM_KT_RBF:
366  cuUpdateKKTConditions<SVM_KT_RBF><<< n_blocks, blocksize, 0, stream >>>(d_f.Pointer(), d_y.Pointer(), d_x.Pointer(),d_kernel_args.Pointer(),training_dataset_size,ndims);
367  break;
368  case SVM_KT_SIGMOID:
369  cuUpdateKKTConditions<SVM_KT_SIGMOID><<< n_blocks, blocksize, 0, stream >>>(d_f.Pointer(), d_y.Pointer(), d_x.Pointer(),d_kernel_args.Pointer(),training_dataset_size,ndims);
370  break;
371  case SVM_KT_UKF:
372  cuUpdateKKTConditions<SVM_KT_UKF><<< n_blocks, blocksize, 0, stream >>>(d_f.Pointer(), d_y.Pointer(), d_x.Pointer(),d_kernel_args.Pointer(),training_dataset_size,ndims);
373  break;
374  }
375  }
376 
381  // check for errors
382  cudaError_t error = cudaGetLastError();
383  if (error != cudaSuccess) {
384  // print the CUDA error message and exit
385  printf("CUDA error: %s\n", cudaGetErrorString(error));
386  exit(-1);
387  }
388  }
389 
406  cudafloat constant_c_negative, cudafloat constant_c_positive,
407  cudafloat constant_epsilon, cudafloat constant_tau, GPUMLib::svm_kernel_type kernel_type,
408  cudafloat * kernel_args, int amount_threads) {
409 
410  int training_dataset_size = h_x.Rows();
411  int ndims = h_x.Columns();
412 
413  if (DEBUG)
414  cout << "started SMO..." << endl;
415 
416  int n_threads_per_block = NumberThreadsPerBlockThatBestFit(training_dataset_size, amount_threads);
417  if (n_threads_per_block < 64)
418  n_threads_per_block = 64;
419  int n_blocks = NumberBlocks(training_dataset_size, n_threads_per_block);
420  int n_threads_per_block_2ndpass = NumberThreadsPerBlockThatBestFit(n_blocks, n_threads_per_block);
421  if (n_threads_per_block_2ndpass < 64)
422  n_threads_per_block_2ndpass = 64;
423 
424  cout << "using " << n_blocks << " block(s) of " << n_threads_per_block << " threads each" << endl;
425  cout << "using " << n_threads_per_block_2ndpass << " threads per block on second reduction" << endl;
426 
427  // create data structures in host's memory
428  GPUMLib::HostArray < cudafloat > h_f(training_dataset_size);
429 
430  GPUMLib::HostArray < cudafloat > h_kernel_args(4);
431  h_kernel_args[0] = (cudafloat) kernel_args[0];
432  h_kernel_args[1] = (cudafloat) kernel_args[1];
433  h_kernel_args[2] = (cudafloat) kernel_args[2];
434  h_kernel_args[3] = (cudafloat) kernel_args[3];
435  GPUMLib::DeviceArray < cudafloat > d_kernel_args(h_kernel_args);
436 
437  //copy to device
438  GPUMLib::DeviceArray<int> d_y(h_y);
441 
442  cudafloat constant_c = min(constant_c_negative, constant_c_positive);
443 
444  //1. initialize
445  int iteration = 0;
446  //bhigh = -1
447  cudafloat h_b_high = CUDA_VALUE(-1.0);
448  //blow = 1
449  cudafloat h_b_low = CUDA_VALUE(1.0);
450 
451  //we can use the host for this, no need to use the GPU
452  //ihigh = min{i : yi = 1}
453  int i_high = findMinimumPositionTarget_HostArray(h_y, training_dataset_size, 1);
454 
455  //ilow = max{i : yi = -1}
456  int i_low = findMinimumPositionTarget_HostArray(h_y, training_dataset_size, -1);
457 
458  //make sure everything is OK
459  if (i_high < 0 || i_low < 0) {
460  cout << "Err: couldn't initialize SMO's indices.." << endl;
461  assert(i_high >= 0);
462  assert(i_low >= 0);
463  }
464 
466  GPUMLib::DeviceArray < cudafloat > d_minimums(n_blocks);
467  GPUMLib::DeviceArray < cudafloat > d_maximuns(n_blocks);
468  GPUMLib::DeviceArray<int> d_minimums_indices(n_blocks);
469  GPUMLib::DeviceArray<int> d_maximums_indices(n_blocks);
470 
472  cudaStream_t stream_memory_transaction;
473  cudaStreamCreate(&stream_memory_transaction);
474  cudaStream_t stream_kernel_execution = 0;
475  // cudaStreamCreate(&stream_kernel_execution);
476 
477  // initialize alphas and optimality conditions
478  cuInitializeSMO<<< n_blocks, n_threads_per_block >>>(d_alphas.Pointer(), d_f.Pointer(), d_y.Pointer(), training_dataset_size);
480 
481  //copy low/high indices to device's memory
482  cudaMemcpyToSymbol(d_i_high, &i_high, sizeof(int));
483  cudaMemcpyToSymbol(d_i_low, &i_low, sizeof(int));
484  //copy low/high offsets to device's memory
485  cudaMemcpyToSymbol(d_b_high, &h_b_high, sizeof(cudafloat));
486  cudaMemcpyToSymbol(d_b_low, &h_b_low, sizeof(cudafloat));
487 
488  // update alphas before entering loops (only one thread required)
489  updateAlphas(stream_kernel_execution, kernel_type, d_x, d_alphas, d_y, constant_c, constant_c, d_kernel_args, training_dataset_size, ndims);
491 
492  // cout << "----started SMO" << endl;
493  bool converged = false;
494  bool memory_transaction_requested = false;
495 
496  int last_i_low = -1;
497  int last_i_high = -1;
498 
499  int convergence_checker = 0;
500 
501  // SMO loop
502  for (;;) {
503  if(convergence_checker>=8){
504  convergence_checker=0;
505  if (!memory_transaction_requested) {
506  //copy low/high offsets from device's memory
507  cudaMemcpyFromSymbolAsync(&h_b_high, d_b_high, sizeof(cudafloat), 0, cudaMemcpyDeviceToHost, stream_memory_transaction);
508  cudaMemcpyFromSymbolAsync(&h_b_low, d_b_low, sizeof(cudafloat), 0, cudaMemcpyDeviceToHost, stream_memory_transaction);
509  cudaMemcpyFromSymbolAsync(&i_low, d_i_low, sizeof(cudafloat), 0, cudaMemcpyDeviceToHost, stream_memory_transaction);
510  cudaMemcpyFromSymbolAsync(&i_high, d_i_high, sizeof(cudafloat), 0, cudaMemcpyDeviceToHost, stream_memory_transaction);
511  memory_transaction_requested = true;
512  // cudaStreamSynchronize(stream_memory_transaction);
513  } else {
514  if (cudaStreamQuery(stream_memory_transaction) == cudaSuccess) {
515  memory_transaction_requested = false;
516 
517  // DEBUG
518  // if ((iteration & 256) && !(iteration & 128) && !(iteration & 64) && !(iteration & 32) && !(iteration & 16) && !(iteration & 8) && !(iteration & 4)
519  // && !(iteration & 2) && !(iteration & 1)) {
520  // cout << "iteration:" << iteration << "\tgap:" << h_b_low - h_b_high <<
521  // "\tb_low:" << h_b_low << "\tb_high:" << h_b_high <<
522  // "\ti_low:" << i_low << "\ti_high:" << i_high << endl;
523  // }
524 
525  //two chosen alphas did not change for one iteration
526  //TODO: this must be solved
527  if(i_low==last_i_low && i_high==last_i_high){
528  //unless the heuristic is changed, nothing to be done
529  //terminate SMO
530  converged = true;
531  }
532  last_i_low=i_low;
533  last_i_high=i_high;
534 
535  if (h_b_low <= h_b_high + constant_tau || converged) {
536  // cout << "----SMO converged!" << endl;
537  cout << "Converged! iteration:" << iteration << "\tgap:" << h_b_low - h_b_high << "\tb_low:" << h_b_low << "\tb_high:" << h_b_high << "\ti_low:"
538  << i_low << "\ti_high:" << i_high << endl;
539  converged = true;
540  }
541  }
542  }
543  }
544  convergence_checker++;
545  //checkCUDA_Errors();
546 
547  if (converged)
548  break;
549 
550  //check optimality conditions
551  //update f_i for all i = 0...n-1
552  updateKKTConditions(stream_kernel_execution, kernel_type, n_blocks, n_threads_per_block, d_f, d_y, d_x, d_kernel_args, training_dataset_size, ndims);
554 
555  //compute b_high, i_high, b_low, i_low
556  //using I_high and I_low sets
557  kernelFirstOrderHeuristic1stPass(stream_kernel_execution, n_blocks, n_threads_per_block, d_f.Pointer(), d_alphas.Pointer(), d_y.Pointer(),
558  d_minimums.Pointer(), d_minimums_indices.Pointer(), d_maximuns.Pointer(), d_maximums_indices.Pointer(), training_dataset_size, constant_epsilon,
559  constant_c);
561  kernelFirstOrderHeuristicFinalPass(stream_kernel_execution, n_threads_per_block_2ndpass, d_minimums.Pointer(), d_minimums_indices.Pointer(),
562  d_maximuns.Pointer(), d_maximums_indices.Pointer(), n_blocks);
564 
565  //update the two lagrange multipliers
566  updateAlphas(stream_kernel_execution, kernel_type, d_x, d_alphas, d_y, constant_c, constant_c, d_kernel_args, training_dataset_size, ndims);
568 
569  iteration++;
570  }
571 
572  puts("computing b");
573 
574  cudaStreamSynchronize(stream_memory_transaction);
575  cudaStreamSynchronize(stream_kernel_execution);
576 
577  if (stream_memory_transaction != 0)
578  cudaStreamDestroy(stream_memory_transaction);
579  if (stream_kernel_execution != 0)
580  cudaStreamDestroy(stream_kernel_execution);
582 
583  cout << "total iterations: " << iteration << endl;
584  }
585 
605  cudafloat constant_c_negative, cudafloat constant_c_positive,
606  cudafloat constant_epsilon, cudafloat constant_tau,
607  svm_kernel_type kernel_type, cudafloat * kernel_args,
608  int amount_threads, GPUMLib::HostArray<cudafloat> &h_alphas,
609  int &n_sv, GPUMLib::HostMatrix<cudafloat> &h_model,
610  cudafloat &h_b )
611  {
612  //allocate array on the device to handle the alphas for each sample
613  GPUMLib::DeviceArray<cudafloat> d_alphas(h_alphas);
614 
615  // "fire for effect!" (SMO)
616  runSMO(h_samples, h_classes, d_alphas,
617  constant_c_negative, constant_c_positive, constant_epsilon, constant_tau,
618  kernel_type, kernel_args, amount_threads);
619 
620  // copy alphas from device's memory to host
621  h_alphas = d_alphas;
622 
623  //compress alphas array (only store alphas greater than 0)
624  if (DEBUG)
625  cout << "creating model..." << endl;
626  int training_dataset_size = h_samples.Rows();
627  int * alpha_indices = new int[training_dataset_size];
628  n_sv = getSupportVectorIndices(h_alphas, alpha_indices, training_dataset_size);
629  cout << n_sv << " SVs" << endl;
630 
631  // populate model matrix
632  int ndims = h_samples.Columns();
633  h_model.ResizeWithoutPreservingData(n_sv, ndims + 2);
634  for (int row = 0; row < n_sv; row++) {
635  //the index of current non zero alpha (support vector) on the original dataset
636  int index = alpha_indices[row];
637  //the value of alpha (lagrange multiplier)
638  cudafloat alpha_i = h_alphas[index];
639  //set alpha on model
640  h_model(row, 0) = alpha_i;
641  //the class associated with current alpha
642  int c_i = h_classes[index];
643  //set class on model
644  h_model(row, 1) = (cudafloat) c_i;
645  //set the remaining elements as the features
646  for (int feature_i = 0; feature_i < ndims; feature_i++) {
647  //get the original attribute
648  cudafloat attribute = h_samples(index, feature_i);
649  //copy to the model
650  h_model(row, feature_i + 2) = attribute;
651  }
652  }
653  delete alpha_indices;
654 
655  cudaStream_t stream_bias_calculus;
656  cudaStreamCreate(&stream_bias_calculus);
657 
658  GPUMLib::DeviceMatrix<cudafloat> d_model(h_model);
659  // compute bias using model
660  GPUMLib::HostArray<cudafloat> h_offsets(n_sv);
661  GPUMLib::DeviceArray<cudafloat> d_offsets(h_offsets);
662 
663  GPUMLib::HostArray<cudafloat> h_kernel_args(4);
664  h_kernel_args[0] = (cudafloat) kernel_args[0];
665  h_kernel_args[1] = (cudafloat) kernel_args[1];
666  h_kernel_args[2] = (cudafloat) kernel_args[2];
667  h_kernel_args[3] = (cudafloat) kernel_args[3];
668  GPUMLib::DeviceArray<cudafloat> d_kernel_args(h_kernel_args);
669 
670  int n_threads_per_block = NumberThreadsPerBlockThatBestFit(n_sv, amount_threads);
671  int n_blocks = NumberBlocks(n_sv, n_threads_per_block);
672  int n_threads_per_block_2ndpass = NumberThreadsPerBlockThatBestFit(n_blocks, n_threads_per_block);
673  if(n_threads_per_block_2ndpass < 64)
674  n_threads_per_block_2ndpass = 64;
675 
676  //printf("n_blocks:%d\tn_threads_per_block:%d\tn_threads_per_block_2ndpass:%d\n",n_blocks,n_threads_per_block,n_threads_per_block_2ndpass);
677 
678  //compute offset on the GPU
679  switch (kernel_type){
680  case SVM_KT_LINEAR:
681  cuCalculateOffsetsUsingModel<SVM_KT_LINEAR> <<<n_blocks, n_threads_per_block, 0, stream_bias_calculus>>>
682  (d_offsets.Pointer(), d_model.Pointer(), n_sv, ndims, d_kernel_args.Pointer());
683  break;
684  case SVM_KT_POLYNOMIAL:
685  cuCalculateOffsetsUsingModel<SVM_KT_POLYNOMIAL> <<<n_blocks, n_threads_per_block, 0, stream_bias_calculus>>>
686  (d_offsets.Pointer(), d_model.Pointer(), n_sv, ndims, d_kernel_args.Pointer());
687  break;
688  case SVM_KT_RBF:
689  cuCalculateOffsetsUsingModel<SVM_KT_RBF> <<<n_blocks, n_threads_per_block, 0, stream_bias_calculus>>>
690  (d_offsets.Pointer(), d_model.Pointer(), n_sv, ndims, d_kernel_args.Pointer());
691  break;
692  case SVM_KT_SIGMOID:
693  cuCalculateOffsetsUsingModel<SVM_KT_SIGMOID> <<<n_blocks, n_threads_per_block, 0, stream_bias_calculus>>>
694  (d_offsets.Pointer(), d_model.Pointer(), n_sv, ndims, d_kernel_args.Pointer());
695  break;
696  case SVM_KT_UKF:
697  cuCalculateOffsetsUsingModel<SVM_KT_UKF> <<<n_blocks, n_threads_per_block, 0, stream_bias_calculus>>>
698  (d_offsets.Pointer(), d_model.Pointer(), n_sv, ndims, d_kernel_args.Pointer());
699  break;
700  }
702 
703  h_offsets = d_offsets;
704 
705  GPUMLib::DeviceArray<cudafloat> d_partialsums(n_blocks);
706 
707  calculateB_1stPass(stream_bias_calculus, n_blocks, n_threads_per_block, d_offsets.Pointer(), d_partialsums.Pointer(), n_sv);
708  cudaStreamSynchronize(stream_bias_calculus);
709  calculateB_FinalPass(stream_bias_calculus, n_threads_per_block_2ndpass, d_partialsums.Pointer(), n_blocks);
710  cudaStreamSynchronize(stream_bias_calculus);
711  cudaMemcpyFromSymbol(&h_b, d_b, sizeof(cudafloat), 0, cudaMemcpyDeviceToHost);
712  h_b = h_b / n_sv;
713  cout << "bias: " << h_b << " nSVs: " << n_sv << endl;
714 
715  if(stream_bias_calculus != 0){
716  cudaStreamSynchronize(stream_bias_calculus);
717  cudaStreamDestroy(stream_bias_calculus);
718  }
719 
720  //TODO: that shared memory bug
721  // checkCUDA_Errors();
722  }
723 
738  cudafloat * kernel_args, int amount_threads,
739  GPUMLib::svm_kernel_type kernel_type, int n_sv, cudafloat h_b, int ndims,
740  GPUMLib::HostArray<int> &h_testing_results )
741  {
742  //create GPU structures
743  GPUMLib::DeviceMatrix<cudafloat> d_model(h_model);
744  GPUMLib::DeviceMatrix<cudafloat> d_testing_samples(h_testing_samples);
745  GPUMLib::DeviceArray<int> d_testing_results(h_testing_results);
746 
747  GPUMLib::HostArray<cudafloat> h_kernel_args(4);
748  h_kernel_args[0] = (cudafloat) kernel_args[0];
749  h_kernel_args[1] = (cudafloat) kernel_args[1];
750  h_kernel_args[2] = (cudafloat) kernel_args[2];
751  h_kernel_args[3] = (cudafloat) kernel_args[3];
752  GPUMLib::DeviceArray<cudafloat> d_kernel_args(h_kernel_args);
753 
754  int testing_dataset_size=h_testing_samples.Rows();
755  int n_threads_per_block = NumberThreadsPerBlockThatBestFit(testing_dataset_size, amount_threads);
756  int n_blocks = NumberBlocks(testing_dataset_size, n_threads_per_block);
757 
758  //process dataset on the GPU
759  switch (kernel_type){
760  case SVM_KT_LINEAR:
761  cuClassifyDataSet<SVM_KT_LINEAR> <<<n_blocks, n_threads_per_block>>>
762  (d_testing_results.Pointer(), d_testing_samples.Pointer(), testing_dataset_size, d_model.Pointer(), n_sv, h_b, ndims, d_kernel_args.Pointer());
763  break;
764  case SVM_KT_POLYNOMIAL:
765  cuClassifyDataSet<SVM_KT_POLYNOMIAL> <<<n_blocks, n_threads_per_block>>>(d_testing_results.Pointer(), d_testing_samples.Pointer(), testing_dataset_size, d_model.Pointer(), n_sv, h_b, ndims, d_kernel_args.Pointer());
766  break;
767  case SVM_KT_RBF:
768  cuClassifyDataSet<SVM_KT_RBF> <<<n_blocks, n_threads_per_block>>>(d_testing_results.Pointer(), d_testing_samples.Pointer(), testing_dataset_size, d_model.Pointer(), n_sv, h_b, ndims, d_kernel_args.Pointer());
769  break;
770  case SVM_KT_SIGMOID:
771  cuClassifyDataSet<SVM_KT_SIGMOID> <<<n_blocks, n_threads_per_block>>>(d_testing_results.Pointer(), d_testing_samples.Pointer(), testing_dataset_size, d_model.Pointer(), n_sv, h_b, ndims, d_kernel_args.Pointer());
772  break;
773  case SVM_KT_UKF:
774  cuClassifyDataSet<SVM_KT_UKF> <<<n_blocks, n_threads_per_block>>>(d_testing_results.Pointer(), d_testing_samples.Pointer(), testing_dataset_size, d_model.Pointer(), n_sv, h_b, ndims, d_kernel_args.Pointer());
775  break;
776  }
777  //copy classification results from device to host
778  h_testing_results = d_testing_results;
779  }
780 
781  };
782 
785 
787 
788 } //namespace
789 
790 #endif
void runSMO(GPUMLib::HostMatrix< cudafloat > &h_x, GPUMLib::HostArray< int > &h_y, GPUMLib::DeviceArray< cudafloat > &d_alphas, cudafloat constant_c_negative, cudafloat constant_c_positive, cudafloat constant_epsilon, cudafloat constant_tau, GPUMLib::svm_kernel_type kernel_type, cudafloat *kernel_args, int amount_threads)
Definition: SVM.h:404
int NumberThreadsPerBlockThatBestFit(int threads, int maxThreadsPerBlock=MAX_THREADS_PER_BLOCK)
Definition: Utilities.h:37
Represents an SVM which can be used to train and classify datasets on the GPU device.
Definition: SVM.h:48
void kernelFirstOrderHeuristic1stPass(cudaStream_t stream, int blocks, int blockSize, cudafloat *f, cudafloat *alphas, int *y, cudafloat *minimuns, int *min_indices, cudafloat *maximuns, int *max_indices, int input_size, cudafloat constant_epsilon, cudafloat constant_c)
Definition: SVM.h:196
int findMinimumPositionTarget_HostArray(GPUMLib::HostArray< int > &array, int array_length, int target)
Definition: SVM.h:80
void kernelFirstOrderHeuristicFinalPass(cudaStream_t stream, int blockSize, cudafloat *minimuns_input, int *min_indices_input, cudafloat *maximuns_input, int *max_indices_input, int input_size)
Definition: SVM.h:252
Type * Pointer() const
Definition: BaseArray.h:70
void classify(GPUMLib::HostMatrix< cudafloat > &h_model, GPUMLib::HostMatrix< cudafloat > &h_testing_samples, cudafloat *kernel_args, int amount_threads, GPUMLib::svm_kernel_type kernel_type, int n_sv, cudafloat h_b, int ndims, GPUMLib::HostArray< int > &h_testing_results)
Definition: SVM.h:737
SVM()
Constructor to represent a SVM on the GPU. Used as a placeholder to execute device code...
Definition: SVM.h:52
void calculateB_FinalPass(cudaStream_t stream, int blockSize, cudafloat *input_floats, int input_size)
Definition: SVM.h:138
int NumberBlocks(int threads, int blockSize)
Definition: Utilities.h:49
Type * Pointer() const
Definition: BaseMatrix.h:88
int Columns() const
Definition: BaseMatrix.h:80
void calculateB_1stPass(cudaStream_t stream, int blocks, int blockSize, cudafloat *offsets, cudafloat *results, int n_svs)
Definition: SVM.h:101
void updateAlphas(cudaStream_t stream, GPUMLib::svm_kernel_type kernel_type, GPUMLib::DeviceMatrix< cudafloat > &d_x, GPUMLib::DeviceArray< cudafloat > &d_alphas, GPUMLib::DeviceArray< int > &d_y, cudafloat constant_c_negative, cudafloat constant_c_positive, GPUMLib::DeviceArray< cudafloat > &d_kernel_args, int training_dataset_size, int ndims)
Definition: SVM.h:307
int ResizeWithoutPreservingData(int rows, int columns)
Definition: BaseMatrix.h:119
int getSupportVectorIndices(GPUMLib::HostArray< cudafloat > &h_alphas, int *alpha_indices, int size)
Definition: SVM.h:61
int Rows() const
Definition: BaseMatrix.h:74
#define CUDA_VALUE(X)
void train(GPUMLib::HostMatrix< cudafloat > &h_samples, GPUMLib::HostArray< int > &h_classes, cudafloat constant_c_negative, cudafloat constant_c_positive, cudafloat constant_epsilon, cudafloat constant_tau, svm_kernel_type kernel_type, cudafloat *kernel_args, int amount_threads, GPUMLib::HostArray< cudafloat > &h_alphas, int &n_sv, GPUMLib::HostMatrix< cudafloat > &h_model, cudafloat &h_b)
Definition: SVM.h:604
float cudafloat
void checkCUDA_Errors()
Definition: SVM.h:380
void updateKKTConditions(cudaStream_t stream, GPUMLib::svm_kernel_type kernel_type, int n_blocks, int blocksize, GPUMLib::DeviceArray< cudafloat > &d_f, GPUMLib::DeviceArray< int > &d_y, GPUMLib::DeviceMatrix< cudafloat > &d_x, GPUMLib::DeviceArray< cudafloat > &d_kernel_args, int training_dataset_size, int ndims)
Definition: SVM.h:354