GPUMLib  0.2.2
GPU Machine Learning Library
CorrectWeightsKernel.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "../reduction/SumWarp.h"
22 #include "MBPkernels.h"
23 
24 #define BIAS 0
25 
26 #define INPUT blockIdx.x
27 #define NUM_INPUTS_INCLUDING_BIAS gridDim.x
28 #define NUM_INPUTS (NUM_INPUTS_INCLUDING_BIAS - 1)
29 
30 #define NEURON blockIdx.y
31 #define NUM_NEURONS gridDim.y
32 
33 namespace GPUMLib {
34 
35 template <int blockSize> KERNEL CorrectLayerWeights(cudafloat * rmsF, cudafloat * bestRMS, cudafloat maxErrorGrowth, cudafloat * inputs, cudafloat * localGradient, cudafloat * weights, cudafloat * learningRate, cudafloat * lastDeltaWithoutLearningMomentum, cudafloat * lastDelta, cudafloat maxStepSize, cudafloat u, cudafloat d, cudafloat r, cudafloat momentum, int numberPatterns) {
36  extern __shared__ cudafloat deltas[];
37 
38  if (bestRMS != nullptr) {
39  __shared__ cudafloat rms;
40  __shared__ cudafloat bRMS;
41 
42  rms = *rmsF;
43  bRMS = *bestRMS;
44  if (rms >= bRMS * maxErrorGrowth) return;
45  }
46 
47  deltas[threadIdx.x] = CUDA_VALUE(0.0);
48  for(int p = threadIdx.x; p < numberPatterns; p += blockDim.x) {
49  cudafloat delta = localGradient[p * NUM_NEURONS + NEURON];
50  if (INPUT > BIAS) delta *= inputs[p * NUM_INPUTS + (INPUT - 1)];
51 
52  deltas[threadIdx.x] += delta;
53  }
54  __syncthreads();
55 
56  SumBeforeWarp<blockSize>(deltas);
57 
58  if (threadIdx.x < 32) {
59  SumWarp<blockSize>(deltas);
60 
61  if (threadIdx.x == 0) {
62  int connection = NEURON * NUM_INPUTS_INCLUDING_BIAS + INPUT;
63 
64  cudafloat delta = deltas[0] / numberPatterns;
65  cudafloat learnRate = learningRate[connection];
66 
67  cudafloat factor = SAME_DIRECTION(lastDeltaWithoutLearningMomentum[connection], delta) ? u : d;
68  learnRate *= factor;
69  if (learnRate > maxStepSize) learnRate = maxStepSize;
70  learningRate[connection] = learnRate;
71 
72  lastDeltaWithoutLearningMomentum[connection] = delta;
73 
74  delta += momentum * lastDelta[connection];
75  lastDelta[connection] = delta;
76 
77  cudafloat w = weights[connection] + (learnRate * delta);
78  if (IsInfOrNaN(w)) {
79  lastDelta[connection] = CUDA_VALUE(0.0);
80  lastDeltaWithoutLearningMomentum[connection] = CUDA_VALUE(0.0);
81  if (bestRMS != nullptr) {
82  learnRate *= r;
83  learningRate[connection] = learnRate;
84  }
85  } else {
86  weights[connection] = w;
87  }
88  }
89  }
90 }
91 
92 void KernelCorrectLayerWeights(cudaStream_t stream, dim3 & gridSize, int blockSize, cudafloat * rmsF, cudafloat * bestRMS, cudafloat maxErrorGrowth, cudafloat * inputs, cudafloat * localGradient, cudafloat * weights, cudafloat * learningRate, cudafloat * lastDeltaWithoutLearningMomentum, cudafloat * lastDelta, cudafloat maxStepSize, cudafloat u, cudafloat d, cudafloat r, cudafloat momentum, int numberPatterns) {
93  switch(blockSize) {
94  #ifdef FERMI
95  case 1024:
96  CorrectLayerWeights<1024><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
97  break;
98  #endif
99  case 512:
100  CorrectLayerWeights<512><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
101  break;
102  case 256:
103  CorrectLayerWeights<256><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
104  break;
105  case 128:
106  CorrectLayerWeights<128><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
107  break;
108  case 64:
109  CorrectLayerWeights<64><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
110  break;
111  case 32:
112  CorrectLayerWeights<32><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
113  break;
114  case 16:
115  CorrectLayerWeights<16><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
116  break;
117  case 8:
118  CorrectLayerWeights<8><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
119  break;
120  case 4:
121  CorrectLayerWeights<4><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
122  break;
123  case 2:
124  CorrectLayerWeights<2><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
125  break;
126  case 1:
127  CorrectLayerWeights<1><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, weights, learningRate, lastDeltaWithoutLearningMomentum, lastDelta, maxStepSize, u, d, r, momentum, numberPatterns);
128  break;
129  }
130 }
131 
132 }
#define SAME_DIRECTION(X, Y)
Verifies if X and Y have the same signal.
void KernelCorrectLayerWeights(cudaStream_t stream, dim3 &gridSize, int blockSize, cudafloat *rmsF, cudafloat *bestRMS, cudafloat maxErrorGrowth, cudafloat *inputs, cudafloat *localGradient, cudafloat *weights, cudafloat *learningRate, cudafloat *lastDeltaWithoutLearningMomentum, cudafloat *lastDelta, cudafloat maxStepSize, cudafloat u, cudafloat d, cudafloat r, cudafloat momentum, int numberPatterns)
#define KERNEL
Defines the type of a kernel function.
#define CUDA_VALUE(X)
float cudafloat