GPUMLib  0.2.2
GPU Machine Learning Library
CorrectWeightsSelInputs.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal (for more information see readme.txt)
3  Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of Multiple Back-Propagation.
6 
7  Multiple Back-Propagation is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "MBPkernels.h"
22 
23 #define NEURON blockIdx.x
24 #define NUM_NEURONS gridDim.x
25 
26 namespace GPUMLib {
27 
28 template <int blockSize> KERNEL CorrectWeightsSelectiveInputs(cudafloat * rmsF, cudafloat * bestRMS, cudafloat maxErrorGrowth, cudafloat * inputs, cudafloat * localGradient, cudafloat * selectiveNeuronsWeights, cudafloat * selectiveNeuronsBias, cudafloat * learningRateWeights, cudafloat * learningRateBias, cudafloat * lastDeltaWithoutLearningMomentumWeights, cudafloat * lastDeltaWithoutLearningMomentumBias, cudafloat * lastDeltaWeights, cudafloat * lastDeltaBias, cudafloat u, cudafloat d, cudafloat r, cudafloat maxStepSize, cudafloat momentum, int numberPatterns) {
29  extern __shared__ cudafloat deltasWeights[];
30  cudafloat * deltasBias = (deltasWeights + blockDim.x);
31 
32  if (bestRMS != NULL) {
33  __shared__ cudafloat rms;
34  __shared__ cudafloat bRMS;
35 
36  rms = *rmsF;
37  bRMS = *bestRMS;
38  if (rms >= bRMS * maxErrorGrowth) return;
39  }
40 
41  deltasBias[threadIdx.x] = CUDA_VALUE(0.0);
42  deltasWeights[threadIdx.x] = CUDA_VALUE(0.0);
43  for(int p = threadIdx.x; p < numberPatterns; p += blockDim.x) {
44  int n = p * NUM_NEURONS + NEURON;
45 
46  cudafloat i = inputs[n];
47  if (!IsInfOrNaN(i)) {
48  cudafloat delta = localGradient[n];
49 
50  deltasBias[threadIdx.x] += delta;
51  deltasWeights[threadIdx.x] += delta * i;
52  }
53  }
54  __syncthreads();
55 
56  if (blockSize >= 1024) {
57  if (threadIdx.x < 512) {
58  deltasBias[threadIdx.x] += deltasBias[threadIdx.x + 512];
59  deltasWeights[threadIdx.x] += deltasWeights[threadIdx.x + 512];
60  }
61  __syncthreads();
62  }
63 
64  if (blockSize >= 512) {
65  if (threadIdx.x < 256) {
66  deltasBias[threadIdx.x] += deltasBias[threadIdx.x + 256];
67  deltasWeights[threadIdx.x] += deltasWeights[threadIdx.x + 256];
68  }
69  __syncthreads();
70  }
71 
72  if (blockSize >= 256) {
73  if (threadIdx.x < 128) {
74  deltasBias[threadIdx.x] += deltasBias[threadIdx.x + 128];
75  deltasWeights[threadIdx.x] += deltasWeights[threadIdx.x + 128];
76  }
77  __syncthreads();
78  }
79 
80  if (blockSize >= 128) {
81  if (threadIdx.x < 64) {
82  deltasBias[threadIdx.x] += deltasBias[threadIdx.x + 64];
83  deltasWeights[threadIdx.x] += deltasWeights[threadIdx.x + 64];
84  }
85  __syncthreads();
86  }
87 
88  if (threadIdx.x < 32) {
89  volatile cudafloat * _deltasBias = deltasBias;
90  volatile cudafloat * _deltasWeights = deltasWeights;
91 
92  if (blockSize >= 64) {
93  _deltasBias[threadIdx.x] += _deltasBias[threadIdx.x + 32];
94  _deltasWeights[threadIdx.x] += _deltasWeights[threadIdx.x + 32];
95  }
96 
97  if (blockSize >= 32) {
98  _deltasBias[threadIdx.x] += _deltasBias[threadIdx.x + 16];
99  _deltasWeights[threadIdx.x] += _deltasWeights[threadIdx.x + 16];
100  }
101 
102  if (blockSize >= 16) {
103  _deltasBias[threadIdx.x] += _deltasBias[threadIdx.x + 8];
104  _deltasWeights[threadIdx.x] += _deltasWeights[threadIdx.x + 8];
105  }
106 
107  if (blockSize >= 8) {
108  _deltasBias[threadIdx.x] += _deltasBias[threadIdx.x + 4];
109  _deltasWeights[threadIdx.x] += _deltasWeights[threadIdx.x + 4];
110  }
111 
112  if (blockSize >= 4) {
113  _deltasBias[threadIdx.x] += _deltasBias[threadIdx.x + 2];
114  _deltasWeights[threadIdx.x] += _deltasWeights[threadIdx.x + 2];
115  }
116 
117  if (blockSize >= 2) {
118  _deltasBias[threadIdx.x] += _deltasBias[threadIdx.x + 1];
119  _deltasWeights[threadIdx.x] += _deltasWeights[threadIdx.x + 1];
120  }
121 
122  if (threadIdx.x == 0) {
123  cudafloat deltaB = deltasBias[0] / numberPatterns;
124  cudafloat deltaW = deltasWeights[0] / numberPatterns;
125 
126  cudafloat learnRateB = learningRateBias[NEURON];
127  cudafloat learnRateW = learningRateWeights[NEURON];
128 
129  cudafloat factorB = SAME_DIRECTION(lastDeltaWithoutLearningMomentumBias[NEURON], deltaB) ? u : d;
130  cudafloat factorW = SAME_DIRECTION(lastDeltaWithoutLearningMomentumWeights[NEURON], deltaW) ? u : d;
131 
132  learnRateB *= factorB;
133  learnRateW *= factorW;
134 
135  if (learnRateB > maxStepSize) learnRateB = maxStepSize;
136  if (learnRateW > maxStepSize) learnRateW = maxStepSize;
137 
138  learningRateBias[NEURON] = learnRateB;
139  learningRateWeights[NEURON] = learnRateW;
140 
141  lastDeltaWithoutLearningMomentumBias[NEURON] = deltaB;
142  lastDeltaWithoutLearningMomentumWeights[NEURON] = deltaW;
143 
144  deltaB += momentum * lastDeltaBias[NEURON];
145  deltaW += momentum * lastDeltaWeights[NEURON];
146 
147  lastDeltaBias[NEURON] = deltaB;
148  lastDeltaWeights[NEURON] = deltaW;
149 
150  cudafloat wb = selectiveNeuronsBias[NEURON] + (learnRateB * deltaB);
151  cudafloat w = selectiveNeuronsWeights[NEURON] + (learnRateW * deltaW);
152 
153  if (IsInfOrNaN(wb)) {
154  lastDeltaBias[NEURON] = CUDA_VALUE(0.0);
155  lastDeltaWithoutLearningMomentumBias[NEURON] = CUDA_VALUE(0.0);
156  if (bestRMS != NULL) {
157  learnRateB *= r;
158  learningRateBias[NEURON] = learnRateB;
159  }
160  } else {
161  selectiveNeuronsBias[NEURON] = wb;
162  }
163 
164  if (IsInfOrNaN(w)) {
165  lastDeltaWeights[NEURON] = CUDA_VALUE(0.0);
166  lastDeltaWithoutLearningMomentumWeights[NEURON] = CUDA_VALUE(0.0);
167  if (bestRMS != NULL) {
168  learnRateW *= r;
169  learningRateWeights[NEURON] = learnRateW;
170  }
171  } else {
172  selectiveNeuronsWeights[NEURON] = w;
173  }
174  }
175  }
176 }
177 
178 #define CORRECT_WEIGHTS(X) CorrectWeightsSelectiveInputs<X><<<neurons, X, 2 * patterns * sizeof(cudafloat), stream>>>(rmsF, bestRMS, maxErrorGrowth, inputs, localGradient, selectiveNeuronsWeights, selectiveNeuronsBias, learningRateWeights, learningRateBias, lastDeltaWithoutLearningMomentumWeights, lastDeltaWithoutLearningMomentumBias, lastDeltaWeights, lastDeltaBias, u, d, r, maxStepSize, momentum, numberPatterns);
179 
180 void KernelCorrectWeightsSelectiveInputs(cudaStream_t stream, int neurons, int patterns, cudafloat * rmsF, cudafloat * bestRMS, cudafloat maxErrorGrowth, cudafloat * inputs, cudafloat * localGradient, cudafloat * selectiveNeuronsWeights, cudafloat * selectiveNeuronsBias, cudafloat * learningRateWeights, cudafloat * learningRateBias, cudafloat * lastDeltaWithoutLearningMomentumWeights, cudafloat * lastDeltaWithoutLearningMomentumBias, cudafloat * lastDeltaWeights, cudafloat * lastDeltaBias, cudafloat u, cudafloat d, cudafloat r, cudafloat maxStepSize, cudafloat momentum, int numberPatterns) {
181  switch(patterns) {
182  case 512:
183  CORRECT_WEIGHTS(512);
184  break;
185  case 256:
186  CORRECT_WEIGHTS(256);
187  break;
188  case 128:
189  CORRECT_WEIGHTS(128);
190  break;
191  case 64:
192  CORRECT_WEIGHTS(64);
193  break;
194  case 32:
195  CORRECT_WEIGHTS(32);
196  break;
197  case 16:
198  CORRECT_WEIGHTS(16);
199  break;
200  case 8:
201  CORRECT_WEIGHTS(8);
202  break;
203  case 4:
204  CORRECT_WEIGHTS(4);
205  break;
206  case 2:
207  CORRECT_WEIGHTS(2);
208  break;
209  case 1:
210  CORRECT_WEIGHTS(1);
211  break;
212  }
213 }
214 
215 }
#define SAME_DIRECTION(X, Y)
Verifies if X and Y have the same signal.
void KernelCorrectWeightsSelectiveInputs(cudaStream_t stream, int neurons, int patterns, cudafloat *rmsF, cudafloat *bestRMS, cudafloat maxErrorGrowth, cudafloat *inputs, cudafloat *localGradient, cudafloat *selectiveNeuronsWeights, cudafloat *selectiveNeuronsBias, cudafloat *learningRateWeights, cudafloat *learningRateBias, cudafloat *lastDeltaWithoutLearningMomentumWeights, cudafloat *lastDeltaWithoutLearningMomentumBias, cudafloat *lastDeltaWeights, cudafloat *lastDeltaBias, cudafloat u, cudafloat d, cudafloat r, cudafloat maxStepSize, cudafloat momentum, int numberPatterns)
#define KERNEL
Defines the type of a kernel function.
#define CUDA_VALUE(X)
float cudafloat