GPUMLib  0.2.2
GPU Machine Learning Library
BackPropagation.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <assert.h>
22 #include <stdlib.h>
23 #include "BackPropagation.h"
24 #include "MBPkernels.h"
25 
26 namespace GPUMLib {
27 
28 int BackPropagation::Layer::totalNeuronsWithSelectiveActivation = 0;
29 int BackPropagation::Layer::patterns;
30 
31 void BackPropagation::Layer::RandomizeWeights(cudafloat minValue, cudafloat maxValue, cudafloat initialLearningRate) {
32  assert(maxValue > minValue);
33 
34  HostArray<cudafloat> learnRate(connections);
35  HostArray<cudafloat> delta(connections);
36  HostArray<cudafloat> weights(connections);
37 
38  for(int c = 0; c < connections; c++) {
39  weights[c] = (maxValue - minValue) * ((cudafloat) rand() / RAND_MAX) + minValue;
40  learnRate[c] = initialLearningRate;
41  delta[c] = CUDA_VALUE(0.0);
42  }
43 
44  d_bestWeights = d_weights = weights;
45  d_learnRate = learnRate;
46  d_lastDelta = d_lastDeltaWithoutLearningMomentum = delta;
47 }
48 
49 void BackPropagation::Layer::Fire(cudaStream_t stream) {
50  dim3 dimNeuronsPatterns;
51  dimNeuronsPatterns.x = neurons;
52 
53  if (isOutputLayer) {
54  if(connections > MAX_THREADS_PER_BLOCK) {
55  int processed = 0;
56  do {
57  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
58  dimNeuronsPatterns.y = patternsToProcess;
59  KernelFireOutputLayer(stream, dimNeuronsPatterns, inputsBlockSize, d_inputs + (processed * inputsWithoutBias), d_weights.Pointer(), (d_m != nullptr) ? d_m + (processed * totalNeuronsWithSelectiveActivation) : nullptr, mOffset, totalNeuronsWithSelectiveActivation, d_desOutputs + (processed * neurons), d_outputs.Pointer() + (processed * neurons), d_localGradient.Pointer() + (processed * neurons), d_rms + processed, (d_localGradSpaceNet == nullptr) ? nullptr : d_localGradSpaceNet + (processed * totalNeuronsWithSelectiveActivation), inputsWithoutBias);
60  processed += patternsToProcess;
61  } while (processed < patterns);
62  } else {
63  int processed = 0;
64  do {
65  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
66  FireOutputLayer<<<patternsToProcess, dimInputsNeurons, sharedMemFire, stream>>>(d_inputs + (processed * inputsWithoutBias), d_weights.Pointer(), (d_m == nullptr) ? nullptr : d_m + (processed * totalNeuronsWithSelectiveActivation), mOffset, totalNeuronsWithSelectiveActivation, d_desOutputs + (processed * neurons), d_outputs.Pointer() + (processed * neurons), d_localGradient.Pointer() + (processed * neurons), d_rms + processed, (d_m == nullptr) ? nullptr : d_localGradSpaceNet + (processed * totalNeuronsWithSelectiveActivation));
67  processed += patternsToProcess;
68  } while (processed < patterns);
69  }
70  } else {
71  if(connections > MAX_THREADS_PER_BLOCK) {
72  int processed = 0;
73  do {
74  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
75  dimNeuronsPatterns.y = patternsToProcess;
76  KernelFireLayer(stream, dimNeuronsPatterns, inputsBlockSize, d_inputs + (processed * inputsWithoutBias), d_weights.Pointer(), (d_m != nullptr) ? d_m + (processed * totalNeuronsWithSelectiveActivation) : nullptr, mOffset, totalNeuronsWithSelectiveActivation, d_outputs.Pointer() + (processed * neurons), inputsWithoutBias);
77  processed += patternsToProcess;
78  } while (processed < patterns);
79  } else {
80  int processed = 0;
81  do {
82  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
83  FireLayer<<<patternsToProcess, dimInputsNeurons, sharedMemFire, stream>>>(d_inputs + (processed * inputsWithoutBias), d_weights.Pointer(), (d_m != nullptr) ? d_m + (processed * totalNeuronsWithSelectiveActivation) : nullptr, mOffset, totalNeuronsWithSelectiveActivation, d_outputs.Pointer() + (processed * neurons));
84  processed += patternsToProcess;
85  } while (processed < patterns);
86  }
87  }
88 }
89 
90 void BackPropagation::Layer::CalculateLocalGradient(cudaStream_t stream, cudafloat * rms, cudafloat * bestRMS, cudafloat rmsGrowToApplyRobustLearning, Layer & nextLayer) {
91  int processed = 0;
92  do {
93  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
94  GPUMLib::CalculateLocalGradient<<<patternsToProcess, dimOutputsNeurons, sharedMemGradients, stream>>>(rms, bestRMS, rmsGrowToApplyRobustLearning, d_outputs.Pointer() + (processed * neurons), nextLayer.d_weights.Pointer(), (d_m != nullptr) ? d_m + (processed * totalNeuronsWithSelectiveActivation) : nullptr, mOffset, totalNeuronsWithSelectiveActivation, nextLayer.d_localGradient.Pointer() + (processed * dimOutputsNeurons.x), d_localGradient.Pointer() + (processed * neurons), (d_m == nullptr) ? nullptr : d_localGradSpaceNet + (processed * totalNeuronsWithSelectiveActivation));
95  processed += patternsToProcess;
96  } while (processed < patterns);
97 }
98 
99 void BackPropagation::Layer::CorrectWeights(cudaStream_t stream, int patternsBlockSize, cudafloat * rms, cudafloat * bestRMS, cudafloat rmsGrowToApplyRobustLearning, cudafloat robustFactor, cudafloat momentum, cudafloat u, cudafloat d, cudafloat maxStepSize) {
100  KernelCorrectLayerWeights(stream, dimInputsNeurons, patternsBlockSize, rms, bestRMS, rmsGrowToApplyRobustLearning, d_inputs, d_localGradient.Pointer(), d_weights.Pointer(), d_learnRate.Pointer(), d_lastDeltaWithoutLearningMomentum.Pointer(), d_lastDelta.Pointer(), maxStepSize, u, d, robustFactor, momentum, patterns);
101 }
102 
103 void BackPropagation::Layer::Init(int neurons, int inputs, int nextLayerNeurons, cudafloat initialLearningRate, cudafloat * layerInputs, bool isOutputLayer, cudafloat * m, cudafloat * localGradSpaceNet, int mOffset) {
104  connections = inputs * neurons;
105 
106  this->neurons = neurons;
107  inputsWithoutBias = inputs - 1;
108 
109  RandomizeWeights(CUDA_VALUE(-1.0), CUDA_VALUE(1.0), initialLearningRate);
110 
111  d_m = m;
112  d_localGradSpaceNet = localGradSpaceNet;
113  this->mOffset = mOffset;
114 
115  inputsBlockSize = 1;
116  while(inputsBlockSize < MAX_THREADS_PER_BLOCK && inputsBlockSize < inputs) inputsBlockSize <<= 1;
117 
118  d_inputs = layerInputs;
119  d_outputs.ResizeWithoutPreservingData(neurons * patterns);
120  d_localGradient.ResizeWithoutPreservingData(neurons * patterns);
121 
122  sharedMemFire = connections * sizeof(cudafloat);
123  sharedMemGradients = (nextLayerNeurons * (neurons + 1)) * sizeof(cudafloat);
124 
125  dimInputsNeurons.x = inputs;
126  dimInputsNeurons.y = neurons;
127 
128  dimOutputsNeurons.x = nextLayerNeurons;
129  dimOutputsNeurons.y = neurons;
130 
131  this->isOutputLayer = isOutputLayer;
132 }
133 
134 void BackPropagation::SelectiveInputLayer::RandomizeWeights(cudafloat minValue, cudafloat maxValue, cudafloat initialLearningRate, HostArray<bool> & selectiveInputs) {
135  assert(maxValue > minValue);
136 
137  int ninputs = selectiveInputs.Length();
138 
139  HostArray<cudafloat> weights(ninputs);
140  HostArray<cudafloat> bias(ninputs);
141  HostArray<cudafloat> learningRate(ninputs);
142  HostArray<cudafloat> delta(ninputs);
143 
144  for(int i = 0; i < ninputs; i++) {
145  if (selectiveInputs[i]) {
146  weights[i] = (maxValue - minValue) * ((cudafloat) rand() / RAND_MAX) + minValue;
147  bias[i] = (maxValue - minValue) * ((cudafloat) rand() / RAND_MAX) + minValue;
148  } else {
149  weights[i] = CUDA_VALUE(0.0);
150  bias[i] = CUDA_VALUE(0.0);
151  }
152 
153  learningRate[i] = initialLearningRate;
154  delta[i] = CUDA_VALUE(0.0);
155  }
156 
157  d_bestWeights = d_weights = weights;
158  d_bestBias = d_bias = bias;
159  d_learnRateBias = d_learnRate = learningRate;
160  d_lastDelta = d_lastDeltaBias = d_lastDeltaWithoutLearningMomentum = d_lastDeltaWithoutLearningMomentumBias = delta;
161 }
162 
163 void BackPropagation::SelectiveInputLayer::Fire(cudaStream_t stream) {
164  int processed = 0;
165  do {
166  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
167  FireSelectiveInputs<<<patternsToProcess, neurons, 0, stream>>>(d_inputs + (processed * neurons), d_weights.Pointer(), d_bias.Pointer(), d_outputs.Pointer() + (processed * neurons), neurons);
168  processed += patternsToProcess;
169  } while (processed < patterns);
170 }
171 
172 void BackPropagation::SelectiveInputLayer::CalculateLocalGradient(cudaStream_t stream, cudafloat * rms, cudafloat * bestRMS, cudafloat rmsGrowToApplyRobustLearning, Layer & nextLayer) {
173  int processed = 0;
174  do {
175  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
176  CalcLocalGradSelectiveInputs<<<patternsToProcess, dimOutputsNeurons, sharedMemGradients, stream>>>(rms, bestRMS, rmsGrowToApplyRobustLearning, d_inputs + (processed * neurons), d_weights.Pointer(), d_bias.Pointer(), nextLayer.d_weights.Pointer(), nextLayer.d_localGradient.Pointer() + (processed * dimOutputsNeurons.x), d_localGradient.Pointer() + (processed * neurons));
177  processed += patternsToProcess;
178  } while (processed < patterns);
179 }
180 
181 void BackPropagation::SelectiveInputLayer::CorrectWeights(cudaStream_t stream, cudafloat * rms, cudafloat * bestRMS, cudafloat rmsGrowToApplyRobustLearning, cudafloat robustFactor, cudafloat momentum, cudafloat u, cudafloat d, cudafloat maxStepSize) {
182  KernelCorrectWeightsSelectiveInputs(stream, neurons, patterns, rms, bestRMS, rmsGrowToApplyRobustLearning, d_inputs, d_localGradient.Pointer(), d_weights.Pointer(), d_bias.Pointer(), d_learnRate.Pointer(), d_learnRateBias.Pointer(), d_lastDeltaWithoutLearningMomentum.Pointer(), d_lastDeltaWithoutLearningMomentumBias.Pointer(), d_lastDelta.Pointer(), d_lastDeltaBias.Pointer(), u, d, maxStepSize, robustFactor, momentum, patterns);
183 }
184 
185 void BackPropagation::CreateNetwork(HostArray<int> & sizeLayers, HostArray<int> * sizeSpaceLayers, HostArray<bool> * selectiveNeurons, HostMatrix<cudafloat> & trainInputPatterns, HostMatrix<cudafloat> & trainDesiredOutputPatterns, cudafloat initialLearningRate) {
186  int nsamples = trainInputPatterns.Rows();
187  int ninputs = trainInputPatterns.Columns();
188 
189  Layer::patterns = nsamples;
190  assert(Layer::patterns > 0 && Layer::patterns == trainDesiredOutputPatterns.Rows());
191 
192  d_inputs = trainInputPatterns;
193  d_desOutputs = trainDesiredOutputPatterns;
194 
195  d_rmsOut.ResizeWithoutPreservingData(1);
196 
197  this->initialLearningRate = initialLearningRate;
198  assert(initialLearningRate > CUDA_VALUE(0.0));
199 
200  // Check for selective inputs
201  bool hasSelectiveInputs = false;
202 
203  selectiveInputs.ResizeWithoutPreservingData(ninputs);
204  for(int i = 0; i < ninputs; i++) selectiveInputs[i] = false;
205 
206  int fi = 0;
207  int li = ninputs - 1;
208 
209  for(int s = 0; s < nsamples; s++) {
210  for(int i = fi; i <= li; i++) {
211  if (!selectiveInputs[i] && IsInfOrNaN(trainInputPatterns(s, i))) {
212  selectiveInputs[i] = hasSelectiveInputs = true;
213  if (i == fi) fi++; else if (i == li) li--;
214  }
215  }
216 
217  if (fi >= li) break;
218  }
219 
220  //Create the space layers
221  int numberSpaceLayers = (sizeSpaceLayers == nullptr) ? 0 : sizeSpaceLayers->Length();
222 
223  selectiveInputLayerSpaceNetwork = nullptr;
224 
225  if (numberSpaceLayers) {
226  assert(selectiveNeurons != nullptr);
227 
228  spaceLayers.ResizeWithoutPreservingData(numberSpaceLayers);
229 
230  int inputsWithoutBias = sizeLayers[0];
231 
232  cudafloat * layerInputs = d_inputs.Pointer();
233 
234  if (hasSelectiveInputs) {
235  selectiveInputLayerSpaceNetwork = new SelectiveInputLayer(nsamples, selectiveInputs, (*sizeSpaceLayers)[0], layerInputs, initialLearningRate);
236  layerInputs = selectiveInputLayerSpaceNetwork->d_outputs.Pointer();
237  }
238 
239  for(int l = 0; l < numberSpaceLayers; l++) {
240  int neurons = (*sizeSpaceLayers)[l];
241 
242  int nextLayerNeurons;
243 
244  if (l == numberSpaceLayers - 1) {
245  Layer::totalNeuronsWithSelectiveActivation = neurons;
246  nextLayerNeurons = 0;
247  } else {
248  nextLayerNeurons = (*sizeSpaceLayers)[l + 1];
249  }
250 
251  spaceLayers[l].Init(neurons, inputsWithoutBias + 1, nextLayerNeurons, initialLearningRate, layerInputs, false);
252 
253  layerInputs = spaceLayers[l].d_outputs.Pointer();
254  inputsWithoutBias = neurons;
255  }
256  }
257 
258  //Create the layers
259  int numberLayers = sizeLayers.Length() - 1;
260  assert(numberLayers > 0);
261 
262  layers.ResizeWithoutPreservingData(numberLayers);
263 
264  int outputLayer = numberLayers - 1;
265 
266  int inputsWithoutBias = sizeLayers[0];
267  assert(inputsWithoutBias > 0 && inputsWithoutBias == trainInputPatterns.Columns());
268 
269  cudafloat * layerInputs = d_inputs.Pointer();
270 
271  if (hasSelectiveInputs) {
272  selectiveInputLayer = new SelectiveInputLayer(nsamples, selectiveInputs, sizeLayers[1], layerInputs, initialLearningRate);
273  layerInputs = selectiveInputLayer->d_outputs.Pointer();
274  } else {
275  selectiveInputLayer = nullptr;
276  }
277 
278  cudafloat * m = (numberSpaceLayers == 0) ? nullptr : spaceLayers[numberSpaceLayers - 1].d_outputs.Pointer();
279  cudafloat * localGradSpaceNet = (numberSpaceLayers == 0) ? nullptr : spaceLayers[numberSpaceLayers - 1].d_localGradient.Pointer();
280  int mOffset = 0;
281 
282  for(int l = 0; l < numberLayers; l++) {
283  int neurons = sizeLayers[l + 1];
284  assert(neurons > 0);
285 
286  bool isOutputLayer = (l == outputLayer) ? true : false;
287 
288  int nextLayerNeurons = (isOutputLayer) ? 0 : sizeLayers[l + 2];
289 
290  bool hasSelectiveNeurons = (numberSpaceLayers > 0 && (*selectiveNeurons)[l]) ? true : false;
291 
292  layers[l].Init(neurons, inputsWithoutBias + 1, nextLayerNeurons, initialLearningRate, layerInputs, isOutputLayer, (hasSelectiveNeurons) ? m : nullptr, (hasSelectiveNeurons) ? localGradSpaceNet : nullptr, mOffset);
293 
294  if (hasSelectiveNeurons) mOffset += neurons;
295 
296  layerInputs = layers[l].d_outputs.Pointer();
297  inputsWithoutBias = neurons;
298  }
299 
300  //Robust Learning
301  layersRobustTraining = numberLayers + numberSpaceLayers;
302  if (hasSelectiveInputs) layersRobustTraining += (numberSpaceLayers) ? 4 : 2;
303 
304  HostArray<int> numberWeightsLayer(layersRobustTraining);
305  HostArray<cudafloat *> weightsLayers(layersRobustTraining);
306  HostArray<cudafloat *> bestWeightsLayers(layersRobustTraining);
307  HostArray<cudafloat *> learnRatesLayers(layersRobustTraining);
308  HostArray<cudafloat *> lastDeltaLayers(layersRobustTraining);
309  HostArray<cudafloat *> lastDeltaWithoutLMlayers(layersRobustTraining);
310 
311  maxNumberWeigths = 0;
312 
313  int ll = 0;
314  while (ll < numberSpaceLayers) {
315  int connections = spaceLayers[ll].connections;
316  if (connections > maxNumberWeigths) maxNumberWeigths = connections;
317 
318  numberWeightsLayer[ll] = connections;
319  weightsLayers[ll] = spaceLayers[ll].d_weights.Pointer();
320  bestWeightsLayers[ll] = spaceLayers[ll].d_bestWeights.Pointer();
321  learnRatesLayers[ll] = spaceLayers[ll].d_learnRate.Pointer();
322  lastDeltaLayers[ll] = spaceLayers[ll].d_lastDelta.Pointer();
323  lastDeltaWithoutLMlayers[ll] = spaceLayers[ll].d_lastDeltaWithoutLearningMomentum.Pointer();
324 
325  ll++;
326  }
327 
328  for(int l = 0; l < numberLayers; l++) {
329  int connections = layers[l].connections;
330  if (connections > maxNumberWeigths) maxNumberWeigths = connections;
331 
332  numberWeightsLayer[ll] = connections;
333  weightsLayers[ll] = layers[l].d_weights.Pointer();
334  bestWeightsLayers[ll] = layers[l].d_bestWeights.Pointer();
335  learnRatesLayers[ll] = layers[l].d_learnRate.Pointer();
336  lastDeltaLayers[ll] = layers[l].d_lastDelta.Pointer();
337  lastDeltaWithoutLMlayers[ll] = layers[l].d_lastDeltaWithoutLearningMomentum.Pointer();
338 
339  ll++;
340  }
341 
342  if (hasSelectiveInputs) {
343  numberWeightsLayer[ll] = ninputs;
344  weightsLayers[ll] = selectiveInputLayer->d_weights.Pointer();
345  bestWeightsLayers[ll] = selectiveInputLayer->d_bestWeights.Pointer();
346  learnRatesLayers[ll] = selectiveInputLayer->d_learnRate.Pointer();
347  lastDeltaLayers[ll] = selectiveInputLayer->d_lastDelta.Pointer();
348  lastDeltaWithoutLMlayers[ll] = selectiveInputLayer->d_lastDeltaWithoutLearningMomentum.Pointer();
349  ll++;
350 
351  numberWeightsLayer[ll] = ninputs;
352  weightsLayers[ll] = selectiveInputLayer->d_bias.Pointer();
353  bestWeightsLayers[ll] = selectiveInputLayer->d_bestBias.Pointer();
354  learnRatesLayers[ll] = selectiveInputLayer->d_learnRateBias.Pointer();
355  lastDeltaLayers[ll] = selectiveInputLayer->d_lastDeltaBias.Pointer();
356  lastDeltaWithoutLMlayers[ll] = selectiveInputLayer->d_lastDeltaWithoutLearningMomentumBias.Pointer();
357  ll++;
358 
359  if (numberSpaceLayers) {
360  numberWeightsLayer[ll] = ninputs;
361  weightsLayers[ll] = selectiveInputLayerSpaceNetwork->d_weights.Pointer();
362  bestWeightsLayers[ll] = selectiveInputLayerSpaceNetwork->d_bestWeights.Pointer();
363  learnRatesLayers[ll] = selectiveInputLayerSpaceNetwork->d_learnRate.Pointer();
364  lastDeltaLayers[ll] = selectiveInputLayerSpaceNetwork->d_lastDelta.Pointer();
365  lastDeltaWithoutLMlayers[ll] = selectiveInputLayerSpaceNetwork->d_lastDeltaWithoutLearningMomentum.Pointer();
366  ll++;
367 
368  numberWeightsLayer[ll] = ninputs;
369  weightsLayers[ll] = selectiveInputLayerSpaceNetwork->d_bias.Pointer();
370  bestWeightsLayers[ll] = selectiveInputLayerSpaceNetwork->d_bestBias.Pointer();
371  learnRatesLayers[ll] = selectiveInputLayerSpaceNetwork->d_learnRateBias.Pointer();
372  lastDeltaLayers[ll] = selectiveInputLayerSpaceNetwork->d_lastDeltaBias.Pointer();
373  lastDeltaWithoutLMlayers[ll] = selectiveInputLayerSpaceNetwork->d_lastDeltaWithoutLearningMomentumBias.Pointer();
374  ll++;
375  }
376  }
377 
378  d_numberWeightsLayer = numberWeightsLayer;
379  d_weightsLayers = weightsLayers;
380  d_bestWeightsLayers = bestWeightsLayers;
381  d_learnRatesLayers = learnRatesLayers;
382  d_lastDeltaLayers = lastDeltaLayers;
383  d_lastDeltaWithoutLMlayers = lastDeltaWithoutLMlayers;
384 
385  robustLearning = true;
386  rmsGrowToApplyRobustLearning = CUDA_VALUE(1.001); // 0.1%
387  robustFactor = CUDA_VALUE(0.5);
388  momentum = CUDA_VALUE(0.7);
389  u = CUDA_VALUE(1.2);
390  d = CUDA_VALUE(0.8);
391  maxStepSize = CUDA_VALUE(10.0);
392 
393  //Create the RMS vectors
394  int sizeRMSvector = (layers[outputLayer].connections > MAX_THREADS_PER_BLOCK) ? Layer::patterns * layers[outputLayer].neurons : Layer::patterns;
395  d_rms.ResizeWithoutPreservingData(sizeRMSvector);
396 
397  layers[outputLayer].d_desOutputs = d_desOutputs.Pointer();
398  layers[outputLayer].d_rms = d_rms.Pointer();
399  layers[outputLayer].sharedMemFire += layers[outputLayer].neurons * sizeof(cudafloat);
400 
401  // Initialize the initial RMS
402  HostArray<cudafloat> h_bestRMS(1);
403  h_bestRMS[0] = CUDA_VALUE(1.0);
404  d_bestRMS = h_bestRMS;
405  rms.Value() = h_bestRMS[0];
406 
407  //Other stuff
408  patternsBlockSize = 1;
409  while(patternsBlockSize < MAX_THREADS_PER_BLOCK && patternsBlockSize < Layer::patterns) patternsBlockSize <<= 1;
410 
411  numberPatternsNeurons = (cudafloat) Layer::patterns * (cudafloat) layers[outputLayer].neurons;
412 
413  epoch = 0;
414 }
415 
416 BackPropagation::BackPropagation(HostArray<int> & sizeLayers, HostMatrix<cudafloat> & trainInputPatterns, HostMatrix<cudafloat> & trainDesiredOutputPatterns, cudafloat initialLearningRate) {
417  CreateNetwork(sizeLayers, nullptr, nullptr, trainInputPatterns, trainDesiredOutputPatterns, initialLearningRate);
418 }
419 
421  assert(layer >= 0 && layer < layers.Length());
422  return HostArray<cudafloat>(layers[layer].d_weights);
423 }
424 
426  assert(layer >= 0 && layer < layers.Length());
427  layers[layer].d_weights = weights;
428 }
429 
431  assert(layer >= 0 && layer < layers.Length());
432 
433  Layer * l = &(layers[layer]);
434  int neurons = l->neurons;
435  int inputs = weights.Columns();
436 
437  assert(neurons == bias.Length());
438 
439  HostArray<cudafloat> weights_bias(weights.Elements() + bias.Length());
440 
441  int w = 0;
442  for(int n = 0; n < neurons; n++) {
443  weights_bias[w++] = bias[n];
444  for(int i = 0; i < inputs; i++) {
445  weights_bias[w++] = weights(n, i);
446  }
447  }
448 
449  layers[layer].d_weights = weights_bias;
450 }
451 
453  return HostArray<cudafloat>(selectiveInputLayer->d_weights);
454 }
455 
457  selectiveInputLayer->d_weights = weights;
458 }
459 
461  return HostArray<cudafloat>(selectiveInputLayer->d_bias);
462 }
463 
465  selectiveInputLayer->d_bias = bias;
466 }
467 
469  int nSpaceLayers = spaceLayers.Length();
470  for (int layer = 0; layer < nSpaceLayers; layer++) spaceLayers[layer].RandomizeWeights(minValue, maxValue, initialLearningRate);
471 
472  int nLayers = layers.Length();
473  for (int layer = 0; layer < nLayers; layer++) layers[layer].RandomizeWeights(minValue, maxValue, initialLearningRate);
474 
475  if (selectiveInputLayerSpaceNetwork) selectiveInputLayerSpaceNetwork->RandomizeWeights(minValue, maxValue, initialLearningRate, selectiveInputs);
476  if (selectiveInputLayer) selectiveInputLayer->RandomizeWeights(minValue, maxValue, initialLearningRate, selectiveInputs);
477  epoch = 0;
478 }
479 
481  return robustLearning;
482 }
483 
485  robustLearning = value;
486 }
487 
489  return rmsGrowToApplyRobustLearning - CUDA_VALUE(1.0);
490 }
491 
493  assert(value > CUDA_VALUE(0.0));
494  rmsGrowToApplyRobustLearning = CUDA_VALUE(1.0) + value;
495 }
496 
498  return robustFactor;
499 }
500 
502  assert(value > CUDA_VALUE(0.0) && value < CUDA_VALUE(1.0));
503  robustFactor = value;
504 }
505 
507  return momentum;
508 }
509 
511  assert(value > CUDA_VALUE(0.0) && value < CUDA_VALUE(1.0));
512  momentum = value;
513 }
514 
516  return u;
517 }
518 
520  assert(value > CUDA_VALUE(1.0));
521  u = value;
522 }
523 
525  return d;
526 }
527 
529  assert(value > CUDA_VALUE(0.0) && value < CUDA_VALUE(1.0));
530  d = value;
531 }
532 
534  return maxStepSize;
535 }
536 
538  assert(value > CUDA_VALUE(0.0));
539  maxStepSize = value;
540 }
541 
543  return epoch;
544 }
545 
547  return layers.Length();
548 }
549 
551  return layers[0].inputsWithoutBias;
552 }
553 
555  return layers[layers.Length() - 1].neurons;
556 }
557 
558 int BackPropagation::GetNumberNeurons(int layer) const {
559  assert(layer >= 0 && layer < layers.Length());
560  return layers[layer].neurons;
561 }
562 
563 void BackPropagation::Fire() {
564  if (selectiveInputLayerSpaceNetwork != nullptr) selectiveInputLayerSpaceNetwork->Fire(streamKernels);
565 
566  int nSpaceLayers = spaceLayers.Length();
567  for (int l = 0; l < nSpaceLayers; l++) spaceLayers[l].Fire(streamKernels);
568 
569  if (selectiveInputLayer != nullptr) selectiveInputLayer->Fire(streamKernels);
570 
571  int numLayers = layers.Length();
572  for(int l = 0; l < numLayers; l++) layers[l].Fire(streamKernels);
573 }
574 
576  cudaDeviceSynchronize();
577 
578  Fire(); // Determine the network outputs
579 
580  // Calculate the RMS
581  KernelCalculateRMS(streamKernels, patternsBlockSize, d_rms.Pointer(), d_rmsOut.Pointer(), d_rms.Length(), numberPatternsNeurons);
582  rms.UpdateValue(d_rmsOut.Pointer());
583 
584  return rms.Value();
585 }
586 
588  cudafloat RMS = rms.Value();
589 
590  if (epoch == 0 && RMS >= CUDA_VALUE(1.0)) return GetRMS();
591 
592  return RMS;
593 }
594 
596  int numLayers = layers.Length();
597  int nSpaceLayers = spaceLayers.Length();
598 
599  Fire(); // Determine the network outputs
600 
601  // Calculate the RMS / Robust training
602  if (robustLearning) {
603  KernelCalculateRMS(streamKernels, patternsBlockSize, d_rms.Pointer(), d_rmsOut.Pointer(), d_rms.Length(), numberPatternsNeurons);
604  if (cudaStreamQuery(streamRMS) == cudaSuccess) rms.UpdateValueAsync(d_rmsOut.Pointer(), streamRMS);
605 
606  RobustLearning<<<1, maxNumberWeigths, 0, streamKernels>>>(d_rmsOut.Pointer(), d_bestRMS.Pointer(), (cudafloat) rmsGrowToApplyRobustLearning, layersRobustTraining, d_numberWeightsLayer.Pointer(), d_weightsLayers.Pointer(), d_bestWeightsLayers.Pointer(), d_learnRatesLayers.Pointer(), robustFactor, d_lastDeltaWithoutLMlayers.Pointer(), d_lastDeltaLayers.Pointer());
607  } else {
608  if (cudaStreamQuery(streamRMS) == cudaSuccess) {
609  KernelCalculateRMS(streamRMS, patternsBlockSize, d_rms.Pointer(), d_rmsOut.Pointer(), d_rms.Length(), numberPatternsNeurons);
610  rms.UpdateValueAsync(d_rmsOut.Pointer(), streamRMS);
611  }
612  }
613 
614  // Calculate local gradients. The local gradient for the output layer was already calculated.
615  cudafloat * rms = (robustLearning) ? d_rmsOut.Pointer() : nullptr;
616  cudafloat * bestRMS = (robustLearning) ? d_bestRMS.Pointer() : nullptr;
617 
618  for(int l = numLayers - 2; l >= 0; l--) {
619  layers[l].CalculateLocalGradient(streamKernels, rms, bestRMS, rmsGrowToApplyRobustLearning, layers[l + 1]);
620  }
621 
622  if (selectiveInputLayer != nullptr) selectiveInputLayer->CalculateLocalGradient(streamKernels, rms, bestRMS, rmsGrowToApplyRobustLearning, layers[0]);
623 
624  for (int l = nSpaceLayers -2; l >= 0; l--) spaceLayers[l].CalculateLocalGradient(streamKernels, rms, bestRMS, rmsGrowToApplyRobustLearning, spaceLayers[l + 1]);
625 
626  if (selectiveInputLayerSpaceNetwork != nullptr) selectiveInputLayerSpaceNetwork->CalculateLocalGradient(streamKernels, rms, bestRMS, rmsGrowToApplyRobustLearning, spaceLayers[0]);
627 
628  // Correct the weights
629  for(int l = numLayers - 1; l >= 0; l--) {
630  layers[l].CorrectWeights(streamKernels, patternsBlockSize, rms, bestRMS, rmsGrowToApplyRobustLearning, robustFactor, momentum, u, d, maxStepSize);
631  }
632 
633  if (selectiveInputLayer != nullptr) selectiveInputLayer->CorrectWeights(streamKernels, rms, bestRMS, rmsGrowToApplyRobustLearning, robustFactor, momentum, u, d, maxStepSize);
634 
635  for (int l = nSpaceLayers - 1; l >= 0; l--) spaceLayers[l].CorrectWeights(streamKernels, patternsBlockSize, rms, bestRMS, rmsGrowToApplyRobustLearning, robustFactor, momentum, u, d, maxStepSize);
636 
637  if (selectiveInputLayerSpaceNetwork != nullptr) selectiveInputLayerSpaceNetwork->CorrectWeights(streamKernels, rms, bestRMS, rmsGrowToApplyRobustLearning, robustFactor, momentum, u, d, maxStepSize);
638 
639  epoch++;
640 }
641 
642 void BackPropagation::Train(int epochs) {
643  for (int e = 0; e < epochs; e++) TrainOneEpoch();
644 }
645 
646 void BackPropagation::Train(int epochs, cudafloat rmsStop) {
647  // In some situations, we may get the RMS error from a previous trained network.
648  // To avoid this, we compute the actual RMS before training the network.
649  GetRMS();
650 
651  for (int e = 0; e < epochs; e++) {
652  TrainOneEpoch();
653  if (GetRMSestimate() <= rmsStop) break;
654  }
655 }
656 
658  int patterns = inputs.Rows();
659  int numberLayers = layers.Length();
660  int numberSpaceLayers = spaceLayers.Length();
661 
662  DeviceMatrix<cudafloat> d_inputs(inputs);
663 
664  HostArray< DeviceMatrix<cudafloat> * > spaceLayerOutputs;
665  spaceLayerOutputs.ResizeWithoutPreservingData(numberSpaceLayers);
666  for (int l = 0; l < numberSpaceLayers; l++) {
667  spaceLayerOutputs[l] = new DeviceMatrix<cudafloat>(patterns, spaceLayers[l].neurons);
668  }
669 
670  HostArray< DeviceMatrix<cudafloat> * > layerOutputs;
671  layerOutputs.ResizeWithoutPreservingData(numberLayers);
672  for (int l = 0; l < numberLayers; l++) {
673  layerOutputs[l] = new DeviceMatrix<cudafloat>(patterns, layers[l].neurons);
674  }
675 
676  cudafloat * layerInputs = d_inputs.Pointer();
677 
678  int ninputs = d_inputs.Columns();
679  DeviceArray<cudafloat> outputsSelectiveInput(patterns * ninputs);
680 
681  if (selectiveInputLayerSpaceNetwork != nullptr) {
682  int processed = 0;
683  do {
684  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
685  FireSelectiveInputs<<<patternsToProcess, ninputs, 0, streamKernels>>>(layerInputs + (processed * ninputs), selectiveInputLayerSpaceNetwork->d_weights.Pointer(), selectiveInputLayerSpaceNetwork->d_bias.Pointer(), outputsSelectiveInput.Pointer() + (processed * ninputs), ninputs);
686  processed += patternsToProcess;
687  } while (processed < patterns);
688 
689  layerInputs = outputsSelectiveInput.Pointer();
690  }
691 
692  for (int l = 0; l < numberSpaceLayers; l++) {
693  if(spaceLayers[l].connections > MAX_THREADS_PER_BLOCK) {
694  dim3 dimNeuronsPatterns;
695  dimNeuronsPatterns.x = spaceLayers[l].neurons;
696 
697  int processed = 0;
698  do {
699  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
700  dimNeuronsPatterns.y = patternsToProcess;
701  KernelFireLayer(streamKernels, dimNeuronsPatterns, spaceLayers[l].inputsBlockSize, layerInputs + (processed * spaceLayers[l].inputsWithoutBias), spaceLayers[l].d_weights.Pointer(), nullptr, 0, Layer::totalNeuronsWithSelectiveActivation, spaceLayerOutputs[l]->Pointer() + (processed * spaceLayers[l].inputsWithoutBias), spaceLayers[l].inputsWithoutBias);
702  processed += patternsToProcess;
703  } while (processed < patterns);
704  } else {
705  int processed = 0;
706  do {
707  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
708  FireLayer<<<patternsToProcess, spaceLayers[l].dimInputsNeurons, spaceLayers[l].sharedMemFire, streamKernels>>>(layerInputs + (processed * spaceLayers[l].inputsWithoutBias), spaceLayers[l].d_weights.Pointer(), nullptr, 0, Layer::totalNeuronsWithSelectiveActivation, spaceLayerOutputs[l]->Pointer() + (processed * spaceLayers[l].inputsWithoutBias));
709  processed += patternsToProcess;
710  } while (processed < patterns);
711  }
712 
713  layerInputs = spaceLayerOutputs[l]->Pointer();
714  }
715 
716  cudafloat * d_m = nullptr;
717  if (numberSpaceLayers > 0) d_m = layerInputs;
718 
719  layerInputs = d_inputs.Pointer();
720 
721  if (selectiveInputLayer != nullptr) {
722  int processed = 0;
723  do {
724  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
725  FireSelectiveInputs<<<patternsToProcess, ninputs, 0, streamKernels>>>(layerInputs + (processed * ninputs), selectiveInputLayer->d_weights.Pointer(), selectiveInputLayer->d_bias.Pointer(), outputsSelectiveInput.Pointer() + (processed * ninputs), ninputs);
726  processed += patternsToProcess;
727  } while (processed < patterns);
728 
729  layerInputs = outputsSelectiveInput.Pointer();
730  }
731 
732  for (int l = 0; l < numberLayers; l++) {
733  if(layers[l].connections > MAX_THREADS_PER_BLOCK) {
734  dim3 dimNeuronsPatterns;
735  dimNeuronsPatterns.x = layers[l].neurons;
736 
737  int processed = 0;
738  do {
739  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
740  dimNeuronsPatterns.y = patternsToProcess;
741  KernelFireLayer(streamKernels, dimNeuronsPatterns, layers[l].inputsBlockSize, layerInputs + (processed * layers[l].inputsWithoutBias), layers[l].d_weights.Pointer(), (layers[l].d_m != nullptr) ? d_m + (processed * Layer::totalNeuronsWithSelectiveActivation) : nullptr, layers[l].mOffset, Layer::totalNeuronsWithSelectiveActivation, layerOutputs[l]->Pointer() + (processed * layers[l].inputsWithoutBias), layers[l].inputsWithoutBias);
742  processed += patternsToProcess;
743  } while (processed < patterns);
744  } else {
745  int processed = 0;
746  do {
747  int patternsToProcess = (patterns > 65535) ? 65535 : patterns;
748  FireLayer<<<patternsToProcess, layers[l].dimInputsNeurons, layers[l].sharedMemFire, streamKernels>>>(layerInputs + (processed * layers[l].inputsWithoutBias), layers[l].d_weights.Pointer(), (layers[l].d_m != nullptr) ? d_m + (processed * Layer::totalNeuronsWithSelectiveActivation) : nullptr, layers[l].mOffset, Layer::totalNeuronsWithSelectiveActivation, layerOutputs[l]->Pointer() + (processed * layers[l].inputsWithoutBias));
749  processed += patternsToProcess;
750  } while (processed < patterns);
751  }
752 
753  layerInputs = layerOutputs[l]->Pointer();
754  }
755 
756  HostMatrix<cudafloat> outputs(*(layerOutputs[numberLayers - 1]));
757 
758  for (int l = 0; l < numberSpaceLayers; l++) {
759  delete spaceLayerOutputs[l];
760  }
761 
762  for (int l = 0; l < numberLayers; l++) {
763  delete layerOutputs[l];
764  }
765 
766  return outputs;
767 }
768 
769 }
cudafloat GetMaxPercentageRMSGrow() const
void KernelCalculateRMS(cudaStream_t stream, int blockSize, cudafloat *rms, cudafloat *rmsOut, int numberPatterns, cudafloat numberPatternsNeurons)
Definition: CalculateRMS.cu:46
void KernelCorrectWeightsSelectiveInputs(cudaStream_t stream, int neurons, int patterns, cudafloat *rmsF, cudafloat *bestRMS, cudafloat maxErrorGrowth, cudafloat *inputs, cudafloat *localGradient, cudafloat *selectiveNeuronsWeights, cudafloat *selectiveNeuronsBias, cudafloat *learningRateWeights, cudafloat *learningRateBias, cudafloat *lastDeltaWithoutLearningMomentumWeights, cudafloat *lastDeltaWithoutLearningMomentumBias, cudafloat *lastDeltaWeights, cudafloat *lastDeltaBias, cudafloat u, cudafloat d, cudafloat r, cudafloat maxStepSize, cudafloat momentum, int numberPatterns)
void SetSelectiveInputBias(HostArray< cudafloat > &bias)
cudafloat GetDownStepSizeFactor() const
void SetMaxStepSize(cudafloat value)
HostArray< cudafloat > GetSelectiveInputWeights()
void SetMomentum(cudafloat value)
int Elements() const
Definition: BaseMatrix.h:94
cudafloat GetRobustFactor() const
Create an array of any type, on the host, that automatically manages the memory used to hold its elem...
Definition: HostArray.h:40
KERNEL FireLayer(cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *outputs)
void TrainOneEpoch()
Trains the network one epoch.
Type * Pointer() const
Definition: BaseArray.h:70
void SetLayerWeights(int layer, HostArray< cudafloat > &weights)
int ResizeWithoutPreservingData(int size)
Definition: BaseArray.h:77
void KernelFireLayer(cudaStream_t stream, dim3 &gridSize, int blockSize, cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *outputs, int numInputs)
void SetSelectiveInputWeights(HostArray< cudafloat > &weights)
void KernelCorrectLayerWeights(cudaStream_t stream, dim3 &gridSize, int blockSize, cudafloat *rmsF, cudafloat *bestRMS, cudafloat maxErrorGrowth, cudafloat *inputs, cudafloat *localGradient, cudafloat *weights, cudafloat *learningRate, cudafloat *lastDeltaWithoutLearningMomentum, cudafloat *lastDelta, cudafloat maxStepSize, cudafloat u, cudafloat d, cudafloat r, cudafloat momentum, int numberPatterns)
KERNEL CalculateLocalGradient(cudafloat *rmsF, cudafloat *bestRMS, cudafloat maxErrorGrowth, cudafloat *outputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *localGradientNextLayer, cudafloat *localGradient, cudafloat *localGradientSpaceNet)
void SetDownStepSizeFactor(cudafloat value)
cudafloat GetMomentum() const
void RandomizeWeights(cudafloat minValue, cudafloat maxValue)
HostArray< cudafloat > GetLayerWeights(int layer)
int GetNumberNeurons(int layer) const
Type * Pointer() const
Definition: BaseMatrix.h:88
void SetUpStepSizeFactor(cudafloat value)
#define MAX_THREADS_PER_BLOCK
Defines the maximum threads per block.
int Columns() const
Definition: BaseMatrix.h:80
cudafloat GetUpStepSizeFactor() const
int Length() const
Definition: BaseArray.h:63
cudafloat GetMaxStepSize() const
void SetMaxPercentageRMSGrow(cudafloat value)
void SetRobustFactor(cudafloat value)
int Rows() const
Definition: BaseMatrix.h:74
void SetRobustLearning(bool value)
HostArray< cudafloat > GetSelectiveInputBias()
HostMatrix< cudafloat > GetOutputs(HostMatrix< cudafloat > &inputs)
#define CUDA_VALUE(X)
void KernelFireOutputLayer(cudaStream_t stream, dim3 &gridSize, int blockSize, cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *desiredOutputs, cudafloat *outputs, cudafloat *localGradient, cudafloat *rms, cudafloat *localGradientSpaceNet, int numInputs)
float cudafloat