GPUMLib  0.2.2
GPU Machine Learning Library
rbm.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <stdlib.h>
22 
23 #include "../common/CudaDefinitions.h"
24 #include "../common/Utilities.h"
25 #include "../random/random.h"
26 #include "RBM.h"
27 
28 namespace GPUMLib {
29 
30 KERNEL InitBiasDeltasRBM(cudafloat * bias, cudafloat initialBias, cudafloat * lastDeltaW, cudafloat * lastDeltaB, cudafloat * lastDeltaWithoutLearningMomentumW, cudafloat * lastDeltaWithoutLearningMomentumB, cudafloat * learningRateW, cudafloat * learningRateB, cudafloat initialLearningRate, int weights, int J);
31 KERNEL InitInputBiasDeltasRBM(cudafloat * v, cudafloat * bias, cudafloat * lastDeltaA, cudafloat * lastDeltaWithoutLearningMomentumA, cudafloat * learningRateA, cudafloat initialLearningRate, int I, int samples);
32 KERNEL CorrectWeightsRBM(cudafloat * v_data, cudafloat * h_data, cudafloat * v_recon, cudafloat * h_recon, int samples, cudafloat * learningRateW, cudafloat * lastDeltaWithoutLearningMomentumW, cudafloat * lastDeltaW, cudafloat * learningRateB, cudafloat * lastDeltaWithoutLearningMomentumB, cudafloat * lastDeltaB, cudafloat * learningRateA, cudafloat * lastDeltaWithoutLearningMomentumA, cudafloat * lastDeltaA, cudafloat u, cudafloat d, cudafloat momentum, cudafloat * weights, cudafloat * b, cudafloat * a, cudafloat * errors, int I, int J);
33 
34 void KernelComputeStatusVisibleUnitsRBM(dim3 & gridSize, int blockSize, cudafloat * h, cudafloat * weights, cudafloat * a, cudafloat * v, float * randomValues, int J);
35 void KernelComputeStatusHiddenUnitsRBM(dim3 & gridSize, int blockSize, cudafloat * v, cudafloat * weights, cudafloat * b, cudafloat * h, float * randomValues, int I);
36 KERNEL ComputeStatusHiddenUnitsSmallRBM(cudafloat * v, cudafloat * weights, cudafloat * b, cudafloat * h, float * randomValues);
37 KERNEL ComputeStatusVisibleUnitsSmallRBM(cudafloat * h, cudafloat * weights, cudafloat * a, cudafloat * v, float * randomValues);
38 
40  int nWeights = w.Elements();
41 
42  cudafloat * weights = w.HostPointer();
43 
44  for (int i = 0; i < nWeights; i++) weights[i] = CUDA_VALUE(2.0) * stdWeights * ((cudafloat) rand() / RAND_MAX) - stdWeights;
45  w.UpdateDevice();
46 
47  int blockSize = NumberThreadsPerBlockThatBestFit(nWeights);
48  int blocks = NumberBlocks(nWeights, blockSize);
49 
50  InitBiasDeltasRBM<<<blocks, blockSize>>>(b.DevicePointer(), INITIAL_BIAS_HIDDEN_UNITS, lastDelta.w.Pointer(), lastDelta.b.Pointer(), lastDeltaWithoutLearningMomentum.w.Pointer(), lastDeltaWithoutLearningMomentum.b.Pointer(), learningRate.w.Pointer(), learningRate.b.Pointer(), initialLearningRate, nWeights, J);
51 
52  blocks = NumberBlocks(I, inputsBlockSize);
53 
54  InitInputBiasDeltasRBM<<<blocks, inputsBlockSize>>>(v.Pointer(), a.DevicePointer(), lastDelta.a.Pointer(), lastDeltaWithoutLearningMomentum.a.Pointer(), learningRate.a.Pointer(), initialLearningRate, I, samples);
55 
56  epoch = 0;
57 }
58 
59 void RBM::ComputeStatusUnits(cudafloat * v, cudafloat * h, cudafloat * v_reconstructed, int samples, float * rnd) {
60  int connections = w.Elements();
61 
62  dim3 dimJsamples;
63  dimJsamples.x = J;
64  dimJsamples.y = samples;
65 
66  if(connections > MAX_THREADS_PER_BLOCK) {
67  KernelComputeStatusHiddenUnitsRBM(dimJsamples, inputsBlockSize, v, w.DevicePointer(), b.DevicePointer(), h, rnd, I);
68  } else {
69  ComputeStatusHiddenUnitsSmallRBM<<<samples, dimIJ, connections * sizeof(cudafloat)>>>(v, w.DevicePointer(), b.DevicePointer(), h, rnd);
70  }
71 
72  dim3 dimIsamples;
73  dimIsamples.x = I;
74  dimIsamples.y = samples;
75 
76  if (v_reconstructed != nullptr) {
77  rnd = (useBinaryValuesVisibleReconstruction) ? (rnd + J * samples) : nullptr;
78 
79  if(connections > MAX_THREADS_PER_BLOCK) {
80  KernelComputeStatusVisibleUnitsRBM(dimIsamples, hiddenUnitsBlockSize, h, w.DevicePointer(), a.DevicePointer(), v_reconstructed, rnd, J);
81  } else {
82  ComputeStatusVisibleUnitsSmallRBM<<<samples, dimJI, connections * sizeof(cudafloat)>>>(h, w.DevicePointer(), a.DevicePointer(), v_reconstructed, rnd);
83  }
84  }
85 }
86 
88  int sizeLastBatch = samples;
89  int batches = 1;
90 
91  if (miniBatchSize > 0) {
92  batches = samples / miniBatchSize;
93  sizeLastBatch = samples % miniBatchSize;
94  if (sizeLastBatch > 0) {
95  batches++;
96  } else {
97  sizeLastBatch = miniBatchSize;
98  }
99  }
100 
101  dim3 block;
102  block.x = 16;
103  block.y = 16;
104 
105  dim3 grid;
106  grid.x = NumberBlocks(I, block.x);
107  grid.y = NumberBlocks(J, block.y);
108 
109  cudafloat * vd = v.Pointer();
110  cudafloat * hd = h_data.Pointer();
111  cudafloat * vr = v_recon.Pointer();
112  cudafloat * hr = h_recon.Pointer();
113  cudafloat * cerrors = errors.Pointer();
114 
115  Random::Fill(randomValues);
116  float * rnd = randomValues.Pointer();
117 
118  int lastBatch = batches - 1;
119  for(int batch = 0; batch < batches; batch++) {
120  int samples = (batch == lastBatch) ? sizeLastBatch : miniBatchSize;
121 
122  ComputeStatusUnits(vd, hd, vr, samples, rnd);
123  rnd += samples * (useBinaryValuesVisibleReconstruction ? (I + J) : J);
124 
125  for (int k = 1; k < n; k++) {
126  ComputeStatusUnits(vr, hr, vr, samples, rnd);
127  rnd += samples * (useBinaryValuesVisibleReconstruction ? (I + J) : J);
128  }
129 
130  ComputeStatusUnits(vr, hr, nullptr, samples, nullptr);
131 
132  CorrectWeightsRBM<<<grid, block>>>(vd, hd, vr, hr, samples, learningRate.w.Pointer(), lastDeltaWithoutLearningMomentum.w.Pointer(), lastDelta.w.Pointer(), learningRate.b.Pointer(), lastDeltaWithoutLearningMomentum.b.Pointer(), lastDelta.b.Pointer(), learningRate.a.Pointer(), lastDeltaWithoutLearningMomentum.a.Pointer(), lastDelta.a.Pointer(), U_FACTOR, D_FACTOR, momentum, w.DevicePointer(), b.DevicePointer(), a.DevicePointer(), cerrors, I, J);
133 
134  vd += miniBatchSize;
135  hd += miniBatchSize;
136  vr += miniBatchSize;
137  hr += miniBatchSize;
138  cerrors += miniBatchSize;
139  }
140 
141  epoch++;
142 }
143 
144 }
int NumberThreadsPerBlockThatBestFit(int threads, int maxThreadsPerBlock=MAX_THREADS_PER_BLOCK)
Definition: Utilities.h:37
void ContrastiveDivergence(int n)
Definition: rbm.cu:87
Type * Pointer() const
Definition: BaseArray.h:70
Type * HostPointer() const
Definition: CudaMatrix.h:139
int NumberBlocks(int threads, int blockSize)
Definition: Utilities.h:49
int Elements() const
Definition: CudaMatrix.h:194
Type * Pointer() const
Definition: BaseMatrix.h:88
#define KERNEL
Defines the type of a kernel function.
#define MAX_THREADS_PER_BLOCK
Defines the maximum threads per block.
void RandomizeWeights()
Randomizes the weights of the RBM.
Definition: rbm.cu:39
Type * DevicePointer() const
Definition: CudaMatrix.h:146
Type * DevicePointer() const
Definition: CudaArray.h:136
#define CUDA_VALUE(X)
static void Fill(DeviceArray< float > &a)
Definition: random.cu:56
float cudafloat