GPUMLib  0.2.2
GPU Machine Learning Library
RBMstatus.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "../reduction/SumWarp.h"
22 
23 namespace GPUMLib {
24 
25 #define NEURON blockIdx.x
26 #define NUM_NEURONS gridDim.x
27 
28 #define SAMPLE blockIdx.y
29 
30 template <int blockSize> KERNEL ComputeStatusHiddenUnitsRBM(cudafloat * v, cudafloat * weights, cudafloat * b, cudafloat * h, float * randomValues, int I) {
31  extern __shared__ cudafloat iw[];
32 
33  iw[threadIdx.x] = CUDA_VALUE(0.0);
34  for(int i = threadIdx.x; i < I; i += blockDim.x) {
35  iw[threadIdx.x] += v[SAMPLE * I + i] * weights[NEURON * I + i];
36  }
37  __syncthreads();
38 
39  if (blockSize >= 1024) {
40  if (threadIdx.x < 512) iw[threadIdx.x] += iw[threadIdx.x + 512];
41  __syncthreads();
42  }
43 
44  if (blockSize >= 512) {
45  if (threadIdx.x < 256) iw[threadIdx.x] += iw[threadIdx.x + 256];
46  __syncthreads();
47  }
48 
49  if (blockSize >= 256) {
50  if (threadIdx.x < 128) iw[threadIdx.x] += iw[threadIdx.x + 128];
51  __syncthreads();
52  }
53 
54  if (blockSize >= 128) {
55  if (threadIdx.x < 64) iw[threadIdx.x] += iw[threadIdx.x + 64];
56  __syncthreads();
57  }
58 
59  __shared__ cudafloat output;
60  if (threadIdx.x < 32) {
61  SumWarp<blockSize>(iw);
62 
63  if (threadIdx.x == 0) {
64  output = CUDA_SIGMOID(iw[0] + b[NEURON]);
65  int idx = SAMPLE * NUM_NEURONS + NEURON;
66  if (randomValues != nullptr) output = (output > randomValues[idx]) ? CUDA_VALUE(1.0) : CUDA_VALUE(0.0);
67  h[idx] = output;
68  }
69  }
70 }
71 
72 void KernelComputeStatusHiddenUnitsRBM(dim3 & gridSize, int blockSize, cudafloat * v, cudafloat * weights, cudafloat * b, cudafloat * h, float * randomValues, int I) {
73  switch(blockSize) {
74  #ifdef FERMI
75  case 1024:
76  ComputeStatusHiddenUnitsRBM<1024><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
77  break;
78  #endif
79  case 512:
80  ComputeStatusHiddenUnitsRBM<512><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
81  break;
82  case 256:
83  ComputeStatusHiddenUnitsRBM<256><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
84  break;
85  case 128:
86  ComputeStatusHiddenUnitsRBM<128><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
87  break;
88  case 64:
89  ComputeStatusHiddenUnitsRBM<64><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
90  break;
91  case 32:
92  ComputeStatusHiddenUnitsRBM<32><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
93  break;
94  case 16:
95  ComputeStatusHiddenUnitsRBM<16><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
96  break;
97  case 8:
98  ComputeStatusHiddenUnitsRBM<8><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
99  break;
100  case 4:
101  ComputeStatusHiddenUnitsRBM<4><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
102  break;
103  case 2:
104  ComputeStatusHiddenUnitsRBM<2><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
105  break;
106  case 1:
107  ComputeStatusHiddenUnitsRBM<1><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(v, weights, b, h, randomValues, I);
108  break;
109  }
110 }
111 
112 template <int blockSize> KERNEL ComputeStatusVisibleUnitsRBM(cudafloat * h, cudafloat * weights, cudafloat * a, cudafloat * v, float * randomValues, int J) {
113  extern __shared__ cudafloat sum[];
114 
115  sum[threadIdx.x] = CUDA_VALUE(0.0);
116  for(int j = threadIdx.x; j < J; j += blockDim.x) {
117  sum[threadIdx.x] += h[SAMPLE * J + j] * weights[j * NUM_NEURONS + NEURON];
118  }
119  __syncthreads();
120 
121  if (blockSize >= 1024) {
122  if (threadIdx.x < 512) sum[threadIdx.x] += sum[threadIdx.x + 512];
123  __syncthreads();
124  }
125 
126  if (blockSize >= 512) {
127  if (threadIdx.x < 256) sum[threadIdx.x] += sum[threadIdx.x + 256];
128  __syncthreads();
129  }
130 
131  if (blockSize >= 256) {
132  if (threadIdx.x < 128) sum[threadIdx.x] += sum[threadIdx.x + 128];
133  __syncthreads();
134  }
135 
136  if (blockSize >= 128) {
137  if (threadIdx.x < 64) sum[threadIdx.x] += sum[threadIdx.x + 64];
138  __syncthreads();
139  }
140 
141  if (threadIdx.x < 32) {
142  SumWarp<blockSize>(sum);
143 
144  if (threadIdx.x == 0) {
145  cudafloat output = CUDA_SIGMOID(sum[0] + a[NEURON]);
146 
147  int idx = SAMPLE * NUM_NEURONS + NEURON;
148  if (randomValues != nullptr) output = (output > randomValues[idx]) ? CUDA_VALUE(1.0) : CUDA_VALUE(0.0);
149  v[idx] = output;
150  }
151  }
152 }
153 
154 void KernelComputeStatusVisibleUnitsRBM(dim3 & gridSize, int blockSize, cudafloat * h, cudafloat * weights, cudafloat * a, cudafloat * v, float * randomValues, int J) {
155  switch(blockSize) {
156  #ifdef FERMI
157  case 1024:
158  ComputeStatusVisibleUnitsRBM<1024><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
159  break;
160  #endif
161  case 512:
162  ComputeStatusVisibleUnitsRBM<512><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
163  break;
164  case 256:
165  ComputeStatusVisibleUnitsRBM<256><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
166  break;
167  case 128:
168  ComputeStatusVisibleUnitsRBM<128><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
169  break;
170  case 64:
171  ComputeStatusVisibleUnitsRBM<64><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
172  break;
173  case 32:
174  ComputeStatusVisibleUnitsRBM<32><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
175  break;
176  case 16:
177  ComputeStatusVisibleUnitsRBM<16><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
178  break;
179  case 8:
180  ComputeStatusVisibleUnitsRBM<8><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
181  break;
182  case 4:
183  ComputeStatusVisibleUnitsRBM<4><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
184  break;
185  case 2:
186  ComputeStatusVisibleUnitsRBM<2><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
187  break;
188  case 1:
189  ComputeStatusVisibleUnitsRBM<1><<<gridSize, blockSize, blockSize * sizeof(cudafloat)>>>(h, weights, a, v, randomValues, J);
190  break;
191  }
192 }
193 
194 }
#define KERNEL
Defines the type of a kernel function.
#define CUDA_SIGMOID(X)
#define CUDA_VALUE(X)
float cudafloat