GPUMLib  0.2.2
GPU Machine Learning Library
FireLayerNeuronsKernel.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "../reduction/SumWarp.h"
22 #include "MBPkernels.h"
23 
24 #define BIAS 0
25 
26 #define NEURON blockIdx.x
27 #define NUM_NEURONS gridDim.x
28 
29 #define PATTERN blockIdx.y
30 
31 namespace GPUMLib {
32 
33 template <int blockSize> KERNEL FireLayerNeurons(cudafloat * inputs, cudafloat * weights, cudafloat * m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat * outputs, int numInputs) {
34  extern __shared__ cudafloat iw[];
35 
36  iw[threadIdx.x] = CUDA_VALUE(0.0);
37  for(int i = threadIdx.x; i <= numInputs; i += blockDim.x) {
38  cudafloat i_w = weights[NEURON * (numInputs + 1) + i];
39  if (i > BIAS) i_w *= inputs[PATTERN * numInputs + (i - 1)];
40  iw[threadIdx.x] += i_w;
41  }
42  __syncthreads();
43 
44  SumBeforeWarp<blockSize>(iw);
45 
46  if (threadIdx.x < 32) {
47  SumWarp<blockSize>(iw);
48 
49  if (threadIdx.x == 0) {
50  cudafloat output = CUDA_SIGMOID(iw[0]);
51  if (m != nullptr) output *= m[PATTERN * totalNeuronsWithSelectiveActivation + NEURON + mOffset];
52  outputs[PATTERN * NUM_NEURONS + NEURON] = output;
53  }
54  }
55 }
56 
57 void KernelFireLayer(cudaStream_t stream, dim3 & gridSize, int blockSize, cudafloat * inputs, cudafloat * weights, cudafloat * m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat * outputs, int numInputs) {
58  switch(blockSize) {
59  #ifdef FERMI
60  case 1024:
61  FireLayerNeurons<1024><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
62  break;
63  #endif
64  case 512:
65  FireLayerNeurons<512><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
66  break;
67  case 256:
68  FireLayerNeurons<256><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
69  break;
70  case 128:
71  FireLayerNeurons<128><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
72  break;
73  case 64:
74  FireLayerNeurons<64><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
75  break;
76  case 32:
77  FireLayerNeurons<32><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
78  break;
79  case 16:
80  FireLayerNeurons<16><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
81  break;
82  case 8:
83  FireLayerNeurons<8><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
84  break;
85  case 4:
86  FireLayerNeurons<4><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
87  break;
88  case 2:
89  FireLayerNeurons<2><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
90  break;
91  case 1:
92  FireLayerNeurons<1><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, outputs, numInputs);
93  break;
94  }
95 }
96 
97 template <int blockSize> KERNEL FireOutputLayerNeurons(cudafloat * inputs, cudafloat * weights, cudafloat * m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat * desiredOutputs, cudafloat * outputs, cudafloat * localGradient, cudafloat * rms, cudafloat * localGradientSpaceNet, int numInputs) {
98  extern __shared__ cudafloat iw[];
99 
100  iw[threadIdx.x] = CUDA_VALUE(0.0);
101  for(int i = threadIdx.x; i <= numInputs; i += blockDim.x) {
102  cudafloat i_w = weights[NEURON * (numInputs + 1) + i];
103  if (i > BIAS) i_w *= inputs[PATTERN * numInputs + (i - 1)];
104  iw[threadIdx.x] += i_w;
105  }
106  __syncthreads();
107 
108  SumBeforeWarp<blockSize>(iw);
109 
110  if (threadIdx.x < 32) {
111  SumWarp<blockSize>(iw);
112 
113  if (threadIdx.x == 0) {
114  int n = PATTERN * NUM_NEURONS + NEURON;
115  int nSelAct = PATTERN * totalNeuronsWithSelectiveActivation + NEURON + mOffset;
116 
117  cudafloat output = CUDA_SIGMOID(iw[0]);
118  cudafloat M = (m != nullptr) ? m[nSelAct] : CUDA_VALUE(1.0);
119  cudafloat outn = output * M;
120 
121  cudafloat error = (desiredOutputs[n] - outn);
122 
123  if (m != nullptr) localGradientSpaceNet[nSelAct] = error * output * CUDA_SIGMOID_DERIVATE(M);
124 
125  outputs[n] = outn;
126 
127  localGradient[n] = error * M * CUDA_SIGMOID_DERIVATE(output);
128 
129  rms[PATTERN * NUM_NEURONS + NEURON] = error * error;
130  }
131  }
132 }
133 
134 void KernelFireOutputLayer(cudaStream_t stream, dim3 & gridSize, int blockSize, cudafloat * inputs, cudafloat * weights, cudafloat * m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat * desiredOutputs, cudafloat * outputs, cudafloat * localGradient, cudafloat * rms, cudafloat * localGradientSpaceNet, int numInputs) {
135  switch(blockSize) {
136  #ifdef FERMI
137  case 1024:
138  FireOutputLayerNeurons<1024><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
139  break;
140  #endif
141  case 512:
142  FireOutputLayerNeurons<512><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
143  break;
144  case 256:
145  FireOutputLayerNeurons<256><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
146  break;
147  case 128:
148  FireOutputLayerNeurons<128><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
149  break;
150  case 64:
151  FireOutputLayerNeurons<64><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
152  break;
153  case 32:
154  FireOutputLayerNeurons<32><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
155  break;
156  case 16:
157  FireOutputLayerNeurons<16><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
158  break;
159  case 8:
160  FireOutputLayerNeurons<8><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
161  break;
162  case 4:
163  FireOutputLayerNeurons<4><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
164  break;
165  case 2:
166  FireOutputLayerNeurons<2><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
167  break;
168  case 1:
169  FireOutputLayerNeurons<1><<<gridSize, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, weights, m, mOffset, totalNeuronsWithSelectiveActivation, desiredOutputs, outputs, localGradient, rms, localGradientSpaceNet, numInputs);
170  break;
171  }
172 }
173 
174 }
void KernelFireLayer(cudaStream_t stream, dim3 &gridSize, int blockSize, cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *outputs, int numInputs)
#define KERNEL
Defines the type of a kernel function.
#define CUDA_SIGMOID_DERIVATE(OUTPUT)
#define CUDA_SIGMOID(X)
#define CUDA_VALUE(X)
void KernelFireOutputLayer(cudaStream_t stream, dim3 &gridSize, int blockSize, cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *desiredOutputs, cudafloat *outputs, cudafloat *localGradient, cudafloat *rms, cudafloat *localGradientSpaceNet, int numInputs)
float cudafloat