GPUMLib  0.2.2
GPU Machine Learning Library
FireLayerKernel.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "MBPkernels.h"
22 
23 #define INPUT threadIdx.x
24 #define NUM_INPUTS_INCLUDING_BIAS blockDim.x
25 #define NUM_INPUTS (NUM_INPUTS_INCLUDING_BIAS - 1)
26 #define BIAS 0
27 
28 #define NEURON threadIdx.y
29 #define NUM_NEURONS blockDim.y
30 
31 #define PATTERN blockIdx.x
32 
33 #define THREAD_ID connection
34 
35 namespace GPUMLib {
36 
37 __device__ void SumInputWeight(int connection, cudafloat * inputs, cudafloat * weights) {
38  extern __shared__ cudafloat iw[];
39 
40  /*******
41  For each each input connection of all layer neurons, calculate the weight * input.
42  Results will be held in iw[]. This is done for the current pattern.
43  *******/
44  iw[connection] = weights[connection];
45  if (INPUT > BIAS) iw[connection] *= inputs[PATTERN * NUM_INPUTS + (INPUT - 1)];
46  __syncthreads();
47 
48  /*******
49  For each layer neuron, calculate the its activation (sum(weight * input)).
50  Results for neuron n will held on iw[n*NUM_INPUTS_INCLUDING_BIAS].
51  This is done for the current pattern.
52  *******/
53  int numberElemSum = NUM_INPUTS_INCLUDING_BIAS;
54  for(int sumUpTo = (numberElemSum >> 1); numberElemSum > 1; sumUpTo = (numberElemSum >> 1)) {
55  int nextNumberElemSum = sumUpTo;
56  if (numberElemSum & 1) nextNumberElemSum++;
57 
58  if (INPUT < sumUpTo) iw[connection] += iw[connection + nextNumberElemSum];
59  numberElemSum = nextNumberElemSum;
60 
61  __syncthreads();
62  }
63 }
64 
65 KERNEL FireLayer(cudafloat * inputs, cudafloat * weights, cudafloat * m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat * outputs) {
66  extern __shared__ cudafloat iw[];
67 
68  int connection = NEURON * NUM_INPUTS_INCLUDING_BIAS + INPUT;
69 
70  SumInputWeight(connection, inputs, weights);
71 
72  /*******
73  For each layer neuron, calculate its output. Results for neuron n will be held on outputs[].
74  Note that outputs[] will contain the layer neuron outputs for all the patterns.
75  *******/
76  if (INPUT == 0) {
77  int n = PATTERN * NUM_NEURONS + NEURON;
78 
79  cudafloat output = CUDA_SIGMOID(iw[THREAD_ID]);
80  if (m != nullptr) output *= m[PATTERN * totalNeuronsWithSelectiveActivation + NEURON + mOffset];
81  outputs[n] = output;
82  }
83 }
84 
85 KERNEL FireOutputLayer(cudafloat * inputs, cudafloat * weights, cudafloat * m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat * desiredOutputs, cudafloat * outputs, cudafloat * localGradient, cudafloat * rms, cudafloat * localGradientSpaceNet) {
86  extern __shared__ cudafloat iw[];
87 
88  int connection = NEURON * NUM_INPUTS_INCLUDING_BIAS + INPUT;
89  SumInputWeight(connection, inputs, weights);
90 
91  /*******
92  - For each layer neuron, calculate its output. Results for neuron n will held on outputs[].
93  Note that outputs[] will contain the layer neuron outputs for all the patterns.
94  - Determine the local gradient to the current pattern. Results will be held on localGradient[].
95  - Determine the contribution of this pattern to the RMS error (rms[]).
96  This value will be used in the kernel CorrectOutputLayerWeights to calculate the RMS of the current epoch.
97  *******/
98 
99  cudafloat * shared_rms = (iw + (NUM_INPUTS_INCLUDING_BIAS * NUM_NEURONS));
100 
101  if (INPUT == 0) {
102  int n = PATTERN * NUM_NEURONS + NEURON;
103  int nSelAct = PATTERN * totalNeuronsWithSelectiveActivation + NEURON + mOffset;
104 
105  cudafloat output = CUDA_SIGMOID(iw[THREAD_ID]);
106  cudafloat M = (m != nullptr) ? m[nSelAct] : CUDA_VALUE(1.0);
107  cudafloat outn = output * M;
108 
109  cudafloat error = (desiredOutputs[n] - outn);
110 
111  if (m != nullptr) localGradientSpaceNet[nSelAct] = error * output * CUDA_SIGMOID_DERIVATE(M);
112 
113  outputs[n] = outn;
114  localGradient[n] = error * M * CUDA_SIGMOID_DERIVATE(output);
115 
116  shared_rms[NEURON] = error * error;
117  }
118 
119  if (NUM_NEURONS > 1) {
120  __syncthreads();
121 
122  // Loop unrolling (interval = 1)
123  if (INPUT == 0 && (NEURON & 1) == 0 && NEURON + 1 < NUM_NEURONS) shared_rms[NEURON] += shared_rms[NEURON + 1];
124  __syncthreads();
125 
126  int nextInterval;
127  for (int interval = 2; interval < NUM_NEURONS; interval = nextInterval) {
128  nextInterval = interval << 1;
129 
130  if (INPUT == 0 && (NEURON & (nextInterval - 1)) == 0 && NEURON + interval < NUM_NEURONS) shared_rms[NEURON] += shared_rms[NEURON + interval];
131  __syncthreads();
132  }
133  }
134 
135  if (NEURON == 0 && INPUT == 0) rms[PATTERN] = shared_rms[0];
136 }
137 
138 }
KERNEL FireLayer(cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *outputs)
#define KERNEL
Defines the type of a kernel function.
#define CUDA_SIGMOID_DERIVATE(OUTPUT)
#define CUDA_SIGMOID(X)
KERNEL FireOutputLayer(cudafloat *inputs, cudafloat *weights, cudafloat *m, int mOffset, int totalNeuronsWithSelectiveActivation, cudafloat *desiredOutputs, cudafloat *outputs, cudafloat *localGradient, cudafloat *rms, cudafloat *localGradientSpaceNet)
#define CUDA_VALUE(X)
float cudafloat