GPUMLib  0.2.2
GPU Machine Learning Library
reduction.h
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef GPUMLib_reduction_h
22 #define GPUMLib_reduction_h
23 
24 #include <cuda_runtime.h>
25 #include <cmath>
26 
27 #include "../common/CudaDefinitions.h"
28 #include "../common/Utilities.h"
29 #include "../memory/CudaArray.h"
30 #include "../memory/DeviceMatrix.h"
31 #include "../memory/DeviceAccessibleVariable.h"
32 
33 using namespace std;
34 
35 namespace GPUMLib {
36 
39 
48 void KernelSum(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * outputs, int numInputs);
49 
58 void KernelSumSmallArray(cudaStream_t stream, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs, cudafloat multiplyFactor);
59 
67 void KernelMin(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs);
68 
78 void KernelMinIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int * minIndexes, int numInputs, int * indexes);
79 
87 void KernelMax(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs);
88 
98 void KernelMaxIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat * inputs, cudafloat * output, int * maxIndexes, int numInputs, int * indexes);
99 
101 class Reduction {
102  private:
103  void static Sum(cudafloat * inputs, cudafloat * output, int numInputs, cudafloat multiplyFactor, cudaStream_t stream);
104 
105  void static MinIndex(cudafloat * inputs, cudafloat * output, int * minIndex, int numInputs, cudaStream_t stream);
106  void static Min(cudafloat * inputs, cudafloat * output, int numInputs, cudaStream_t stream);
107 
108  void static Max(cudafloat * inputs, cudafloat * output, int numInputs, cudaStream_t stream);
109  void static MaxIndex(cudafloat * inputs, cudafloat * output, int * minIndex, int numInputs, cudaStream_t stream);
110 
111  public:
114 
120  void static Sum(DeviceArray<cudafloat> & inputs, cudafloat * output, cudafloat multiplyFactor = CUDA_VALUE(1.0), cudaStream_t stream = nullptr) {
121  Sum(inputs.Pointer(), output, inputs.Length(), multiplyFactor, stream);
122  }
123 
129  void static Sum(DeviceArray<cudafloat> & inputs, DeviceArray<cudafloat> & output, cudafloat multiplyFactor = CUDA_VALUE(1.0), cudaStream_t stream = nullptr) {
130  Sum(inputs.Pointer(), output.Pointer(), inputs.Length(), multiplyFactor, stream);
131  }
132 
137  void static Average(DeviceArray<cudafloat> & inputs, DeviceArray<cudafloat> & output, cudaStream_t stream = nullptr) {
138  double multiplyFactor = 1.0 / inputs.Length();
139  Sum(inputs.Pointer(), output.Pointer(), inputs.Length(), (cudafloat) multiplyFactor, stream);
140  }
141 
146  void static Min(DeviceArray<cudafloat> & inputs, DeviceArray<cudafloat> & output, cudaStream_t stream = nullptr) {
147  Min(inputs.Pointer(), output.Pointer(), inputs.Length(), stream);
148  }
149 
154  void static Min(DeviceMatrix<cudafloat> & inputs, DeviceArray<cudafloat> & output, cudaStream_t stream = nullptr) {
155  Min(inputs.Pointer(), output.Pointer(), inputs.Elements(), stream);
156  }
157 
163  void static MinIndex(DeviceArray<cudafloat> & inputs, DeviceArray<cudafloat> & min, DeviceArray<int> & minIndex, cudaStream_t stream = nullptr) {
164  MinIndex(inputs.Pointer(), min.Pointer(), minIndex.Pointer(), inputs.Length(), stream);
165  }
166 
172  void static MinIndex(DeviceMatrix<cudafloat> & inputs, DeviceArray<cudafloat> & min, DeviceArray<int> & minIndex, cudaStream_t stream = nullptr) {
173  MinIndex(inputs.Pointer(), min.Pointer(), minIndex.Pointer(), inputs.Elements(), stream);
174  }
175 
180  void static Max(DeviceArray<cudafloat> & inputs, DeviceArray<cudafloat> & output, cudaStream_t stream = nullptr) {
181  Max(inputs.Pointer(), output.Pointer(), inputs.Length(), stream);
182  }
183 
188  void static Max(DeviceMatrix<cudafloat> & inputs, DeviceArray<cudafloat> & output, cudaStream_t stream = nullptr) {
189  Max(inputs.Pointer(), output.Pointer(), inputs.Elements(), stream);
190  }
191 
197  void static MaxIndex(DeviceArray<cudafloat> & inputs, DeviceArray<cudafloat> & max, DeviceArray<int> & maxIndex, cudaStream_t stream = nullptr) {
198  MaxIndex(inputs.Pointer(), max.Pointer(), maxIndex.Pointer(), inputs.Length(), stream);
199  }
200 
206  void static MaxIndex(DeviceMatrix<cudafloat> & inputs, DeviceArray<cudafloat> & max, DeviceArray<int> & maxIndex, cudaStream_t stream = nullptr) {
207  MaxIndex(inputs.Pointer(), max.Pointer(), maxIndex.Pointer(), inputs.Elements(), stream);
208  }
209 };
210 
212 
213 }
214 
215 #endif
Provides reduction functions (Sum, Average, Max, Min, ...).
Definition: reduction.h:101
void KernelMinIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int *minIndexes, int numInputs, int *indexes)
Definition: MinKernel.cu:432
static void Average(DeviceArray< cudafloat > &inputs, DeviceArray< cudafloat > &output, cudaStream_t stream=nullptr)
Definition: reduction.h:137
static void MaxIndex(DeviceArray< cudafloat > &inputs, DeviceArray< cudafloat > &max, DeviceArray< int > &maxIndex, cudaStream_t stream=nullptr)
Definition: reduction.h:197
int Elements() const
Definition: BaseMatrix.h:94
Type * Pointer() const
Definition: BaseArray.h:70
void KernelMin(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs)
Definition: MinKernel.cu:352
static void MaxIndex(DeviceMatrix< cudafloat > &inputs, DeviceArray< cudafloat > &max, DeviceArray< int > &maxIndex, cudaStream_t stream=nullptr)
Definition: reduction.h:206
static void MinIndex(DeviceArray< cudafloat > &inputs, DeviceArray< cudafloat > &min, DeviceArray< int > &minIndex, cudaStream_t stream=nullptr)
Definition: reduction.h:163
Type * Pointer() const
Definition: BaseMatrix.h:88
void KernelSumSmallArray(cudaStream_t stream, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs, cudafloat multiplyFactor)
Definition: SumKernel.cu:102
int Length() const
Definition: BaseArray.h:63
static void Sum(DeviceArray< cudafloat > &inputs, DeviceArray< cudafloat > &output, cudafloat multiplyFactor=CUDA_VALUE(1.0), cudaStream_t stream=nullptr)
Definition: reduction.h:129
static void MinIndex(DeviceMatrix< cudafloat > &inputs, DeviceArray< cudafloat > &min, DeviceArray< int > &minIndex, cudaStream_t stream=nullptr)
Definition: reduction.h:172
static void Min(DeviceMatrix< cudafloat > &inputs, DeviceArray< cudafloat > &output, cudaStream_t stream=nullptr)
Definition: reduction.h:154
void KernelMaxIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int *maxIndexes, int numInputs, int *indexes)
Definition: MaxKernel.cu:431
void KernelSum(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *outputs, int numInputs)
Definition: SumKernel.cu:45
void KernelMax(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs)
Definition: MaxKernel.cu:351
static void Max(DeviceArray< cudafloat > &inputs, DeviceArray< cudafloat > &output, cudaStream_t stream=nullptr)
Definition: reduction.h:180
#define CUDA_VALUE(X)
static void Min(DeviceArray< cudafloat > &inputs, DeviceArray< cudafloat > &output, cudaStream_t stream=nullptr)
Definition: reduction.h:146
static void Max(DeviceMatrix< cudafloat > &inputs, DeviceArray< cudafloat > &output, cudaStream_t stream=nullptr)
Definition: reduction.h:188
static void Sum(DeviceArray< cudafloat > &inputs, cudafloat *output, cudafloat multiplyFactor=CUDA_VALUE(1.0), cudaStream_t stream=nullptr)
Definition: reduction.h:120
float cudafloat
static DeviceArray< cudafloat > temporaryBuffer
Temporary buffer used for the reduction tasks. Programmers may take advantage of it for other tasks (...
Definition: reduction.h:113