GPUMLib  0.2.2
GPU Machine Learning Library
reduction.cpp
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "reduction.h"
22 
23 namespace GPUMLib {
24 
25 DeviceArray<cudafloat> Reduction::temporaryBuffer;
26 
27 void Reduction::Sum(cudafloat * inputs, cudafloat * output, int numInputs, cudafloat multiplyFactor, cudaStream_t stream) {
29 
30  if (numInputs > SIZE_SMALL_CUDA_VECTOR) {
31  int blocks = NumberBlocks(numInputs, blockSize);
33 
34  KernelSum(stream, blocks, blockSize, inputs, temporaryBuffer.Pointer(), numInputs);
35 
36  inputs = temporaryBuffer.Pointer();
37  numInputs = blocks;
38 
39  blockSize = NumberThreadsPerBlockThatBestFit(numInputs);
40  }
41 
42  KernelSumSmallArray(stream, blockSize, inputs, output, numInputs, multiplyFactor);
43 }
44 
45 void Reduction::Min(cudafloat * inputs, cudafloat * output, int numInputs, cudaStream_t stream) {
47 
48  if (numInputs > SIZE_SMALL_CUDA_VECTOR) {
49  int blocks = NumberBlocks(numInputs, blockSize);
51 
52  KernelMin(stream, blocks, blockSize, inputs, temporaryBuffer.Pointer(), numInputs);
53 
54  inputs = temporaryBuffer.Pointer();
55  numInputs = blocks;
56 
57  blockSize = NumberThreadsPerBlockThatBestFit(numInputs);
58  }
59 
60  KernelMin(stream, 1, blockSize, inputs, output, numInputs);
61 }
62 
63 void Reduction::MinIndex(cudafloat * inputs, cudafloat * output, int * minIndex, int numInputs, cudaStream_t stream) {
65 
66  int * indexes = nullptr;
67 
68  if (numInputs > SIZE_SMALL_CUDA_VECTOR) {
69  int blocks = NumberBlocks(numInputs, blockSize);
70 
71  int minSizeBuffer = blocks + (int) ceil(blocks * (sizeof(int) / (float) sizeof(float)));
72  if (temporaryBuffer.Length() < minSizeBuffer) temporaryBuffer.ResizeWithoutPreservingData(minSizeBuffer);
73 
74  indexes = (int *)(temporaryBuffer.Pointer() + blocks);
75 
76  KernelMinIndexes(stream, blocks, blockSize, inputs, temporaryBuffer.Pointer(), indexes, numInputs, nullptr);
77 
78  inputs = temporaryBuffer.Pointer();
79  numInputs = blocks;
80 
81  blockSize = NumberThreadsPerBlockThatBestFit(numInputs);
82  }
83 
84  KernelMinIndexes(stream, 1, blockSize, inputs, output, minIndex, numInputs, indexes);
85 }
86 
87 void Reduction::Max(cudafloat * inputs, cudafloat * output, int numInputs, cudaStream_t stream) {
89 
90  if (numInputs > SIZE_SMALL_CUDA_VECTOR) {
91  int blocks = NumberBlocks(numInputs, blockSize);
93 
94  KernelMax(stream, blocks, blockSize, inputs, temporaryBuffer.Pointer(), numInputs);
95 
96  inputs = temporaryBuffer.Pointer();
97  numInputs = blocks;
98 
99  blockSize = NumberThreadsPerBlockThatBestFit(numInputs);
100  }
101 
102  KernelMax(stream, 1, blockSize, inputs, output, numInputs);
103 }
104 
105 void Reduction::MaxIndex(cudafloat * inputs, cudafloat * output, int * maxIndex, int numInputs, cudaStream_t stream) {
107 
108  int * indexes = nullptr;
109 
110  if (numInputs > SIZE_SMALL_CUDA_VECTOR) {
111  int blocks = NumberBlocks(numInputs, blockSize);
112 
113  int minSizeBuffer = blocks + (int) ceil(blocks * (sizeof(int) / (float) sizeof(float)));
114  if (temporaryBuffer.Length() < minSizeBuffer) temporaryBuffer.ResizeWithoutPreservingData(minSizeBuffer);
115 
116  indexes = (int *)(temporaryBuffer.Pointer() + blocks);
117 
118  KernelMaxIndexes(stream, blocks, blockSize, inputs, temporaryBuffer.Pointer(), indexes, numInputs, nullptr);
119 
120  inputs = temporaryBuffer.Pointer();
121  numInputs = blocks;
122 
123  blockSize = NumberThreadsPerBlockThatBestFit(numInputs);
124  }
125 
126  KernelMaxIndexes(stream, 1, blockSize, inputs, output, maxIndex, numInputs, indexes);
127 }
128 
129 }
void KernelMinIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int *minIndexes, int numInputs, int *indexes)
Definition: MinKernel.cu:432
int NumberThreadsPerBlockThatBestFit(int threads, int maxThreadsPerBlock=MAX_THREADS_PER_BLOCK)
Definition: Utilities.h:37
Type * Pointer() const
Definition: BaseArray.h:70
void KernelMin(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs)
Definition: MinKernel.cu:352
int ResizeWithoutPreservingData(int size)
Definition: BaseArray.h:77
#define SIZE_SMALL_CUDA_VECTOR
int NumberBlocks(int threads, int blockSize)
Definition: Utilities.h:49
#define OPTIMAL_BLOCK_SIZE_REDUCTION
void KernelSumSmallArray(cudaStream_t stream, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs, cudafloat multiplyFactor)
Definition: SumKernel.cu:102
int Length() const
Definition: BaseArray.h:63
void KernelMaxIndexes(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int *maxIndexes, int numInputs, int *indexes)
Definition: MaxKernel.cu:431
void KernelSum(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *outputs, int numInputs)
Definition: SumKernel.cu:45
void KernelMax(cudaStream_t stream, int blocks, int blockSize, cudafloat *inputs, cudafloat *output, int numInputs)
Definition: MaxKernel.cu:351
float cudafloat
static DeviceArray< cudafloat > temporaryBuffer
Temporary buffer used for the reduction tasks. Programmers may take advantage of it for other tasks (...
Definition: reduction.h:113