GPUMLib  0.2.2
GPU Machine Learning Library
SumWarp.h
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef GPUMLib_SumWarp_h
22 #define GPUMLib_SumWarp_h
23 
24 #include "../common/CudaDefinitions.h"
25 
26 namespace GPUMLib {
27 
28 template <int blockSize> __device__ __forceinline__ void SumBeforeWarp(cudafloat * s) {
29  if (blockSize >= 1024) {
30  if (threadIdx.x < 512) s[threadIdx.x] += s[threadIdx.x + 512];
31  __syncthreads();
32  }
33 
34  if (blockSize >= 512) {
35  if (threadIdx.x < 256) s[threadIdx.x] += s[threadIdx.x + 256];
36  __syncthreads();
37  }
38 
39  if (blockSize >= 256) {
40  if (threadIdx.x < 128) s[threadIdx.x] += s[threadIdx.x + 128];
41  __syncthreads();
42  }
43 
44  if (blockSize >= 128) {
45  if (threadIdx.x < 64) s[threadIdx.x] += s[threadIdx.x + 64];
46  __syncthreads();
47  }
48 }
49 
50 template <int blockSize> __device__ __forceinline__ void SumWarp(volatile cudafloat * s) {
51  if (blockSize >= 64) s[threadIdx.x] += s[threadIdx.x + 32];
52  if (blockSize >= 32) s[threadIdx.x] += s[threadIdx.x + 16];
53  if (blockSize >= 16) s[threadIdx.x] += s[threadIdx.x + 8];
54  if (blockSize >= 8) s[threadIdx.x] += s[threadIdx.x + 4];
55  if (blockSize >= 4) s[threadIdx.x] += s[threadIdx.x + 2];
56  if (blockSize >= 2) s[threadIdx.x] += s[threadIdx.x + 1];
57 }
58 
59 }
60 
61 #endif
float cudafloat