GPUMLib  0.2.2
GPU Machine Learning Library
NMFadditiveDivergenceKernels.cu
1 /*
2  Noel Lopes is an Assistant Professor at the Polytechnic Institute of Guarda, Portugal
3  Copyright (C) 2009, 2010, 2011, 2012 Noel de Jesus Mendonša Lopes
4 
5  This file is part of GPUMLib.
6 
7  GPUMLib is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "NMFkernels.h"
22 
23 namespace GPUMLib {
24 
27 
28 #ifdef ROW_MAJOR_H
29  #define HMATRIX(_ROW, _COL, _R, _M) (H[(_ROW) * (_M) + (_COL)])
30  #define IDX_HMATRIX(_ROW, _COL, _R, _M) ((_ROW) * (_M) + (_COL))
31 #else
32  #define HMATRIX(_ROW, _COL, _R, _M) (H[(_COL) * (_R) + (_ROW)])
33  #define IDX_HMATRIX(_ROW, _COL, _R, _M) ((_COL) * (_R) + (_ROW))
34 #endif
35 
36 #ifdef ROW_MAJOR_W
37  #define WMATRIX(_ROW, _COL, _N, _R) (W[(_ROW) * (_R) + (_COL)])
38  #define IDX_WMATRIX(_ROW, _COL, _N, _R) ((_ROW) * (_R) + (_COL))
39 #else
40  #define WMATRIX(_ROW, _COL, _N, _R) (W[(_COL) * (_N) + (_ROW)])
41  #define IDX_WMATRIX(_ROW, _COL, _N, _R) ((_COL) * (_N) + (_ROW))
42 #endif
43 
44 //#define SW(_R, _C) sw[(_R)][(_C)]
45 #define SW(_R, _C) (sw[(_C)][(_R)])
46 
47 #define SVH(_R, _C) svh[(_R)][(_C)]
48 //#define SVH(_R, _C) (svh[(_C)][(_R)])
49 
50 //#define SH(_R, _C) sh[(_R)][(_C)]
51 #define SH(_R, _C) sh[(_C)][(_R)]
52 
53 #define SVW(_R, _C) svw[(_R)][(_C)]
54 //#define SVW(_R, _C) svw[(_C)][(_R)]
55 
56 KERNEL UpdateW_AD(cudafloat * W, cudafloat * H, cudafloat * V, cudafloat * WH, cudafloat * sumH, int n, int m, int r) {
57  __shared__ cudafloat SH(32, 32);
58  __shared__ cudafloat SVW(32, 32);
59 
60  int x = blockIdx.x * 32 + threadIdx.x;
61  int y = blockIdx.y * 32 + threadIdx.y;
62 
63  cudafloat sum1 = CUDA_VALUE(0.0);
64  //cudafloat sumH1 = CUDA_VALUE(0.0);
65  cudafloat sum2 = CUDA_VALUE(0.0);
66  //cudafloat sumH2 = CUDA_VALUE(0.0);
67 
68  for(int k = 0; k < m; k += 32) {
69  int tx = threadIdx.x + 16;
70 
71  if (x < r && threadIdx.y + k < m) {
72  int ky = k + threadIdx.y;
73  SH(threadIdx.x, threadIdx.y) = HMATRIX(x, ky, r, m);
74  SH(tx, threadIdx.y) = (x + 16 < r) ? HMATRIX(x + 16, ky, r, m) : CUDA_VALUE(0.0);
75  } else {
76  SH(threadIdx.x, threadIdx.y) = CUDA_VALUE(0.0);
77  SH(tx, threadIdx.y) = CUDA_VALUE(0.0);
78  }
79 
80  if (y < n && k + threadIdx.x < m) {
81  int idx = (k + threadIdx.x) * n + y;
82  SVW(threadIdx.y, threadIdx.x) = (V[idx] / (WH[idx] + SMALL_VALUE_TO_ADD_DENOMINATOR));
83 
84  idx += (n << 4);
85  SVW(threadIdx.y, tx) = (k + tx < m) ? (V[idx] / (WH[idx] + SMALL_VALUE_TO_ADD_DENOMINATOR)) : CUDA_VALUE(0.0);
86  } else {
87  SVW(threadIdx.y, threadIdx.x) = CUDA_VALUE(0.0);
88  SVW(threadIdx.y, tx) = CUDA_VALUE(0.0);
89  }
90  __syncthreads();
91 
92  for(int i = 0; i < 32; i++) {
93  cudafloat h1 = SH(threadIdx.x, i);
94  cudafloat h2 = SH(tx, i);
95 
96  //sumH1 += h1;
97  //sumH2 += h2;
98 
99  cudafloat vw = SVW(threadIdx.y, i) - CUDA_VALUE(1.0);
100 
101  sum1 += h1 * vw;
102  sum2 += h2 * vw;
103  }
104  __syncthreads();
105  }
106 
107  if (y < n && x < r) {
108  int idx = IDX_WMATRIX(y, x, n, r);
109  cudafloat sumH1 = sumH[x];
110  cudafloat v = W[idx] + (W[idx] / sumH1) * sum1;
111  if (v < CUDA_VALUE(0.0)) v = CUDA_VALUE(0.0);
112  W[idx] = v;
113 
114  x += 16;
115  if (x < r) {
116  idx = IDX_WMATRIX(y, x, n, r);
117  cudafloat sumH2 = sumH[x];
118  v = W[idx] + (W[idx] / sumH2) * sum2;
119  if (v < CUDA_VALUE(0.0)) v = CUDA_VALUE(0.0);
120  W[idx] = v;
121  }
122  }
123 }
124 
125 KERNEL UpdateH_AD(cudafloat * H, cudafloat * W, cudafloat * V, cudafloat * WH, cudafloat * sumW, int n, int m, int r) {
126  __shared__ cudafloat SW(32, 32);
127  __shared__ cudafloat SVH(32, 32);
128 
129  int x = blockIdx.x * 32 + threadIdx.x;
130  int y = blockIdx.y * 32 + threadIdx.y;
131 
132  cudafloat sum1 = CUDA_VALUE(0.0);
133  //cudafloat sumW1 = CUDA_VALUE(0.0);
134  cudafloat sum2 = CUDA_VALUE(0.0);
135  //cudafloat sumW2 = CUDA_VALUE(0.0);
136 
137  for(int k = 0; k < n; k += 32) {
138  int ty = threadIdx.y + 16;
139 
140  if (y < r && k + threadIdx.x < n) {
141  int kx = k + threadIdx.x;
142  SW(threadIdx.x, threadIdx.y) = WMATRIX(kx, y, n, r);
143  SW(threadIdx.x, ty) = (y + 16 < r) ? WMATRIX(kx, y + 16, n, r) : CUDA_VALUE(0.0);
144  } else {
145  SW(threadIdx.x, threadIdx.y) = CUDA_VALUE(0.0);
146  SW(threadIdx.x, ty) = CUDA_VALUE(0.0);
147  }
148 
149  if (x < m && k + threadIdx.y < n) {
150  int idx = x * n + (k + threadIdx.y);
151  SVH(threadIdx.y, threadIdx.x) = V[idx] / (WH[idx] + SMALL_VALUE_TO_ADD_DENOMINATOR);
152 
153  idx += 16;
154  SVH(ty, threadIdx.x) = (k + ty < n) ? (V[idx] / (WH[idx] + SMALL_VALUE_TO_ADD_DENOMINATOR)) : CUDA_VALUE(0.0);
155  } else {
156  SVH(threadIdx.y, threadIdx.x) = CUDA_VALUE(0.0);
157  SVH(ty, threadIdx.x) = CUDA_VALUE(0.0);
158  }
159  __syncthreads();
160 
161  for(int i = 0; i < 32; i++) {
162  cudafloat w1 = SW(i, threadIdx.y);
163  cudafloat w2 = SW(i, ty);
164 
165  //sumW1 += w1;
166  //sumW2 += w2;
167 
168  cudafloat vw = SVH(i, threadIdx.x) - CUDA_VALUE(1.0);
169 
170  sum1 += w1 * vw;
171  sum2 += w2 * vw;
172  }
173  __syncthreads();
174  }
175 
176  if (y < r && x < m) {
177  int idx = IDX_HMATRIX(y, x, r, m);
178  cudafloat sumW1 = sumW[y];
179  cudafloat v = H[idx] + (H[idx] / sumW1) * sum1;
180  if (v < CUDA_VALUE(0.0)) v = CUDA_VALUE(0.0);
181  H[idx] = v;
182 
183  y+=16;
184  if (y < r) {
185  idx = IDX_HMATRIX(y, x, r, m);
186  cudafloat sumW2 = sumW[y];
187  v = H[idx] + (H[idx] / sumW2) * sum2;
188  if (v < CUDA_VALUE(0.0)) v = CUDA_VALUE(0.0);
189  H[idx] = v;
190  }
191  }
192 }
193 
195 
196 }
KERNEL UpdateW_AD(cudafloat *W, cudafloat *H, cudafloat *V, cudafloat *WH, cudafloat *sumH, int n, int m, int r)
KERNEL UpdateH_AD(cudafloat *H, cudafloat *W, cudafloat *V, cudafloat *WH, cudafloat *sumW, int n, int m, int r)
#define SMALL_VALUE_TO_ADD_DENOMINATOR
Small value added to the denominator of a fraction to prevent division by zero.
Definition: NMFkernels.h:32
#define KERNEL
Defines the type of a kernel function.
#define CUDA_VALUE(X)
float cudafloat