# lf005835_7

Autor: Luiz Fernando Stein Wetzel 28/09/2010

# Convolução com Núcleo Separável

```  1 // #include <string.h>
2 // #include <math.h>
3 #include <cuda.h>
4 #include "simple_arrays.h"
5
6
7 #ifdef FORCE_ALIGNMENT
8 #define PITCH pitch
9 #else
10 #define PITCH width
11 #endif
12
13 // Vertical Convolution
14 #define BLOCK_WIDTH_V         16
15 #define BLOCK_HEIGHT_V        32
16 #define PIXELS_PER_THREAD_V   4
17 #define SMEM_WIDTH_V          BLOCK_WIDTH_V
18 #define SMEM_HEIGHT_V         240
19 #define MAX_KERNEL_DIM        (SMEM_HEIGHT_V - PIXELS_PER_THREAD_V*BLOCK_HEIGHT_V + 1)
20 #define S_OFFSET_Y_V          ((MAX_KERNEL_DIM-1)/2)
21 #define S_MYPIXEL_X_V         threadIdx.x
22 #define Y0_V                  (blockIdx.y*BLOCK_HEIGHT_V*PIXELS_PER_THREAD_V)
23 #define G_PIXEL_V(x, y)       ((y)*PITCH + (x))
24 #define IS_LAST_BLOCK_V       (Y0_V + BLOCK_HEIGHT_H >= height)
25 #define S_LAST_IN_BLOCK_V     (S_OFFSET_Y_V + min(PIXELS_PER_THREAD_V*BLOCK_HEIGHT_V, height-Y0_V) - 1)
26 #define G_LAST_IN_BLOCK_V     (Y0_V + min(PIXELS_PER_THREAD_V*BLOCK_HEIGHT_V, height-Y0_V) - 1)
27
28
29 // Block size - Horizontal
30 #define BLOCK_WIDTH_H         512
31 #define S_OFFSET_H            S_OFFSET_Y_V
32 #define SMEM_WIDTH_H          (BLOCK_WIDTH_H + 2*S_OFFSET_H)
33 #define SMEM_HEIGHT_H         1
34
35 #define X0_H                  (BLOCK_WIDTH_H * blockIdx.x)
36 #define X_H                   (X0_H + threadIdx.x)
37 #define Y_H                   blockIdx.y
38 #define G_PIXEL_H(x, y)       ((y)*PITCH + (x))
39 #define G_END_X_H             (Y_H*PITCH + width)
40 #define IS_LAST_BLOCK_H       (X0_H + BLOCK_WIDTH_H >= width)
41 #define S_LAST_IN_BLOCK_H     (S_OFFSET_H + min(BLOCK_WIDTH_H, width-X0_H) - 1)
42 #define TO_LAST_IN_BLOCK_H    (S_LAST_IN_BLOCK_H - S_OFFSET_H - threadIdx.x)
43
44
45 static void swap (float*& i1, float*& i2)
46 {
47     float* x = i1;
48     i1 = i2;
49     i2 = x;
50 }
51
52 __constant__ float Kd[MAX_KERNEL_DIM];
53 __global__ void ConvHKernel(float* Id, float* Od, int width, int kernel_size)
54 {
55     __shared__ float S[SMEM_WIDTH_H];
56      const int S_MYPIXEL_H = (S_OFFSET_H + threadIdx.x);
57      const int G_MYPIXEL_H = G_PIXEL_H(X_H, Y_H);
58     int kr = (kernel_size - 1)/2; // kernel radius
59     float* Kc = &Kd[kr];          // kernel center
60
61     // Load pixel to shared memory
62     if(X_H < width) S[S_MYPIXEL_H] = Id[G_MYPIXEL_H];
63
64     // Load "borders"
65     if(threadIdx.x != 0 && threadIdx.x <= kr)
66     {
67         S[S_OFFSET_H - threadIdx.x] = Id[G_PIXEL_H((X0_H-threadIdx.x+width)%width, Y_H)];
68     }
69
70     if(TO_LAST_IN_BLOCK_H <= kr && TO_LAST_IN_BLOCK_H > 0)
71     {
72         S[S_MYPIXEL_H + 2*TO_LAST_IN_BLOCK_H] = Id[G_PIXEL_H((X_H + 2*TO_LAST_IN_BLOCK_H) % width, Y_H)];
73     }
74
75     if(X_H >= width) return;
76     __syncthreads();
77
78     // Execute convolution
79     // Move Od to current pixel
80     Od = &Od[G_MYPIXEL_H];
81     float* Is = &S[S_MYPIXEL_H];
82     float val = Is[0] * Kc[0];
83     for(int i=1; i<=kr; i++) {
84         val += Is[-i] * Kc[i];
85         val += Is[i] * Kc[-i];
86     }
87     // Save in global memory
88     *Od = val;
89  }
90
91 __global__ void ConvVKernel(float* Id, float* Od, int width, int height, int kernel_size)
92 {
93     __shared__ float S[SMEM_HEIGHT_V][SMEM_WIDTH_V];
94     const int X_V = (blockIdx.x*BLOCK_WIDTH_V + threadIdx.x);
95     const int Y_V = ((blockIdx.y*BLOCK_HEIGHT_V + threadIdx.y)*PIXELS_PER_THREAD_V);
96     const int S_MYPIXEL_Y_V = ((S_OFFSET_Y_V + threadIdx.y*PIXELS_PER_THREAD_V));
97     const int G_MYPIXEL_V = G_PIXEL_V(X_V, Y_V);
98
99     if(X_V >= width) return;
100     int kr = (kernel_size - 1)/2; // kernel radius
101     float* Kc = &Kd[kr];          // kernel center
102
103     // Load pixel to shared memory
104     if(Y_V < height) {
105         S[S_MYPIXEL_Y_V + 0][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 0*PITCH];
106         S[S_MYPIXEL_Y_V + 1][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 1*PITCH];
107         S[S_MYPIXEL_Y_V + 2][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 2*PITCH];
108         S[S_MYPIXEL_Y_V + 3][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 3*PITCH];
109     }
110
111     // Load "borders"
112     int tidef = threadIdx.y+1;
113     if(tidef <= kr)
114     {
115         S[S_OFFSET_Y_V - tidef][S_MYPIXEL_X_V] = Id[G_PIXEL_V(X_V, (Y0_V-tidef+height)%height)];
116     }
117
118     tidef = BLOCK_HEIGHT_V - threadIdx.y;
119     if(tidef <= kr)
120     {
121         S[S_LAST_IN_BLOCK_V+tidef][S_MYPIXEL_X_V] = Id[G_PIXEL_V(X_V, (G_LAST_IN_BLOCK_V+tidef) % height)];
122     }
123
124     if(Y_V >= height) return;
125     __syncthreads();
126
127     // Execute convolution
128      float val; int i;
129
130     #define CONV_V(IT)                                              \
131     val = S[S_MYPIXEL_Y_V + IT][S_MYPIXEL_X_V] * Kc[0];             \
132     for(i=1; i<=kr; i++) {                                          \
133         val += S[S_MYPIXEL_Y_V + IT - i][S_MYPIXEL_X_V] * Kc[i];    \
134         val += S[S_MYPIXEL_Y_V + IT + i][S_MYPIXEL_X_V] * Kc[-i]; } \
135     Od[G_MYPIXEL_V + IT * PITCH] = val;
136
137     CONV_V(0)
138     CONV_V(1)
139     CONV_V(2)
140     CONV_V(3)
141 }
142
143 Image32F* Conv2DSep(Image32F* Im, Image32F* kh, Image32F* kv)
144 {
145     // Check input
146     if(Im==NULL || Im->nd!=2 ||
147        (kh!=NULL && (kh->size > MAX_KERNEL_DIM || kh->size%2==0)) ||
148        (kv!=NULL && (kv->size > MAX_KERNEL_DIM || kv->size%2==0))) {
149            pyprintf("Invalid arguments\n");
150            return NULL;
151     }
152
153     // Alloc and copy Image to Device Memory
154
155     int height = Im->dims[0];
156     int width  = Im->dims[1];
157     // Alocar memória suficiente para que o número de linhas seja multiplo de PIXELS_PER_THREAD_V
158     int aheight = ((height + PIXELS_PER_THREAD_V -1)/PIXELS_PER_THREAD_V)*PIXELS_PER_THREAD_V;
159     int size = Im->size * sizeof(float);
160     int asize = width*aheight*sizeof(float);
161
162     float *I1d, *I2d;
163     if(cudaSuccess != cudaMalloc((void**)&I1d, asize) ||
164        cudaSuccess != cudaMalloc((void**)&I2d, asize) ||
165        cudaSuccess != cudaMemcpy(I1d, Im->raster, size, cudaMemcpyHostToDevice) ||
166        cudaMemset((void*)I2d, 0, asize)) {
167         pyprintf("Erro 1\n"); return NULL;
168     }
169     if(height != aheight ) {
170         if(cudaSuccess != cudaMemset((void*)&I1d[height*PITCH], 0, (aheight-height)*PITCH*sizeof(float)) ||
171            cudaSuccess != cudaMemset((void*)&I2d[height*PITCH], 0, (aheight-height)*PITCH*sizeof(float))) {
172         pyprintf("Erro 1.5\n"); return NULL;
173         }
174     }
175
176     int ks, gx, gy;
177     float* Ii = I1d;
178     float* Io = I2d;
179     // Horizontal Convolution
180     if(kh != NULL && kh->size > 1) {
181         pyprintf("Executing horizontal convolution %d\n", kh->size);
182
183         ks = kh->size; // kernel size
184
185         // copy horizontal kernel
186         if(cudaSuccess != cudaMemcpyToSymbol(Kd, kh->raster, ks*sizeof(float))) {
187             pyprintf("Erro 2\n"); return NULL;
188         }
189
190         // Grid size
191         gx = (width+BLOCK_WIDTH_H-1)/BLOCK_WIDTH_H;
192         gy = height;
193         // Execute Kernel
194
195         dim3 dimGrid(gx, gy);
196         dim3 dimBlock(BLOCK_WIDTH_H, 1);
197
198         // Execute Kernel (in place convolution)
199         ConvHKernel<<<dimGrid, dimBlock>>>(Ii, Io, Im->dims[1], ks);
200         swap(Ii, Io);
201     }
202     // Vertical Convolution
203     if(kv != NULL && kv->size > 1) {
204         pyprintf("Executing vertical convolution %d\n", kv->size);
205         ks = kv->size; // kernel size
206
207         // Grid size
208         gx = (width+BLOCK_WIDTH_V-1)/BLOCK_WIDTH_V;
209         gy = (height+BLOCK_HEIGHT_V-1)/BLOCK_HEIGHT_V;
210
211         pyprintf("gx=%d, gy=%d\n", gx, gy);
212         dim3 dimGrid(gx, gy);
213         dim3 dimBlock(BLOCK_WIDTH_V, BLOCK_HEIGHT_V);
214
215         // copy verticalkernel
216         if(cudaSuccess != cudaMemcpyToSymbol(Kd, kv->raster, ks*sizeof(float))) {
217             pyprintf("Erro 3\n"); return NULL;
218         }
219
220         // Execute Kernel (in place convolution)
221         ConvVKernel<<<dimGrid, dimBlock>>>(Ii, Io, Im->dims[1], Im->dims[0], ks);
222         swap(Ii, Io);
223     }
224
225     // Create output image
226     Image32F* ret = new Image32F(Im->nd, Im->dims);
227     if(cudaSuccess != cudaMemcpy(ret->raster, Ii, size, cudaMemcpyDeviceToHost)) {
228         delete ret;
229         pyprintf("Erro 4\n");
230         return NULL;
231     }
232
233     cudaFree(I1d);
234     cudaFree(I2d);
235     return ret;
236 }
```
```OK
```
```1 Image32F* Conv2DSep(Image32F* Im, Image32F* kh, Image32F* kv);
```
```OK [C/C++ extension is up-to-date]
```

# Utilizando o módulo

``` 1 import time
2 import numpy as np
3 import lf005835_5 as conv
4 from ia636 import *
5
6 I0 = adreadgray("cameraman.tif").astype(np.float32)
7 adshow(ianormalize(I0))
8
9 k1 = np.array([0, 1, 0]).astype(np.float32)
10 k2 = np.array([1, 1, 1, 1, 1, 1, 1]).astype(np.float32)
11 I1 = Conv2DSep(I0, None, k1)
12 adshow(ianormalize(I1))
13 I2 = Conv2DSep(I0, None, k2)
14 adshow(ianormalize(I2))
15 I3 = iapconv(I0, k1)
16 adshow(ianormalize(I3))
17
18 adshow(ianormalize(I3-I1))
```