lf005835_7

Autor: Luiz Fernando Stein Wetzel
Data: 28/09/2010

Convolução com Núcleo Separável

  1 // #include <string.h>
  2 // #include <math.h>
  3 #include <cuda.h>
  4 #include "simple_arrays.h"
  5 
  6 
  7 #ifdef FORCE_ALIGNMENT
  8 #define PITCH pitch
  9 #else
 10 #define PITCH width
 11 #endif
 12 
 13 // Vertical Convolution
 14 #define BLOCK_WIDTH_V         16
 15 #define BLOCK_HEIGHT_V        32
 16 #define PIXELS_PER_THREAD_V   4
 17 #define SMEM_WIDTH_V          BLOCK_WIDTH_V
 18 #define SMEM_HEIGHT_V         240
 19 #define MAX_KERNEL_DIM        (SMEM_HEIGHT_V - PIXELS_PER_THREAD_V*BLOCK_HEIGHT_V + 1)
 20 #define S_OFFSET_Y_V          ((MAX_KERNEL_DIM-1)/2)
 21 #define S_MYPIXEL_X_V         threadIdx.x
 22 #define Y0_V                  (blockIdx.y*BLOCK_HEIGHT_V*PIXELS_PER_THREAD_V)
 23 #define G_PIXEL_V(x, y)       ((y)*PITCH + (x))
 24 #define IS_LAST_BLOCK_V       (Y0_V + BLOCK_HEIGHT_H >= height)
 25 #define S_LAST_IN_BLOCK_V     (S_OFFSET_Y_V + min(PIXELS_PER_THREAD_V*BLOCK_HEIGHT_V, height-Y0_V) - 1)
 26 #define G_LAST_IN_BLOCK_V     (Y0_V + min(PIXELS_PER_THREAD_V*BLOCK_HEIGHT_V, height-Y0_V) - 1)
 27 
 28 
 29 // Block size - Horizontal
 30 #define BLOCK_WIDTH_H         512
 31 #define S_OFFSET_H            S_OFFSET_Y_V
 32 #define SMEM_WIDTH_H          (BLOCK_WIDTH_H + 2*S_OFFSET_H)
 33 #define SMEM_HEIGHT_H         1
 34 
 35 #define X0_H                  (BLOCK_WIDTH_H * blockIdx.x)
 36 #define X_H                   (X0_H + threadIdx.x)
 37 #define Y_H                   blockIdx.y
 38 #define G_PIXEL_H(x, y)       ((y)*PITCH + (x))
 39 #define G_END_X_H             (Y_H*PITCH + width)
 40 #define IS_LAST_BLOCK_H       (X0_H + BLOCK_WIDTH_H >= width)
 41 #define S_LAST_IN_BLOCK_H     (S_OFFSET_H + min(BLOCK_WIDTH_H, width-X0_H) - 1)
 42 #define TO_LAST_IN_BLOCK_H    (S_LAST_IN_BLOCK_H - S_OFFSET_H - threadIdx.x)
 43 
 44 
 45 static void swap (float*& i1, float*& i2)
 46 {
 47     float* x = i1;
 48     i1 = i2;
 49     i2 = x;
 50 }
 51 
 52 __constant__ float Kd[MAX_KERNEL_DIM];
 53 __global__ void ConvHKernel(float* Id, float* Od, int width, int kernel_size)
 54 {
 55     __shared__ float S[SMEM_WIDTH_H];
 56      const int S_MYPIXEL_H = (S_OFFSET_H + threadIdx.x);
 57      const int G_MYPIXEL_H = G_PIXEL_H(X_H, Y_H);
 58     int kr = (kernel_size - 1)/2; // kernel radius
 59     float* Kc = &Kd[kr];          // kernel center
 60 
 61     // Load pixel to shared memory
 62     if(X_H < width) S[S_MYPIXEL_H] = Id[G_MYPIXEL_H];
 63 
 64     // Load "borders"
 65     if(threadIdx.x != 0 && threadIdx.x <= kr)
 66     {
 67         S[S_OFFSET_H - threadIdx.x] = Id[G_PIXEL_H((X0_H-threadIdx.x+width)%width, Y_H)];
 68     }
 69 
 70     if(TO_LAST_IN_BLOCK_H <= kr && TO_LAST_IN_BLOCK_H > 0)
 71     {
 72         S[S_MYPIXEL_H + 2*TO_LAST_IN_BLOCK_H] = Id[G_PIXEL_H((X_H + 2*TO_LAST_IN_BLOCK_H) % width, Y_H)];
 73     }
 74 
 75     if(X_H >= width) return;
 76     __syncthreads();
 77 
 78     // Execute convolution
 79     // Move Od to current pixel
 80     Od = &Od[G_MYPIXEL_H];
 81     float* Is = &S[S_MYPIXEL_H];
 82     float val = Is[0] * Kc[0];
 83     for(int i=1; i<=kr; i++) {
 84         val += Is[-i] * Kc[i];
 85         val += Is[i] * Kc[-i];
 86     }
 87     // Save in global memory
 88     *Od = val;
 89  }
 90 
 91 __global__ void ConvVKernel(float* Id, float* Od, int width, int height, int kernel_size)
 92 {
 93     __shared__ float S[SMEM_HEIGHT_V][SMEM_WIDTH_V];
 94     const int X_V = (blockIdx.x*BLOCK_WIDTH_V + threadIdx.x);
 95     const int Y_V = ((blockIdx.y*BLOCK_HEIGHT_V + threadIdx.y)*PIXELS_PER_THREAD_V);
 96     const int S_MYPIXEL_Y_V = ((S_OFFSET_Y_V + threadIdx.y*PIXELS_PER_THREAD_V));
 97     const int G_MYPIXEL_V = G_PIXEL_V(X_V, Y_V);
 98 
 99     if(X_V >= width) return;
100     int kr = (kernel_size - 1)/2; // kernel radius
101     float* Kc = &Kd[kr];          // kernel center
102 
103     // Load pixel to shared memory
104     if(Y_V < height) {
105         S[S_MYPIXEL_Y_V + 0][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 0*PITCH];
106         S[S_MYPIXEL_Y_V + 1][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 1*PITCH];
107         S[S_MYPIXEL_Y_V + 2][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 2*PITCH];
108         S[S_MYPIXEL_Y_V + 3][S_MYPIXEL_X_V] = Id[G_MYPIXEL_V + 3*PITCH];
109     }
110 
111     // Load "borders"
112     int tidef = threadIdx.y+1;
113     if(tidef <= kr)
114     {
115         S[S_OFFSET_Y_V - tidef][S_MYPIXEL_X_V] = Id[G_PIXEL_V(X_V, (Y0_V-tidef+height)%height)];
116     }
117 
118     tidef = BLOCK_HEIGHT_V - threadIdx.y;
119     if(tidef <= kr)
120     {
121         S[S_LAST_IN_BLOCK_V+tidef][S_MYPIXEL_X_V] = Id[G_PIXEL_V(X_V, (G_LAST_IN_BLOCK_V+tidef) % height)];
122     }
123 
124     if(Y_V >= height) return;
125     __syncthreads();
126 
127     // Execute convolution
128      float val; int i;
129 
130     #define CONV_V(IT)                                              \
131     val = S[S_MYPIXEL_Y_V + IT][S_MYPIXEL_X_V] * Kc[0];             \
132     for(i=1; i<=kr; i++) {                                          \
133         val += S[S_MYPIXEL_Y_V + IT - i][S_MYPIXEL_X_V] * Kc[i];    \
134         val += S[S_MYPIXEL_Y_V + IT + i][S_MYPIXEL_X_V] * Kc[-i]; } \
135     Od[G_MYPIXEL_V + IT * PITCH] = val;
136 
137     CONV_V(0)
138     CONV_V(1)
139     CONV_V(2)
140     CONV_V(3)
141 }
142 
143 Image32F* Conv2DSep(Image32F* Im, Image32F* kh, Image32F* kv)
144 {
145     // Check input
146     if(Im==NULL || Im->nd!=2 ||
147        (kh!=NULL && (kh->size > MAX_KERNEL_DIM || kh->size%2==0)) ||
148        (kv!=NULL && (kv->size > MAX_KERNEL_DIM || kv->size%2==0))) {
149            pyprintf("Invalid arguments\n");
150            return NULL;
151     }
152 
153     // Alloc and copy Image to Device Memory
154 
155     int height = Im->dims[0];
156     int width  = Im->dims[1];
157     // Alocar memória suficiente para que o número de linhas seja multiplo de PIXELS_PER_THREAD_V
158     int aheight = ((height + PIXELS_PER_THREAD_V -1)/PIXELS_PER_THREAD_V)*PIXELS_PER_THREAD_V;
159     int size = Im->size * sizeof(float);
160     int asize = width*aheight*sizeof(float);
161 
162     float *I1d, *I2d;
163     if(cudaSuccess != cudaMalloc((void**)&I1d, asize) ||
164        cudaSuccess != cudaMalloc((void**)&I2d, asize) ||
165        cudaSuccess != cudaMemcpy(I1d, Im->raster, size, cudaMemcpyHostToDevice) ||
166        cudaMemset((void*)I2d, 0, asize)) {
167         pyprintf("Erro 1\n"); return NULL;
168     }
169     if(height != aheight ) {
170         if(cudaSuccess != cudaMemset((void*)&I1d[height*PITCH], 0, (aheight-height)*PITCH*sizeof(float)) ||
171            cudaSuccess != cudaMemset((void*)&I2d[height*PITCH], 0, (aheight-height)*PITCH*sizeof(float))) {
172         pyprintf("Erro 1.5\n"); return NULL;
173         }
174     }
175 
176     int ks, gx, gy;
177     float* Ii = I1d;
178     float* Io = I2d;
179     // Horizontal Convolution
180     if(kh != NULL && kh->size > 1) {
181         pyprintf("Executing horizontal convolution %d\n", kh->size);
182 
183         ks = kh->size; // kernel size
184 
185         // copy horizontal kernel
186         if(cudaSuccess != cudaMemcpyToSymbol(Kd, kh->raster, ks*sizeof(float))) {
187             pyprintf("Erro 2\n"); return NULL;
188         }
189 
190         // Grid size
191         gx = (width+BLOCK_WIDTH_H-1)/BLOCK_WIDTH_H;
192         gy = height;
193         // Execute Kernel
194 
195         dim3 dimGrid(gx, gy);
196         dim3 dimBlock(BLOCK_WIDTH_H, 1);
197 
198         // Execute Kernel (in place convolution)
199         ConvHKernel<<<dimGrid, dimBlock>>>(Ii, Io, Im->dims[1], ks);
200         swap(Ii, Io);
201     }
202     // Vertical Convolution
203     if(kv != NULL && kv->size > 1) {
204         pyprintf("Executing vertical convolution %d\n", kv->size);
205         ks = kv->size; // kernel size
206 
207         // Grid size
208         gx = (width+BLOCK_WIDTH_V-1)/BLOCK_WIDTH_V;
209         gy = (height+BLOCK_HEIGHT_V-1)/BLOCK_HEIGHT_V;
210 
211         pyprintf("gx=%d, gy=%d\n", gx, gy);
212         dim3 dimGrid(gx, gy);
213         dim3 dimBlock(BLOCK_WIDTH_V, BLOCK_HEIGHT_V);
214 
215         // copy verticalkernel
216         if(cudaSuccess != cudaMemcpyToSymbol(Kd, kv->raster, ks*sizeof(float))) {
217             pyprintf("Erro 3\n"); return NULL;
218         }
219 
220         // Execute Kernel (in place convolution)
221         ConvVKernel<<<dimGrid, dimBlock>>>(Ii, Io, Im->dims[1], Im->dims[0], ks);
222         swap(Ii, Io);
223     }
224 
225     // Create output image
226     Image32F* ret = new Image32F(Im->nd, Im->dims);
227     if(cudaSuccess != cudaMemcpy(ret->raster, Ii, size, cudaMemcpyDeviceToHost)) {
228         delete ret;
229         pyprintf("Erro 4\n");
230         return NULL;
231     }
232 
233     cudaFree(I1d);
234     cudaFree(I2d);
235     return ret;
236 }
OK
1 Image32F* Conv2DSep(Image32F* Im, Image32F* kh, Image32F* kv);
OK [C/C++ extension is up-to-date]

Utilizando o módulo

 1 import time
 2 import numpy as np
 3 import lf005835_5 as conv
 4 from ia636 import *
 5 
 6 I0 = adreadgray("cameraman.tif").astype(np.float32)
 7 adshow(ianormalize(I0))
 8 
 9 k1 = np.array([0, 1, 0]).astype(np.float32)
10 k2 = np.array([1, 1, 1, 1, 1, 1, 1]).astype(np.float32)
11 I1 = Conv2DSep(I0, None, k1)
12 adshow(ianormalize(I1))
13 I2 = Conv2DSep(I0, None, k2)
14 adshow(ianormalize(I2))
15 I3 = iapconv(I0, k1)
16 adshow(ianormalize(I3))
17 
18 adshow(ianormalize(I3-I1))