CA-Algorithm Source Code

  1 #include <cuda_runtime.h>
  2 
  3 #include <time.h>
  4 
  5 #include <stdio.h>
  6 #include <stdlib.h>
  7 #include <string.h>
  8 #include <sys/time.h>
  9 
 10 #include "simple_arrays.h"
 11 
 12 
 13 #define IMUL(a, b) __mul24(a, b)
 14 
 15 ////////////////////////////////////////////////////////////////////////////////
 16 // Kernel configuration
 17 ////////////////////////////////////////////////////////////////////////////////
 18 #define BLOCK_SIZE_2D       16
 19 #define REAL_BLOCK          14
 20 
 21 //------------------------------------------------------------------------------
 22 // CUDA error check and print
 23 //------------------------------------------------------------------------------
 24 void checkCUDAError(const char *msg);
 25 
 26 //------------------------------------------------------------------------------
 27 // Texture image
 28 //------------------------------------------------------------------------------
 29 texture<unsigned char, 2, cudaReadModeElementType> f_tex;
 30 texture<int, 2, cudaReadModeElementType> mins_tex;
 31 
 32 //------------------------------------------------------------------------------
 33 // Constant memory for neigh
 34 //------------------------------------------------------------------------------
 35 __constant__ int c_neigh[16];
 36 
 37 //------------------------------------------------------------------------------
 38 // Shared memory for image
 39 //------------------------------------------------------------------------------
 40 __shared__ unsigned short s_img[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
 41 __shared__ unsigned short s_lambda[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
 42 __shared__ unsigned short s_label[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
 43 
 44 //------------------------------------------------------------------------------
 45 // Round up the division a/b
 46 //------------------------------------------------------------------------------
 47 int iDivUp(int a, int b){
 48     return (a % b != 0) ? (a / b + 1) : (a / b);
 49 }
 50 
 51 //------------------------------------------------------------------------------
 52 // Generate neighborhood offsets
 53 //------------------------------------------------------------------------------
 54 void neighvector(int conn)
 55 {
 56     int neigh[16];
 57     int i = 0;
 58     switch(conn) {
 59         case 4:
 60             // for Y
 61             neigh[i++] = -1;
 62             neigh[i++] =  0;
 63             neigh[i++] =  0;
 64             neigh[i++] =  1;
 65             // for X
 66             neigh[i++] =  0;
 67             neigh[i++] = -1;
 68             neigh[i++] =  1;
 69             neigh[i++] =  0;
 70             break;
 71         case 8:
 72             // for Y
 73             neigh[i++] = -1;
 74             neigh[i++] = -1;
 75             neigh[i++] = -1;
 76             neigh[i++] =  0;
 77             neigh[i++] =  0;
 78             neigh[i++] =  1;
 79             neigh[i++] =  1;
 80             neigh[i++] =  1;
 81             // for X
 82             neigh[i++] = -1;
 83             neigh[i++] =  0;
 84             neigh[i++] =  1;
 85             neigh[i++] = -1;
 86             neigh[i++] =  1;
 87             neigh[i++] = -1;
 88             neigh[i++] =  0;
 89             neigh[i++] =  1;
 90             break;
 91     }
 92 
 93     cudaMemcpyToSymbol(c_neigh, neigh, 16*sizeof(int), 0, cudaMemcpyHostToDevice);
 94 }
 95 
 96 
 97 __device__ float cost(int fp, int fq, int px, int py, int qx, int qy)
 98 {
 99 
100     if (fp > fq)
101     {
102         int minv = tex2D(mins_tex, px, py);
103         return fp - minv;
104     }
105     else if (fq > fp)
106     {
107         int minv = tex2D(mins_tex, qx, qy);
108         return fq - minv;
109     }
110     else
111     {
112         int minv = tex2D(mins_tex, px, py);
113         int minv2 = tex2D(mins_tex, qx, qy);
114         return ((fp - minv)+(fq - minv2))/2.0;
115     }
116 
117 }
118 
119 //------------------------------------------------------------------------------
120 // Perform iterations of Ford-Bellmann inside block
121 //------------------------------------------------------------------------------
122 __global__ void cuFordBellmann(int *label, int *label_next, int *lambda, int *lambda_next, int *flag, int w, int h, int conn)
123 {
124     const int px = REAL_BLOCK*blockIdx.x + threadIdx.x - 1;
125     const int py = REAL_BLOCK*blockIdx.y + threadIdx.y - 1;
126     const int idp = py*w + px;
127     int count = 0;
128     __shared__ int bChanged;
129 
130     if (px < 0 || py < 0 || px >= w || py >= h)
131     {
132         s_img[threadIdx.y][threadIdx.x] = 0xFFFF;
133         s_lambda[threadIdx.y][threadIdx.x] = 0;
134         s_label[threadIdx.y][threadIdx.x] = 0;
135     }
136     else
137     {
138         s_img[threadIdx.y][threadIdx.x] = tex2D(f_tex,(float)px,(float)py);
139         s_lambda[threadIdx.y][threadIdx.x] = lambda[idp];
140         s_label[threadIdx.y][threadIdx.x] = label[idp];
141     }
142 
143     bChanged = 1;
144 
145     __syncthreads();
146 
147     if (px < 0 || py < 0 || px >= w || py >= h ||
148         threadIdx.x == 0 || threadIdx.x >= BLOCK_SIZE_2D - 1 ||
149         threadIdx.y == 0 || threadIdx.y >= BLOCK_SIZE_2D - 1)
150         return;
151 
152 
153     while (bChanged && count < 28)
154     {
155         bChanged = 0;
156 
157         count++;
158 
159         int fp, fq;
160         int u = 0x00FFFFFF;
161         int ux, uy;
162         int lambdap = s_lambda[threadIdx.y][threadIdx.x];
163         int idu = -1;
164 
165         fp = s_img[threadIdx.y][threadIdx.x];
166 
167         for(int pos = 0; pos < conn ; pos++)
168         {
169             int qx = px + c_neigh[pos+conn];
170             int qy = py + c_neigh[pos];
171             int sqx = threadIdx.x + c_neigh[pos+conn];
172             int sqy = threadIdx.y + c_neigh[pos];
173 
174             if (qx >= 0 && qy >= 0 && qx < w && qy < h)
175             {
176                 int lambdaq = s_lambda[sqy][sqx];
177                 fq = s_img[sqy][sqx];
178 
179                 float c = cost(fp,
180                                fq,
181                                px,
182                                py,
183                                qx,
184                                qy);
185 
186                 if (lambdaq + c < u)
187                 {
188                     u = lambdaq + c;
189                     ux = sqx;
190                     uy = sqy;
191                     idu = 1;
192                 }
193             }
194         }
195 
196         int ulabel = 0;
197         if (idu >= 0)
198             ulabel = s_label[uy][ux];
199 
200         __syncthreads();
201 
202         if (idu >= 0 && u < lambdap)
203         {
204             s_lambda[threadIdx.y][threadIdx.x] = u;
205             s_label[threadIdx.y][threadIdx.x] = ulabel;
206             *flag += 1;
207             bChanged = 1;
208         }
209 
210         __syncthreads();
211 
212     }
213 
214     lambda_next[idp] = s_lambda[threadIdx.y][threadIdx.x];
215     label_next[idp] = s_label[threadIdx.y][threadIdx.x];
216 }
217 
218 //------------------------------------------------------------------------------
219 // Initialize the lambda memory on the seeds and find neighborhood minima
220 //------------------------------------------------------------------------------
221 __global__ void cuInit(int *lambda, int *seeds, int *mins, int w, int h, int conn)
222 {
223     const int px = REAL_BLOCK*blockIdx.x + threadIdx.x - 1;
224     const int py = REAL_BLOCK*blockIdx.y + threadIdx.y - 1;
225     const int idp = py*w + px;
226 
227     if (px < 0 || py < 0 || px >= w || py >= h)
228     {
229         s_img[threadIdx.y][threadIdx.x] = 0xFFFF;
230     }
231     else
232     {
233         s_img[threadIdx.y][threadIdx.x] = tex2D(f_tex,(float)px,(float)py);
234     }
235 
236     if (px < 0 || py < 0 || px >= w || py >= h ||
237         threadIdx.x == 0 || threadIdx.x >= BLOCK_SIZE_2D - 1 ||
238         threadIdx.y == 0 || threadIdx.y >= BLOCK_SIZE_2D - 1)
239         return;
240 
241     lambda[idp] = (seeds[idp] == 0) * 0x00FFFFFF;
242 
243     int minv = 0x7FFFFFFF;
244     int fv;
245     for(int pos = 0; pos < conn ; pos++)
246     {
247         int vx = threadIdx.x + c_neigh[pos+conn];
248         int vy = threadIdx.y + c_neigh[pos];
249         fv = s_img[vy][vx];
250         if (fv < minv)
251             minv = fv;
252     }
253 
254     mins[idp] = minv;
255 
256 }
257 
258 //------------------------------------------------------------------------------
259 // Watershed by Kauffmann & Piche
260 //------------------------------------------------------------------------------
261 __host__ float ws_kauffmann(int *label, // output
262                            unsigned char *f, // input
263                            int *seeds, // seeds (regional minima)
264                            int w,  // width
265                            int h, //height
266                            int conn) // connectivity (4 or 8)
267 
268 {
269 
270     int *label_d,
271         *label_next_d,
272         *lambda_d,
273         *lambda_next_d,
274         *flag_d,
275         *mins_d;
276 
277     unsigned int timer;
278     float measuredTime;
279 
280     timeval tim;
281 
282 
283     cudaArray *f_d;
284     cudaArray *mins_a_d;
285 
286     int flag;
287 
288     int sizei = w * h * sizeof(int);
289     int sizec = w * h * sizeof(unsigned char);
290 
291     // Setup the grid hierarchy
292     dim3 dimBlock(BLOCK_SIZE_2D,BLOCK_SIZE_2D);
293     dim3 dimGrid(iDivUp(w,REAL_BLOCK),iDivUp(h,REAL_BLOCK));
294 
295     neighvector(conn);
296 
297     cudaChannelFormatDesc desc8 = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSigned);
298     checkCUDAError("cudaCreateChannelDesc8");
299 
300     cudaChannelFormatDesc desc32 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
301     checkCUDAError("cudaCreateChannelDesc32");
302 
303 
304     // Allocate memory
305     cudaMallocArray(&f_d, &desc8, w, h);
306     checkCUDAError("cudaMallocArray f_d");
307 
308     cudaMemcpyToArray(f_d, 0, 0, f, sizec, cudaMemcpyHostToDevice);
309     checkCUDAError("cudaMemcpyToArray f_d");
310 
311     cudaBindTextureToArray(f_tex, f_d);
312     checkCUDAError("cudaBindTextureToArray f_tex");
313 
314 
315     cudaMalloc((void**)&label_d, sizei);
316     cudaMemset(label_d, 0, sizei);
317 
318     cudaMalloc((void**)&label_next_d, sizei);
319     cudaMemcpy(label_next_d, seeds, sizei, cudaMemcpyHostToDevice);
320 
321     cudaMalloc((void**)&lambda_d, sizei);
322     cudaMemset(lambda_d, 0, sizei);
323 
324     cudaMalloc((void**)&lambda_next_d, sizei);
325     cudaMemset(lambda_next_d, 0, sizei);
326 
327     cudaMalloc((void**)&mins_d, sizei);
328     cudaMemset(mins_d, 0, sizei);
329 
330     cudaMalloc((void**)&flag_d, sizeof(int));
331 
332     gettimeofday(&tim, NULL);
333     double t1=tim.tv_sec+(tim.tv_usec/1000000.0);
334 
335     // Initialize the lambda image with zeros on minima
336     cuInit<<<dimGrid, dimBlock>>>(lambda_next_d, label_next_d, mins_d, w, h, conn);
337     cudaThreadSynchronize();
338 
339     cudaMallocArray(&mins_a_d, &desc32, w, h);
340     checkCUDAError("cudaMallocArray mins_a_d");
341 
342     cudaMemcpyToArray(mins_a_d, 0, 0, mins_d, sizei, cudaMemcpyDeviceToDevice);
343     checkCUDAError("cudaMemcpyToArray mins_a_d");
344 
345     cudaBindTextureToArray(mins_tex, mins_a_d);
346     checkCUDAError("cudaBindTextureToArray mins_a_d");
347 
348     // Iterate until stabilization
349     int iter = 0;
350     do{
351         iter++;
352         //pyprintf("iter\n");
353         cudaMemset(flag_d, 0, sizeof(int));
354         cudaMemcpy(lambda_d, lambda_next_d, sizei, cudaMemcpyDeviceToDevice);
355         cudaMemcpy(label_d, label_next_d, sizei, cudaMemcpyDeviceToDevice);
356 
357         cuFordBellmann<<<dimGrid, dimBlock>>>(label_d, label_next_d, lambda_d, lambda_next_d, flag_d, w, h, conn);
358 
359         cudaThreadSynchronize();
360 
361         cudaMemcpy(&flag, flag_d, sizeof(int), cudaMemcpyDeviceToHost);
362 
363     }while(flag > 0 && iter < 2000);
364 
365     //cutStopTimer(timer);
366 
367     gettimeofday(&tim, NULL);
368     double t2=tim.tv_sec+(tim.tv_usec/1000000.0);
369 
370     //pyprintf("iter: %d\n", iter);
371 
372     // Copy the labels
373     cudaMemcpy(label, label_d, sizei, cudaMemcpyDeviceToHost);
374 
375     // Free and Unbind memory
376     cudaFree(mins_d);
377     cudaFreeArray(f_d);
378     cudaFreeArray(mins_a_d);
379     cudaFree(lambda_d);
380     cudaFree(lambda_next_d);
381     cudaFree(label_d);
382     cudaFree(label_next_d);
383     cudaFree(flag_d);
384     cudaUnbindTexture(f_tex);
385     cudaUnbindTexture(mins_tex);
386 
387     return t2-t1;
388 
389 }
390 
391 //------------------------------------------------------------------------------
392 // Error Check
393 //------------------------------------------------------------------------------
394 void checkCUDAError(const char *msg)
395 {
396     cudaError_t err = cudaGetLastError();
397     if(cudaSuccess != err)
398     {
399         pyprintf("Cuda error: %s: %s. \n", msg, cudaGetErrorString(err));
400         exit(-1);
401     }
402 
403 }
404 
405 //
406 // Exported Function
407 //
408 float wskp(Image8U *A, Image32S *S, int conn, Image32S *output)
409 {
410     output->set_dims(A->nd,A->dims);
411     output->raster =(char *)new int[A->size];
412 
413 
414     float measuredTime = ws_kauffmann((int*)output->raster,
415                                 (unsigned char*)A->raster,
416                                 (int*)S->raster,
417                                 A->dims[0],
418                                 A->dims[1],
419                                 conn);
420 
421     return measuredTime;
422 }
OK
1 float wskp(Image8U *inp, Image32S *inp1, int inp2, Image32S *out);
OK [C/C++ extension is up-to-date]

Implementation Test

 1 import ismm2011_ca as m
 2 
 3 I = array([[8,6,6,6],[0,3,7,5],[2,4,0,2],[2,4,6,9]]).astype('uint8')
 4 S = mmlabel(mmregmin(I))
 5 (t,w) = m.wskp(I, S, 4)
 6 
 7 print I
 8 print w
 9 
10 f = adreadgray('cameraman.tif')
11 f = mmgradm(f)
12 f = mmhmin(f, 5)
13 S = mmlabel(mmregmin(f))
14 
15 t0 = time.time()
16 w0 = mmcwatershed(f, S, mmsecross(), 'REGIONS')
17 t1 = time.time()
18 (t, w1) = m.wskp(f, S, 4)
19 t2 = time.time()
20 
21 print 'Processing in C:   ',t1 - t0
22 print 'Processing in CUDA:',t
23 
24 mmlblshow(w0, title='w0 - sequential')
25 mmlblshow(w1, title='w1 - CUDA')
[[8 6 6 6]
 [0 3 7 5]
 [2 4 0 2]
 [2 4 6 9]]
[[1 1 2 2]
 [1 1 2 2]
 [1 2 2 2]
 [1 1 2 2]]
Processing in C:    0.00519704818726
Processing in CUDA: 0.00577402114868
Warning: downcasting image from int32 to uint16 (may lose precision)

w0 - sequential

w1 - CUDA