DW Algorithm Source Code

This algorithm has been published in

KÖRBES, A. ; VITOR, G. B. ; LOTUFO, R. ; FERREIRA, J. V. . Advances on Watershed Processing on GPU Architecture. In: 10th International Symposium on Mathematical Morphology, 2011, Intra, Lake Maggiore, Italy. MATHEMATICAL MORPHOLOGY AND ITS APPLICATIONS TO IMAGE AND SIGNAL PROCESSING. Heidelberg : Springer Berlin, 2011. v. 6671. p. 260-271.

DOI: http://dx.doi.org/10.1007/978-3-642-21569-8_23

  1 #include <cuda_runtime.h>
  2 
  3 #include <stdio.h>
  4 #include <stdlib.h>
  5 #include <string.h>
  6 
  7 #undef _POSIX_C_SOURCE
  8 #undef _XOPEN_SOURCE
  9 
 10 #include "simple_arrays.h"
 11 
 12 ////////////////////////////////////////////////////////////////////////////////
 13 // Kernel configuration
 14 ////////////////////////////////////////////////////////////////////////////////
 15 #define BORDER 0xFFFFFFF
 16 #define UNVISITED 0xFFFFFFE
 17 
 18 #define BLOCK_TILE 16  //7 //16
 19 #define REAL_TILE  14  //5 //14
 20 #define BLOCK_SIZE_1D 512 //8 //256 //512
 21 
 22 //------------------------------------------------------------------------------
 23 // Declaração de textura referencia
 24 //------------------------------------------------------------------------------
 25 texture<unsigned char, 2, cudaReadModeElementType> timg;
 26 texture<int, 2, cudaReadModeElementType> tarrow;
 27 texture<int, 1, cudaReadModeElementType> taux;
 28 
 29 //------------------------------------------------------------------------------
 30 // Constant memory for neigh
 31 //------------------------------------------------------------------------------
 32 __constant__ int c_neigh[16];
 33 
 34 
 35 //------------------------------------------------------------------------------
 36 //Arredonda a / b para o valor do vizinho superior
 37 //------------------------------------------------------------------------------
 38 int iDivUp(int a, int b){
 39     return (a % b != 0) ? (a / b + 1) : (a / b);
 40 }
 41 
 42 //------------------------------------------------------------------------------
 43 // Generate neighborhood offsets
 44 //------------------------------------------------------------------------------
 45 void neighvector(int conn)
 46 {
 47     int neigh[16];
 48     int i = 0;
 49       switch(conn)
 50       {
 51         case 4:
 52             // for Y
 53             neigh[i++] = -1;
 54             neigh[i++] =  0;
 55             neigh[i++] =  0;
 56             neigh[i++] =  1;
 57             // for X
 58             neigh[i++] =  0;
 59             neigh[i++] = -1;
 60             neigh[i++] =  1;
 61             neigh[i++] =  0;
 62         break;
 63         case 8:
 64             // for Y
 65             neigh[i++] = -1;
 66             neigh[i++] = -1;
 67             neigh[i++] = -1;
 68             neigh[i++] =  0;
 69             neigh[i++] =  0;
 70             neigh[i++] =  1;
 71             neigh[i++] =  1;
 72             neigh[i++] =  1;
 73             // for X
 74             neigh[i++] = -1;
 75             neigh[i++] =  0;
 76             neigh[i++] =  1;
 77             neigh[i++] = -1;
 78             neigh[i++] =  1;
 79             neigh[i++] = -1;
 80             neigh[i++] =  0;
 81             neigh[i++] =  1;
 82         break;
 83     }
 84 
 85     cudaMemcpyToSymbol(c_neigh, neigh, 16*sizeof(int), 0, cudaMemcpyHostToDevice);
 86 
 87 }
 88 
 89 //------------------------------------------------------------------------------
 90 // Indexa a descida
 91 //------------------------------------------------------------------------------
 92 __global__ void cuIndexDescida(   int *gArrow,
 93                                   int   *gDist,
 94                                   int  width,
 95                                   int  height,
 96                                   int  conn      )
 97 {
 98 
 99     int x = REAL_TILE * blockIdx.x + threadIdx.x - 1;
100     int y = REAL_TILE * blockIdx.y + threadIdx.y - 1;
101     int idg = y * width + x;
102 
103     int p,
104         count = 0,
105         arrow = UNVISITED,
106         dist  = UNVISITED;
107 
108     __shared__ int  smImg[BLOCK_TILE][BLOCK_TILE],
109                    smCtrl[BLOCK_TILE][BLOCK_TILE],
110                    smDist[BLOCK_TILE][BLOCK_TILE];
111 
112     smImg[threadIdx.y][threadIdx.x] = (x < width && x >= 0 && y >= 0 && y < height)? tex2D(timg,(float)x,(float)y) : BORDER;
113     smDist[threadIdx.y][threadIdx.x] = dist;
114     smCtrl[threadIdx.y][threadIdx.x] = 0;
115 
116     __syncthreads();
117 
118     if (    x < 0 || y < 0 ||     x >= width || y >= height ||
119         threadIdx.x == 0 || threadIdx.x == BLOCK_TILE - 1 ||
120         threadIdx.y == 0 || threadIdx.y == BLOCK_TILE - 1    )
121             return;
122 
123     p = smImg[threadIdx.y][threadIdx.x];
124 
125     for(int pos=0; pos < conn ; pos++)
126     {
127         int dx = c_neigh[pos+conn];
128         int dy = c_neigh[pos];
129         int q  = smImg[threadIdx.y + dy][threadIdx.x + dx];
130 
131         if(q < p )
132         {
133             p = q;
134             arrow = -( (y+dy) * width + (x+dx) );
135             dist = 0;
136         }
137     }
138 
139     smDist[threadIdx.y][threadIdx.x] = dist;
140 
141     __syncthreads();
142 
143     p = smImg[threadIdx.y][threadIdx.x];
144 
145     if(smDist[threadIdx.y][threadIdx.x] == UNVISITED )
146     {
147         do
148         {
149             smCtrl[1][1] = 0;
150             count = 0;
151             __syncthreads();
152 
153             for(int pos=0; pos < conn ; pos++)
154             {
155                 int dx = c_neigh[pos+conn];
156                 int dy = c_neigh[pos];
157 
158                 int q = smImg[threadIdx.y + dy][threadIdx.x + dx];
159                 int d = smDist[threadIdx.y + dy][threadIdx.x + dx];
160 
161                 if(q == p && (dist-1) > d )
162                 {
163                     dist = d + 1;
164                     arrow = -( (y+dy) * width + (x+dx) );
165                     count = 1;
166                 }
167             }
168             smDist[threadIdx.y][threadIdx.x] = dist;
169             if(count == 1) smCtrl[1][1] = 1;
170             __syncthreads();
171 
172         }while(smCtrl[1][1]);
173     }
174 
175     if(arrow == UNVISITED) arrow = idg;
176 
177     gArrow[idg] = arrow;
178      gDist[idg] = dist;
179 
180 }
181 
182 
183 
184 //------------------------------------------------------------------------------
185 // Propaga e resolve a distancia do plateau
186 //------------------------------------------------------------------------------
187 __global__ void cuPropagaDescida(  int      *gArrow,
188                                    int      *gDist,
189                                    int       width,
190                                    int       height,
191                                    int       conn,
192                                    int      *flag  )
193 {
194 
195     const int x = REAL_TILE * blockIdx.x + threadIdx.x - 1;
196     const int y = REAL_TILE * blockIdx.y + threadIdx.y - 1;
197     const int idg = y * width + x;
198 
199     int p,
200         arrow=0,
201         dist =0,
202         count=0;
203 
204     //Memória compartilhada para dados com borda.
205     __shared__ int   smImg[BLOCK_TILE][BLOCK_TILE],
206                     smCtrl[BLOCK_TILE][BLOCK_TILE],
207                     smDist[BLOCK_TILE][BLOCK_TILE];
208 
209 
210     smImg[threadIdx.y][threadIdx.x]   = (x < width && x >= 0 && y >= 0 && y < height)? tex2D(timg,(float)x,(float)y) : BORDER;
211     smDist[threadIdx.y][threadIdx.x]  = (x < width && x >= 0 && y >= 0 && y < height)? tex1Dfetch(taux,idg) : BORDER;
212     smCtrl[threadIdx.y][threadIdx.x]  = 0;
213 
214     __syncthreads();
215 
216     if (    x < 0 || y < 0 ||     x >= width || y >= height ||
217         threadIdx.x == 0 || threadIdx.x == BLOCK_TILE - 1 ||
218         threadIdx.y == 0 || threadIdx.y == BLOCK_TILE - 1    )
219             return;
220 
221 
222     p = smImg[threadIdx.y][threadIdx.x];
223     dist = smDist[threadIdx.y][threadIdx.x];
224 
225     if(smDist[threadIdx.y][threadIdx.x] > 0 )
226     {
227         do
228         {
229             smCtrl[1][1] = 0;
230             count = 0;
231             __syncthreads();
232 
233             for(int pos=0; pos < conn ; pos++)
234             {
235                 int dx = c_neigh[pos+conn];
236                 int dy = c_neigh[pos];
237 
238                 int q = smImg[threadIdx.y + dy][threadIdx.x + dx];
239                 int d = smDist[threadIdx.y + dy][threadIdx.x + dx];
240 
241                 if(q == p && (dist-1) > d)
242                 {
243                     dist = d + 1;
244                     arrow = -( (y+dy) * width + (x+dx) );
245                     count = 1;
246                 }
247             }
248             smDist[threadIdx.y][threadIdx.x] = dist;
249             if(count == 1) smCtrl[1][1] = 1;
250             __syncthreads();
251 
252         }while(smCtrl[1][1]);
253     }
254 
255     if(arrow < 0)
256     {
257         gArrow[idg] = arrow;
258         *flag += 1;
259     }
260 
261     gDist[idg] = dist;
262 
263 }
264 //----------------------------------------------------------------------------------------------
265 // Função que faz o scan verificando a rotulação dos seus vizinhos e atribuindo o de menor valor
266 //----------------------------------------------------------------------------------------------
267 
268 __global__ void cuAgrupaPixel(   int *R,
269                                  int *flag,
270                                  int conn,
271                                  int width,
272                                  int height )
273 
274 {
275 
276     int x = REAL_TILE * blockIdx.x + threadIdx.x - 1;
277     int y = REAL_TILE * blockIdx.y + threadIdx.y - 1;
278     int idg = y * width + x;
279 
280     __shared__ int  smArrow[BLOCK_TILE][BLOCK_TILE];
281 
282     smArrow[threadIdx.y][threadIdx.x] = tex2D(tarrow,(float)x,(float)y);
283 
284     __syncthreads();
285 
286     if (    x < 0 || y < 0 ||     x >= width || y >= height ||
287         threadIdx.x == 0 || threadIdx.x == BLOCK_TILE - 1 ||
288         threadIdx.y == 0 || threadIdx.y == BLOCK_TILE - 1    )
289             return;
290 
291     int label = smArrow[threadIdx.y][threadIdx.x],
292         label2= BORDER;
293 
294     R[idg] = label;
295 
296     for(int pos=0; pos < conn ; pos++)
297     {
298         int dx = c_neigh[pos+conn];
299         int dy = c_neigh[pos];
300 
301         int label_v = smArrow[threadIdx.y + dy][threadIdx.x + dx];
302 
303         if( label_v >= 0 && label >= 0)
304             label2 = min(label2,label_v);
305 
306     }
307     __syncthreads();
308 
309     if (label2 < label)
310     {
311         atomicExch(&R[label],label2);
312         *flag += 1;
313     }
314 }
315 //----------------------------------------------------------------------------------------------
316 // Função que analisa a equivalencia dos pixels Agrupados
317 //----------------------------------------------------------------------------------------------
318 __global__ void cuPropagaIndex(  int *L,
319                                  int *R,
320                                  int dim )
321 
322 {
323 
324     // inicializando o indice dos threads com os blocos
325 
326     int id = blockIdx.x * blockDim.x + threadIdx.x ;
327 
328 
329     if(id >= dim) return;
330 
331     int label = L[id];
332 
333     if (label == id)
334     {
335         int ref = label;
336         label = tex1Dfetch(taux,ref);
337 
338         do
339         {
340             ref = label;
341             label = tex1Dfetch(taux,ref);
342 
343         }while(label != ref);
344 
345         __syncthreads();
346 
347         R[id] = label;
348     }
349 
350 
351 }
352 //----------------------------------------------------------------------------------------------
353 // Função que Atribui os rotulos
354 //----------------------------------------------------------------------------------------------
355 __global__ void cuLabel(int *L,
356                 int dim )
357 
358 {
359 
360     // inicializando o indice dos threads com os blocos
361 
362     const int id    = blockIdx.x * blockDim.x + threadIdx.x ;
363     const int ref   = tex1Dfetch(taux,id);
364 
365     if(id >= dim) return;
366     if(ref>=0) L[id] = tex1Dfetch(taux,ref);
367 
368 
369 }
370 //----------------------------------------------------------------------------------------------
371 // Função que Ajusta os vetores para o passo 4.
372 //----------------------------------------------------------------------------------------------
373 __global__ void cuAjustaVetor(int *L,
374                     int *R,
375                     int dim  )
376 
377 {
378 
379     // inicializando o indice dos threads com os blocos
380     const int id    = blockIdx.x * blockDim.x + threadIdx.x ;
381 
382     if(id >= dim) return;
383 
384     const int label = tex1Dfetch(taux,id);;
385     __syncthreads();
386 
387     L[id] = id;
388     R[id] = (label>=0)?label:-label;
389 
390 }
391 
392 //------------------------------------------------------------------------------
393 // Função Watershed
394 //------------------------------------------------------------------------------
395 extern "C" __host__ void giwatershed( int *hdataOut,
396                                       unsigned char *hdataIn,
397                                       int w,
398                                       int h,
399                                       int conn )
400 {
401     int *gArrow,
402         *gDist,
403         *gCtrl,
404          hCtrl;
405 
406     neighvector(conn);
407 
408     // Dimensionando os kernels
409     dim3 dimBlock(BLOCK_TILE,BLOCK_TILE);
410     dim3 dimGrid( iDivUp(w,REAL_TILE),iDivUp(h,REAL_TILE));
411 
412     dim3  dimBlock1(BLOCK_SIZE_1D);
413     dim3  dimGrid1(iDivUp(w*h,BLOCK_SIZE_1D));
414 
415     //Alocando memórias...
416     int sizei = w * h * sizeof(int);
417     cudaArray *cuarrayImg;
418     cudaChannelFormatDesc desc8 = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSigned);
419     cudaMallocArray(&cuarrayImg, &desc8, w, h);
420 
421     cudaArray *cuarrayArrow;
422     cudaChannelFormatDesc desc32 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
423     cudaMallocArray(&cuarrayArrow, &desc32, w, h);
424 
425     cudaMalloc((void**)&gArrow, sizei);
426     cudaMalloc((void**)&gDist , sizei);
427     cudaMalloc((void**)&gCtrl, sizeof(int));
428 
429     // >> PASSO 1 <<
430 
431     cudaMemcpyToArray(cuarrayImg, 0, 0, hdataIn, w * h * sizeof(unsigned char), cudaMemcpyHostToDevice);
432     cudaBindTextureToArray(timg,cuarrayImg);
433     cuIndexDescida<<<dimGrid, dimBlock>>>(gArrow, gDist, w, h,conn);
434 
435     // >> PASSO 2 <<
436 
437     do{
438 
439         cudaMemset(gCtrl, 0, sizeof(int));
440         cudaBindTexture(0,taux,gDist,sizei);
441         cuPropagaDescida<<<dimGrid, dimBlock>>>(gArrow, gDist, w, h, conn, gCtrl);
442         cudaMemcpy(&hCtrl,gCtrl, sizeof(int),cudaMemcpyDeviceToHost);
443 
444     }while(hCtrl);
445 
446     // >> PASSO 3 <<
447 
448     do{
449         cudaMemset(gCtrl, 0, sizeof(int));
450         cudaMemcpyToArray(cuarrayArrow, 0, 0, gArrow, sizei, cudaMemcpyDeviceToDevice);
451         cudaBindTextureToArray( tarrow,cuarrayArrow);
452         cuAgrupaPixel<<<dimGrid, dimBlock>>>(gDist, gCtrl,conn,w,h);
453         cudaMemcpy(&hCtrl,gCtrl, sizeof(int),cudaMemcpyDeviceToHost);
454 
455         if(hCtrl)
456         {
457             cudaBindTexture(0,taux,gDist,sizei);
458             cuPropagaIndex<<<dimGrid1, dimBlock1>>>(gArrow,gDist,w*h);
459             cudaUnbindTexture(taux);
460 
461             cudaBindTexture(0,taux,gDist,sizei);
462             cuLabel<<<dimGrid1, dimBlock1>>>(gArrow, w*h);
463             cudaUnbindTexture(taux);
464             cudaThreadSynchronize();
465         }
466     }while(hCtrl);
467 
468     // >> PASSO 4 <<
469 
470     cudaBindTexture(0,taux,gArrow,sizei);
471     cuAjustaVetor<<<dimGrid1, dimBlock1>>>(gArrow,gDist, w*h);
472     cudaBindTexture(0,taux,gDist,sizei);
473     cuPropagaIndex<<<dimGrid1, dimBlock1>>>(gArrow,gDist, w*h);
474     cudaThreadSynchronize();
475 
476     // Copiando resultado para o host...
477     cudaMemcpy(hdataOut,gDist, sizei,cudaMemcpyDeviceToHost);
478 
479     //Liberando as memórias...
480     cudaFreeArray(cuarrayImg);
481     cudaFreeArray(cuarrayArrow);
482 
483     cudaFree(gArrow);
484     cudaFree(gDist);
485     cudaFree(gCtrl);
486 
487     cudaUnbindTexture(timg);
488     cudaUnbindTexture(tarrow);
489     cudaUnbindTexture(taux);
490 }
491 
492 ////////////////////////////////////////////////////////////////////////////////
493 // Exported Function
494 ////////////////////////////////////////////////////////////////////////////////
495 Image32S *wsgpu(Image8U *A, int conn)
496 {
497     Image32S *output = new Image32S(A->nd, A->dims);
498 
499     giwatershed((int*)output->raster,
500                 (unsigned char*)A->raster,
501                 A->dims[0],
502                 A->dims[1],
503                 conn );
504 
505     return output;
506 }
OK
1 Image32S *wsgpu(Image8U *A, int conn);
OK [C/C++ extension is up-to-date]
 1 import ismm2011_dw as m
 2 import time
 3 
 4 I = array([[8,6,6,6],[0,3,7,5],[2,4,0,2],[2,4,6,9]]).astype('uint8')
 5 w = m.wsgpu(I, 4)
 6 
 7 print I
 8 print w
 9 
10 #f = adreadgray('cameraman.tif')
11 f = mmreadgray('app/ilhotas_gray.jpg')
12 
13 mmshow(f, title='original Image')
14 
15 f = mmgradm(f)
16 f = mmhmin(f, 5)
17 
18 t0 = time.time()
19 w0 = mmwatershed(f, mmsecross(), 'REGIONS')
20 t1 = time.time()
21 w1 = m.wsgpu(f, 4)
22 t2 = time.time()
23 
24 print 'Sequential C:', t1 - t0
25 print 'CUDA :', t2 - t1
26 
27 mmlblshow(w0, title='w0 - sequential - C')
28 mmlblshow(w1, title='w1 - CUDA')
[[8 6 6 6]
 [0 3 7 5]
 [2 4 0 2]
 [2 4 6 9]]
[[ 4  4  4 10]
 [ 4  4 10 10]
 [ 4 10 10 10]
 [ 4  4 10 10]]
Sequential C: 0.0994999408722
CUDA : 0.024425983429
Warning: downcasting image from int32 to uint16 (may lose precision)

original Image

w0 - sequential - C

w1 - CUDA