# DW Algorithm Source Code

This algorithm has been published in

KÖRBES, A. ; VITOR, G. B. ; LOTUFO, R. ; FERREIRA, J. V. . Advances on Watershed Processing on GPU Architecture. In: 10th International Symposium on Mathematical Morphology, 2011, Intra, Lake Maggiore, Italy. MATHEMATICAL MORPHOLOGY AND ITS APPLICATIONS TO IMAGE AND SIGNAL PROCESSING. Heidelberg : Springer Berlin, 2011. v. 6671. p. 260-271.

```  1 #include <cuda_runtime.h>
2
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6
7 #undef _POSIX_C_SOURCE
8 #undef _XOPEN_SOURCE
9
10 #include "simple_arrays.h"
11
12 ////////////////////////////////////////////////////////////////////////////////
13 // Kernel configuration
14 ////////////////////////////////////////////////////////////////////////////////
15 #define BORDER 0xFFFFFFF
16 #define UNVISITED 0xFFFFFFE
17
18 #define BLOCK_TILE 16  //7 //16
19 #define REAL_TILE  14  //5 //14
20 #define BLOCK_SIZE_1D 512 //8 //256 //512
21
22 //------------------------------------------------------------------------------
23 // Declaração de textura referencia
24 //------------------------------------------------------------------------------
25 texture<unsigned char, 2, cudaReadModeElementType> timg;
28
29 //------------------------------------------------------------------------------
30 // Constant memory for neigh
31 //------------------------------------------------------------------------------
32 __constant__ int c_neigh[16];
33
34
35 //------------------------------------------------------------------------------
36 //Arredonda a / b para o valor do vizinho superior
37 //------------------------------------------------------------------------------
38 int iDivUp(int a, int b){
39     return (a % b != 0) ? (a / b + 1) : (a / b);
40 }
41
42 //------------------------------------------------------------------------------
43 // Generate neighborhood offsets
44 //------------------------------------------------------------------------------
45 void neighvector(int conn)
46 {
47     int neigh[16];
48     int i = 0;
49       switch(conn)
50       {
51         case 4:
52             // for Y
53             neigh[i++] = -1;
54             neigh[i++] =  0;
55             neigh[i++] =  0;
56             neigh[i++] =  1;
57             // for X
58             neigh[i++] =  0;
59             neigh[i++] = -1;
60             neigh[i++] =  1;
61             neigh[i++] =  0;
62         break;
63         case 8:
64             // for Y
65             neigh[i++] = -1;
66             neigh[i++] = -1;
67             neigh[i++] = -1;
68             neigh[i++] =  0;
69             neigh[i++] =  0;
70             neigh[i++] =  1;
71             neigh[i++] =  1;
72             neigh[i++] =  1;
73             // for X
74             neigh[i++] = -1;
75             neigh[i++] =  0;
76             neigh[i++] =  1;
77             neigh[i++] = -1;
78             neigh[i++] =  1;
79             neigh[i++] = -1;
80             neigh[i++] =  0;
81             neigh[i++] =  1;
82         break;
83     }
84
85     cudaMemcpyToSymbol(c_neigh, neigh, 16*sizeof(int), 0, cudaMemcpyHostToDevice);
86
87 }
88
89 //------------------------------------------------------------------------------
90 // Indexa a descida
91 //------------------------------------------------------------------------------
92 __global__ void cuIndexDescida(   int *gArrow,
93                                   int   *gDist,
94                                   int  width,
95                                   int  height,
96                                   int  conn      )
97 {
98
99     int x = REAL_TILE * blockIdx.x + threadIdx.x - 1;
100     int y = REAL_TILE * blockIdx.y + threadIdx.y - 1;
101     int idg = y * width + x;
102
103     int p,
104         count = 0,
105         arrow = UNVISITED,
106         dist  = UNVISITED;
107
108     __shared__ int  smImg[BLOCK_TILE][BLOCK_TILE],
109                    smCtrl[BLOCK_TILE][BLOCK_TILE],
110                    smDist[BLOCK_TILE][BLOCK_TILE];
111
112     smImg[threadIdx.y][threadIdx.x] = (x < width && x >= 0 && y >= 0 && y < height)? tex2D(timg,(float)x,(float)y) : BORDER;
115
117
118     if (    x < 0 || y < 0 ||     x >= width || y >= height ||
121             return;
122
124
125     for(int pos=0; pos < conn ; pos++)
126     {
127         int dx = c_neigh[pos+conn];
128         int dy = c_neigh[pos];
130
131         if(q < p )
132         {
133             p = q;
134             arrow = -( (y+dy) * width + (x+dx) );
135             dist = 0;
136         }
137     }
138
140
142
144
146     {
147         do
148         {
149             smCtrl[1][1] = 0;
150             count = 0;
152
153             for(int pos=0; pos < conn ; pos++)
154             {
155                 int dx = c_neigh[pos+conn];
156                 int dy = c_neigh[pos];
157
160
161                 if(q == p && (dist-1) > d )
162                 {
163                     dist = d + 1;
164                     arrow = -( (y+dy) * width + (x+dx) );
165                     count = 1;
166                 }
167             }
169             if(count == 1) smCtrl[1][1] = 1;
171
172         }while(smCtrl[1][1]);
173     }
174
175     if(arrow == UNVISITED) arrow = idg;
176
177     gArrow[idg] = arrow;
178      gDist[idg] = dist;
179
180 }
181
182
183
184 //------------------------------------------------------------------------------
185 // Propaga e resolve a distancia do plateau
186 //------------------------------------------------------------------------------
187 __global__ void cuPropagaDescida(  int      *gArrow,
188                                    int      *gDist,
189                                    int       width,
190                                    int       height,
191                                    int       conn,
192                                    int      *flag  )
193 {
194
195     const int x = REAL_TILE * blockIdx.x + threadIdx.x - 1;
196     const int y = REAL_TILE * blockIdx.y + threadIdx.y - 1;
197     const int idg = y * width + x;
198
199     int p,
200         arrow=0,
201         dist =0,
202         count=0;
203
205     __shared__ int   smImg[BLOCK_TILE][BLOCK_TILE],
206                     smCtrl[BLOCK_TILE][BLOCK_TILE],
207                     smDist[BLOCK_TILE][BLOCK_TILE];
208
209
210     smImg[threadIdx.y][threadIdx.x]   = (x < width && x >= 0 && y >= 0 && y < height)? tex2D(timg,(float)x,(float)y) : BORDER;
211     smDist[threadIdx.y][threadIdx.x]  = (x < width && x >= 0 && y >= 0 && y < height)? tex1Dfetch(taux,idg) : BORDER;
213
215
216     if (    x < 0 || y < 0 ||     x >= width || y >= height ||
219             return;
220
221
224
226     {
227         do
228         {
229             smCtrl[1][1] = 0;
230             count = 0;
232
233             for(int pos=0; pos < conn ; pos++)
234             {
235                 int dx = c_neigh[pos+conn];
236                 int dy = c_neigh[pos];
237
240
241                 if(q == p && (dist-1) > d)
242                 {
243                     dist = d + 1;
244                     arrow = -( (y+dy) * width + (x+dx) );
245                     count = 1;
246                 }
247             }
249             if(count == 1) smCtrl[1][1] = 1;
251
252         }while(smCtrl[1][1]);
253     }
254
255     if(arrow < 0)
256     {
257         gArrow[idg] = arrow;
258         *flag += 1;
259     }
260
261     gDist[idg] = dist;
262
263 }
264 //----------------------------------------------------------------------------------------------
265 // Função que faz o scan verificando a rotulação dos seus vizinhos e atribuindo o de menor valor
266 //----------------------------------------------------------------------------------------------
267
268 __global__ void cuAgrupaPixel(   int *R,
269                                  int *flag,
270                                  int conn,
271                                  int width,
272                                  int height )
273
274 {
275
276     int x = REAL_TILE * blockIdx.x + threadIdx.x - 1;
277     int y = REAL_TILE * blockIdx.y + threadIdx.y - 1;
278     int idg = y * width + x;
279
280     __shared__ int  smArrow[BLOCK_TILE][BLOCK_TILE];
281
283
285
286     if (    x < 0 || y < 0 ||     x >= width || y >= height ||
289             return;
290
292         label2= BORDER;
293
294     R[idg] = label;
295
296     for(int pos=0; pos < conn ; pos++)
297     {
298         int dx = c_neigh[pos+conn];
299         int dy = c_neigh[pos];
300
302
303         if( label_v >= 0 && label >= 0)
304             label2 = min(label2,label_v);
305
306     }
308
309     if (label2 < label)
310     {
311         atomicExch(&R[label],label2);
312         *flag += 1;
313     }
314 }
315 //----------------------------------------------------------------------------------------------
316 // Função que analisa a equivalencia dos pixels Agrupados
317 //----------------------------------------------------------------------------------------------
318 __global__ void cuPropagaIndex(  int *L,
319                                  int *R,
320                                  int dim )
321
322 {
323
324     // inicializando o indice dos threads com os blocos
325
326     int id = blockIdx.x * blockDim.x + threadIdx.x ;
327
328
329     if(id >= dim) return;
330
331     int label = L[id];
332
333     if (label == id)
334     {
335         int ref = label;
336         label = tex1Dfetch(taux,ref);
337
338         do
339         {
340             ref = label;
341             label = tex1Dfetch(taux,ref);
342
343         }while(label != ref);
344
346
347         R[id] = label;
348     }
349
350
351 }
352 //----------------------------------------------------------------------------------------------
353 // Função que Atribui os rotulos
354 //----------------------------------------------------------------------------------------------
355 __global__ void cuLabel(int *L,
356                 int dim )
357
358 {
359
360     // inicializando o indice dos threads com os blocos
361
362     const int id    = blockIdx.x * blockDim.x + threadIdx.x ;
363     const int ref   = tex1Dfetch(taux,id);
364
365     if(id >= dim) return;
366     if(ref>=0) L[id] = tex1Dfetch(taux,ref);
367
368
369 }
370 //----------------------------------------------------------------------------------------------
371 // Função que Ajusta os vetores para o passo 4.
372 //----------------------------------------------------------------------------------------------
373 __global__ void cuAjustaVetor(int *L,
374                     int *R,
375                     int dim  )
376
377 {
378
379     // inicializando o indice dos threads com os blocos
380     const int id    = blockIdx.x * blockDim.x + threadIdx.x ;
381
382     if(id >= dim) return;
383
384     const int label = tex1Dfetch(taux,id);;
386
387     L[id] = id;
388     R[id] = (label>=0)?label:-label;
389
390 }
391
392 //------------------------------------------------------------------------------
393 // Função Watershed
394 //------------------------------------------------------------------------------
395 extern "C" __host__ void giwatershed( int *hdataOut,
396                                       unsigned char *hdataIn,
397                                       int w,
398                                       int h,
399                                       int conn )
400 {
401     int *gArrow,
402         *gDist,
403         *gCtrl,
404          hCtrl;
405
406     neighvector(conn);
407
408     // Dimensionando os kernels
409     dim3 dimBlock(BLOCK_TILE,BLOCK_TILE);
410     dim3 dimGrid( iDivUp(w,REAL_TILE),iDivUp(h,REAL_TILE));
411
412     dim3  dimBlock1(BLOCK_SIZE_1D);
413     dim3  dimGrid1(iDivUp(w*h,BLOCK_SIZE_1D));
414
415     //Alocando memórias...
416     int sizei = w * h * sizeof(int);
417     cudaArray *cuarrayImg;
418     cudaChannelFormatDesc desc8 = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSigned);
419     cudaMallocArray(&cuarrayImg, &desc8, w, h);
420
421     cudaArray *cuarrayArrow;
422     cudaChannelFormatDesc desc32 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
423     cudaMallocArray(&cuarrayArrow, &desc32, w, h);
424
425     cudaMalloc((void**)&gArrow, sizei);
426     cudaMalloc((void**)&gDist , sizei);
427     cudaMalloc((void**)&gCtrl, sizeof(int));
428
429     // >> PASSO 1 <<
430
431     cudaMemcpyToArray(cuarrayImg, 0, 0, hdataIn, w * h * sizeof(unsigned char), cudaMemcpyHostToDevice);
432     cudaBindTextureToArray(timg,cuarrayImg);
433     cuIndexDescida<<<dimGrid, dimBlock>>>(gArrow, gDist, w, h,conn);
434
435     // >> PASSO 2 <<
436
437     do{
438
439         cudaMemset(gCtrl, 0, sizeof(int));
440         cudaBindTexture(0,taux,gDist,sizei);
441         cuPropagaDescida<<<dimGrid, dimBlock>>>(gArrow, gDist, w, h, conn, gCtrl);
442         cudaMemcpy(&hCtrl,gCtrl, sizeof(int),cudaMemcpyDeviceToHost);
443
444     }while(hCtrl);
445
446     // >> PASSO 3 <<
447
448     do{
449         cudaMemset(gCtrl, 0, sizeof(int));
450         cudaMemcpyToArray(cuarrayArrow, 0, 0, gArrow, sizei, cudaMemcpyDeviceToDevice);
451         cudaBindTextureToArray( tarrow,cuarrayArrow);
452         cuAgrupaPixel<<<dimGrid, dimBlock>>>(gDist, gCtrl,conn,w,h);
453         cudaMemcpy(&hCtrl,gCtrl, sizeof(int),cudaMemcpyDeviceToHost);
454
455         if(hCtrl)
456         {
457             cudaBindTexture(0,taux,gDist,sizei);
458             cuPropagaIndex<<<dimGrid1, dimBlock1>>>(gArrow,gDist,w*h);
459             cudaUnbindTexture(taux);
460
461             cudaBindTexture(0,taux,gDist,sizei);
462             cuLabel<<<dimGrid1, dimBlock1>>>(gArrow, w*h);
463             cudaUnbindTexture(taux);
465         }
466     }while(hCtrl);
467
468     // >> PASSO 4 <<
469
470     cudaBindTexture(0,taux,gArrow,sizei);
471     cuAjustaVetor<<<dimGrid1, dimBlock1>>>(gArrow,gDist, w*h);
472     cudaBindTexture(0,taux,gDist,sizei);
473     cuPropagaIndex<<<dimGrid1, dimBlock1>>>(gArrow,gDist, w*h);
475
476     // Copiando resultado para o host...
477     cudaMemcpy(hdataOut,gDist, sizei,cudaMemcpyDeviceToHost);
478
479     //Liberando as memórias...
480     cudaFreeArray(cuarrayImg);
481     cudaFreeArray(cuarrayArrow);
482
483     cudaFree(gArrow);
484     cudaFree(gDist);
485     cudaFree(gCtrl);
486
487     cudaUnbindTexture(timg);
488     cudaUnbindTexture(tarrow);
489     cudaUnbindTexture(taux);
490 }
491
492 ////////////////////////////////////////////////////////////////////////////////
493 // Exported Function
494 ////////////////////////////////////////////////////////////////////////////////
495 Image32S *wsgpu(Image8U *A, int conn)
496 {
497     Image32S *output = new Image32S(A->nd, A->dims);
498
499     giwatershed((int*)output->raster,
500                 (unsigned char*)A->raster,
501                 A->dims[0],
502                 A->dims[1],
503                 conn );
504
505     return output;
506 }
```
```OK
```
```1 Image32S *wsgpu(Image8U *A, int conn);
```
```OK [C/C++ extension is up-to-date]
```
``` 1 import ismm2011_dw as m
2 import time
3
4 I = array([[8,6,6,6],[0,3,7,5],[2,4,0,2],[2,4,6,9]]).astype('uint8')
5 w = m.wsgpu(I, 4)
6
7 print I
8 print w
9
12
13 mmshow(f, title='original Image')
14
16 f = mmhmin(f, 5)
17
18 t0 = time.time()
19 w0 = mmwatershed(f, mmsecross(), 'REGIONS')
20 t1 = time.time()
21 w1 = m.wsgpu(f, 4)
22 t2 = time.time()
23
24 print 'Sequential C:', t1 - t0
25 print 'CUDA :', t2 - t1
26
27 mmlblshow(w0, title='w0 - sequential - C')
28 mmlblshow(w1, title='w1 - CUDA')
```
```[[8 6 6 6]
[0 3 7 5]
[2 4 0 2]
[2 4 6 9]]
[[ 4  4  4 10]
[ 4  4 10 10]
[ 4 10 10 10]
[ 4  4 10 10]]
Sequential C: 0.0994999408722
CUDA : 0.024425983429
Warning: downcasting image from int32 to uint16 (may lose precision)
```