# lf005835_10 - Multiplicação de Matrizes Complexas

Autor: Luiz Fernando Stein Wetzel 23/10/2010
```  1 // #include <string.h>
2 // #include <math.h>
3 #include <cuda.h>
4 #include "simple_arrays.h"
5
6 typedef int BOOL;
7 #define TRUE  1
8 #define FALSE 0
9
10
11 // complex structure
12 struct complex : public float2
13 {
14     __device__ inline complex(float2 f) : float2(f) {}
15     __device__ inline complex(float re, float im) : float2(make_float2(re, im)) {}
16     __device__ inline complex() {}                                // default constructor
17     __device__ inline float& real() { return x; }
18     __device__ inline float& imag() { return y; }
19     __device__ inline complex operator += (complex c) { real()+= c.real(); imag()+=c.imag(); return *this; }
20     __device__ inline complex operator * (complex& c) { return complex(real()*c.real() - imag()*c.imag(), real()*c.imag() + imag()*c.real()); }
21 };
22 __device__ inline complex make_complex(float re, float im) { return complex(make_float2(re, im)); }
23
24 // Error functions
25 inline BOOL print_cuda_error(cudaError_t err, int pos=0)
26 {
27     if(err != cudaSuccess)
28     {
29         pyprintf("CUDA Error: %d, position: %d %x\n", err, pos, pos);
30         return TRUE;
31     }
32     return FALSE;
33 }
34
35 inline void print_error(int pos)
36 {
37     pyprintf("NON CUDA Error: position: %d %x\n", pos, pos);
38 }
39
40 #define CHECK_CUDA(err, pos, ret) { if(print_cuda_error(err, pos)) return ret; }
41 #define ERROR(pos, ret) { print_error(pos); return ret; }
42
43 struct gdata{
44     int     m_width;
45     int     m_height;
46     int     m_pitch;
47     float*  m_data;
48     BOOL    m_iscomplex;
49
50     inline gdata() { m_data = NULL; }
51     inline BOOL init(Image32F* im)
52     {
53         if (im == NULL) ERROR(1, FALSE)
54         m_iscomplex = (im->nd == 3 && im->dims[0] == 2);
55         if (im->nd != 2 && !m_iscomplex ) ERROR(2, FALSE)
56         m_height = im->dims[m_iscomplex ? 1 : 0];
57         m_width  = im->dims[m_iscomplex ? 2 : 1];
58         size_t pitch;
59         cudaError_t err = cudaMallocPitch((void**)&m_data, &pitch, m_width*sizeof(float), m_iscomplex ? 2*m_height : m_height);
60         m_pitch = (int)pitch;
61         CHECK_CUDA(err, 3, FALSE)
62         err = cudaMemcpy2D(m_data, m_pitch, im->raster, m_width<<2, m_width<<2, m_iscomplex ? (m_height<<1) : m_height, cudaMemcpyHostToDevice);
63         CHECK_CUDA(err, 4, FALSE)
64         return TRUE;
65     }
66
67     inline BOOL init(int height, int width, BOOL iscomplex) {
68         m_iscomplex = iscomplex;
69         m_height = height;
70         m_width = width;
71         size_t pitch;
72         cudaError_t err = cudaMallocPitch((void**)&m_data, &pitch, m_width<<2, m_iscomplex ? 2*m_height : m_height);
73         m_pitch = (int)pitch;
74         // cudaMemset(m_data, m_pitch*(m_iscomplex ? 2*m_height : m_height), 0);
75         CHECK_CUDA(err, 5, FALSE)
76         return TRUE;
77     }
78
79     inline ~gdata()
80     {
81         if(m_data != NULL) cudaFree(m_data);
82     }
83
84     inline void dump()
85     {
86         pyprintf("width=%d, height=%d, pitch=%d, data=%p, iscomplex=%d\n", m_width, m_height, m_pitch, m_data, m_iscomplex);
87     }
88
89     inline Image32F* getImage32F()
90     {
91         int dims[3];
92         if(m_iscomplex) { dims[0]=2; dims[1]=m_height; dims[2]=m_width; }
93         else { dims[0]=m_height; dims[1]=m_width; }
94         Image32F* ret = new Image32F(m_iscomplex ? 3 : 2, dims);
95         cudaError_t err = cudaMemcpy2D(ret->raster, m_width<<2, m_data, m_pitch, m_width<<2, m_iscomplex ? (m_height<<1) : m_height, cudaMemcpyDeviceToHost);
96         if(err != cudaSuccess)
97         {
98             print_cuda_error(err, (long)m_data);
99             delete ret;
100             ret = NULL;
101         }
102         return ret;
103     }
104 };
105
107     int     m_width;
108     int     m_height;
109     int     m_pitch;
110     float*  m_data;
111     BOOL    m_iscomplex;
112
113     inline __device__ BOOL init(int width, int height, int pitch, float* data, BOOL iscomplex)
114     {
115         m_width = width;
116         m_height = height;
117         m_pitch = pitch;
118         m_data = data;
119         m_iscomplex = iscomplex;
120         return TRUE;
121     }
122
123     inline __device__ complex cget(int row, int col)
124     {
125         int offreal = row*(m_pitch >> 2) + col;
126         return make_complex(m_data[offreal], (m_iscomplex ?  m_data[offreal + (m_pitch >> 2) * m_height] : 0.0));
127     }
128
129     inline __device__ void cput(int row, int col, complex value)
130     {
131         int offreal = row*(m_pitch >> 2) + col;
132         m_data[offreal] = value.real();
133         if(m_iscomplex) m_data[offreal + (m_pitch >> 2) * m_height] = value.imag();
134     }
135 };
136
137 #define TILE_WIDTH 16
138 #define XOFF (blockIdx.x*TILE_WIDTH  + threadIdx.x)
139 #define YOFF (blockIdx.y*TILE_WIDTH  + threadIdx.y)
140
141 __global__ void KernelMultMatrix(int g1d_width, int g1d_height, int g1d_pitch, float* g1d_data, BOOL g1d_iscomplex,
142                                  int g2d_width, int g2d_height, int g2d_pitch, float* g2d_data, BOOL g2d_iscomplex,
143                                  int grd_width, int grd_height, int grd_pitch, float* grd_data, BOOL grd_iscomplex)
144 {
145     __shared__ complex sm1[TILE_WIDTH][TILE_WIDTH];
146     __shared__ complex sm2[TILE_WIDTH][TILE_WIDTH];
147     __shared__ complex smr[TILE_WIDTH][TILE_WIDTH];
148
152
153     gm1d.init(g1d_width, g1d_height, g1d_pitch, g1d_data, g1d_iscomplex);
154     gm2d.init(g2d_width, g2d_height, g2d_pitch, g2d_data, g2d_iscomplex);
155     gmrd.init(grd_width, grd_height, grd_pitch, grd_data, grd_iscomplex);
156
157     const int xoff = XOFF;
158     const int yoff = YOFF;
159     if(xoff >= gmrd.m_width && yoff >= gmrd.m_height) return;
161
162     for(int t=0; t < gm1d.m_width; t+=TILE_WIDTH)
163     {
164         // Carrega valores na memória compartilhada
165         if(threadIdx.x+t < gm1d.m_width && yoff < gm1d.m_height)
166         {
168         }
169         if(threadIdx.y+t < gm2d.m_height && xoff < gm2d.m_width)
170         {
172         }
173
174         int lim = min(TILE_WIDTH, gm1d.m_width - t);
176
177         // Executa multiplicação
178         if(xoff < gmrd.m_width && yoff < gmrd.m_height)
179         {
180             complex temp = make_complex(0.0, 0.0);
181             for(int i=0; i<lim; i++)
182             {
184             }
185
187         }
189     }
190
191     // copia resultado para memória global
192     if(xoff < gmrd.m_width && yoff < gmrd.m_height)
193     {
195     }
196 }
197
198 Image32F* multcompmat(Image32F* m1, Image32F* m2)
199 {
200     gdata gm1, gm2, gmr;
201     if(!gm1.init(m1) || !gm2.init(m2) || !gmr.init(gm1.m_height, gm2.m_width, TRUE)) return NULL;
202
203     gm1.dump();
204     gm2.dump();
205     gmr.dump();
206
207     pyprintf("(void*): %d int: %d long: %d, int4: %d\n", sizeof(void*), sizeof(int), sizeof(long), sizeof(int4));
208
209     // Executa Multiplicação
210     int by = (gmr.m_height + TILE_WIDTH -1)/TILE_WIDTH;
211     int bx = (gmr.m_width + TILE_WIDTH -1)/TILE_WIDTH;
212
213     pyprintf("bx= %d, by=%d\n", bx, by);
214     dim3 dimGrid(bx, by);
215     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
216
217     /*
218     int     m_width;
219     int     m_height;
220     int     m_pitch;
221     float*  m_data;
222     BOOL    m_iscomplex;
223     */
224     KernelMultMatrix <<< dimGrid, dimBlock >>>(gm1.m_width, gm1.m_height, gm1.m_pitch, gm1.m_data, gm1.m_iscomplex,
225                                                gm2.m_width, gm2.m_height, gm2.m_pitch, gm2.m_data, gm2.m_iscomplex,
226                                                gmr.m_width, gmr.m_height, gmr.m_pitch, gmr.m_data, gmr.m_iscomplex );
227
228     return gmr.getImage32F();
229 }
```
```OK
```
```1 Image32F* multcompmat(Image32F* m1, Image32F* m2);
```
```OK

--------------------------------------------------------------------------------
Thu Oct 20 08:52:18 2011
--------------------------------------------------------------------------------
-- Configuring done
-- Generating done
-- Build files have been written to: <pkg_dir>/_build/_lf005835_10
[ 25%] Building NVCC (Device) object ./cuda_compile_generated__lf005835_10.cu.o
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
[ 50%] Swig source
Scanning dependencies of target __lf005835_10
[ 75%] Building CXX object CMakeFiles/__lf005835_10.dir/_lf005835_10PYTHON_wrap.cxx.o
[100%] Building CXX object CMakeFiles/__lf005835_10.dir<pkg_dir>/_build/simple_arrays.cpp.o
[100%] Built target __lf005835_10

```

# Utilizando o módulo

``` 1 import numpy as np
2 def ComplexToFloat(c):
3     sh = c.shape
4     nsh = (2,) + sh
5     f = np.empty(nsh, np.float32)
6     f[0] = np.real(c)
7     f[1] = np.imag(c)
8     return f
9
10 def FloatToComplex(f):
11     sh = f.shape
12     nsh = tuple(np.asarray(sh)[1:])
13     c = np.empty(nsh, np.complex64)
14     c.real = f[0]
15     c.imag = f[1]
16     return c
17
18 def MultComplex(c1, c2):
19     f1 = ComplexToFloat(c1)
20     print "c=\n", c1, "\nf=\n", f1, "\n"
21     f2 = ComplexToFloat(c2)
22     fr = multcompmat(f1, f2);
23     return FloatToComplex(fr)
24
25 # c1 = np.array([[1+1j, 1-1j], [1j,-1j]])
26 # c2 = np.array([[1, 1j], [-1j,0]])
27 c1 = np.eye(100).astype(complex)
28 c2 = np.eye(100).astype(complex)
29 c1[0, 0] = 2 + 1j
30 c2[0, 0] = 2 - 1j
31 print "c1=\n", c1
32 print "c2=\n", c2
33 print c1.dtype
34 p = MultComplex(c1, c2)
35 print "prod=\n", p
```
```c1=
[[ 2.+1.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
...,
[ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]
c2=
[[ 2.-1.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
...,
[ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]
complex128
c=
[[ 2.+1.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
...,
[ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]
f=
[[[ 2.  0.  0. ...,  0.  0.  0.]
[ 0.  1.  0. ...,  0.  0.  0.]
[ 0.  0.  1. ...,  0.  0.  0.]
...,
[ 0.  0.  0. ...,  1.  0.  0.]
[ 0.  0.  0. ...,  0.  1.  0.]
[ 0.  0.  0. ...,  0.  0.  1.]]

[[ 1.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
...,
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]]]

prod=
[[ 5.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
...,
[ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
[ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]
```