lf005835_10 - Multiplicação de Matrizes Complexas

Autor: Luiz Fernando Stein Wetzel
Data: 23/10/2010
  1 // #include <string.h>
  2 // #include <math.h>
  3 #include <cuda.h>
  4 #include "simple_arrays.h"
  5 
  6 typedef int BOOL;
  7 #define TRUE  1
  8 #define FALSE 0
  9 
 10 
 11 // complex structure
 12 struct complex : public float2
 13 {
 14     __device__ inline complex(float2 f) : float2(f) {}
 15     __device__ inline complex(float re, float im) : float2(make_float2(re, im)) {}
 16     __device__ inline complex() {}                                // default constructor
 17     __device__ inline float& real() { return x; }
 18     __device__ inline float& imag() { return y; }
 19     __device__ inline complex operator += (complex c) { real()+= c.real(); imag()+=c.imag(); return *this; }
 20     __device__ inline complex operator * (complex& c) { return complex(real()*c.real() - imag()*c.imag(), real()*c.imag() + imag()*c.real()); }
 21 };
 22 __device__ inline complex make_complex(float re, float im) { return complex(make_float2(re, im)); }
 23 
 24 // Error functions
 25 inline BOOL print_cuda_error(cudaError_t err, int pos=0)
 26 {
 27     if(err != cudaSuccess)
 28     {
 29         pyprintf("CUDA Error: %d, position: %d %x\n", err, pos, pos);
 30         return TRUE;
 31     }
 32     return FALSE;
 33 }
 34 
 35 inline void print_error(int pos)
 36 {
 37     pyprintf("NON CUDA Error: position: %d %x\n", pos, pos);
 38 }
 39 
 40 #define CHECK_CUDA(err, pos, ret) { if(print_cuda_error(err, pos)) return ret; }
 41 #define ERROR(pos, ret) { print_error(pos); return ret; }
 42 
 43 struct gdata{
 44     int     m_width;
 45     int     m_height;
 46     int     m_pitch;
 47     float*  m_data;
 48     BOOL    m_iscomplex;
 49 
 50     inline gdata() { m_data = NULL; }
 51     inline BOOL init(Image32F* im)
 52     {
 53         if (im == NULL) ERROR(1, FALSE)
 54         m_iscomplex = (im->nd == 3 && im->dims[0] == 2);
 55         if (im->nd != 2 && !m_iscomplex ) ERROR(2, FALSE)
 56         m_height = im->dims[m_iscomplex ? 1 : 0];
 57         m_width  = im->dims[m_iscomplex ? 2 : 1];
 58         size_t pitch;
 59         cudaError_t err = cudaMallocPitch((void**)&m_data, &pitch, m_width*sizeof(float), m_iscomplex ? 2*m_height : m_height);
 60         m_pitch = (int)pitch;
 61         CHECK_CUDA(err, 3, FALSE)
 62         err = cudaMemcpy2D(m_data, m_pitch, im->raster, m_width<<2, m_width<<2, m_iscomplex ? (m_height<<1) : m_height, cudaMemcpyHostToDevice);
 63         CHECK_CUDA(err, 4, FALSE)
 64         return TRUE;
 65     }
 66 
 67     inline BOOL init(int height, int width, BOOL iscomplex) {
 68         m_iscomplex = iscomplex;
 69         m_height = height;
 70         m_width = width;
 71         size_t pitch;
 72         cudaError_t err = cudaMallocPitch((void**)&m_data, &pitch, m_width<<2, m_iscomplex ? 2*m_height : m_height);
 73         m_pitch = (int)pitch;
 74         // cudaMemset(m_data, m_pitch*(m_iscomplex ? 2*m_height : m_height), 0);
 75         CHECK_CUDA(err, 5, FALSE)
 76         return TRUE;
 77     }
 78 
 79     inline ~gdata()
 80     {
 81         if(m_data != NULL) cudaFree(m_data);
 82     }
 83 
 84     inline void dump()
 85     {
 86         pyprintf("width=%d, height=%d, pitch=%d, data=%p, iscomplex=%d\n", m_width, m_height, m_pitch, m_data, m_iscomplex);
 87     }
 88 
 89     inline Image32F* getImage32F()
 90     {
 91         int dims[3];
 92         if(m_iscomplex) { dims[0]=2; dims[1]=m_height; dims[2]=m_width; }
 93         else { dims[0]=m_height; dims[1]=m_width; }
 94         Image32F* ret = new Image32F(m_iscomplex ? 3 : 2, dims);
 95         cudaError_t err = cudaMemcpy2D(ret->raster, m_width<<2, m_data, m_pitch, m_width<<2, m_iscomplex ? (m_height<<1) : m_height, cudaMemcpyDeviceToHost);
 96         if(err != cudaSuccess)
 97         {
 98             print_cuda_error(err, (long)m_data);
 99             delete ret;
100             ret = NULL;
101         }
102         return ret;
103     }
104 };
105 
106 struct __align__(8) gdatad {
107     int     m_width;
108     int     m_height;
109     int     m_pitch;
110     float*  m_data;
111     BOOL    m_iscomplex;
112 
113     inline __device__ BOOL init(int width, int height, int pitch, float* data, BOOL iscomplex)
114     {
115         m_width = width;
116         m_height = height;
117         m_pitch = pitch;
118         m_data = data;
119         m_iscomplex = iscomplex;
120         return TRUE;
121     }
122 
123     inline __device__ complex cget(int row, int col)
124     {
125         int offreal = row*(m_pitch >> 2) + col;
126         return make_complex(m_data[offreal], (m_iscomplex ?  m_data[offreal + (m_pitch >> 2) * m_height] : 0.0));
127     }
128 
129     inline __device__ void cput(int row, int col, complex value)
130     {
131         int offreal = row*(m_pitch >> 2) + col;
132         m_data[offreal] = value.real();
133         if(m_iscomplex) m_data[offreal + (m_pitch >> 2) * m_height] = value.imag();
134     }
135 };
136 
137 #define TILE_WIDTH 16
138 #define XOFF (blockIdx.x*TILE_WIDTH  + threadIdx.x)
139 #define YOFF (blockIdx.y*TILE_WIDTH  + threadIdx.y)
140 
141 __global__ void KernelMultMatrix(int g1d_width, int g1d_height, int g1d_pitch, float* g1d_data, BOOL g1d_iscomplex,
142                                  int g2d_width, int g2d_height, int g2d_pitch, float* g2d_data, BOOL g2d_iscomplex,
143                                  int grd_width, int grd_height, int grd_pitch, float* grd_data, BOOL grd_iscomplex)
144 {
145     __shared__ complex sm1[TILE_WIDTH][TILE_WIDTH];
146     __shared__ complex sm2[TILE_WIDTH][TILE_WIDTH];
147     __shared__ complex smr[TILE_WIDTH][TILE_WIDTH];
148 
149     gdatad gm1d;
150     gdatad gm2d;
151     gdatad gmrd;
152 
153     gm1d.init(g1d_width, g1d_height, g1d_pitch, g1d_data, g1d_iscomplex);
154     gm2d.init(g2d_width, g2d_height, g2d_pitch, g2d_data, g2d_iscomplex);
155     gmrd.init(grd_width, grd_height, grd_pitch, grd_data, grd_iscomplex);
156 
157     const int xoff = XOFF;
158     const int yoff = YOFF;
159     if(xoff >= gmrd.m_width && yoff >= gmrd.m_height) return;
160     smr[threadIdx.y][threadIdx.x] = make_complex(0.0, 0.0);
161 
162     for(int t=0; t < gm1d.m_width; t+=TILE_WIDTH)
163     {
164         // Carrega valores na memória compartilhada
165         if(threadIdx.x+t < gm1d.m_width && yoff < gm1d.m_height)
166         {
167             sm1[threadIdx.y][threadIdx.x] = gm1d.cget(yoff, threadIdx.x + t);
168         }
169         if(threadIdx.y+t < gm2d.m_height && xoff < gm2d.m_width)
170         {
171             sm2[threadIdx.y][threadIdx.x] = gm2d.cget(threadIdx.y + t, xoff);
172         }
173 
174         int lim = min(TILE_WIDTH, gm1d.m_width - t);
175         __syncthreads();
176 
177         // Executa multiplicação
178         if(xoff < gmrd.m_width && yoff < gmrd.m_height)
179         {
180             complex temp = make_complex(0.0, 0.0);
181             for(int i=0; i<lim; i++)
182             {
183                 temp += (sm1[threadIdx.y][i] * sm2[i][threadIdx.x]);
184             }
185 
186             smr[threadIdx.y][threadIdx.x] += temp;
187         }
188         __syncthreads();
189     }
190 
191     // copia resultado para memória global
192     if(xoff < gmrd.m_width && yoff < gmrd.m_height)
193     {
194         gmrd.cput(yoff, xoff, smr[threadIdx.y][threadIdx.x]);
195     }
196 }
197 
198 Image32F* multcompmat(Image32F* m1, Image32F* m2)
199 {
200     gdata gm1, gm2, gmr;
201     if(!gm1.init(m1) || !gm2.init(m2) || !gmr.init(gm1.m_height, gm2.m_width, TRUE)) return NULL;
202 
203     gm1.dump();
204     gm2.dump();
205     gmr.dump();
206 
207     pyprintf("(void*): %d int: %d long: %d, int4: %d\n", sizeof(void*), sizeof(int), sizeof(long), sizeof(int4));
208 
209     // Executa Multiplicação
210     int by = (gmr.m_height + TILE_WIDTH -1)/TILE_WIDTH;
211     int bx = (gmr.m_width + TILE_WIDTH -1)/TILE_WIDTH;
212 
213     pyprintf("bx= %d, by=%d\n", bx, by);
214     dim3 dimGrid(bx, by);
215     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
216 
217     /*
218     int     m_width;
219     int     m_height;
220     int     m_pitch;
221     float*  m_data;
222     BOOL    m_iscomplex;
223     */
224     KernelMultMatrix <<< dimGrid, dimBlock >>>(gm1.m_width, gm1.m_height, gm1.m_pitch, gm1.m_data, gm1.m_iscomplex,
225                                                gm2.m_width, gm2.m_height, gm2.m_pitch, gm2.m_data, gm2.m_iscomplex,
226                                                gmr.m_width, gmr.m_height, gmr.m_pitch, gmr.m_data, gmr.m_iscomplex );
227 
228     return gmr.getImage32F();
229 }
OK
1 Image32F* multcompmat(Image32F* m1, Image32F* m2);
OK

--------------------------------------------------------------------------------
    Thu Oct 20 08:52:18 2011
--------------------------------------------------------------------------------
-- Configuring done
-- Generating done
-- Build files have been written to: <pkg_dir>/_build/_lf005835_10
[ 25%] Building NVCC (Device) object ./cuda_compile_generated__lf005835_10.cu.o
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1031:1: warning: "_POSIX_C_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:158:1: warning: this is the location of the previous definition
In file included from /usr/include/python2.6/Python.h:8,
                 from <pkg_dir>/_build/_lf005835_10/../simple_arrays.h:1,
                 from <pkg_dir>/_build/_lf005835_10/_lf005835_10.cu:4:
/usr/include/python2.6/pyconfig.h:1040:1: warning: "_XOPEN_SOURCE" redefined
In file included from /usr/local/cuda/bin/../include/host_config.h:62,
                 from /usr/local/cuda/bin/../include/cuda_runtime.h:45,
                 from <command-line>:0:
/usr/include/features.h:160:1: warning: this is the location of the previous definition
[ 50%] Swig source
Scanning dependencies of target __lf005835_10
[ 75%] Building CXX object CMakeFiles/__lf005835_10.dir/_lf005835_10PYTHON_wrap.cxx.o
[100%] Building CXX object CMakeFiles/__lf005835_10.dir<pkg_dir>/_build/simple_arrays.cpp.o
Linking CXX shared module __lf005835_10.so
[100%] Built target __lf005835_10

Utilizando o módulo

 1 import numpy as np
 2 def ComplexToFloat(c):
 3     sh = c.shape
 4     nsh = (2,) + sh
 5     f = np.empty(nsh, np.float32)
 6     f[0] = np.real(c)
 7     f[1] = np.imag(c)
 8     return f
 9 
10 def FloatToComplex(f):
11     sh = f.shape
12     nsh = tuple(np.asarray(sh)[1:])
13     c = np.empty(nsh, np.complex64)
14     c.real = f[0]
15     c.imag = f[1]
16     return c
17 
18 def MultComplex(c1, c2):
19     f1 = ComplexToFloat(c1)
20     print "c=\n", c1, "\nf=\n", f1, "\n"
21     f2 = ComplexToFloat(c2)
22     fr = multcompmat(f1, f2);
23     return FloatToComplex(fr)
24 
25 # c1 = np.array([[1+1j, 1-1j], [1j,-1j]])
26 # c2 = np.array([[1, 1j], [-1j,0]])
27 c1 = np.eye(100).astype(complex)
28 c2 = np.eye(100).astype(complex)
29 c1[0, 0] = 2 + 1j
30 c2[0, 0] = 2 - 1j
31 print "c1=\n", c1
32 print "c2=\n", c2
33 print c1.dtype
34 p = MultComplex(c1, c2)
35 print "prod=\n", p
c1=
[[ 2.+1.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 ..., 
 [ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]
c2=
[[ 2.-1.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 ..., 
 [ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]
complex128
c=
[[ 2.+1.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 ..., 
 [ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]] 
f=
[[[ 2.  0.  0. ...,  0.  0.  0.]
  [ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  1. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  1.  0.  0.]
  [ 0.  0.  0. ...,  0.  1.  0.]
  [ 0.  0.  0. ...,  0.  0.  1.]]

 [[ 1.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]] 

prod=
[[ 5.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  1.+0.j ...,  0.+0.j  0.+0.j  0.+0.j]
 ..., 
 [ 0.+0.j  0.+0.j  0.+0.j ...,  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...,  0.+0.j  0.+0.j  1.+0.j]]