基于CUDA的VTI介质有限差分正演模拟与逆时偏移及ADCIGs提取

简单明了“CUDA”、“C语言”、“nvcc”、“VTI介质”、“RTM”、“全孔径接收”、“照明”、“拉普拉斯滤波”、“角度域共成像点道集”、“Poynting矢量”;

在此做个备份!

直接上代码吧!

/*******************************************************
a*         2D Quasi Acoustic VTI Medium  FD & RTM   
b*  P + sv wave and get rid of sv        
c*  GPU(CUDA) ,poynting adcigs, read shot
d*
e*******************************************************
f*
g* Ps:  the Quasi Acoustic VTI function:
h*      
i*          du/dt=1/rho*dp/dx , 
j*          dw/dt=1/rho*dq/dz ,  
k*          dp/dt=rho*vpx^2*du/dx+rho*vp*vpn*dw/dz ,
l*          dq/dt=rho*vp*vpn*du/dx+rho*vp^2*dw/dz ,
m*                     vpx^2=vp^2*(1+2*epsilon);
n*                     vpn^2=vp^2*(1+2*delta);
o*
p*******************************************************
q*                           initial: 2017.2.15 Rong Tao 
r*                            adcigs: 2017.4.13 Rong Tao 
s*                            modify: 2018.1.29 Rong Tao 
t*                                  
u*                                    
v*
w*
x*
y*******************************************************
z*/

#include <stdio.h>
#include <malloc.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>

#ifdef pi
#pragma message("Already define pi !!!")
#else
#define pi 3.141592653
#endif

#define mm 4

#define Nbar 25

#define CHECK_gpu(call)  {                                          \
    const cudaError_t error = call;                             \
    if (error != cudaSuccess)  {                                \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);  \
        fprintf(stderr, "code: %d, reason: %s\n", error,        \
                cudaGetErrorString(error));                     \
        exit(1);                                                \
    }                                            \
}

const char *note[] = {
"\n       2D Quasi Acoustic VTI Medium  FD & RTM (CUDA, ADCIGs)         ",
"                             Author: Rong Tao @UPC                    ",
"                                                                               ",
" Quasi Acoustic Function as follows:                                  ",
"    du/dt = dp/dx                                                       ",
"    dw/dt = dq/dz                                                       ",
"    dp/dt =  vpx^2 * du/dx + vp*vpn * dw/dz                             ",
"    dq/dt = vp*vpn * du/dx +  vp^2  * dw/dz                             ",
"    vpx^2 = vp^2 * (1+2*epsilon)                                        ",
"    vpn^2 = vp^2 * (1+2*delta)                                          ",
"                                                                               ",
" Required Parameters:                                                    ",
"    Kind         =1 FD                                                    ",
"                 =2 RTM                                                   ",
"    For example:                                                          ",
"    ./a.out  1     Finite difference forward modeling[FD]                   ",
"    ./a.out  2     Reverse Time Migration[RTM]                              ",
"                                                                               ",
" Inner Parameters:                                                      ",
"    nx, dx       Horizontal Space sampling point and interval             ",
"    nz, dz       Vertical Space sampling point and interval               ",
"    nt, dt       Time sampling point and interval                         ",
"    favg         Wavelet frequency                                        ",
"    pfac         Wavelet Gain                                             ",
"    ns           The number of shots                                      ",
"    fs           First shot position[grid]                                ",
"    ds           Shots interval[grid]                                     ",
"    zs           Shots vertical position[grid]                            ",
"    nangle       The number of ADCIGs's angle                             ",
"    dangle       The interval of ADCIGs's angle                           ",
"    dAdcigs      Output file, the interval cdp(nx)                        ",
"    npml         PML Border width[grid]                                   ",
"                                                                               ",
" Optional Parameters:                                                            ",
"    wtype        kind of wavelet    =1 ricker wavelet                     ",
"                                    =2 derivative of gaussian             ",
"                                    =3 derivative of gaussian             ",
"    readShot     =true,             boolean, read obs shot                ",
"                 =false,            boolean, use accurate shot data       ",
"    writeSnap    =true,false        output snap into file or not          ",
"    ",
"    ",
NULL
};


__device__ float d0;

__global__ void get_d0(float dx,
                       float dz,
                       int nnx,
                       int nnz,
                       int npml,
                       float *vp)
/* this (d0) function for pml bndr */
{
    d0 = 10.0*vp[nnx*nnz/2]*log(100000.0)/(2.0*npml*((dx+dz)/2.0));
}
/*#define mm 4*/
__constant__ float c[mm]={1.196289,-0.0797526,0.009570313,-0.0006975447};

void mBar(float fBar)
/* show progress bar */
{
 
    int i,j,k,m;
    //for ( i=0;i< Nbar+6; i++ ) 
    //    printf("\b");
    k = Nbar*fBar; 
    m = fBar*100; 
    printf("[");
    for ( i=0;i<k;i++ ) 
        printf("=");
    for ( j=0;j<Nbar-k;j++ ) 
        printf(" ");
    printf("]%3d%%",m); 
}

void check_gpu_error (const char *msg) 
/* check GPU errors */
{
    cudaError_t err = cudaGetLastError ();
    if (cudaSuccess != err) { 
        printf("Cuda error: %s: %s\n", msg, cudaGetErrorString (err)); 
        exit(0);   
    }
}

void laplace_filter(int adj, 
                    int nz, 
                    int nx, 
                    float *in, 
                    float *out)
/**
 * linear operator
 *
 * Copyright@ Madagascar Mlaplac2
 */
{
    int iz,ix,j;
    for (j=0;j<nx*nz;j++) 
        out[j]=0.0;

    for (ix=0; ix < nx; ix++) {
        for (iz=0; iz < nz; iz++) {
            j = iz+ix*nz;
            if (iz > 0) {
                if (adj) {
                    out[j-1] -= in[j];
                    out[j]   += in[j];
                } else {
                    out[j] += in[j] - in[j-1];
                }
            }
            if (iz < nz-1) {
                if (adj) {
                    out[j+1] -= in[j];
                    out[j]   += in[j];
                } else {
                    out[j] += in[j] - in[j+1];
                }
            }
            if (ix > 0) {
                if (adj) {
                    out[j-nz] -= in[j];
                    out[j]    += in[j];
                } else {
                    out[j] += in[j] - in[j-nz];
                }
            }
            if (ix < nx-1) {
                if (adj) {
                    out[j+nz] -= in[j];
                    out[j]    += in[j];
                } else {
                    out[j] += in[j] - in[j+nz];
                }
            }
        }
    }
}

__global__ void add_source( float pfac, 
                            float xsn,  
                            float zsn, 
                            int nx,  
                            int nz,  
                            int nnx,  
                            int nnz,
                            float dt,  
                            float t,  
                            float favg,
                            int wtype, 
                            int npml,
                            int is, 
                            int ds,
                            float *P, 
                            float *Q)
/* generate ricker wavelet with time deley */
{
    int ixs,izs;
    float x_,xx_,tdelay,ts,source=0.0,fs; 
  
    tdelay = 1.0/favg;
    ts = t-tdelay;
    fs = xsn+(is-1)*ds;

    if(wtype==1)//ricker wavelet
    {
        x_ = favg*ts;
        xx_ = x_*x_;
        source=(1-2*pi*pi*(xx_))*exp(-(pi*pi*xx_));

    }else if(wtype==2){//derivative of gaussian

        x_ = (-4)*favg*favg*pi*pi/log(0.1);
        source = (-2)*pi*pi*ts*exp(-x_*ts*ts);

    }else if(wtype==3){//derivative of gaussian

        x_ = (-1)*favg*favg*pi*pi/log(0.1);
        source = exp(-x_*ts*ts);
    }

    if(t <= 2*tdelay)
    {         
        ixs = (int)( fs + 0.5) + npml - 1;
        izs = (int)(zsn + 0.5) + npml - 1;

        P[ixs*nnz+izs] += pfac * source;
        Q[ixs*nnz+izs] += pfac * source;
    }
}

__global__ void update_vel(int nx,  
                           int nz, 
                           int nnx,   
                           int nnz, 
                           int npml,
                           float dt,
                           float dx,    
                           float dz,
                           float *u0,   
                           float *w0,
                           float *u1,   
                           float *w1,
                           float *P,    
                           float *Q,
                           float *coffx1,  
                           float *coffx2,
                           float *coffz1,  
                           float *coffz2)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    int ix,iz,im;
    float dtx,dtz,xx,zz;

    ix = id/nnz;
    iz = id%nnz;

    dtx = dt/dx;
    dtz = dt/dz;

    if(id >= mm && id < nnx*nnz - mm) {

        if(ix >= mm && ix<(nnx-mm) && iz >= mm && iz<(nnz-mm)) {

            xx = 0.0;
            zz = 0.0;
            for(im = 0;im<mm;im++) {

                xx += c[im] * (P[id+(im+1)*nnz]  -  P[id-im*nnz]);
                zz += c[im] * (Q[id+im+1]        -  Q[id-im]);
            }
            u1[id] = coffx2[ix]*u0[id] - coffx1[ix]*dtx*xx;
            w1[id] = coffz2[iz]*w0[id] - coffz1[iz]*dtz*zz;
        }
    }
}

__global__ void update_stress(int nx,   
                              int nz,    
                              int nnx,    
                              int nnz,
                              float dt,    
                              float dx,    
                              float dz,
                              float *u1,    
                              float *w1,
                              float *P,    
                              float *Q,    
                              float *vp,    
                              int npml,
                              float *px1,    
                              float *px0,
                              float *pz1,    
                              float *pz0,
                              float *qx1,    
                              float *qx0,
                              float *qz1,    
                              float *qz0,
                              float *acoffx1,    
                              float *acoffx2,
                              float *acoffz1,    
                              float *acoffz2,
                              float *delta,    
                              float *epsilon,
                              int fs,    
                              int ds,    
                              int zs,    
                              int is,
                              bool SV)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    int im, ix, iz, rx, rz;
    float dtx, dtz, xx, zz, ee, dd;

    /* iso circle */
    int R=18,r=7;

    ix = id / nnz;
    iz = id % nnz;

    dtx = dt / dx;
    dtz = dt / dz;

    if(id >= mm && id<nnx*nnz-mm) {

        /* iso circle begin */
        rx = ix-(fs+(is-1)*ds+npml);
        rz = iz-(zs+npml);

        if(SV){

            if((rx*rx+rz*rz) <= R*R){
                if((rx*rx+rz*rz) <= r*r){

                    ee = 0.0;
                    dd = 0.0;

                }else{

                    ee = 0.5*(1-cos(pi*((sqrtf(rx*rx+rz*rz)-r)*4.0/(R*3.0-1))))*epsilon[id];
                    dd = 0.5*(1-cos(pi*((sqrtf(rx*rx+rz*rz)-r)*4.0/(R*3.0-1))))*delta[id]; 

                }//else

            }else{

                    ee = epsilon[id];
                    dd = delta[id];
            }

        }else{

            ee = epsilon[id];
            dd = delta[id];

        }
        /* iso circle end */

        if(ix>=mm && ix<(nnx-mm) && iz>=mm && iz<(nnz-mm)) {

            xx=0.0;
            zz=0.0;

            for(im=0; im<mm; im++) {

                xx += c[im]*(u1[id+im*nnz] - u1[id-(im+1)*nnz]);
                zz += c[im]*(w1[id+im]     - w1[id-im-1]);
            }
            px1[id] = acoffx2[ix]*px0[id] - acoffx1[ix]*vp[id]*vp[id]*(1+2*ee)*dtx*xx;
            pz1[id] = acoffz2[iz]*pz0[id] - acoffz1[iz]*vp[id]*vp[id]*sqrtf(1+2*dd)*dtz*zz;
            qx1[id] = acoffx2[ix]*qx0[id] - acoffx1[ix]*vp[id]*vp[id]*sqrtf(1+2*dd)*dtx*xx;
            qz1[id] = acoffz2[iz]*qz0[id] - acoffz1[iz]*vp[id]*vp[id]*dtz*zz;

            P[id] = px1[id] + pz1[id];
            Q[id] = qx1[id] + qz1[id];
        }
    }
}   

void pad_vv(int nx,
            int nz,
            int nnx,
            int nnz,
            int npml,
            float *ee)
/**
 * Expand the border
 */
{
    int ix,iz,id;
 
    for(id=0; id<nnx*nnz; id++) {

        ix = id/nnz;
        iz = id%nnz;
        
        /* left */
        if(ix<npml){

            ee[id] = ee[npml*nnz+iz]; 

        /* right */
        }else if(ix>=nnx-npml){

            ee[id] = ee[(nnx-npml-1)*nnz+iz];
        }
    }
    for(id=0; id<nnx*nnz; id++) {

        ix = id/nnz;
        iz = id%nnz;

        /* up */
        if(iz < npml){

            ee[id] = ee[ix*nnz+npml];

        /* bottom */
        }else if(iz >= nnz-npml){

            ee[id] = ee[ix*nnz+nnz-npml-1];
        }
    }
}

__global__ void initial_coffe(float dt,
                              int nn,
                              float *coff1,
                              float *coff2,
                              float *acoff1,
                              float *acoff2,
                              int npml)
/**
 * Calculate the PML coefficient
 *
 */
{		
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    if(id < nn+2*npml) {

        /* The front of the inner */
        if(id<npml) {   
    
            coff1[id] = 1.0/(1.0+(dt*d0*pow((npml-0.5-id)/npml,2.0))/2.0);
            coff2[id] = coff1[id]*(1.0-(dt*d0*pow((npml-0.5-id)/npml,2.0))/2.0);

            acoff1[id] = 1.0/(1.0+(dt*d0*pow(((npml-id)*1.0)/npml,2.0))/2.0);
            acoff2[id] = acoff1[id]*(1.0-(dt*d0*pow(((npml-id)*1.0)/npml,2.0))/2.0);

        /* media inner */
        }else if(id>=npml&&id<npml+nn){

            coff1[id] = 1.0;
            coff2[id] = 1.0;

            acoff1[id] = 1.0;
            acoff2[id] = 1.0;

        /* The tail of the inner */
        }else{

            coff1[id] = 1.0/(1.0+(dt*d0*pow((0.5+id-nn-npml)/npml,2.0))/2.0);
            coff2[id] = coff1[id]*(1.0-(dt*d0*pow((0.5+id-nn-npml)/npml,2.0))/2.0);

            acoff1[id] = 1.0/(1.0+(dt*d0*pow(((id-nn-npml)*1.0)/npml,2.0))/2.0);
            acoff2[id] = acoff1[id]*(1.0-(dt*d0*pow(((id-nn-npml)*1.0)/npml,2.0))/2.0);
        }	
    }       
}

__global__ void shot_record(int nnx, 
                            int nnz, 
                            int nx, 
                            int nz, 
                            int npml, 
                            int it,    
                            int nt, 
                            float *P, 
                            float *shot, 
                            bool record)
/**
 * Record or load Receiver wavefield
 *      (nx) >> (nx,nt)
 *           or
 *   (nx,nt) >> (nx)
 */
{		
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    if(id<nx) {

        /* record the wavefield */
        if(record){

            shot[it+nt*id] = P[npml+nnz*(id+npml)];

        /* load the receiver wavefield */
        }else{

            P[npml+nnz*(id+npml)] = shot[it+nt*id];
        }
    }       
}

__global__ void wavefield_bndr(int nnx,    
                               int nnz, 
                               int nx,    
                               int nz, 
                               int npml, 
                               int it, 
                               int nt, 
                               float *P, 
                               float *Q, 
                               float *P_bndr, 
                               float *Q_bndr, 
                               bool record)
/**
 * Record or backword the boundary wave field
 *
 */
{		
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    if(id<2*nx+2*nz) {

        /* save boundary */
        if(record) {

            /* up */
            if(id<nx){

                P_bndr[it*(2*nx+2*nz)+id] = P[npml-1+nnz*(id+npml)];
                Q_bndr[it*(2*nx+2*nz)+id] = Q[npml-1+nnz*(id+npml)];

            /* bottom */
            }else if(id>=nx&&id<(2*nx)){
   
                P_bndr[it*(2*nx+2*nz)+id] = P[npml+nz+1+nnz*(id-nx+npml)];
                Q_bndr[it*(2*nx+2*nz)+id] = Q[npml+nz+1+nnz*(id-nx+npml)];

            /* left */
            }else if(id>=(2*nx)&&id<(2*nx+nz)){

                P_bndr[it*(2*nx+2*nz)+id] = P[id-2*nx+npml+nnz*(npml-1)];
                Q_bndr[it*(2*nx+2*nz)+id] = Q[id-2*nx+npml+nnz*(npml-1)];

            /* right */
            }else if(id>=(2*nx+nz)){

                P_bndr[it*(2*nx+2*nz)+id] = P[id-2*nx-nz+npml+nnz*(npml+nx+1)];
                Q_bndr[it*(2*nx+2*nz)+id] = Q[id-2*nx-nz+npml+nnz*(npml+nx+1)];

            }

        /* backward porpagation boundary */
        }else{

            /* up */
            if(id<nx){

                P[npml-1+nnz*(id+npml)] = P_bndr[it*(2*nx+2*nz)+id];
                Q[npml-1+nnz*(id+npml)] = Q_bndr[it*(2*nx+2*nz)+id];

            /* bottom */
            }else if(id>=nx&&id<(2*nx)){
   
                P[npml+nz+1+nnz*(id-nx+npml)] = P_bndr[it*(2*nx+2*nz)+id];
                Q[npml+nz+1+nnz*(id-nx+npml)] = Q_bndr[it*(2*nx+2*nz)+id];

            /* left */
            }else if(id>=(2*nx)&&id<(2*nx+nz)){

                P[id-2*nx+npml+nnz*(npml-1)] = P_bndr[it*(2*nx+2*nz)+id];
                Q[id-2*nx+npml+nnz*(npml-1)] = Q_bndr[it*(2*nx+2*nz)+id];

            /* right */
            }else if(id>=(2*nx+nz)){

                P[id-2*nx-nz+npml+nnz*(npml+nx+1)] = P_bndr[it*(2*nx+2*nz)+id];
                Q[id-2*nx-nz+npml+nnz*(npml+nx+1)] = Q_bndr[it*(2*nx+2*nz)+id];

            }
        }
    }       
}

__global__ void mute_directwave(int nx,
                                int nt,
                                float dt,
                                float favg,
                                float dx,
                                float dz,
                                int fs,   
                                int ds,   
                                int zs,   
                                int is,
                                float *vp,
                                float *epsilon,
                                float *shot,
                                int tt)
/**
 *mute direct waves
 */
{
    int it = threadIdx.x+blockDim.x*blockIdx.x;

    int mu_t, mu_nt;
    float mu_x, mu_z, mu_t0;

    int ix, id;

    for(ix = 0; ix < nx; ix ++){

        id = ix*nt + it;

        mu_x = dx*abs(ix-fs-(is-1)*ds);
        mu_z = dz*zs;
        mu_t0 = sqrtf(pow(mu_x,2)+pow(mu_z,2))/(vp[1]*sqrtf(1+2*epsilon[1]));
        mu_t = (int)(2.0/(dt*favg));
        mu_nt = (int)(mu_t0/dt)+mu_t+tt;

        if((it > (int)(mu_t0/dt)-tt) && (it<mu_nt))
            shot[id] = 0.0;
    }
}

__global__ void cal_illumination(int nnx,
                                 int nnz, 
                                 int nz, 
                                 int npml, 
                                 float *illumination, 
                                 float *P, 
                                 float *Q)
/**
 * illumination matrix
 */
{
    int id = threadIdx.x+blockDim.x*blockIdx.x;
    int ix = id/nz;
    int iz = id%nz;

    if(id < nnx*nnz) {

        illumination[id] += P[iz+npml+nnz*(ix+npml)] * P[iz+npml+nnz*(ix+npml)]
                           +Q[iz+npml+nnz*(ix+npml)] * Q[iz+npml+nnz*(ix+npml)];

        if(illumination[id] <= 0.0 )
            illumination[id] = 1.0;
    }
}

__global__ void cal_migration(int nnx, 
                              int nnz, 
                              int nz, 
                              int npml, 
                              float *migration, 
                              float *s, 
                              float *g)
/**
 * RTM migration
 */
{
    int id = threadIdx.x+blockDim.x*blockIdx.x;
    int ix = id/nz;
    int iz = id%nz;

    if(id<nnx*nnz) {

        migration[id] += s[iz+npml+nnz*(ix+npml)] * g[iz+npml+nnz*(ix+npml)];
    }
}

__global__ void migration_illum(int nx, 
                                int nz, 
                                int npml, 
                                float *migration, 
                                float *illumination)
/**
 *  illuminate
 */
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    if(id<nx*nz) {

        migration[id] /= illumination[id];
    }
}


__global__ void Poynting_Adcigs(int nnz, 
                                int nx, 
                                int nz, 
                                int npml, 
                                int nangle, 
                                int dangle, 
                                float *adcigs, 
                                float *s_P, 
                                float *s_Q, 
                                float *s_u, 
                                float *s_w, 
                                float *g_P, 
                                float *g_Q, 
                                float *g_u, 
                                float *g_w)
/**
 *  poynting vector extraction ADCIGs
 */
{
    int id = threadIdx.x+blockDim.x*blockIdx.x;
    int ix = id/nz;
    int iz = id%nz;

    int ia = 0;

    float Ssx = -s_P[iz+npml+nnz*(ix+npml)]*s_u[iz+npml+nnz*(ix+npml)];
    float Ssz = -s_Q[iz+npml+nnz*(ix+npml)]*s_w[iz+npml+nnz*(ix+npml)];
    float Sgx =  g_P[iz+npml+nnz*(ix+npml)]*g_u[iz+npml+nnz*(ix+npml)];
    float Sgz =  g_Q[iz+npml+nnz*(ix+npml)]*g_w[iz+npml+nnz*(ix+npml)];

    float b1 =  Ssx*Ssx + Ssz*Ssz;
    float b2 =  Sgx*Sgx + Sgz*Sgz;
    float  a = (Ssx*Sgx + Ssz*Sgz)/(sqrtf(b1*b2)*(1 - 0.1));

    if(id<nx*nz) {

        if(a>=-1&&a<=1) {

          a = 0.5*acosf(a)*180.0/pi;
         ia = (int)(a/(dangle*1.0));
         
            if(ia<nangle) {
                adcigs[iz+nz*ia+nz*nangle*(id/nz)] 
                    += s_P[iz+npml+nnz*(ix+npml)]*g_P[iz+npml+nnz*(ix+npml)]
                      *cosf(ia*pi/180.0)*cosf(ia*pi/180.0)*cosf(ia*pi/180.0);
            }
        }
    }
}

__global__ void adcigs_illum(int nx, 
                             int nz,  
                             int nangle,  
                             int dangle,  
                             float *adcigs,  
                             float *illumination)
/**
 *  illuminate the adcigs
 */
{
    int id = threadIdx.x+blockDim.x*blockIdx.x;
    int ix = id/(nz*nangle);
    int iz = id%nz;

    if(id<nx*nz*nangle) {

        adcigs[id] /= illumination[iz+nz*ix];
    }
}

void stk_adcigs(int nx,
                int nz,
                int nangle,
                float *adcigs,
                float *migration)
/**
 * Stack adcigs to migration
 * Can suppress low-frequency random noise
 */
{
    int ix,iz,ia,id,ido;
    float stk;
    float *temp;

    temp=(float*)malloc(nz*nx*sizeof(float));

    for (ix=0; ix<nx; ix++)  {
        for (iz=0; iz<nz; iz++)  {
            stk=0.0;
            for (ia=0; ia<nangle; ia++)  {
                id = ix*nangle*nz+ia*nz+iz;
                stk += adcigs[id];
            }
            ido = ix*nz+iz;
            temp[ido] = stk;
        }
    }
    laplace_filter(1,nz,nx,temp,migration);
}

void adcigs_smiled(int nx,
                   int nz,
                   int nangle,
                   int dAdcigs,
                   float *adcigs) 
/**
 * Draw thin adcigs
 */
{
    int ix,iz,ia,id,ido;
    float *temp;

    temp = (float*)malloc(nz*nx/dAdcigs*nangle*sizeof(float));

    for (ix=0; ix<nx; ix++)  {
        for (ia=0; ia<nangle; ia++)  {
            for (iz=0; iz<nz; iz++)  {

                id=ix*nangle*nz+ia*nz+iz;

                if(ix%dAdcigs==0) {

                    ido = ix/dAdcigs*nangle*nz+ia*nz+iz;
                    temp[ido] = adcigs[id];
                    adcigs[ido] = temp[ido];
                }
            }
        }
    }
}

void readFile( char FNvelocity[],   
               char FNepsilon[],  
               char FNdelta[],
               int nx,  
               int nz,
               int nnx,  
               int nnz,
               float dx,  
               float dz,
               float favg,  
               float dt,
               float *v,  
               float *e,  
               float *d,
               int npml)
{
    int i,j,id;
    float vmax, vmin;
    float emax, emin;
    float dmax, dmin;
    float H_min, dt_max, dxz_max, C, tmp;

    FILE *fp1,*fp2,*fp3;

    if((fp1=fopen(FNvelocity,"rb"))==NULL){

        printf("error open <%s>!\n",FNvelocity);
        exit(0);
    }
    if((fp2=fopen(FNepsilon,"rb"))==NULL){

        printf("error open <%s>!\n",FNepsilon);
        exit(0);
    }
    if((fp3=fopen(FNdelta,"rb"))==NULL){

        printf("error open <%s>!\n",FNdelta);
        exit(0);
    }

    vmin = emin = dmin =  999999.9;
    vmax = emax = dmax = -999999.9;

    for(i=npml;i<nx+npml;i++) {
        for(j=npml;j<nz+npml;j++) {

            id=i*nnz+j;
                                  /* inch time 0.3 */
            fread(&v[id],4L,1,fp1);//v[id] *= 0.3;
            fread(&e[id],4L,1,fp2);
            fread(&d[id],4L,1,fp3);

            /* For Parameters Sensitivity Analysis */
            if(false) // true: active
            if(v[id]>3800){

                v[id] *= 0.85;
                e[id] *= 0.85;
                d[id] *= 0.85;
            }

            if(vmax<v[id]) vmax = v[id];
            if(vmin>v[id]) vmin = v[id];
            if(emax<e[id]) emax = e[id];
            if(emin>e[id]) emin = e[id];
            if(dmax<d[id]) dmax = d[id];
            if(dmin>d[id]) dmin = d[id];
        }
    }
    fclose(fp1);
    fclose(fp2);
    fclose(fp3);

    printf("------------------------------------\n---\n");
    printf("---   Velocity Range (%.1f - %.1f)\n",vmin,vmax);
    printf("---    Epsilon Range (%.4f - %.4f)\n",emin,emax);
    printf("---      Delta Range (%.4f - %.4f)\n---\n",dmin,dmax);
        
    /* boundary */
    pad_vv(nx,nz,nnx,nnz,npml,e);
    pad_vv(nx,nz,nnx,nnz,npml,d);
    pad_vv(nx,nz,nnx,nnz,npml,v); 
      
    H_min=dx<dz?dx:dz;
    dt_max = 0.5*H_min/vmin;
    dxz_max = vmax/favg*0.2;

    if ( dxz_max<dz || dxz_max<dx){

        printf("---   You need have to redefine DX and DZ ! \n");
        exit(0);
    }
    if ( dt_max<dt){
        printf("---   You need have to redefine DT ! \n");
        exit(0);
    }
    if ( favg >= vmin/( 5.0*(dx>dz?dx:dz) ) 
      || favg >= vmin/( 5.0*(dx>dz?dx:dz) ) ) {
        printf("---   Non-dispersion relation not satisfied! \n");
        exit(0);
    } 
    /* following 
     * Copyright@ Madagascar */
    else if ( mm == 2 )     
        C = 0.857;
    else if ( mm == 3 )     
        C = 0.8;
    else if ( mm == 4 )     
        C = 0.777;
    else if ( mm == 5 )     
        C = 0.759;

    tmp = dt*vmax*sqrtf( 1.0/(dx*dx)+1.0/(dz*dz) );
    if ( tmp >= C){ 
        printf("---   Stability condition not satisfied! tmp = %f, C = %f\n",tmp,C);
        exit(0);
    }
}


void FD( char FNvelocity[],
         char FNepsilon[],
         char FNdelta[], 
         char FNCalShot[],
         char FNSnap[],
         char FNIllumination[],
         int wtype,
         int npml,
         int nx, 
         int nz,
         float dx, 
         float dz,
         int nt, 
         float dt, 
         int ns, 
         int fs, 
         int ds, 
         int zs,
         float favg, 
         float pfac,
         bool writeSnap)
/**
 * FD
 *
 */
{
    float *v, *e, *d;
    float *vp, *epsilon, *delta;

    float *s_u0, *s_u1, *s_px0, *s_qx0, *s_px1, *s_qx1;
    float *s_w0, *s_w1, *s_pz0, *s_qz0, *s_pz1, *s_qz1;

    float *s_P, *s_Q;

    float *coffx1,*coffx2,*coffz1,*coffz2;
    float *acoffx1,*acoffx2,*acoffz1,*acoffz2;

    float *shot_Dev, *shot_Hos;
    float *illumination;

    int nnx, nnz;
    int it, is;

    float t;

    /* FILE pointer */
    FILE *fpCalShot = fopen(FNCalShot,"wb");
    FILE *fpSnap;              
    if(writeSnap) {

        fpSnap = fopen(FNSnap,"wb");
    }
    FILE *fpIllunmination = fopen(FNIllumination,"wb");

    /* whole media size */
    nnx = nx + 2*npml;
    nnz = nz + 2*npml;

    /* read the media file into memory */
    v = (float*)malloc(nnz*nnx*sizeof(float));
    e = (float*)malloc(nnz*nnx*sizeof(float));
    d = (float*)malloc(nnz*nnx*sizeof(float));
    readFile(FNvelocity,FNepsilon,FNdelta,
             nx,nz,nnx,nnz,dx,dz,favg,dt,v,e,d,npml);

    /* alloc host and device record memory */
    shot_Hos=(float*)malloc(nt*nx*sizeof(float));

    /* initialize device, default device=0; */
    cudaSetDevice(0); 
    check_gpu_error("Failed to initialize device!");
    CHECK_gpu(cudaDeviceReset());

    /* malloc the device media memory */
    cudaMalloc(&vp, nnz*nnx*sizeof(float));
    cudaMalloc(&epsilon, nnz*nnx*sizeof(float));
    cudaMalloc(&delta, nnz*nnx*sizeof(float));

    /* copy the media parameters host memory to device */
    cudaMemcpy(vp, v, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(epsilon, e, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(delta, d, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);

    /* source wavefield device memory */
    cudaMalloc(&s_u0, nnz*nnx*sizeof(float)); cudaMalloc(&s_u1, nnz*nnx*sizeof(float));
    cudaMalloc(&s_w0, nnz*nnx*sizeof(float)); cudaMalloc(&s_w1, nnz*nnx*sizeof(float));  
    cudaMalloc(&s_P, nnz*nnx*sizeof(float));  cudaMalloc(&s_Q, nnz*nnx*sizeof(float)); 
    cudaMalloc(&s_px0, nnz*nnx*sizeof(float));cudaMalloc(&s_px1, nnz*nnx*sizeof(float));  
    cudaMalloc(&s_pz0, nnz*nnx*sizeof(float));cudaMalloc(&s_pz1, nnz*nnx*sizeof(float));   
    cudaMalloc(&s_qx0, nnz*nnx*sizeof(float));cudaMalloc(&s_qx1, nnz*nnx*sizeof(float));   
    cudaMalloc(&s_qz0, nnz*nnx*sizeof(float));cudaMalloc(&s_qz1, nnz*nnx*sizeof(float));   
    
    /* boundary absorb coefficient device memory */
    cudaMalloc(&coffx1, nnx*sizeof(float));   cudaMalloc(&acoffx1, nnx*sizeof(float));     
    cudaMalloc(&coffx2, nnx*sizeof(float));   cudaMalloc(&acoffx2, nnx*sizeof(float));
    cudaMalloc(&coffz1, nnz*sizeof(float));   cudaMalloc(&acoffz1, nnz*sizeof(float));     
    cudaMalloc(&coffz2, nnz*sizeof(float));   cudaMalloc(&acoffz2, nnz*sizeof(float));

    cudaMalloc(&shot_Dev, nx*nt*sizeof(float));

    /* imaging device memory */
    cudaMalloc(&illumination, nz*nx*sizeof(float));

    /* check Nvidia GPU */
    check_gpu_error("Failed to allocate memory for variables!");

    /* calculate d0 and pml adsorb coffe */
    get_d0<<<1, 1>>>(dx, dz, nnx, nnz, npml, vp);
    initial_coffe<<<(nnx+511)/512, 512>>>(dt,nx,coffx1,coffx2,acoffx1,acoffx2,npml);
    initial_coffe<<<(nnz+511)/512, 512>>>(dt,nz,coffz1,coffz2,acoffz1,acoffz2,npml);

    /* set Imaging to zero */
    cudaMemset(illumination, 0, nz*nx*sizeof(float));

    printf("--------------------------------------------------------\n");
    printf("---");   

    clock_t time;

    /* Starting IS loop */
    for(is=1; is<=ns; is++)	 {  

        time = clock();
        printf("\n---  IS =%3d/%d ",is,ns);
        mBar(1.0*is/(1.0*ns));

        cudaMemset(s_u0, 0, nnz*nnx*sizeof(float));  cudaMemset(s_u1, 0, nnz*nnx*sizeof(float));    
        cudaMemset(s_w0, 0, nnz*nnx*sizeof(float));  cudaMemset(s_w1, 0, nnz*nnx*sizeof(float));   
        cudaMemset(s_P, 0, nnz*nnx*sizeof(float));   cudaMemset(s_Q, 0, nnz*nnx*sizeof(float));     
        cudaMemset(s_px0, 0, nnz*nnx*sizeof(float)); cudaMemset(s_px1, 0, nnz*nnx*sizeof(float)); 
        cudaMemset(s_pz0, 0, nnz*nnx*sizeof(float)); cudaMemset(s_pz1, 0, nnz*nnx*sizeof(float)); 
        cudaMemset(s_qx0, 0, nnz*nnx*sizeof(float)); cudaMemset(s_qx1, 0, nnz*nnx*sizeof(float)); 
        cudaMemset(s_qz0, 0, nnz*nnx*sizeof(float)); cudaMemset(s_qz1, 0, nnz*nnx*sizeof(float));    
        
        cudaMemset(shot_Dev, 0, nt*nx*sizeof(float));

        /* forward */
        for(it=0,t=dt; it<nt; it++,t+=dt) { 

            add_source<<<1,1>>>(pfac, fs,zs,nx,nz,nnx,nnz,dt,t,favg,wtype,npml,is,ds,s_P,s_Q);
            update_vel<<<(nnx*nnz+511)/512, 512>>>
                        (nx,nz,nnx,nnz,npml,dt,dx,dz,
                         s_u0,s_w0,s_u1,s_w1,s_P,s_Q,coffx1,coffx2,coffz1,coffz2);
            update_stress<<<(nnx*nnz+511)/512, 512>>>
                          (nx,nz,nnx,nnz,dt,dx,dz,s_u1,s_w1,s_P,s_Q,vp,npml,
                           s_px1,s_px0,s_pz1,s_pz0,s_qx1,s_qx0,s_qz1,s_qz0,
                           acoffx1,acoffx2,acoffz1,acoffz2,delta,epsilon,fs,ds,zs,is,false);
            s_u0 = s_u1;   s_w0 = s_w1; 
            s_px0 = s_px1; s_pz0 = s_pz1; 
            s_qx0 = s_qx1; s_qz0 = s_qz1; 

            shot_record<<<(nx+511)/512, 512>>>
                        (nnx, nnz, nx, nz, npml, it, nt, s_P, shot_Dev, true);
            cal_illumination<<<(nx*nz+511)/512, 512>>>
                        (nnx, nnz, nz, npml, illumination, s_P, s_Q);

            if(writeSnap && (it%300==0)) {
                cudaMemcpy(e, s_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
                for(int i = npml; i<nnx-npml; i++)
                    for(int j = npml; j<nnz-npml; j++)
                        fwrite(&e[i*nnz+j], 4L, 1, fpSnap);
            }

        }//it

        //mute_directwave<<<(nt+511)/512, 512>>>
        //                (nx,nt,dt,favg,dx,dz,fs,ds,zs,is,vp,epsilon,shot_Dev,100);

        cudaMemcpy(shot_Hos, shot_Dev, nt*nx*sizeof(float), cudaMemcpyDeviceToHost);
        fwrite(shot_Hos,sizeof(float),nt*nx,fpCalShot);

        time = clock() - time;
        printf(", %f min", ((float)time)/60.0/CLOCKS_PER_SEC);

    }//is 

    /* output multi-shot illumination */
    cudaMemcpy(e, illumination, nz*nx*sizeof(float), cudaMemcpyDeviceToHost);
    fwrite(e,sizeof(float),nx*nz,fpIllunmination);


    /* file close */
    if(writeSnap)
        fclose(fpSnap);  
    fclose(fpCalShot);
    fclose(fpIllunmination);  

    /* device memory free */
    cudaFree(coffx1);     cudaFree(acoffx1);      
    cudaFree(coffx2);     cudaFree(acoffx2);
    cudaFree(coffz1);     cudaFree(acoffz1);      
    cudaFree(coffz2);     cudaFree(acoffz2);

    cudaFree(s_u0);       cudaFree(s_u1);          
    cudaFree(s_w0);       cudaFree(s_w1);           
    cudaFree(s_P);        cudaFree(s_Q);            
    cudaFree(s_px0);      cudaFree(s_px1);        
    cudaFree(s_pz0);      cudaFree(s_pz1);         
    cudaFree(s_qx0);      cudaFree(s_qx1);         
    cudaFree(s_qz0);      cudaFree(s_qz1);         

    cudaFree(shot_Dev);

    cudaFree(illumination);

    /* host memory free */
    free(v);	
    free(e);	
    free(d);
    free(shot_Hos);  

}//FD



void RTM(char FNvelocity[],
         char FNepsilon[],
         char FNdelta[], 
         char FNObsShot[],
         char FNCalShot[],
         char FNSnap[],
         char FNMigration[],
         char FNIllumination[],
         char FNAdcigs[],
         char FNStkAdcigs[],
         char FNIntervalAdcigs[],
         int wtype,
         int npml,
         int nx, 
         int nz,
         float dx, 
         float dz,
         int nt, 
         float dt, 
         int ns, 
         int fs, 
         int ds, 
         int zs,
         float favg, 
         float pfac,
         int nangle, 
         int dangle, 
         int dAdcigs,
         bool readShot, 
         bool writeSnap)
/**
 * RTM
 *
 */
{
    float *v, *e, *d;
    float *vp, *epsilon, *delta;

    float *s_u0, *s_u1, *s_px0, *s_qx0, *s_px1, *s_qx1;
    float *s_w0, *s_w1, *s_pz0, *s_qz0, *s_pz1, *s_qz1;
    float *g_u0, *g_u1, *g_px0, *g_qx0, *g_px1, *g_qx1;
    float *g_w0, *g_w1, *g_pz0, *g_qz0, *g_pz1, *g_qz1;

    float *s_P, *s_Q, *g_P, *g_Q;

    float *coffx1,*coffx2,*coffz1,*coffz2;
    float *acoffx1,*acoffx2,*acoffz1,*acoffz2;

    float *shot_Dev, *shot_Hos, *P_bndr, *Q_bndr;
    float *migration, *illumination, *adcigs;
    float *Atemp;

    int nnx, nnz;
    int it, is;

    float t;

    /* FILE pointer */
    FILE *fpObsShot, *fpCalShot;
    if(readShot) {

        if((fpObsShot = fopen(FNObsShot,"rb"))==NULL){
            printf("error open <%s>!\n",FNObsShot);
            exit(0);
        }
    }else{

        fpCalShot = fopen(FNCalShot,"wb");
    }
    FILE *fpSnap;              
    if(writeSnap) {

        fpSnap = fopen(FNSnap,"wb");
    }
    FILE *fpMigration         = fopen(FNMigration,"wb");
    FILE *fpIllunmination     = fopen(FNIllumination,"wb");
    FILE *fpAdcigs            = fopen(FNAdcigs,"wb");
    FILE *fpStkAdcigs         = fopen(FNStkAdcigs,"wb");
    FILE *fpIntervalAdcigs    = fopen(FNIntervalAdcigs,"wb");

    /* whole media size */
    nnx = nx + 2*npml;
    nnz = nz + 2*npml;

    /* temp malloc for host adcigs */
    Atemp = (float*)malloc(nz*nx*nangle*sizeof(float));

    /* read the media file into memory */
    v = (float*)malloc(nnz*nnx*sizeof(float));
    e = (float*)malloc(nnz*nnx*sizeof(float));
    d = (float*)malloc(nnz*nnx*sizeof(float));
    readFile(FNvelocity,FNepsilon,FNdelta,
             nx,nz,nnx,nnz,dx,dz,favg,dt,v,e,d,npml);

    /* alloc host and device record memory */
    shot_Hos=(float*)malloc(nt*nx*sizeof(float));

    /* initialize device, default device=0; */
    cudaSetDevice(0); 
    check_gpu_error("Failed to initialize device!");
    CHECK_gpu(cudaDeviceReset());

    /* malloc the device media memory */
    cudaMalloc(&vp, nnz*nnx*sizeof(float));
    cudaMalloc(&epsilon, nnz*nnx*sizeof(float));
    cudaMalloc(&delta, nnz*nnx*sizeof(float));

    /* copy the media parameters host memory to device */
    cudaMemcpy(vp, v, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(epsilon, e, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(delta, d, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);

    /* source wavefield device memory */      /* receiver wavefield device memory */
    cudaMalloc(&s_u0, nnz*nnx*sizeof(float));  cudaMalloc(&g_u0, nnz*nnx*sizeof(float));   
    cudaMalloc(&s_u1, nnz*nnx*sizeof(float));  cudaMalloc(&g_u1, nnz*nnx*sizeof(float));
    cudaMalloc(&s_w0, nnz*nnx*sizeof(float));  cudaMalloc(&g_w0, nnz*nnx*sizeof(float));    
    cudaMalloc(&s_w1, nnz*nnx*sizeof(float));  cudaMalloc(&g_w1, nnz*nnx*sizeof(float));

    cudaMalloc(&s_P, nnz*nnx*sizeof(float));   cudaMalloc(&g_P, nnz*nnx*sizeof(float));     
    cudaMalloc(&s_Q, nnz*nnx*sizeof(float));   cudaMalloc(&g_Q, nnz*nnx*sizeof(float));

    cudaMalloc(&s_px0, nnz*nnx*sizeof(float)); cudaMalloc(&g_px0, nnz*nnx*sizeof(float)); 
    cudaMalloc(&s_px1, nnz*nnx*sizeof(float)); cudaMalloc(&g_px1, nnz*nnx*sizeof(float));
    cudaMalloc(&s_pz0, nnz*nnx*sizeof(float)); cudaMalloc(&g_pz0, nnz*nnx*sizeof(float)); 
    cudaMalloc(&s_pz1, nnz*nnx*sizeof(float)); cudaMalloc(&g_pz1, nnz*nnx*sizeof(float));
    cudaMalloc(&s_qx0, nnz*nnx*sizeof(float)); cudaMalloc(&g_qx0, nnz*nnx*sizeof(float));  
    cudaMalloc(&s_qx1, nnz*nnx*sizeof(float)); cudaMalloc(&g_qx1, nnz*nnx*sizeof(float));
    cudaMalloc(&s_qz0, nnz*nnx*sizeof(float)); cudaMalloc(&g_qz0, nnz*nnx*sizeof(float));  
    cudaMalloc(&s_qz1, nnz*nnx*sizeof(float)); cudaMalloc(&g_qz1, nnz*nnx*sizeof(float));

    /* boundary absorb coefficient device memory */
    cudaMalloc(&coffx1, nnx*sizeof(float));    cudaMalloc(&acoffx1, nnx*sizeof(float));    
    cudaMalloc(&coffx2, nnx*sizeof(float));    cudaMalloc(&acoffx2, nnx*sizeof(float));
    cudaMalloc(&coffz1, nnz*sizeof(float));    cudaMalloc(&acoffz1, nnz*sizeof(float));    
    cudaMalloc(&coffz2, nnz*sizeof(float));    cudaMalloc(&acoffz2, nnz*sizeof(float));

    /* boundary wavefield device memory */
    cudaMalloc(&P_bndr, nt*(2*nx+2*nz)*sizeof(float));
    cudaMalloc(&Q_bndr, nt*(2*nx+2*nz)*sizeof(float));

    cudaMalloc(&shot_Dev, nx*nt*sizeof(float));

    /* imaging device memory */
    cudaMalloc(&migration, nz*nx*sizeof(float)); 
    cudaMalloc(&illumination, nz*nx*sizeof(float));
    cudaMalloc(&adcigs, nz*nangle*nx*sizeof(float));

    /* check Nvidia GPU */
    check_gpu_error("Failed to allocate memory for variables!");

    /* calculate d0 and pml adsorb coffe */
    get_d0<<<1, 1>>>(dx, dz, nnx, nnz, npml, vp);
    initial_coffe<<<(nnx+511)/512, 512>>>(dt,nx,coffx1,coffx2,acoffx1,acoffx2,npml);
    initial_coffe<<<(nnz+511)/512, 512>>>(dt,nz,coffz1,coffz2,acoffz1,acoffz2,npml);

    /* set Imaging to zero */
    cudaMemset(migration, 0, nz*nx*sizeof(float)); 
    cudaMemset(illumination, 0, nz*nx*sizeof(float));
    cudaMemset(adcigs, 0, nz*nangle*nx*sizeof(float));

    printf("--------------------------------------------------------\n");
    printf("---");   

    clock_t time;

    /* Starting IS loop */
    for(is=1; is<=ns; is++)	 {  

        time = clock();
        printf("\n---  IS =%3d/%d ",is,ns);
        mBar(1.0*is/(1.0*ns));

        cudaMemset(s_u0, 0, nnz*nnx*sizeof(float)); cudaMemset(g_u0, 0, nnz*nnx*sizeof(float));     
        cudaMemset(s_u1, 0, nnz*nnx*sizeof(float)); cudaMemset(g_u1, 0, nnz*nnx*sizeof(float));
        cudaMemset(s_w0, 0, nnz*nnx*sizeof(float)); cudaMemset(g_w0, 0, nnz*nnx*sizeof(float));     
        cudaMemset(s_w1, 0, nnz*nnx*sizeof(float)); cudaMemset(g_w1, 0, nnz*nnx*sizeof(float));

        cudaMemset(s_P, 0, nnz*nnx*sizeof(float));  cudaMemset(g_P, 0, nnz*nnx*sizeof(float));      
        cudaMemset(s_Q, 0, nnz*nnx*sizeof(float));  cudaMemset(g_Q, 0, nnz*nnx*sizeof(float));

        cudaMemset(s_px0, 0, nnz*nnx*sizeof(float));cudaMemset(g_px0, 0, nnz*nnx*sizeof(float));   
        cudaMemset(s_px1, 0, nnz*nnx*sizeof(float));cudaMemset(g_px1, 0, nnz*nnx*sizeof(float));
        cudaMemset(s_pz0, 0, nnz*nnx*sizeof(float));cudaMemset(g_pz0, 0, nnz*nnx*sizeof(float));    
        cudaMemset(s_pz1, 0, nnz*nnx*sizeof(float));cudaMemset(g_pz1, 0, nnz*nnx*sizeof(float));
        cudaMemset(s_qx0, 0, nnz*nnx*sizeof(float));cudaMemset(g_qx0, 0, nnz*nnx*sizeof(float));    
        cudaMemset(s_qx1, 0, nnz*nnx*sizeof(float));cudaMemset(g_qx1, 0, nnz*nnx*sizeof(float));
        cudaMemset(s_qz0, 0, nnz*nnx*sizeof(float));cudaMemset(g_qz0, 0, nnz*nnx*sizeof(float));    
        cudaMemset(s_qz1, 0, nnz*nnx*sizeof(float));cudaMemset(g_qz1, 0, nnz*nnx*sizeof(float));

        cudaMemset(shot_Dev, 0, nt*nx*sizeof(float));
        cudaMemset(P_bndr, 0, nt*(2*nx+2*nz)*sizeof(float));
        cudaMemset(Q_bndr, 0, nt*(2*nx+2*nz)*sizeof(float));

        /* forward */
        for(it=0,t=dt; it<nt; it++,t+=dt) { 

            add_source<<<1,1>>>(pfac, fs,zs,nx,nz,nnx,nnz,dt,t,favg,wtype,npml,is,ds,s_P,s_Q);
            update_vel<<<(nnx*nnz+511)/512, 512>>>
                        (nx,nz,nnx,nnz,npml,dt,dx,dz,
                         s_u0,s_w0,s_u1,s_w1,s_P,s_Q,coffx1,coffx2,coffz1,coffz2);
            update_stress<<<(nnx*nnz+511)/512, 512>>>
                          (nx,nz,nnx,nnz,dt,dx,dz,s_u1,s_w1,s_P,s_Q,vp,npml,
                           s_px1,s_px0,s_pz1,s_pz0,s_qx1,s_qx0,s_qz1,s_qz0,
                           acoffx1,acoffx2,acoffz1,acoffz2,delta,epsilon,fs,ds,zs,is,true);
            s_u0 = s_u1;   s_w0 = s_w1; 
            s_px0 = s_px1; s_pz0 = s_pz1; 
            s_qx0 = s_qx1; s_qz0 = s_qz1; 

            shot_record<<<(nx+511)/512, 512>>>
                        (nnx, nnz, nx, nz, npml, it, nt, s_P, shot_Dev, true);
            wavefield_bndr<<<((2*nx+2*nz)+511)/512,512>>>
                        (nnx, nnz, nx, nz, npml, it, nt, s_P, s_Q, P_bndr, Q_bndr, true);
            cal_illumination<<<(nx*nz+511)/512, 512>>>
                        (nnx, nnz, nz, npml, illumination, s_P, s_Q);

            if(writeSnap && (it%300==0)) {
                cudaMemcpy(e, s_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
                for(int i = npml; i<nnx-npml; i++)
                    for(int j = npml; j<nnz-npml; j++)
                        fwrite(&e[i*nnz+j], 4L, 1, fpSnap);
            }

        }//it

        mute_directwave<<<(nt+511)/512, 512>>>
                        (nx,nt,dt,favg,dx,dz,fs,ds,zs,is,vp,epsilon,shot_Dev,20);

        if(readShot) {

            fread(shot_Hos,sizeof(float),nt*nx,fpObsShot);
            cudaMemcpy(shot_Dev, shot_Hos, nt*nx*sizeof(float), cudaMemcpyHostToDevice);

        } else {

            cudaMemcpy(shot_Hos, shot_Dev, nt*nx*sizeof(float), cudaMemcpyDeviceToHost);
            fwrite(shot_Hos,sizeof(float),nt*nx,fpCalShot);
        }

        /* backward */
        for(it=nt-1; it>=0; it--) { 

            /* source wavefield */
            wavefield_bndr<<<((2*nx+2*nz)+511)/512,512>>>
                            (nnx, nnz, nx, nz, npml, it, nt, s_P, s_Q, P_bndr, Q_bndr, false);
            update_vel<<<(nnx*nnz+511)/512, 512>>>
                            (nx,nz,nnx,nnz,npml,dt,dx,dz,
                             s_u0,s_w0,s_u1,s_w1,s_P,s_Q,coffx1,coffx2,coffz1,coffz2);
            update_stress<<<(nnx*nnz+511)/512, 512>>>
                            (nx,nz,nnx,nnz,dt,dx,dz,s_u1,s_w1,s_P,s_Q,vp,npml,
                             s_px1,s_px0,s_pz1,s_pz0,s_qx1,s_qx0,s_qz1,s_qz0,
                             acoffx1,acoffx2,acoffz1,acoffz2,delta,epsilon,fs,ds,zs,is,false);
            s_u0=s_u1;   s_w0=s_w1; 
            s_px0=s_px1; s_pz0=s_pz1; 
            s_qx0=s_qx1; s_qz0=s_qz1;
 
            if(writeSnap && (it%300==0)) {
                cudaMemcpy(e, s_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
                for(int i=npml;i<nnx-npml;i++)
                    for(int j=npml;j<nnz-npml;j++)
                        fwrite(&e[i*nnz+j],4L,1,fpSnap);
            }



            /* receivers wavefield */
            shot_record<<<(nx+511)/512, 512>>>
                        (nnx, nnz, nx, nz, npml, it, nt, g_P, shot_Dev, false);
            shot_record<<<(nx+511)/512, 512>>>
                        (nnx, nnz, nx, nz, npml, it, nt, g_Q, shot_Dev, false);
            update_vel<<<(nnx*nnz+511)/512, 512>>>
                        (nx,nz,nnx,nnz,npml,dt,dx,dz,
                         g_u0,g_w0,g_u1,g_w1,g_P,g_Q,coffx1,coffx2,coffz1,coffz2);
            update_stress<<<(nnx*nnz+511)/512, 512>>>
                            (nx,nz,nnx,nnz,dt,dx,dz,g_u1,g_w1,g_P,g_Q,vp,npml,
                             g_px1,g_px0,g_pz1,g_pz0,g_qx1,g_qx0,g_qz1,g_qz0,
                             acoffx1,acoffx2,acoffz1,acoffz2,
                             delta,epsilon,fs,ds,zs,is,false);
            g_u0=g_u1;   g_w0=g_w1; 
            g_px0=g_px1; g_pz0=g_pz1; 
            g_qx0=g_qx1; g_qz0=g_qz1; 

            if(writeSnap && (it%300==0)) {
                cudaMemcpy(e, g_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
                for(int i=npml;i<nnx-npml;i++)
                    for(int j=npml;j<nnz-npml;j++)
                        fwrite(&e[i*nnz+j],4L,1,fpSnap);
            }

            cal_migration<<<(nx*nz+511)/512, 512>>>
                            (nnx, nnz, nz, npml, migration, s_P, g_P);

            Poynting_Adcigs<<<(nx*nz+511)/512, 512>>>
                            (nnz, nx, nz, npml, nangle, dangle, adcigs, 
                             s_P, s_Q, s_u0, s_w0, g_P, g_Q, g_u0, g_w0);

        }//it

        time = clock() - time;
        printf(", %f min", ((float)time)/60.0/CLOCKS_PER_SEC);

    }//is 

    migration_illum<<<(nx*nz+511)/512, 512>>>(nx, nz, npml, migration, illumination);

    adcigs_illum<<<(nx*nz*nangle+511)/512, 512>>>(nx, nz, nangle, dangle, adcigs, illumination);

    /* output multi-shot migration */
    cudaMemcpy(e, migration, nz*nx*sizeof(float), cudaMemcpyDeviceToHost);
    laplace_filter(1,nz,nx,e,d);
    fwrite(d,sizeof(float),nx*nz,fpMigration);

    /* output multi-shot illumination */
    cudaMemcpy(e, illumination, nz*nx*sizeof(float), cudaMemcpyDeviceToHost);
    fwrite(e,sizeof(float),nx*nz,fpIllunmination);

    /* output multi-shot adcigs */
    cudaMemcpy(Atemp, adcigs, nz*nx*nangle*sizeof(float), cudaMemcpyDeviceToHost);
    fwrite(Atemp,sizeof(float),nz*nx*nangle,fpAdcigs);

    /* output adcigs stk migration */
    stk_adcigs(nx,nz,nangle,Atemp,d);
    fwrite(d,sizeof(float),nx*nz,fpStkAdcigs);

    /* output smiled adcigs */
    adcigs_smiled(nx,nz,nangle,dAdcigs,Atemp);
    fwrite(Atemp,sizeof(float),nz*nx/dAdcigs*nangle,fpIntervalAdcigs);


    /* file close */
    if(writeSnap)
        fclose(fpSnap);  
    if(readShot) 
        fclose(fpObsShot);  
    else
        fclose(fpCalShot);
    fclose(fpMigration);
    fclose(fpIllunmination);  
    fclose(fpAdcigs);
    fclose(fpStkAdcigs);
    fclose(fpIntervalAdcigs);

    /* device memory free */
    cudaFree(coffx1);    cudaFree(acoffx1);       
    cudaFree(coffx2);    cudaFree(acoffx2);
    cudaFree(coffz1);    cudaFree(acoffz1);  
    cudaFree(coffz2);    cudaFree(acoffz2);

    cudaFree(s_u0);    cudaFree(s_u1);           
    cudaFree(s_w0);    cudaFree(s_w1);           

    cudaFree(s_P);    cudaFree(s_Q);            

    cudaFree(s_px0);    cudaFree(s_px1);          
    cudaFree(s_pz0);    cudaFree(s_pz1);          
    cudaFree(s_qx0);    cudaFree(s_qx1);          
    cudaFree(s_qz0);    cudaFree(s_qz1);          

    cudaFree(g_u0);    cudaFree(g_u1);           
    cudaFree(g_w0);    cudaFree(g_w1);           

    cudaFree(g_P);    cudaFree(g_Q);            

    cudaFree(g_px0);    cudaFree(g_px1);          
    cudaFree(g_pz0);    cudaFree(g_pz1);          
    cudaFree(g_qx0);    cudaFree(g_qx1);          
    cudaFree(g_qz0);    cudaFree(g_qz1);          

    cudaFree(shot_Dev);

    cudaFree(P_bndr);    cudaFree(Q_bndr);        

    cudaFree(migration); 
    cudaFree(illumination);
    cudaFree(adcigs);

    /* host memory free */
    free(v);	
    free(e);	
    free(d);
    free(shot_Hos);    
    free(Atemp);

}//RTM


/**
 *     MAIN FUNCTION
 * 
 */
int main(int argc,char *argv[])
{
    #pragma message("Note: 1 for FD, 2 for RTM")
    /* clear screen */
    system("clear");

    /* this "if" for arguments line: ./a.out 1 */
    int Kind;
    if(argc == 1 ){

        for(int i=0; note[i] != NULL; i++) {

            fprintf(stderr, "%s",note[i]);
            if(i>13) {
                char ch = getchar();
                if(ch == 'q'){

                    fprintf(stderr, "\n%s exit(0): line %d \n",__FILE__,__LINE__);
                    fprintf(stderr, "%s exit(0)\n",argv[0]);
                    exit(0);
                } else
                    continue;
            } else {
                fprintf(stderr, "\n");
            }
        }
        //assert(argc-1);
        exit(0);

    }else if(argc >= 2){

        Kind = atoi(argv[1]);

        if(Kind != 1 && Kind != 2) {

            fprintf(stderr, "%s 1 for FD.\n",argv[0]);
            fprintf(stderr, "%s 2 for RTM.\n",argv[0]);
            exit(0);

        }else {

            printf("%s Starting...\n", argv[0]);
        }
    }

    /* Parameters */
    int nx, nz, nt, wtype, nangle, dangle, dAdcigs;
    int ns, ds, fs, zs, npml;
    float dx, dz, dt, pfac, favg;
    bool readShot, writeSnap;

    clock_t start, end;

    /* file */
    /* these for FD */
    char FNvelocity[90]       = {"fd/layer_v_2000.dat"};      
    char FNepsilon[90]        = {"fd/layer_e_0.0.dat"};
    char FNdelta[90]          = {"fd/layer_d_0.0.dat"}; 
    char FNCalShot[90]        = {"fd/layer_shot_v2000e0.0d0.0dir.dat"};//shot cal
    char FNSnap[90]           = {"fd/layer_snap.dat"};//snap
    char FNIllumination[90]   = {"fd/layer_illumination.dat"};

    /* these for RTM */
    char FNObsShot[90]        = {"fd/layer_shot_obs.dat"};//shot obs

    char FNMigration[90]      = {"fd/layer_migration.dat"};
    char FNAdcigs[90]         = {"fd/layer_adcigs.dat"}; 
    char FNStkAdcigs[90]      = {"fd/layer_stkadcigs.dat"};
    char FNIntervalAdcigs[90] = {"fd/layer_smiled_adcigs.dat"}; 

    
    wtype=1;/* wavelet: 1,2,3 */
    npml=20;/* pml boundary */

    readShot = true;/* true: read shot; 
                       flase: use right shot record */
    writeSnap = true;/* true: write; 
                        flase: no write snap */

    nx = 2000;              
    nz = 210;         favg=15;     pfac=1000.0;

    dx=5.0;   
    dz=5.0;   
     
    nt=11001;    
    dt=0.0005;
     
    ns=1;       
    fs=2;//nx/ns/2;      
    ds=nx/ns;
    zs=1;   

    nangle=70; 
    dangle=1;  
    dAdcigs=25;


 
    start = clock(); 


    if(Kind == 1){
        /* FD */
        FD( FNvelocity,  
            FNepsilon,  
            FNdelta, 
            FNCalShot, 
            FNSnap, 
            FNIllumination, 
            wtype, 
            npml, 
            nx, 
            nz, 
            dx, 
            dz, 
            nt, 
            dt, 
            ns, 
            fs, 
            ds, 
            zs, 
            favg, 
            pfac, 
            writeSnap);

    }else if(Kind == 2){
        /* RTM */
        RTM(FNvelocity,  
            FNepsilon,  
            FNdelta, 
            FNObsShot,  
            FNCalShot, 
            FNSnap, 
            FNMigration, 
            FNIllumination, 
            FNAdcigs, 
            FNStkAdcigs, 
            FNIntervalAdcigs,
            wtype, 
            npml, 
            nx, 
            nz, 
            dx, 
            dz, 
            nt, 
            dt, 
            ns, 
            fs, 
            ds, 
            zs, 
            favg, 
            pfac, 
            nangle, 
            dangle, 
            dAdcigs, 
            readShot, 
            writeSnap);

    } else { }


    end = clock();


    printf("\n%s Finished... \n", argv[0]);  
    printf("Total %d shots: %f (min)\n", ns, ((float)(end-start))/60.0/CLOCKS_PER_SEC);

    return 1;

}//end of main


编译:

$ nvcc -o a Toa_gpu2DvtiFdRtmAdcigsLaplace.cu
Toa_gpu2DvtiFdRtmAdcigsLaplace.cu: In function ‘int main(int, char**)’:
Toa_gpu2DvtiFdRtmAdcigsLaplace.cu:1571: 附注:#pragma message:Note: 1 for FD, 2 for RTM
不加参数运行:

$ ./a

       2D Quasi Acoustic VTI Medium  FD & RTM (CUDA, ADCIGs)         
                             Author: Rong Tao @UPC                    
                                                                               
 Quasi Acoustic Function as follows:                                  
    du/dt = dp/dx                                                       
    dw/dt = dq/dz                                                       
    dp/dt =  vpx^2 * du/dx + vp*vpn * dw/dz                             
    dq/dt = vp*vpn * du/dx +  vp^2  * dw/dz                             
    vpx^2 = vp^2 * (1+2*epsilon)                                        
    vpn^2 = vp^2 * (1+2*delta)                                          
                                                                               
 Required Parameters:                                                    
    Kind         =1 FD                                                    
                 =2 RTM                                                   
    For example:                                                          
    ./a.out  1     Finite difference forward modeling[FD]                   
    ./a.out  2     Reverse Time Migration[RTM]                              
                                                                               
 Inner Parameters:                                                      
    nx, dx       Horizontal Space sampling point and interval             
    nz, dz       Vertical Space sampling point and interval               
    nt, dt       Time sampling point and interval                         
    favg         Wavelet frequency                                        
    pfac         Wavelet Gain                                             
    ns           The number of shots                                      
    fs           First shot position[grid]                                
    ds           Shots interval[grid]                                     
    zs           Shots vertical position[grid]                            
    nangle       The number of ADCIGs's angle                             
    dangle       The interval of ADCIGs's angle                           
    dAdcigs      Output file, the interval cdp(nx)                        
    npml         PML Border width[grid]                                   
                                                                               
 Optional Parameters:                                                            
    wtype        kind of wavelet    =1 ricker wavelet                     
                                    =2 derivative of gaussian             
                                    =3 derivative of gaussian             
    readShot     =true,             boolean, read obs shot                
                 =false,            boolean, use accurate shot data       
    writeSnap    =true,false        output snap into file or not          
    
加参数运行:

$ ./a 1 #for finite difference
$ ./a 2 #for RTM


下面来一个原来编的一个,只能做逆时偏移的程序:

//a#########################################################
//a##         2D Acoustic VTI Medium RTM   
//a##  Ps : P + sv wave and get rid of sv        
//a##       GPU(CUDA) ,poynting adcigs
//a##
//a##/*a****************************************************/
//a##Function for VTI medium modeling,2017.2.13
//a##
//a## Ps:  the function of modeling following:
//a##      
//a##          du/dt=1/rho*dp/dx , 
//a##          dw/dt=1/rho*dq/dz ,  
//a##          dp/dt=rho*vpx^2*du/dx+rho*vp*vpn*dw/dz ,
//a##          dq/dt=rho*vp*vpn*du/dx+rho*vp^2*dw/dz ,
//a##                     vpx^2=vp^2*(1+2*epsilon);
//a##                     vpn^2=vp^2*(1+2*delta);
//a##*********a*********************************************/
//a##                        first code: 2017.2.15
//a##                    adcigs process: 2017.4.13
//a##  
//a##                               programming by Rong Tao 
//a#########################################################
#include<stdio.h>
#include<malloc.h>
#include<math.h>
#include<stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define pi 3.141592653

#define mm 4

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(1);                                                               \
    }                                                                          \
}


//__constant__ float c[mm]={1.125,-0.04166667};/*mm==2*/
//__constant__ float c[mm]={1.1718750,-0.065104167,0.0046875};/*mm==3*/
__constant__ float c[mm]={1.196289,-0.0797526,0.009570313,-0.0006975447};/*mm==4*/
//__constant__ float c[mm]={1.211243,-0.08972168,0.01384277,-0.00176566,0.0001186795};/*mm==5*/


__device__ float d0;
//a################################################################################
void check_gpu_error (const char *msg) 
/*< check GPU errors >*/
{
    cudaError_t err = cudaGetLastError ();
    if (cudaSuccess != err) { 
	printf("Cuda error: %s: %s\n", msg, cudaGetErrorString (err)); 
	exit(0);   
    }
}
/*************func**************/
void migraiton_laplace_filter(int adj, int nz, int nx, float *in, float *out)
/*< linear operator, come from Madagascar Mlaplac2>*/
{
    int iz,ix,j;
    for (j=0;j<nx*nz;j++) out[j]=0.0;
    for (ix=0; ix < nx; ix++) {
	for (iz=0; iz < nz; iz++) {
	    j = iz+ix*nz;
	    if (iz > 0) {
		if (adj) {
		    out[j-1] -= in[j];
		    out[j]   += in[j];
		} else {
		    out[j] += in[j] - in[j-1];
		}
	    }
	    if (iz < nz-1) {
		if (adj) {
		    out[j+1] -= in[j];
		    out[j]   += in[j];
		} else {
		    out[j] += in[j] - in[j+1];
		}
	    }

	    if (ix > 0) {
		if (adj) {
		    out[j-nz] -= in[j];
		    out[j]    += in[j];
		} else {
		    out[j] += in[j] - in[j-nz];
		}
	    }
	    if (ix < nx-1) {
		if (adj) {
		    out[j+nz] -= in[j];
		    out[j]    += in[j];
		} else {
		    out[j] += in[j] - in[j+nz];
		}
	    }
	}
    }
}
void laplac2_lop(int adj, int nz, int nx, float *in, float *out)
/*< linear operator >*/
{
    int iz,ix,j;

    for (ix=0; ix < nx; ix++) {
	for (iz=0; iz < nz; iz++) {
	    j = iz+ix*nz;

	    if (iz > 0) {
		if (adj) {
		    out[j-1] -= in[j];
		    out[j]   += in[j];
		} else {
		    out[j] += in[j] - in[j-1];
		}
	    }
	    if (iz < nz-1) {
		if (adj) {
		    out[j+1] -= in[j];
		    out[j]   += in[j];
		} else {
		    out[j] += in[j] - in[j+1];
		}
	    }

	    if (ix > 0) {
		if (adj) {
		    out[j-nz] -= in[j];
		    out[j]    += in[j];
		} else {
		    out[j] += in[j] - in[j-nz];
		}
	    }
	    if (ix < nx-1) {
		if (adj) {
		    out[j+nz] -= in[j];
		    out[j]    += in[j];
		} else {
		    out[j] += in[j] - in[j+nz];
		}
	    }
	}
    }
}
/*************func**************/
__global__ void add_source(float pfac,float xsn,float zsn,int nx,int nz,int nnx,int nnz,float dt,float t,
                        float favg,int wtype,int npml,int is,int ds,float *P,float *Q)
/*< generate ricker wavelet with time deley >*/
{
       int ixs,izs;
       float x_,xx_,tdelay,ts,source=0.0,fs; 
  
       tdelay=1.0/favg;
       ts=t-tdelay;
       fs=xsn+(is-1)*ds;

	if(wtype==1)//ricker wavelet
	{
          x_=favg*ts;
          xx_=x_*x_;
          source=(1-2*pi*pi*(xx_))*exp(-(pi*pi*xx_));
	}else if(wtype==2){//derivative of gaussian
          x_=(-4)*favg*favg*pi*pi/log(0.1);
          source=(-2)*pi*pi*ts*exp(-x_*ts*ts);
        }else if(wtype==3){//derivative of gaussian
          x_=(-1)*favg*favg*pi*pi/log(0.1);
          source=exp(-x_*ts*ts);
        }

       if(t<=2*tdelay)
       {         
	     ixs = (int)(fs+0.5)+npml-1;
            izs = (int)(zsn+0.5)+npml-1;
            P[ixs*nnz+izs]+=pfac*source;
            Q[ixs*nnz+izs]+=pfac*source;
       }
}
/*******************func*********************/
__global__ void update_vel(int nx,int nz,int nnx,int nnz,int npml,float dt,float dx,float dz,
                           float *u0,float *w0,float *u1,float *w1,float *P,float *Q,
                           float *coffx1,float *coffx2,float *coffz1,float *coffz2)
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;

	int ix,iz,im;
	float dtx,dtz,xx,zz;

        ix=id/nnz;
        iz=id%nnz;

		 dtx=dt/dx;
		 dtz=dt/dz;
               if(id>=mm&&id<nnx*nnz-mm)
                 {
                   if(ix>=mm&&ix<(nnx-mm)&&iz>=mm&&iz<(nnz-mm))
                    {
                     xx=0.0;
                     zz=0.0;
	             for(im=0;im<mm;im++)
                      {
                        xx+=c[im]*(P[id+(im+1)*nnz]-P[id-im*nnz]);
                        zz+=c[im]*(Q[id+im+1]      -Q[id-im]);
                      }
                     u1[id]=coffx2[ix]*u0[id]-coffx1[ix]*dtx*xx;
                     w1[id]=coffz2[iz]*w0[id]-coffz1[iz]*dtz*zz;
                   }
                 }
}
/*******************func***********************/
__global__ void update_stress(int nx,int nz,int nnx,int nnz,float dt,float dx,float dz,
                           float *u1,float *w1,float *P,float *Q,float *vp,int npml,
                           float *px1,float *px0,float *pz1,float *pz0,float *qx1,float *qx0,float *qz1,float *qz0,
                           float *acoffx1,float *acoffx2,float *acoffz1,float *acoffz2,
                           float *delta,float *epsilon,int fs,int ds,int zs,int is,bool SV)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;

	int im,ix,iz,rx,rz,R=15,r=5;
	float dtx,dtz, xx,zz,ee,dd;

        ix=id/nnz;
        iz=id%nnz;

               dtx=dt/dx;
		 dtz=dt/dz;
               if(id>=mm&&id<nnx*nnz-mm)
                 {
/************************i****************************************/
/************************iso circle start*************************/
                   rx=ix-(fs+(is-1)*ds+npml);
                   rz=iz-(zs+npml);
                   if(SV){
                       if((rx*rx+rz*rz)<=R*R){
                           if((rx*rx+rz*rz)<=r*r){
                               ee = 0.0;
                               dd = 0.0;
                           }else{
                               ee = 0.5*(1-cos(pi*((sqrtf(rx*rx+rz*rz)-r)*4.0/(R*3.0-1))))*epsilon[id];
                               dd = 0.5*(1-cos(pi*((sqrtf(rx*rx+rz*rz)-r)*4.0/(R*3.0-1))))*delta[id]; 
                              }
                       }else{
                          ee=epsilon[id];
                          dd=delta[id];
                          }
                   }else{
                      ee=epsilon[id];
                      dd=delta[id];
                     }
/************************ iso circle end *************************/
/************************i****************************************/
                   if(ix>=mm&&ix<(nnx-mm)&&iz>=mm&&iz<(nnz-mm))
                     {
                     xx=0.0;
                     zz=0.0;
	             for(im=0;im<mm;im++)
                       {
                        xx+=c[im]*(u1[id+im*nnz]-u1[id-(im+1)*nnz]);
                        zz+=c[im]*(w1[id+im]    -w1[id-im-1]);
                       }
                     px1[id]=acoffx2[ix]*px0[id]-acoffx1[ix]*vp[id]*vp[id]*(1+2*ee)*dtx*xx;
                     pz1[id]=acoffz2[iz]*pz0[id]-acoffz1[iz]*vp[id]*vp[id]*sqrtf(1+2*dd)*dtz*zz;
                     qx1[id]=acoffx2[ix]*qx0[id]-acoffx1[ix]*vp[id]*vp[id]*sqrtf(1+2*dd)*dtx*xx;
                     qz1[id]=acoffz2[iz]*qz0[id]-acoffz1[iz]*vp[id]*vp[id]*dtz*zz;

                     P[id]=px1[id]+pz1[id];
                     Q[id]=qx1[id]+qz1[id];
                   }
                 }
}                      
/********************func**********************/
__global__ void get_d0(float dx,float dz,int nnx,int nnz,int npml,float *vp)
{
   d0=10.0*vp[nnx*nnz/2]*log(100000.0)/(2.0*npml*((dx+dz)/2.0));
}
/*************func*******************/
void pad_vv(int nx,int nz,int nnx,int nnz,int npml,float *ee)
{
     int ix,iz,id;
 
    for(id=0;id<nnx*nnz;id++)
     {
       ix=id/nnz;
       iz=id%nnz;
       if(ix<npml){
           ee[id]=ee[npml*nnz+iz];  //left
       }else if(ix>=nnx-npml){
           ee[id]=ee[(nnx-npml-1)*nnz+iz];//right
       }
     }
    for(id=0;id<nnx*nnz;id++)
     {
       ix=id/nnz;
       iz=id%nnz;
       if(iz<npml){
           ee[id]=ee[ix*nnz+npml];//up
       }else if(iz>=nnz-npml){
           ee[id]=ee[ix*nnz+nnz-npml-1];//down
       }
     }
}
/*************func*******************/
void read_file(char FN1[],char FN2[],char FN3[],int nx,int nz,int nnx,int nnz,float dx,float dz,float favg,float dt,
               float *v,float *e,float *d,int npml)
{
		 int i,j,id;
               float vmax,  vmin, emax, emin, dmax, dmin,  H_min, dt_max, dxz_max, C, tmp;

		
		 FILE *fp1,*fp2,*fp3;
		 if((fp1=fopen(FN1,"rb"))==NULL){printf("error open <%s>!\n",FN1);exit(0);}
		 if((fp2=fopen(FN2,"rb"))==NULL){printf("error open <%s>!\n",FN2);exit(0);}
		 if((fp3=fopen(FN3,"rb"))==NULL){printf("error open <%s>!\n",FN3);exit(0);}

               vmin= 999999.9;emin= 999999.9;dmin= 999999.9;
               vmax=-999999.9;dmax=-999999.9;emax=-999999.9;
		 for(i=npml;i<nx+npml;i++)
		 {
			 for(j=npml;j<nz+npml;j++)
			 {
                            id=i*nnz+j;
				 fread(&v[id],4L,1,fp1);//v[id] /= 3.281;
				 fread(&e[id],4L,1,fp2);//if(e[id]<0)e[id] *= -1.0;
				 fread(&d[id],4L,1,fp3);//if(d[id]<0)d[id] *= -1.0;

                            if(vmax<v[id]) vmax = v[id];
			       if(vmin>v[id]) vmin = v[id];
                            if(emax<e[id]) emax = e[id];
			       if(emin>e[id]) emin = e[id];
                            if(dmax<d[id]) dmax = d[id];
			       if(dmin>d[id]) dmin = d[id];
			 }
		 }
		 fclose(fp1);
		 fclose(fp2);
		 fclose(fp3);
        printf("------------------------------------\n---\n");
        printf("---   Vmax=%.4f, Vmin=%.4f\n",vmax,vmin);
        printf("---   Emax=%.4f, Emin=%.4f\n",emax,emin);
        printf("---   Dmax=%.4f, Dmin=%.4f\n---\n",dmax,dmin);
        /*********boundary*********/
        pad_vv(nx,nz,nnx,nnz,npml,e);
        pad_vv(nx,nz,nnx,nnz,npml,d);
        pad_vv(nx,nz,nnx,nnz,npml,v); 
      
       H_min=dx<dz?dx:dz;
       dt_max = 0.5*H_min/vmin;
       dxz_max = vmax/favg*0.2;

       if(dxz_max<dz||dxz_max<dx){printf("---   You need have to redefine DX and DZ ! \n");exit(0);}
	if(dt_max<dt){printf("---   You need have to redefine DT ! \n");exit(0);}
       if (   favg >= vmin/( 5.0*(dx>dz?dx:dz) )   ||   favg >= vmin/( 5.0*(dx>dz?dx:dz) )  )
	             {printf("---   Non-dispersion relation not satisfied! \n");exit(0);}

	else if ( mm == 2 )     C = 0.857;
	else if ( mm == 3 )     C = 0.8;
	else if ( mm == 4 )     C = 0.777;
	else if ( mm == 5 )     C = 0.759;

       tmp = dt*vmax*sqrtf( 1.0/(dx*dx)+1.0/(dz*dz) );
       if ( tmp >= C){ printf("---   Stability condition not satisfied! tmp = %f, C = %f\n",tmp,C);exit(0);}
}
/*************func*******************/
__global__ void initial_coffe(float dt,int nn,float *coff1,float *coff2,float *acoff1,float *acoff2,int npml)
{		
	 int id=threadIdx.x+blockDim.x*blockIdx.x;

           if(id<nn+2*npml)
            {
		 if(id<npml)
		 {   
			 coff1[id]=1.0/(1.0+(dt*d0*pow((npml-0.5-id)/npml,2.0))/2.0);
			 coff2[id]=coff1[id]*(1.0-(dt*d0*pow((npml-0.5-id)/npml,2.0))/2.0);

			 acoff1[id]=1.0/(1.0+(dt*d0*pow(((npml-id)*1.0)/npml,2.0))/2.0);
			 acoff2[id]=acoff1[id]*(1.0-(dt*d0*pow(((npml-id)*1.0)/npml,2.0))/2.0);

		 }else if(id>=npml&&id<npml+nn){

			 coff1[id]=1.0;
			 coff2[id]=1.0;

			 acoff1[id]=1.0;
			 acoff2[id]=1.0;

		 }else{

			 coff1[id]=1.0/(1.0+(dt*d0*pow((0.5+id-nn-npml)/npml,2.0))/2.0);
			 coff2[id]=coff1[id]*(1.0-(dt*d0*pow((0.5+id-nn-npml)/npml,2.0))/2.0);

			 acoff1[id]=1.0/(1.0+(dt*d0*pow(((id-nn-npml)*1.0)/npml,2.0))/2.0);
			 acoff2[id]=acoff1[id]*(1.0-(dt*d0*pow(((id-nn-npml)*1.0)/npml,2.0))/2.0);
		 }	
            }       
}
/*************func*******************/
__global__ void shot_record(int nnx, int nnz, int nx, int nz, int npml, int it, int nt, float *P, float *shot, bool flag)
{		
	 int id=threadIdx.x+blockDim.x*blockIdx.x;

           if(id<nx)
            {
             if(flag){
               shot[it+nt*id]=P[npml+nnz*(id+npml)];
             }else{
               P[npml+nnz*(id+npml)]=shot[it+nt*id];
              }
            }       
}
/*************func*******************/
__global__ void wavefield_bndr(int nnx, int nnz, int nx, int nz, int npml, int it, int nt, 
                               float *P, float *Q, float *P_bndr, float *Q_bndr, bool flag)
{		
	 int id=threadIdx.x+blockDim.x*blockIdx.x;

           if(id<2*nx+2*nz)
            {
            if(flag)/save boundary
             {
              if(id<nx){//up

               P_bndr[it*(2*nx+2*nz)+id]=P[npml-1+nnz*(id+npml)];
               Q_bndr[it*(2*nx+2*nz)+id]=Q[npml-1+nnz*(id+npml)];

              }else if(id>=nx&&id<(2*nx)){//down
   
               P_bndr[it*(2*nx+2*nz)+id]=P[npml+nz+1+nnz*(id-nx+npml)];
               Q_bndr[it*(2*nx+2*nz)+id]=Q[npml+nz+1+nnz*(id-nx+npml)];


              }else if(id>=(2*nx)&&id<(2*nx+nz)){//left

               P_bndr[it*(2*nx+2*nz)+id]=P[id-2*nx+npml+nnz*(npml-1)];
               Q_bndr[it*(2*nx+2*nz)+id]=Q[id-2*nx+npml+nnz*(npml-1)];

              }else if(id>=(2*nx+nz)){//right

               P_bndr[it*(2*nx+2*nz)+id]=P[id-2*nx-nz+npml+nnz*(npml+nx+1)];
               Q_bndr[it*(2*nx+2*nz)+id]=Q[id-2*nx-nz+npml+nnz*(npml+nx+1)];

                }
            }else{/add boundary
              if(id<nx){//up

               P[npml-1+nnz*(id+npml)]=P_bndr[it*(2*nx+2*nz)+id];
               Q[npml-1+nnz*(id+npml)]=Q_bndr[it*(2*nx+2*nz)+id];

              }else if(id>=nx&&id<(2*nx)){//down
   
               P[npml+nz+1+nnz*(id-nx+npml)]=P_bndr[it*(2*nx+2*nz)+id];
               Q[npml+nz+1+nnz*(id-nx+npml)]=Q_bndr[it*(2*nx+2*nz)+id];


              }else if(id>=(2*nx)&&id<(2*nx+nz)){//left

               P[id-2*nx+npml+nnz*(npml-1)]=P_bndr[it*(2*nx+2*nz)+id];
               Q[id-2*nx+npml+nnz*(npml-1)]=Q_bndr[it*(2*nx+2*nz)+id];

              }else if(id>=(2*nx+nz)){//right

               P[id-2*nx-nz+npml+nnz*(npml+nx+1)]=P_bndr[it*(2*nx+2*nz)+id];
               Q[id-2*nx-nz+npml+nnz*(npml+nx+1)]=Q_bndr[it*(2*nx+2*nz)+id];

                }
             }
            }       
}
/*************func**************/    
__global__ void mute_directwave(int nx,int nt,float dt,float favg,
                     float dx,float dz,int fs,int ds,int zs,int is,
                     float *vp,float *epsilon,float *shot,int tt)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;

    int mu_t,mu_nt;
    float mu_x,mu_z,mu_t0;

    int ix=id/nt;
    int it=id%nt;

   if(id<nx*nt)
   {
        mu_x=dx*abs(ix-fs-(is-1)*ds);
        mu_z=dz*zs;
        mu_t0=sqrtf(pow(mu_x,2)+pow(mu_z,2))/(vp[1]*sqrtf(1+2*epsilon[1]));
        mu_t=(int)(2.0/(dt*favg));
        mu_nt=(int)(mu_t0/dt)+mu_t+tt;

           if((it>(int)(mu_t0/dt)-tt)&&(it<mu_nt))
              shot[id]=0.0;
   }
}
/*************func**************/    
__global__ void cal_illumination(int nnx, int nnz, int nz, int npml, float *illumination, float *P, float *Q)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;
    int ix=id/nz;
    int iz=id%nz;

   if(id<nnx*nnz)
   {
      illumination[id]+=P[iz+npml+nnz*(ix+npml)]*P[iz+npml+nnz*(ix+npml)]
                         +Q[iz+npml+nnz*(ix+npml)]*Q[iz+npml+nnz*(ix+npml)];
      if(illumination[id]==0)illumination[id]=1.0;
   }
}
/*************func**************/    
__global__ void cal_migration(int nnx, int nnz, int nz, int npml, float *migration, float *s, float *g)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;
    int ix=id/nz;
    int iz=id%nz;

   if(id<nnx*nnz)
   {
      migration[id]+=s[iz+npml+nnz*(ix+npml)]*g[iz+npml+nnz*(ix+npml)];
   }
}
/*************func**************/    
__global__ void migration_illum(int nx, int nz, int npml, float *migration, float *illumination)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;

   if(id<nx*nz)
   {
      migration[id]/=illumination[id];//*illumination[id];
   }
}
/*************func**************/    
__global__ void Poynting_Adcigs(int nnz, int nx, int nz, int npml, int na, int da, float *adcigs, 
                           float *s_P, float *s_Q, float *s_u, float *s_w, 
                           float *g_P, float *g_Q, float *g_u, float *g_w)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;
    int ix=id/nz;
    int iz=id%nz;

    int ia=0;

    float Ssx=-s_P[iz+npml+nnz*(ix+npml)]*s_u[iz+npml+nnz*(ix+npml)];
    float Ssz=-s_Q[iz+npml+nnz*(ix+npml)]*s_w[iz+npml+nnz*(ix+npml)];
    float Sgx= g_P[iz+npml+nnz*(ix+npml)]*g_u[iz+npml+nnz*(ix+npml)];
    float Sgz= g_Q[iz+npml+nnz*(ix+npml)]*g_w[iz+npml+nnz*(ix+npml)];

    float b1= Ssx*Ssx + Ssz*Ssz;
    float b2= Sgx*Sgx + Sgz*Sgz;
    float  a=(Ssx*Sgx + Ssz*Sgz)/(sqrtf(b1*b2)*(1 - 0.1));

   if(id<nx*nz)
   {
     if(a>=-1&&a<=1)
      {
         a=0.5*acosf(a)*180.0/pi;
         ia=(int)(a/(da*1.0));
         if(ia<na)
          {
             adcigs[iz+nz*ia+nz*na*(id/nz)] += s_P[iz+npml+nnz*(ix+npml)]*g_P[iz+npml+nnz*(ix+npml)]
                                                *cosf(ia*pi/180.0)*cosf(ia*pi/180.0)*cosf(ia*pi/180.0);
          }
      }
   }
}
/*************func**************/    
__global__ void adcigs_illum(int nx, int nz, int na, int da, float *adcigs, float *illumination)
{
    int id=threadIdx.x+blockDim.x*blockIdx.x;
    int ix=id/(nz*na);
    int iz=id%nz;

   if(id<nx*nz*na)
   {
      adcigs[id]/=illumination[iz+nz*ix];//*illumination[iz+nz*ix];
   }
}
/*************func**************/ 
void adcigs_laplace_filter(int nx,int nz,int na,float *adcigs,int flag)
{
   int ix,iz,ia,id,ido;
   float *in, *out,*temp;

    	 in=(float*)malloc(nz*nx*sizeof(float));
    	out=(float*)malloc(nz*nx*sizeof(float));
    	temp=(float*)malloc(nz*nx*na*sizeof(float));

   if(flag==1||flag==3)
   for (ia=0; ia<na; ia++){
       for (ix=0; ix<nx; ix++){
           for (iz=0; iz<nz; iz++){
                   id=ix*na*nz+ia*nz+iz;
                   ido=ix*nz+iz;
                   in[ido]=adcigs[id];
             }
        }
       laplac2_lop( 1, nz, nx, in,  out );
       for (ix=0; ix<nx; ix++)  {
          for (iz=0; iz<nz; iz++) {
                   id=ix*na*nz+ia*nz+iz;
                   ido=ix*nz+iz;
                   temp[id]+=out[ido];
                   if(flag==1)adcigs[id]=temp[id];
            }
        }
   }
   if(flag!=1)
   for (ia=na-1; ia>=0; ia--) {
       for (ix=0; ix<nx; ix++) {
          for (iz=0; iz<nz; iz++) {
                   id=ix*na*nz+ia*nz+iz;
                   ido=ix*nz+iz;
                   in[ido]=adcigs[id];
           }
       }
      laplac2_lop( 1, nz, nx, in,  out );
      for (ix=0; ix<nx; ix++) {
          for (iz=0; iz<nz; iz++) {
                   id=ix*na*nz+ia*nz+iz;
                   ido=ix*nz+iz;
                   temp[id]+=out[ido];
                   if(flag==2||flag==3) adcigs[id]=temp[id];
            }
       }
   }
}
/*************func**************/ 
void adcigs_chouxi(int nx,int nz,int na,int nxa,int dadcigs,float *adcigs)
{
   int ix,iz,ia,id,ido;
   float *temp;

   temp=(float*)malloc(nz*nxa*na*sizeof(float));
   for (ix=0; ix<nx; ix++)  {
       for (ia=0; ia<na; ia++)  {
           for (iz=0; iz<nz; iz++)  {
                id=ix*na*nz+ia*nz+iz;
                if(ix%dadcigs==0) {
                      ido=ix/dadcigs*na*nz+ia*nz+iz;
                      temp[ido]=adcigs[id];
                      adcigs[ido]=temp[ido];
                  }
             }
        }
   }

}
/*************func**************/ 
void stk_adcigs(int nx,int nz,int na,float *adcigs,float *migration)
{
   int ix,iz,ia,id,ido;
   float stk;

   for (ix=0; ix<nx; ix++)  {
       for (iz=0; iz<nz; iz++)  {
           stk=0.0;
           for (ia=0; ia<na; ia++)  {
                id=ix*na*nz+ia*nz+iz;
                stk+=adcigs[id];
             }
           ido=ix*nz+iz;
           migration[ido]=stk;
        }
   }

}
//a########################################################################
//a##                          Main Function                             ##
//a########################################################################
int main(int argc,char *argv[])
{
    printf("%s Starting...\n", argv[0]);

	int is, it, nx, nz, nnx, nnz, nt, wtype, na, da, dadcigs, nxa;
	int ns, ds, fs, zs, npml;
	float dx, dz, dt, t, pfac, favg;
       float *coffx1,*coffx2,*coffz1,*coffz2,*acoffx1,*acoffx2,*acoffz1,*acoffz2;
	float *v, *e, *d;
	float *vp, *epsilon, *delta;
	float *s_u0, *s_u1, *s_px0, *s_qx0, *s_px1, *s_qx1;
       float *s_w0, *s_w1, *s_pz0, *s_qz0, *s_pz1, *s_qz1;
	float *g_u0, *g_u1, *g_px0, *g_qx0, *g_px1, *g_qx1;
       float *g_w0, *g_w1, *g_pz0, *g_qz0, *g_pz1, *g_qz1;
	float *s_P, *s_Q, *g_P, *g_Q, *shot_Dev, *shot_Hos, *P_bndr, *Q_bndr;
       float *migration, *illumination, *adcigs;
       float *Atemp;



       clock_t start, end;
/*************wavelet\boundary**************/
          wtype=1;npml=20;
/********** dat document ***********/

    /*      char FN1[250]={"waxian_3layer/waxian_vel_1001_301.dat"};      
          char FN2[250]={"waxian_3layer/waxian_epsilon_1001_301.dat"};
          char FN3[250]={"waxian_3layer/waxian_delta_1001_301.dat"}; 
	   char FN4[250]={"waxian_3layer/waxian_shot_obs.dat"};
	   char FN5[250]={"waxian_3layer/waxian_snap.dat"};
	   char FN6[250]={"waxian_3layer/waxian_migration.dat"};
	   char FN7[250]={"waxian_3layer/waxian_illumination.dat"};
	   char FN8[250]={"waxian_3layer/waxian_adcigs.dat"}; 
	   char FN9[250]={"waxian_3layer/waxian_adcigs_laplace.dat"}; 
	  char FN10[250]={"waxian_3layer/waxian_adcigs_dadcigs.dat"}; 
	  char FN11[250]={"waxian_3layer/waxian_migration_stkAdcigs.dat"}; */

          char FN1[250]={"BP/BP_vel_1700_601.dat"};      
          char FN2[250]={"BP/BP_eps_1700_601.dat"};
          char FN3[250]={"BP/BP_del_1700_601.dat"}; 
          char FN4[250]={"BP/shot_obs_mute.dat"};
          char FN5[250]={"BP/snap.dat"};
          char FN6[250]={"BP/migration.dat"};
          char FN7[250]={"BP/illumination.dat"};
          char FN8[250]={"BP/adcigs.dat"}; 
          char FN9[250]={"BP/adcigs_laplace.dat"}; 
          char FN10[250]={"BP/adcigs_dadcigs.dat"}; 
          char FN11[250]={"BP/migration_stkAdcigs.dat"}; 


          nx=1700;              
          nz=601;         favg=15;     pfac=1000.0;

          dx=10.0;   
          dz=10.0;   
     
          nt=12001;    
          dt=0.001;
     
          ns=340;       
          fs=nx/ns/2;      
          ds=nx/ns;
          zs=1;  

          na=70; 
          da=1;  
          dadcigs=25;

/********aaa************/  
	 FILE *fpsnap, *fpshot, *fpmig, *fpillum, *fpadcigs, *fpadcigslaplace, *fpdadcigs,*fpstkadcigs;
        fpshot=fopen(FN4,"wb");
        fpsnap=fopen(FN5,"wb");
        fpmig=fopen(FN6,"wb");
        fpillum=fopen(FN7,"wb");
        fpadcigs=fopen(FN8,"wb");
        fpadcigslaplace=fopen(FN9,"wb");
        fpdadcigs=fopen(FN10,"wb");
        fpstkadcigs=fopen(FN11,"wb");
/*************v***************/ 
          nnx=nx+2*npml;
          nnz=nz+2*npml;
          nxa=(int)(nx/dadcigs);
/************a*************/
    	 Atemp=(float*)malloc(nz*nx*na*sizeof(float));

    	 v=(float*)malloc(nnz*nnx*sizeof(float));
    	 e=(float*)malloc(nnz*nnx*sizeof(float));
    	 d=(float*)malloc(nnz*nnx*sizeof(float));
    	 shot_Hos=(float*)malloc(nt*nx*sizeof(float));
        read_file(FN1,FN2,FN3,nx,nz,nnx,nnz,dx,dz,favg,dt,v,e,d,npml);
/****************************/

        cudaSetDevice(0);// initialize device, default device=0;
	 check_gpu_error("Failed to initialize device!");
    CHECK(cudaDeviceReset());
/****************************/
        cudaMalloc(&vp, nnz*nnx*sizeof(float));
        cudaMalloc(&epsilon, nnz*nnx*sizeof(float));
        cudaMalloc(&delta, nnz*nnx*sizeof(float));
	 cudaMemcpy(vp, v, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
	 cudaMemcpy(epsilon, e, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
	 cudaMemcpy(delta, d, nnz*nnx*sizeof(float), cudaMemcpyHostToDevice);
/****************************/
        cudaMalloc(&s_u0, nnz*nnx*sizeof(float));    cudaMalloc(&s_u1, nnz*nnx*sizeof(float));
        cudaMalloc(&s_w0, nnz*nnx*sizeof(float));    cudaMalloc(&s_w1, nnz*nnx*sizeof(float));

        cudaMalloc(&s_P, nnz*nnx*sizeof(float));     cudaMalloc(&s_Q, nnz*nnx*sizeof(float));

        cudaMalloc(&s_px0, nnz*nnx*sizeof(float));   cudaMalloc(&s_px1, nnz*nnx*sizeof(float));
        cudaMalloc(&s_pz0, nnz*nnx*sizeof(float));   cudaMalloc(&s_pz1, nnz*nnx*sizeof(float));
        cudaMalloc(&s_qx0, nnz*nnx*sizeof(float));   cudaMalloc(&s_qx1, nnz*nnx*sizeof(float));
        cudaMalloc(&s_qz0, nnz*nnx*sizeof(float));   cudaMalloc(&s_qz1, nnz*nnx*sizeof(float));

        cudaMalloc(&g_u0, nnz*nnx*sizeof(float));    cudaMalloc(&g_u1, nnz*nnx*sizeof(float));
        cudaMalloc(&g_w0, nnz*nnx*sizeof(float));    cudaMalloc(&g_w1, nnz*nnx*sizeof(float));

        cudaMalloc(&g_P, nnz*nnx*sizeof(float));     cudaMalloc(&g_Q, nnz*nnx*sizeof(float));

        cudaMalloc(&g_px0, nnz*nnx*sizeof(float));   cudaMalloc(&g_px1, nnz*nnx*sizeof(float));
        cudaMalloc(&g_pz0, nnz*nnx*sizeof(float));   cudaMalloc(&g_pz1, nnz*nnx*sizeof(float));
        cudaMalloc(&g_qx0, nnz*nnx*sizeof(float));   cudaMalloc(&g_qx1, nnz*nnx*sizeof(float));
        cudaMalloc(&g_qz0, nnz*nnx*sizeof(float));   cudaMalloc(&g_qz1, nnz*nnx*sizeof(float));

        cudaMalloc(&coffx1, nnx*sizeof(float));     cudaMalloc(&coffx2, nnx*sizeof(float));
        cudaMalloc(&coffz1, nnz*sizeof(float));     cudaMalloc(&coffz2, nnz*sizeof(float));
        cudaMalloc(&acoffx1, nnx*sizeof(float));    cudaMalloc(&acoffx2, nnx*sizeof(float));
        cudaMalloc(&acoffz1, nnz*sizeof(float));    cudaMalloc(&acoffz2, nnz*sizeof(float));

        cudaMalloc(&shot_Dev, nx*nt*sizeof(float));
        cudaMalloc(&P_bndr, nt*(2*nx+2*nz)*sizeof(float));
        cudaMalloc(&Q_bndr, nt*(2*nx+2*nz)*sizeof(float));

        cudaMalloc(&migration, nz*nx*sizeof(float)); 
        cudaMalloc(&illumination, nz*nx*sizeof(float));
        cudaMalloc(&adcigs, nz*na*nx*sizeof(float));
/******************************/
	 check_gpu_error("Failed to allocate memory for variables!");

        get_d0<<<1, 1>>>(dx, dz, nnx, nnz, npml, vp);
        initial_coffe<<<(nnx+511)/512, 512>>>(dt,nx,coffx1,coffx2,acoffx1,acoffx2,npml);
        initial_coffe<<<(nnz+511)/512, 512>>>(dt,nz,coffz1,coffz2,acoffz1,acoffz2,npml);

        cudaMemset(migration, 0, nz*nx*sizeof(float)); 
        cudaMemset(illumination, 0, nz*nx*sizeof(float));
        cudaMemset(adcigs, 0, nz*na*nx*sizeof(float));
        printf("--------------------------------------------------------\n");
        printf("---");   
        start = clock();                                  
/**********IS Loop start*******/
   for(is=1;is<=ns;is++)	
    {     
         printf("\n---   IS=%3d  ",is);

     cudaMemset(s_u0, 0, nnz*nnx*sizeof(float));     cudaMemset(s_u1, 0, nnz*nnx*sizeof(float));
     cudaMemset(s_w0, 0, nnz*nnx*sizeof(float));     cudaMemset(s_w1, 0, nnz*nnx*sizeof(float));

     cudaMemset(s_P, 0, nnz*nnx*sizeof(float));      cudaMemset(s_Q, 0, nnz*nnx*sizeof(float));

     cudaMemset(s_px0, 0, nnz*nnx*sizeof(float));    cudaMemset(s_px1, 0, nnz*nnx*sizeof(float));
     cudaMemset(s_pz0, 0, nnz*nnx*sizeof(float));    cudaMemset(s_pz1, 0, nnz*nnx*sizeof(float));
     cudaMemset(s_qx0, 0, nnz*nnx*sizeof(float));    cudaMemset(s_qx1, 0, nnz*nnx*sizeof(float));
     cudaMemset(s_qz0, 0, nnz*nnx*sizeof(float));    cudaMemset(s_qz1, 0, nnz*nnx*sizeof(float));

     cudaMemset(g_u0, 0, nnz*nnx*sizeof(float));     cudaMemset(g_u1, 0, nnz*nnx*sizeof(float));
     cudaMemset(g_w0, 0, nnz*nnx*sizeof(float));     cudaMemset(g_w1, 0, nnz*nnx*sizeof(float));

     cudaMemset(g_P, 0, nnz*nnx*sizeof(float));      cudaMemset(g_Q, 0, nnz*nnx*sizeof(float));

     cudaMemset(g_px0, 0, nnz*nnx*sizeof(float));    cudaMemset(g_px1, 0, nnz*nnx*sizeof(float));
     cudaMemset(g_pz0, 0, nnz*nnx*sizeof(float));    cudaMemset(g_pz1, 0, nnz*nnx*sizeof(float));
     cudaMemset(g_qx0, 0, nnz*nnx*sizeof(float));    cudaMemset(g_qx1, 0, nnz*nnx*sizeof(float));
     cudaMemset(g_qz0, 0, nnz*nnx*sizeof(float));    cudaMemset(g_qz1, 0, nnz*nnx*sizeof(float));

     cudaMemset(shot_Dev, 0, nt*nx*sizeof(float));
     cudaMemset(P_bndr, 0, nt*(2*nx+2*nz)*sizeof(float));
     cudaMemset(Q_bndr, 0, nt*(2*nx+2*nz)*sizeof(float));

/*a***********************************Forward*******************************************/
     for(it=0,t=dt;it<nt;it++,t+=dt)
     { 
      //if(it==0)printf(" > F >",is,it);
       /*a#####################a*/
       /*a##     Forward     ##a*/
       /*a#####################a*/
	 add_source<<<1,1>>>(pfac,fs,zs,nx,nz,nnx,nnz,dt,t,favg,wtype,npml,is,ds,s_P,s_Q);
        update_vel<<<(nnx*nnz+511)/512, 512>>>(nx,nz,nnx,nnz,npml,dt,dx,dz,
                                               s_u0,s_w0,s_u1,s_w1,s_P,s_Q,coffx1,coffx2,coffz1,coffz2);
        update_stress<<<(nnx*nnz+511)/512, 512>>>(nx,nz,nnx,nnz,dt,dx,dz,s_u1,s_w1,s_P,s_Q,vp,npml,
                                                  s_px1,s_px0,s_pz1,s_pz0,s_qx1,s_qx0,s_qz1,s_qz0,
                                                  acoffx1,acoffx2,acoffz1,acoffz2,delta,epsilon,fs,ds,zs,is,true);
        s_u0=s_u1; s_w0=s_w1; s_px0=s_px1; s_pz0=s_pz1; s_qx0=s_qx1; s_qz0=s_qz1; 

        shot_record<<<(nx+511)/512, 512>>>(nnx, nnz, nx, nz, npml, it, nt, s_P, shot_Dev, true);
        wavefield_bndr<<<((2*nx+2*nz)+511)/512,512>>>(nnx, nnz, nx, nz, npml, it, nt, s_P, s_Q, P_bndr, Q_bndr, true);
        cal_illumination<<<(nx*nz+511)/512, 512>>>(nnx, nnz, nz, npml, illumination, s_P, s_Q);

           if((is==1)&&(it%300==0))
            {
	       cudaMemcpy(e, s_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
              fwrite(e,4L,nnx*nnz,fpsnap);
            }
     }//it loop end
      mute_directwave<<<(nx*nt+511)/512, 512>>>(nx,nt,dt,favg,dx,dz,fs,ds,zs,is,vp,epsilon,shot_Dev,20);
      cudaMemcpy(shot_Hos, shot_Dev, nt*nx*sizeof(float), cudaMemcpyDeviceToHost);
      fseek(fpshot,(is-1)*nt*nx*sizeof(float),0);
      fwrite(shot_Hos,sizeof(float),nt*nx,fpshot);
/*a***********************************Backward*******************************************/
     for(it=nt-1;it>=0;it--)
     { 
     // if(it==0)printf("  B ",is,it);
       /*a#####################a*/
       /*a##  Reconstruction ##a*/
       /*a#####################a*/
        wavefield_bndr<<<((2*nx+2*nz)+511)/512,512>>>(nnx, nnz, nx, nz, npml, it, nt, s_P, s_Q, P_bndr, Q_bndr, false);
        update_vel<<<(nnx*nnz+511)/512, 512>>>(nx,nz,nnx,nnz,npml,dt,dx,dz,
                                               s_u0,s_w0,s_u1,s_w1,s_P,s_Q,coffx1,coffx2,coffz1,coffz2);
        update_stress<<<(nnx*nnz+511)/512, 512>>>(nx,nz,nnx,nnz,dt,dx,dz,s_u1,s_w1,s_P,s_Q,vp,npml,
                                                  s_px1,s_px0,s_pz1,s_pz0,s_qx1,s_qx0,s_qz1,s_qz0,
                                                  acoffx1,acoffx2,acoffz1,acoffz2,delta,epsilon,fs,ds,zs,is,false);
        s_u0=s_u1; s_w0=s_w1; s_px0=s_px1; s_pz0=s_pz1; s_qx0=s_qx1; s_qz0=s_qz1; 

         /*  if((is==1)&&(it%300==0))
            {
	       cudaMemcpy(e, s_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
              fwrite(e,4L,nnx*nnz,fpsnap);
            }*/
       /*a#####################a*/
       /*a##     Backward    ##a*/
       /*a#####################a*/
        shot_record<<<(nx+511)/512, 512>>>(nnx, nnz, nx, nz, npml, it, nt, g_P, shot_Dev, false);
        shot_record<<<(nx+511)/512, 512>>>(nnx, nnz, nx, nz, npml, it, nt, g_Q, shot_Dev, false);
        update_vel<<<(nnx*nnz+511)/512, 512>>>(nx,nz,nnx,nnz,npml,dt,dx,dz,
                                               g_u0,g_w0,g_u1,g_w1,g_P,g_Q,coffx1,coffx2,coffz1,coffz2);
        update_stress<<<(nnx*nnz+511)/512, 512>>>(nx,nz,nnx,nnz,dt,dx,dz,g_u1,g_w1,g_P,g_Q,vp,npml,
                                                  g_px1,g_px0,g_pz1,g_pz0,g_qx1,g_qx0,g_qz1,g_qz0,
                                                  acoffx1,acoffx2,acoffz1,acoffz2,delta,epsilon,fs,ds,zs,is,false);
        g_u0=g_u1; g_w0=g_w1; g_px0=g_px1; g_pz0=g_pz1; g_qx0=g_qx1; g_qz0=g_qz1; 

        /*   if((is==1)&&(it%300==0))
            {
	       cudaMemcpy(e, g_P, nnz*nnx*sizeof(float), cudaMemcpyDeviceToHost);
              fwrite(e,4L,nnx*nnz,fpsnap);
            }*/
        cal_migration<<<(nx*nz+511)/512, 512>>>(nnx, nnz, nz, npml, migration, s_P, g_P);

        Poynting_Adcigs<<<(nx*nz+511)/512, 512>>>(nnz, nx, nz, npml, na, da, adcigs, 
                                                       s_P, s_Q, s_u0, s_w0, g_P, g_Q, g_u0, g_w0);
     }//it loop end

   }//is loop end

   migration_illum<<<(nx*nz+511)/512, 512>>>(nx, nz, npml, migration, illumination);
   adcigs_illum<<<(nx*nz*na+511)/512, 512>>>(nx, nz, na, da, adcigs, illumination);

   printf("\n->");
   cudaMemcpy(e, migration, nz*nx*sizeof(float), cudaMemcpyDeviceToHost);
   migraiton_laplace_filter(1,nz,nx,e,d);
   fwrite(d,sizeof(float),nx*nz,fpmig);

   printf("->");
   cudaMemcpy(e, illumination, nz*nx*sizeof(float), cudaMemcpyDeviceToHost);
   fwrite(e,sizeof(float),nx*nz,fpillum);

   printf("->");
   cudaMemcpy(Atemp, adcigs, nz*nx*na*sizeof(float), cudaMemcpyDeviceToHost);
   fwrite(Atemp,sizeof(float),nz*nx*na,fpadcigs);

   printf("->");
   adcigs_laplace_filter(nx,nz,na,Atemp,2);/*1:(0-na);2:(na-0);3:(0-na-0)*/
   fwrite(Atemp,sizeof(float),nz*nx*na,fpadcigslaplace);

   printf("->");
   stk_adcigs(nx,nz,na,Atemp,d);
   fwrite(d,sizeof(float),nx*nz,fpstkadcigs);

   printf("->");
   adcigs_chouxi(nx,nz,na,nxa,dadcigs,Atemp);
   fwrite(Atemp,sizeof(float),nz*nxa*na,fpdadcigs);
   printf(" done!\n");

   end = clock();
/*********IS Loop end*********/ 		     
   printf("\n---   Complete!!!!!!!!! \n");  
   printf("total %d shots: %f (min)\n", ns, ((float)(end-start))/60.0/CLOCKS_PER_SEC);



/***********close************/ 
     fclose(fpsnap);   fclose(fpshot);  fclose(fpmig);
     fclose(fpillum);  fclose(fpadcigs);fclose(fpadcigslaplace);
     fclose(fpdadcigs); fclose(fpstkadcigs);
/***********free*************/ 
       cudaFree(coffx1);       cudaFree(coffx2);
       cudaFree(coffz1);       cudaFree(coffz2);
       cudaFree(acoffx1);      cudaFree(acoffx2);
       cudaFree(acoffz1);      cudaFree(acoffz2);

       cudaFree(s_u0);           cudaFree(s_u1);
       cudaFree(s_w0);           cudaFree(s_w1);

       cudaFree(s_P);            cudaFree(s_Q);

       cudaFree(s_px0);          cudaFree(s_px1);
       cudaFree(s_pz0);          cudaFree(s_pz1);
       cudaFree(s_qx0);          cudaFree(s_qx1);
       cudaFree(s_qz0);          cudaFree(s_qz1);

       cudaFree(g_u0);           cudaFree(g_u1);
       cudaFree(g_w0);           cudaFree(g_w1);

       cudaFree(g_P);            cudaFree(g_Q);

       cudaFree(g_px0);          cudaFree(g_px1);
       cudaFree(g_pz0);          cudaFree(g_pz1);
       cudaFree(g_qx0);          cudaFree(g_qx1);
       cudaFree(g_qz0);          cudaFree(g_qz1);

       cudaFree(shot_Dev);

       cudaFree(P_bndr);        cudaFree(Q_bndr);

       cudaFree(migration); 
       cudaFree(illumination);
       cudaFree(adcigs);
/***************host free*****************/
	free(v);	free(e);	free(d);
       free(shot_Hos);    free(Atemp);
       exit(0);
}








已标记关键词 清除标记
相关推荐
课程简介: 历经半个多月的时间,Debug亲自撸的 “企业员工角色权限管理平台” 终于完成了。如字面意思,本课程讲解的是一个真意义上的、企业级的项目实战,主要介绍了企业级应用系统中后端应用权限的管理,其中主要涵盖了六大核心业务模块、十几张数据库表。 其中的核心业务模块主要包括用户模块、部门模块、岗位模块、角色模块、菜单模块和系统日志模块;与此同时,Debug还亲自撸了额外的附属模块,包括字典管理模块、商品分类模块以及考勤管理模块等等,主要是为了更好地巩固相应的技术栈以及企业应用系统业务模块的开发流程! 核心技术栈列表: 值得介绍的是,本课程在技术栈层面涵盖了前端和后端的大部分常用技术,包括Spring Boot、Spring MVC、Mybatis、Mybatis-Plus、Shiro(身份认证与资源授权跟会话等等)、Spring AOP、防止XSS攻击、防止SQL注入攻击、过滤器Filter、验证码Kaptcha、热部署插件Devtools、POI、Vue、LayUI、ElementUI、JQuery、HTML、Bootstrap、Freemarker、一键打包部署运行工具Wagon等等,如下图所示: 课程内容与收益: 总的来说,本课程是一门具有很强实践性质的“项目实战”课程,即“企业应用员工角色权限管理平台”,主要介绍了当前企业级应用系统中员工、部门、岗位、角色、权限、菜单以及其他实体模块的管理;其中,还重点讲解了如何基于Shiro的资源授权实现员工-角色-操作权限、员工-角色-数据权限的管理;在课程的最后,还介绍了如何实现一键打包上传部署运行项目等等。如下图所示为本权限管理平台的数据库设计图: 以下为项目整体的运行效果截图: 值得一提的是,在本课程中,Debug也向各位小伙伴介绍了如何在企业级应用系统业务模块的开发中,前端到后端再到数据库,最后再到服务器的上线部署运行等流程,如下图所示:
©️2020 CSDN 皮肤主题: 酷酷鲨 设计师:CSDN官方博客 返回首页