StencilOps1D.c

 /*	Basic Sparse matrix operations for C			*/

 /*	John Pormann, NSF/ERC-ECT at Duke University		*/
 /*		jbp@erc-sparc.mc.duke.edu			*/

 /*	jcoef: Ndim [xdim ydim zdim] DiagFlag? Nextras			*/
 /*		0-D:  0 0 0					*/
 /*		1-D:  1 xd 0 0					*/
 /*		2-D:  2 xd yd 0 0					*/
 /*		3-D:  3 xd yd zd 0 0				*/

 /*	0-D is essentially just a diagonal matrix		*/

 /* Copyright John B. Pormann, 21 July 2000, all rights reserved */

 /* RCSID: $Id: StencilOps1D.c,v 1.5 2005/05/18 21:29:22 john Exp $ */


 #include "CardioWave.h"

 int GetBandwidth1D( domain_t d );

 int sprmpy0( real alpha, sparse A, vector x, real beta, vector y );
 int sprmpy1( real alpha, sparse A, vector x, real beta, vector y );
 int sprmpy2( real alpha, sparse A, vector x, real beta, vector y );
 int sprmpy3( real alpha, sparse A, vector x, real beta, vector y );

 static int wksp_sz = 0;
 static real* wksp_y = NULL;
 static real* wksp_x = NULL;

 int coefsize( int ndim ) {
 	int r = 0;
 	switch( ndim ) {
 		case 3: r = 27;
 			break;
 		case 2: r = 9;
 			break;
 		case 1: r = 3;
 			break;
 		case 0: r = 1;
 			break;
 	}
 	return( r );
 }

 /* allocate a sparse matrix of the given type */
 int spralloc( sparse* M, domain_t dtp, int type, int rows,
 		int cols, int maxnz ) {
 	if( maxnz < 1024 ) {
 		maxnz = 1024;
 	}

 	M->type   = type;
 	M->rows   = rows;
 	M->cols   = cols;
 	M->maxnz  = maxnz;
 	M->csep   = 0;
 	M->dtype  = dtp;
 	/* grab 2 message tags */
 	M->msgtag = NextMsgTag();
 	NextMsgTag();

 	/* jcoef array is of size = maxnz */
 	M->jcoef = (int*)malloc( maxnz*sizeof(int) );
 	if( M->jcoef == NULL ) {
 		return( -1 );
 	}
 	memset( (void*)(M->jcoef), 0, maxnz*sizeof(int) );

 	/* coef array is of size = maxnz */
 	/* : caller should set maxnz >= 3^Ndim */
 	M->coef  = (real*)malloc( maxnz*sizeof(real) );
 	if( M->coef == NULL ) {
 		return( -2 );
 	}
 	memset( (void*)(M->coef), 0, maxnz*sizeof(real) );

 	if( maxnz > wksp_sz ) {
 		if( wksp_y != NULL ) {
 			free( wksp_y );
 		}
 		if( wksp_x != NULL ) {
 			free( wksp_x );
 		}
 		wksp_sz = maxnz;
 		wksp_y = (real*)malloc( maxnz*sizeof(real) );
 		if( wksp_y == NULL ) {
 			return( -3 );
 		}
 		wksp_x = (real*)malloc( maxnz*sizeof(real) );
 		if( wksp_x == NULL ) {
 			return( -4 );
 		}
 	}

 	return( 0 );
 }

 int sprfree( sparse* M ) {
 	if( M->jcoef != NULL ) {
 		free( M->jcoef );
 	}
 	if( M->coef != NULL ) {
 		free( M->coef );
 	}

 	M->type   = 0;
 	M->rows   = 0;
 	M->cols   = 0;
 	M->maxnz  = 0;
 	M->jcoef  = NULL;
 	M->coef   = NULL;
 	M->dtype  = -1;
 	M->csep   = 0;
 	M->msgtag = MPI_ANY_TAG;

 	return( 0 );
 }

 int sswap( sparse* Ap, sparse* Bp ) {
 	sparse C;

 	C   = *Ap;
 	*Ap = *Bp;
 	*Bp = C;

 	return( 0 );
 }

 int scopy( sparse A, sparse B ) {
 	if( (A.type!=B.type) or (A.rows!=B.rows) or (A.cols!=B.cols)
 	    or (A.maxnz!=B.maxnz) ) {
 		return( -1 );
 	}

 	memcpy( B.jcoef, A.jcoef, A.maxnz*sizeof(int) );
 	memcpy( B.coef, A.coef, A.maxnz*sizeof(real) );

 	return( 0 );
 }

 int szero( sparse A ) {
 	int sz;

 	sz = coefsize( A.jcoef[0] );
 	memset( (void*)(A.coef), 0, sz*sizeof(real) );

 	return( 0 );
 }

 int sscale( real beta, sparse M1 ) {
 	int i,m,sz;
 	real* av = M1.coef;

 	sz = coefsize( M1.jcoef[0] );
 	m  = sz - sz%4;

 	for(i=0;i<m;i+=4) {
 		av[i]   *= beta;
 		av[i+1] *= beta;
 		av[i+2] *= beta;
 		av[i+3] *= beta;
 	}
 	for(;i<sz;i++) {
 		av[i] *= beta;
 	}

 	return( 0 );
 }

 int ssadd( real alpha, sparse A, sparse B ) {
 	int i;
 	int m,sz;
 	int rrr  = A.rows;
 	int ccc  = A.cols;
 	real* av = A.coef;
 	real* bv = B.coef;
 	real  x0,x1,x2,x3;

 	if( (rrr!=B.rows) or (ccc!=B.cols) or (A.type!=B.type) ) {
 		return( -1 );
 	}

 	sz = coefsize( A.jcoef[0] );
 	m  = sz - sz%4;

 	for(i=0;i<m;i+=4) {
 		x0 = alpha*av[i];
 		x1 = alpha*av[i+1];
 		x2 = alpha*av[i+2];
 		x3 = alpha*av[i+3];
 		bv[i]   += x0;
 		bv[i+1] += x1;
 		bv[i+2] += x2;
 		bv[i+3] += x3;
 	}
 	for(;i<sz;i++) {
 		bv[i] += alpha*av[i];
 	}

 	return( 0 );
 }

 int ssadddiag( real alpha, vector V1, sparse M2 ) {
 	/* assume V1 is size=1 */
 	switch( M2.jcoef[0] ) {
 		case 3: M2.coef[13] += alpha*V1.data[0];
 			break;
 		case 2: M2.coef[4] += alpha*V1.data[0];
 			break;
 		case 1: M2.coef[1] += alpha*V1.data[0];
 			break;
 		case 0: M2.coef[0] += alpha*V1.data[0];
 			break;
 	}

 	return( 0 );
 }

 void sprnonz( sparse A, int i, int* n, int* lst ) {
 	int nd;
 	int xd = 0, yd = 0, zd = 0;
 	int x  = 0, y  = 0, z  = 0;
 	int c;
 	int tissuesize;

 	/* unpack the dimensions from A.jcoef[] */
 	nd = A.jcoef[0];

 	tissuesize = DomainGlobalSize( A.dtype );

 	/* : note there is fall-thru for case 3 and 2 */
 	switch( nd ) {
 		case 3: xd = A.jcoef[1];
 			yd = A.jcoef[2];
 			if( NumPEs > 1 ) {
 				zd = tissuesize/(xd*yd);
 			} else {
 				zd = A.jcoef[3];
 			}
 			break;
 		case 2: xd = A.jcoef[1];
 			if( NumPEs > 1 ) {
 				yd = tissuesize/xd;
 			} else {
 				yd = A.jcoef[2];
 			}
 			break;
 		case 1: if( NumPEs > 1 ) {
 				xd = tissuesize;
 			} else {
 				xd = A.jcoef[1];
 			}
 			break;
 	}

 	/* get coordinates */
 	/* : note there is fall-thru for case 3 and 2 */
 	/* ::old code:: c = i + TissueSplit[SelfPE]; */
 	c = i + Local2Global( A.dtype, 0 );
 	switch( nd ) {
 		case 3: z = c/(xd*yd);
 			c = c%(xd*yd);
 		case 2: y = c/xd;
 			c = c%xd;
 		case 1: x = c;
 	}

 	/* store neighbors into lst */
 	c = 0;
 	lst[c] = i;
 	c++;
 	switch( nd ) {
 		case 3: /* needs work! */
 			if( z > 0 ) {
 				lst[c] = i-(xd*yd);
 				c++;
 			}
 			if( z < (zd-1) ) {
 				lst[c] = i+(xd*yd);
 				c++;
 			}
 		case 2: if( y > 0 ) {
 				if( x > 0 ) {
 					lst[c] = i-xd-1;
 					c++;
 				}
 				lst[c] = i-xd;
 				c++;
 				if( x < (xd-1) ) {
 					lst[c] = i-xd+1;
 					c++;
 				}
 			}
 			if( y < (yd-1) ) {
 				if( x > 0 ) {
 					lst[c] = i+xd-1;
 					c++;
 				}
 				lst[c] = i+xd;
 				c++;
 				if( x < (xd-1) ) {
 					lst[c] = i+xd+1;
 					c++;
 				}
 			}
 		case 1: if( x > 0 ) {
 				lst[c] = i-1;
 				c++;
 			}
 			if( x < (xd-1) ) {
 				lst[c] = i+1;
 				c++;
 			}
 	}

 	*n = c;

 	return;
 }

 int sprput( sparse m1, int i, int j, real v ) {
 	int ndim = m1.jcoef[0];
 	int xd,yd,zd,x1,x2,y1,y2,z1,z2;
 	int xdiff,ydiff,zdiff;
 	int n,flag;
 	int r = -1;

 	switch( ndim ) {
 		case 3: xd = m1.jcoef[1];
 			yd = m1.jcoef[2];
 			zd = m1.jcoef[3];
 			z1 = i/(xd*yd);
 			y1 = (i/xd)%yd;
 			x1 = i%xd;
 			z2 = j/(xd*yd);
 			y2 = (j/xd)%yd;
 			x2 = j%xd;
 			zdiff = z2 - z1;
 			ydiff = y2 - y1;
 			xdiff = x2 - x1;
 			n = 13;
 			flag = 0;
 			if( (xdiff>=-1) and (xdiff<=1) ) {
 				n += xdiff;
 				flag++;
 			}
 			if( (ydiff>=-1) and (ydiff<=1) ) {
 				n += ydiff*3;
 				flag++;
 			}
 			if( (zdiff>=-1) and (zdiff<=1) ) {
 				n += zdiff*9;
 				flag++;
 			}
 			if( flag == 3 ) {
 				m1.coef[n] = v;
 				r = 0;
 			}
 			break;
 		case 2: xd = m1.jcoef[1];
 			yd = m1.jcoef[2];
 			y1 = (i/xd)%yd;
 			x1 = i%xd;
 			y2 = (j/xd)%yd;
 			x2 = j%xd;
 			ydiff = y2 - y1;
 			xdiff = x2 - x1;
 			n = 4;
 			flag = 0;
 			if( (xdiff>=-1) and (xdiff<=1) ) {
 				n += xdiff;
 				flag++;
 			}
 			if( (ydiff>=-1) and (ydiff<=1) ) {
 				n += ydiff*3;
 				flag++;
 			}
 			if( flag == 2 ) {
 				m1.coef[n] = v;
 				r = 0;
 			}
 			break;
 		case 1: xdiff = (i-j);
 			if( (xdiff>=-1) and (xdiff<=1) ) {
 				m1.coef[1+xdiff] = v;
 				r = 0;
 			}
 			break;
 		case 0: m1.coef[0] = v;
 			r = 0;
 			break;
 	}

 	return( r );
 }

 real sprget( sparse m1, int i, int j ) {
 	int ndim = m1.jcoef[0];
 	int diff = (i-j);
 	int xd,yd,zd,x1,y1,z1,x2,y2,z2;
 	int xdiff,ydiff,zdiff;
 	int n,flag;

 	switch( ndim ) {
 		case 3: xd = m1.jcoef[1];
 			yd = m1.jcoef[2];
 			zd = m1.jcoef[3];
 			z1 = i/(xd*yd);
 			y1 = (i/xd)%yd;
 			x1 = i%xd;
 			z2 = j/(xd*yd);
 			y2 = (j/xd)%yd;
 			x2 = j%xd;
 			zdiff = z2 - z1;
 			ydiff = y2 - y1;
 			xdiff = x2 - x1;
 			n = 13;
 			flag = 0;
 			if( (xdiff>=-1) and (xdiff<=1) ) {
 				n += xdiff;
 				flag++;
 			}
 			if( (ydiff>=-1) and (ydiff<=1) ) {
 				n += ydiff*3;
 				flag++;
 			}
 			if( (zdiff>=-1) and (zdiff<=1) ) {
 				n += zdiff*9;
 				flag++;
 			}
 			if( flag == 3 ) {
 				return( m1.coef[n] );
 			}
 			break;
 		case 2: xd = m1.jcoef[1];
 			yd = m1.jcoef[2];
 			y1 = (i/xd)%yd;
 			x1 = i%xd;
 			y2 = (j/xd)%yd;
 			x2 = j%xd;
 			ydiff = y2 - y1;
 			xdiff = x2 - x1;
 			n = 4;
 			flag = 0;
 			if( (xdiff>=-1) and (xdiff<=1) ) {
 				n += xdiff;
 				flag++;
 			}
 			if( (ydiff>=-1) and (ydiff<=1) ) {
 				n += ydiff*3;
 				flag++;
 			}
 			if( flag == 2 ) {
 				return( m1.coef[n] );
 			}
 			break;
 		case 1: xdiff = (i-j);
 			if( xdiff == 0 ) {
 				return( m1.coef[1] );
 			} else if( diff == 1 ) {
 				return( m1.coef[2] );
 			} else if( diff == -1 ) {
 				return( m1.coef[0] );
 			}
 			break;
 		case 0: return( m1.coef[0] );
 			break;
 	}

 	return( 0.0 );
 }

 int sprvec( real alpha, sparse A, vector x, real beta, vector y ) {
 	int ndim;
 	int rtn = 0;

 	ndim = A.jcoef[0];
 	switch( ndim ) {
 		case 0: rtn = sprmpy0( alpha, A, x, beta, y );
 			break;
 		case 1: rtn = sprmpy1( alpha, A, x, beta, y );
 			break;
 		case 2: rtn = sprmpy2( alpha, A, x, beta, y );
 			break;
 		case 3: rtn = sprmpy3( alpha, A, x, beta, y );
 			break;
 		default:
 			rtn = -1;			
 	}

 	return( rtn );
 }

 int sprmpy0( real alpha, sparse A, vector x, real beta, vector y ) {
 	int i,r,n;
 	real* yv = y.data;
 	real* xv = x.data;
 	int df = A.jcoef[1];
 	int ne = A.jcoef[2];

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		/* temporarily zero-out the x-vector entries */
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+3];
 			wksp_x[i] = xv[n];
 			xv[n] = 0.0;
 		}
 	}
 	if( df > 0 ) {
 		/* save original values in wksp_y */
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+3];
 			wksp_y[i] = yv[n];
 		}
 	}

 	r = vvaddb( alpha*A.coef[0], x, beta, y );

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+3];
 			xv[n] = wksp_x[i];
 		}
 	}
 	if( df > 0 ) {
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+3];
 			yv[n] = alpha*wksp_x[i] + beta*wksp_y[i];
 		}
 	}

 	return( r );
 }

 int sprmpy1( real alpha, sparse A, vector x, real beta, vector y ) {
 	int   i,n;
 	int   sz = A.rows;
 	int   xd;
 	int   df = A.jcoef[2];
 	int   ne = A.jcoef[3];
 	real* xv = x.data;
 	real* yv = y.data;
 	real  mp0,mp1,mp2;
 	MPI_Request recvreqU,recvreqD;
 	MPI_Request sendreqU,sendreqD;
 	MPI_Status  recvstatU,recvstatD;
 	int NextPE?,PrevPE?;

 	NextPE? = (SelfPE+1)%NumPEs;
 	PrevPE? = (SelfPE+NumPEs-1)%NumPEs;

 	/* note that global x-dim is stored in jcoef[1] */
 	/* but we want local x-dim, so use sz instead   */
 	xd = sz;

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		/* temporarily zero-out the x-vector entries */
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+4];
 			wksp_x[i] = xv[n];
 			xv[n] = 0.0;
 		}
 	}
 	if( df > 0 ) {
 		/* save original values in wksp_y */
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+4];
 			wksp_y[i] = yv[n];
 		}
 	}

 	if( SelfPE != 0 ) {
 		MPI_Irecv( Workspace.data, 1, MPI_SSREAL, PrevPE?, A.msgtag,
 			MPI_COMM_WORLD, &recvreqD );
 		MPI_Isend( xv, 1, MPI_SSREAL, PrevPE?, A.msgtag+1,
 			MPI_COMM_WORLD, &sendreqU );
 	}
 	if( SelfPE != (NumPEs-1) ) {
 		MPI_Irecv( Workspace.data+1, 1, MPI_SSREAL, NextPE?, 
 			A.msgtag+1, MPI_COMM_WORLD, &recvreqU );
 		MPI_Isend( xv+sz-1, 1, MPI_SSREAL, NextPE?, A.msgtag,
 			MPI_COMM_WORLD, &sendreqD );
 	}

 	mp0 = alpha*A.coef[0];
 	mp1 = alpha*A.coef[1];
 	mp2 = alpha*A.coef[2];

 	for(i=1;i<(xd-1);i++) {
 		yv[i] = mp0*xv[i-1] + mp1*xv[i] + mp2*xv[i+1] + beta*yv[i];
 	}

 	/* do the boundaries - either by comm or with NBCs? */
 	if( SelfPE == 0 ) {
 		yv[0] = (mp0+mp1)*xv[0] + mp2*xv[1] + beta*yv[0];
 	} else {
 		MPI_Wait( &recvreqD, &recvstatD );
 		MPI_Request_free( &sendreqU );
 		yv[0] = mp0*Workspace.data[0] + mp1*xv[0] + mp2*xv[1]
 			+ beta*yv[0];
 	}
 	if( SelfPE == (NumPEs-1) ) {
 		i = xd-1;
 		yv[i] = mp0*xv[i-1] + (mp1+mp2)*xv[i] + beta*yv[i];
 	} else {
 		MPI_Wait( &recvreqU, &recvstatU );
 		MPI_Request_free( &sendreqD );
 		i = xd-1;
 		yv[i] = mp0*xv[i-1] + mp1*xv[i] + mp2*Workspace.data[1]
 			+ beta*yv[i];
 	}

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+4];
 			xv[n] = wksp_x[i];
 		}
 	}
 	if( df > 0 ) {
 		for(i=0;i<ne;i++) {
 			n = A.jcoef[i+4];
 			yv[n] = alpha*wksp_x[i] + beta*wksp_y[i];
 		}
 	}

 	return( 0 );
 }

 int sprmpy2( real alpha, sparse A, vector x, real beta, vector y ) {
 	int   i,j,k,o;
 	int   sz = A.rows;
 	int   xd = A.jcoef[1];
 	int   yd;
 	int   df = A.jcoef[3];
 	int   ne = A.jcoef[4];
 	real* xv = x.data;
 	real* yv = y.data;
 	real* zv = NULL;
 	real  mp0,mp1,mp2,mp3,mp4,mp5,mp6,mp7,mp8,mpx;
 	MPI_Request recvreqU,recvreqD;
 	MPI_Request sendreqU,sendreqD;
 	MPI_Status  recvstatU,recvstatD;
 	int NextPE?,PrevPE?;

 	NextPE? = (SelfPE+1)%NumPEs;
 	PrevPE? = (SelfPE+NumPEs-1)%NumPEs;

 	/* note that global y-dim is stored in jcoef[2] */
 	/* but we want local y-dim = global-ydim/NumPEs */
 	/* also local y-dim = sz/xd                     */
 	yd = sz/xd;

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		/* temporarily zero-out the x-vector entries */
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+5];
 			wksp_x[i] = xv[k];
 			xv[k] = 0.0;
 		}
 	}
 	if( df > 0 ) {
 		/* save original values in wksp_y */
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+5];
 			wksp_y[i] = yv[k];
 		}
 	}

 	if( SelfPE != 0 ) {
 		MPI_Irecv( Workspace.data, xd, MPI_SSREAL, PrevPE?, A.msgtag,
 			MPI_COMM_WORLD, &recvreqD );
 		MPI_Isend( xv, xd, MPI_SSREAL, PrevPE?, A.msgtag+1,
 			MPI_COMM_WORLD, &sendreqU );
 	}
 	if( SelfPE != (NumPEs-1) ) {
 		MPI_Irecv( Workspace.data+xd, xd, MPI_SSREAL, NextPE?, 
 			A.msgtag+1, MPI_COMM_WORLD, &recvreqU );
 		MPI_Isend( xv+sz-xd, xd, MPI_SSREAL, NextPE?, A.msgtag,
 			MPI_COMM_WORLD, &sendreqD );
 	}

 	mp0 = alpha*A.coef[0];
 	mp1 = alpha*A.coef[1];
 	mp2 = alpha*A.coef[2];
 	mp3 = alpha*A.coef[3];
 	mp4 = alpha*A.coef[4];
 	mp5 = alpha*A.coef[5];
 	mp6 = alpha*A.coef[6];
 	mp7 = alpha*A.coef[7];
 	mp8 = alpha*A.coef[8];

 	/* do the bulk of the domain - interior points */
 	k = xd;
 	for(j=1;j<(yd-1);j++) {
 		yv[k] =   beta*yv[k]
 			+ mp1*xv[k-xd]            + mp2*xv[k-xd+1] 
 			+ (mp0+mp3+mp4+mp6)*xv[k] + mp5*xv[k+1]
 			+ mp7*xv[k+xd]            + mp8*xv[k+xd+1];
 		k++;
 		for(i=1;i<(xd-1);i++) {
 			yv[k] =   beta*yv[k]
 				+ mp0*xv[k-xd-1] + mp1*xv[k-xd] + mp2*xv[k-xd+1]
 				+ mp3*xv[k-1]    + mp4*xv[k]    + mp5*xv[k+1]
 				+ mp6*xv[k+xd-1] + mp7*xv[k+xd] + mp8*xv[k+xd+1];
 			k++;
 		}
 		yv[k] =   beta*yv[k]
 			+ mp0*xv[k-xd-1] + mp1*xv[k-xd]
 			+ mp3*xv[k-1]    + (mp2+mp4+mp5+mp8)*xv[k] 
 			+ mp6*xv[k+xd-1] + mp7*xv[k+xd];
 		k++;
 	}

 	/* top edge of domain - either use comm values or NBC */
 	if( SelfPE == (NumPEs-1) ) {
 		k = sz - xd;
 		yv[k] =   beta*yv[k]
 			+ mp1*xv[k-xd] + mp2*xv[k-xd+1] 
 			+ (mp3+mp4+mp0+mp6+mp7+mp8)*xv[k] + mp5*xv[k+1];
 		k++;
 		mpx = mp4 + mp6 + mp7 + mp8;
 		for(i=1;i<(xd-1);i++) {
 			yv[k] =   beta*yv[k]
 				+ mp0*xv[k-xd-1] + mp1*xv[k-xd] + mp2*xv[k-xd+1]
 				+ mp3*xv[k-1]    + mpx*xv[k]    + mp5*xv[k+1];
 			k++;
 		}
 		yv[k] =   beta*yv[k]
 			+ mp0*xv[k-xd-1] + mp1*xv[k-xd]
 			+ mp3*xv[k-1]    + (mp4+mp5+mp2+mp6+mp7+mp8)*xv[k];
 	} else {
 		MPI_Wait( &recvreqU, &recvstatU );
 		MPI_Request_free( &sendreqD );
 		zv = Workspace.data + xd;

 		k = sz - xd;
 		o = -k;
 		yv[k] =   beta*yv[k]
 			+ mp1*xv[k-xd]            + mp2*xv[k-xd+1] 
 			+ (mp0+mp3+mp4+mp6)*xv[k] + mp5*xv[k+1]
 			+ mp7*zv[k+o]             + mp8*zv[k+o+1];
 		k++;
 		for(i=1;i<(xd-1);i++) {
 			yv[k] =   beta*yv[k]
 				+ mp0*xv[k-xd-1] + mp1*xv[k-xd] + mp2*xv[k-xd+1]
 				+ mp3*xv[k-1]    + mp4*xv[k]    + mp5*xv[k+1]
 				+ mp6*zv[k+o-1]  + mp7*zv[k+o]  + mp8*zv[k+o+1];
 			k++;
 		}
 		yv[k] =   beta*yv[k]
 			+ mp0*xv[k-xd-1] + mp1*xv[k-xd]
 			+ mp3*xv[k-1]    + (mp2+mp4+mp5+mp8)*xv[k] 
 			+ mp6*zv[k+o-1]  + mp7*zv[k+o];
 	}
 	/* bottom edge of domain - either use comm values or NBC */
 	if( SelfPE == 0 ) {
 		k = 0;
 		yv[k] =   beta*yv[k]
 			+ (mp0+mp1+mp2+mp3+mp4+mp6)*xv[k] + mp5*xv[k+1]
 			+ mp7*xv[k+xd]                    + mp8*xv[k+xd+1];
 		k++;
 		mpx = mp4 + mp0 + mp1 + mp2;
 		for(i=1;i<(xd-1);i++) {
 			yv[k] =   beta*yv[k]
 				+ mp3*xv[k-1]    + mpx*xv[k]    + mp5*xv[k+1]
 				+ mp6*xv[k+xd-1] + mp7*xv[k+xd] + mp8*xv[k+xd+1];
 			k++;
 		}
 		yv[k] =   beta*yv[k]
 			+ mp3*xv[k-1]    + (mp0+mp1+mp2+mp4+mp5+mp8)*xv[k] 
 			+ mp6*xv[k+xd-1] + mp7*xv[k+xd];
 	} else {
 		MPI_Wait( &recvreqD, &recvstatD );
 		MPI_Request_free( &sendreqU );
 		zv = Workspace.data;

 		k = 0;
 		yv[k] =   beta*yv[k]
 			+ mp1*zv[k]               + mp2*zv[k+1] 
 			+ (mp0+mp3+mp4+mp6)*xv[k] + mp5*xv[k+1]
 			+ mp7*xv[k+xd]            + mp8*xv[k+xd+1];
 		k++;
 		for(i=1;i<(xd-1);i++) {
 			yv[k] =   beta*yv[k]
 				+ mp0*zv[k-1]    + mp1*zv[k]    + mp2*zv[k+1]
 				+ mp3*xv[k-1]    + mp4*xv[k]    + mp5*xv[k+1]
 				+ mp6*xv[k+xd-1] + mp7*xv[k+xd] + mp8*xv[k+xd+1];
 			k++;
 		}
 		yv[k] =   beta*yv[k]
 			+ mp0*zv[k-1]    + mp1*zv[k]
 			+ mp3*xv[k-1]    + (mp2+mp4+mp5+mp8)*xv[k] 
 			+ mp6*xv[k+xd-1] + mp7*xv[k+xd];
 	}

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+5];
 			xv[k] = wksp_x[i];
 		}
 	}
 	if( df > 0 ) {
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+5];
 			yv[k] = alpha*wksp_x[i] + beta*wksp_y[i];
 		}
 	}

 	return( 0 );
 }

 int sprmpy3( real alpha, sparse A, vector x, real beta, vector y ) {
 	int   i,j,k,m,o;
 	real* xv = x.data;
 	real* yv = y.data;
 	real* zv;
 	int   xd = A.jcoef[1];
 	int   yd = A.jcoef[2];
 	int   zd;
 	int   df = A.jcoef[4];
 	int   ne = A.jcoef[5];
 	int   xy;
 	int   sz = A.rows;
 	real  mpx,mpy,mpz,mpd,mpdd;
 	MPI_Request recvreqU,recvreqD;
 	MPI_Request sendreqU,sendreqD;
 	MPI_Status  recvstatU,recvstatD;
 	int NextPE?,PrevPE?;
 	int pp;
 	static int counter = 0;

 	counter++;

 	/* compute z as sz/(xd*yd) */
 	xy = xd*yd;
 	zd = sz/xy;

 	NextPE? = (SelfPE+1)%NumPEs;
 	PrevPE? = (SelfPE+NumPEs-1)%NumPEs;

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		/* temporarily zero-out the x-vector entries */
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+6];
 			wksp_x[i] = xv[k];
 			xv[k] = 0.0;
 		}
 	}
 	if( df > 0 ) {
 		/* save original values in wksp_y */
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+6];
 			wksp_y[i] = yv[k];
 		}
 	}

 	if( SelfPE != 0 ) {
 		MPI_Irecv( Workspace.data, xy, MPI_SSREAL, PrevPE?, A.msgtag,
 			MPI_COMM_WORLD, &recvreqD );
 		MPI_Isend( xv, xy, MPI_SSREAL, PrevPE?, A.msgtag+1,
 			MPI_COMM_WORLD, &sendreqU );
 	}
 	if( SelfPE != (NumPEs-1) ) {
 		MPI_Irecv( Workspace.data+xy, xy, MPI_SSREAL, NextPE?, 
 			A.msgtag+1, MPI_COMM_WORLD, &recvreqU );
 		MPI_Isend( xv+sz-xy, xy, MPI_SSREAL, NextPE?, A.msgtag,
 			MPI_COMM_WORLD, &sendreqD );
 	}

 	mpx = alpha*A.coef[12];
 	mpy = alpha*A.coef[10];
 	mpz = alpha*A.coef[4];
 	mpd = alpha*A.coef[13];

 	/* : interior planes */
 	m = xy;
 	for(k=1;k<(zd-1);k++) {
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m+xd]
 			+ mpz*( xv[m-xy] + xv[m+xy] );
 		m++;
 		mpdd = mpd+mpy;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] =  beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m+xd]
 				+ mpz*( xv[m+xy] + xv[m-xy] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m+xd]
 			+ mpz*( xv[m-xy] + xv[m+xy] );
 		m++;
 		/* interior of plane */
 		mpdd = mpd+mpx;
 		for(j=1;j<(yd-1);j++) {
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*xv[m+1]
 				+ mpy*( xv[m+xd] + xv[m-xd] )
 				+ mpz*( xv[m+xy] + xv[m-xy] );
 			m++;
 			for(i=1;i<(xd-1);i++) {
 				yv[m] = beta*yv[m] + mpd*xv[m]
 					+ mpx*( xv[m-1] + xv[m+1] )
 					+ mpy*( xv[m-xd] + xv[m+xd] )
 					+ mpz*( xv[m+xy] + xv[m-xy] );
 				m++;
 			}
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*xv[m-1]
 				+ mpy*( xv[m-xd] + xv[m+xd] )
 				+ mpz*( xv[m+xy] + xv[m-xy] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m-xd]
 			+ mpz*( xv[m+xy] + xv[m-xy] );
 		m++;
 		mpdd = mpd+mpy;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m-xd] 
 				+ mpz*( xv[m+xy] + xv[m-xy] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m-xd]
 			+ mpz*( xv[m+xy] + xv[m-xy] );
 		m++;
 	}

 	/* bottom plane of domain - either use comm values or NBC */
 	if( SelfPE == 0 ) {
 		m = 0;
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m+xd]
 			+ mpz*xv[m+xy];
 		m++;
 		mpdd = mpd+mpy+mpz;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] =  beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m+xd]
 				+ mpz*xv[m+xy];
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m+xd]
 			+ mpz*xv[m+xy];
 		m++;
 		mpdd = mpd+mpz;
 		for(j=1;j<(yd-1);j++) {
 			yv[m] = beta*yv[m] + (mpd+mpx+mpz)*xv[m]
 				+ mpx*xv[m+1]
 				+ mpy*( xv[m+xd] + xv[m-xd] )
 				+ mpz*xv[m+xy];
 			m++;
 			for(i=1;i<(xd-1);i++) {
 				yv[m] = beta*yv[m] + mpdd*xv[m]
 					+ mpx*( xv[m-1] + xv[m+1] )
 					+ mpy*( xv[m-xd] + xv[m+xd] )
 					+ mpz*xv[m+xy];
 				m++;
 			}
 			yv[m] = beta*yv[m] + (mpd+mpx+mpz)*xv[m]
 				+ mpx*xv[m-1]
 				+ mpy*( xv[m-xd] + xv[m+xd] )
 				+ mpz*xv[m+xy];
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m-xd]
 			+ mpz*xv[m+xy];
 		m++;
 		mpdd = mpd+mpy+mpz;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m-xd] 
 				+ mpz*xv[m+xy];
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m-xd]
 			+ mpz*xv[m+xy];
 		m++;
 	} else {
 		MPI_Wait( &recvreqD, &recvstatD );
 		MPI_Request_free( &sendreqU );
 		zv = Workspace.data;

 		m = 0;
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m+xd]
 			+ mpz*( xv[m+xy] + zv[m] );
 		m++;
 		mpdd = mpd+mpy;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] =  beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m+xd]
 				+ mpz*( xv[m+xy] + zv[m] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m+xd]
 			+ mpz*( xv[m+xy] + zv[m] );
 		m++;
 		for(j=1;j<(yd-1);j++) {
 			yv[m] = beta*yv[m] + (mpd+mpx)*xv[m]
 				+ mpx*xv[m+1]
 				+ mpy*( xv[m+xd] + xv[m-xd] )
 				+ mpz*( xv[m+xy] + zv[m] );
 			m++;
 			for(i=1;i<(xd-1);i++) {
 				yv[m] = beta*yv[m] + mpd*xv[m]
 					+ mpx*( xv[m-1] + xv[m+1] )
 					+ mpy*( xv[m-xd] + xv[m+xd] )
 					+ mpz*( xv[m+xy] + zv[m] );
 				m++;
 			}
 			yv[m] = beta*yv[m] + (mpd+mpx)*xv[m]
 				+ mpx*xv[m-1]
 				+ mpy*( xv[m-xd] + xv[m+xd] )
 				+ mpz*( xv[m+xy] + zv[m] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m-xd]
 			+ mpz*( xv[m+xy] + zv[m] );
 		m++;
 		mpdd = mpd+mpy;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m-xd] 
 				+ mpz*( xv[m+xy] + zv[m] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m-xd]
 			+ mpz*( xv[m+xy] + zv[m] );
 		m++;
 	}
 	if( SelfPE == (NumPEs-1) ) {
 		m = sz - xy;
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m+xd]
 			+ mpz*xv[m-xy];
 		m++;
 		mpdd = mpd+mpy+mpz;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] =  beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m+xd]
 				+ mpz*xv[m-xy];
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m+xd]
 			+ mpz*xv[m-xy];
 		m++;
 		mpdd = mpd+mpz;
 		for(j=1;j<(yd-1);j++) {
 			yv[m] = beta*yv[m] + (mpd+mpx+mpz)*xv[m]
 				+ mpx*xv[m+1]
 				+ mpy*( xv[m+xd] + xv[m-xd] )
 				+ mpz*xv[m-xy];
 			m++;
 			for(i=1;i<(xd-1);i++) {
 				yv[m] = beta*yv[m] + mpdd*xv[m]
 					+ mpx*( xv[m-1] + xv[m+1] )
 					+ mpy*( xv[m-xd] + xv[m+xd] )
 					+ mpz*xv[m-xy];
 				m++;
 			}
 			yv[m] = beta*yv[m] + (mpd+mpx+mpz)*xv[m]
 				+ mpx*xv[m-1]
 				+ mpy*( xv[m-xd] + xv[m+xd] )
 				+ mpz*xv[m-xy];
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m-xd]
 			+ mpz*xv[m-xy];
 		m++;
 		mpdd = mpd+mpy+mpz;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m-xd] 
 				+ mpz*xv[m-xy];
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy+mpz)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m-xd]
 			+ mpz*xv[m-xy];
 		m++;
 	} else {
 		MPI_Wait( &recvreqU, &recvstatU );
 		MPI_Request_free( &sendreqD );
 		zv = Workspace.data + xy;

 		m = sz - xy;
 		o = -m;
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m+xd]
 			+ mpz*( xv[m-xy] + zv[m+o] );
 		m++;
 		mpdd = mpd+mpy;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] =  beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m+xd]
 				+ mpz*( xv[m-xy] + zv[m+o] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m+xd]
 			+ mpz*( xv[m-xy] + zv[m+o] );
 		m++;
 		for(j=1;j<(yd-1);j++) {
 			yv[m] = beta*yv[m] + (mpd+mpx)*xv[m]
 				+ mpx*xv[m+1]
 				+ mpy*( xv[m+xd] + xv[m-xd] )
 				+ mpz*( xv[m-xy] + zv[m+o] );
 			m++;
 			for(i=1;i<(xd-1);i++) {
 				yv[m] = beta*yv[m] + mpd*xv[m]
 					+ mpx*( xv[m-1] + xv[m+1] )
 					+ mpy*( xv[m-xd] + xv[m+xd] )
 					+ mpz*( xv[m-xy] + zv[m+o] );
 				m++;
 			}
 			yv[m] = beta*yv[m] + (mpd+mpx)*xv[m]
 				+ mpx*xv[m-1]
 				+ mpy*( xv[m-xd] + xv[m+xd] )
 				+ mpz*( xv[m-xy] + zv[m+o] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m+1]
 			+ mpy*xv[m-xd]
 			+ mpz*( xv[m-xy] + zv[m+o] );
 		m++;
 		mpdd = mpd+mpy;
 		for(i=1;i<(xd-1);i++) {
 			yv[m] = beta*yv[m] + mpdd*xv[m]
 				+ mpx*( xv[m-1] + xv[m+1] )
 				+ mpy*xv[m-xd] 
 				+ mpz*( xv[m-xy] + zv[m+o] );
 			m++;
 		}
 		yv[m] = beta*yv[m] + (mpd+mpx+mpy)*xv[m]
 			+ mpx*xv[m-1]
 			+ mpy*xv[m-xd]
 			+ mpz*( xv[m-xy] + zv[m+o] );
 		m++;
 	}

 	/* take care of DBCs? */
 	if( ne > 0 ) {
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+6];
 			xv[k] = wksp_x[i];
 		}
 	}
 	if( df > 0 ) {
 		for(i=0;i<ne;i++) {
 			k = A.jcoef[i+6];
 			yv[k] = alpha*wksp_x[i] + beta*wksp_y[i];
 		}
 	}

 	return( 0 );
 }
Edit - History - Print - Recent Changes - Search
Page last modified on November 23, 2009, at 10:24 AM