////////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2008 The Regents of the University of California
//
// This file is part of Qbox
//
// Qbox is distributed under the terms of the GNU General Public License
// as published by the Free Software Foundation, either version 2 of
// the License, or (at your option) any later version.
// See the file COPYING in the root directory of this distribution
// or .
//
////////////////////////////////////////////////////////////////////////////////
//
// jade.C
//
////////////////////////////////////////////////////////////////////////////////
#include
#include
#include
#include
#include
#include // epsilon
#include
#include
#ifdef USE_MPI
#include
#endif
#ifdef SCALAPACK
#include "blacs.h"
#endif
#include "Context.h"
#include "Matrix.h"
#include "blas.h"
#include "Timer.h"
using namespace std;
int jade(int maxsweep, double tol, vector a,
DoubleMatrix& u, vector >& adiag)
{
Timer tm_comm;
const bool debug_diag_sum = false;
const double eps = numeric_limits::epsilon();
if (tol= ctxt.npcol(), all process columns are active
// otherwise, the last active process column has index num_nblocks-1
const int num_nblocks = u.n() / u.nb() + ( u.n()%u.nb() == 0 ? 0 : 1 );
const int last_active_process_col = min(ctxt.npcol()-1, num_nblocks-1);
// initialize u with the identity
u.identity();
// eigenvalue array
adiag.resize(a.size());
for ( int k = 0; k < a.size(); k++ )
adiag[k].resize(a[k]->n());
// check if the local number of rows is odd
const bool nloc_odd = ( a[0]->nloc()%2 != 0 );
// if nloc is odd, auxiliary arrays are created to host an extra column
// for both a[k] and u
vector > a_aux(a.size());
vector u_aux;
if ( nloc_odd )
{
for ( int k = 0; k < a.size(); k++ )
a_aux[k].resize(mloc);
u_aux.resize(mloc);
}
// compute local number of pairs nploc
const int nploc = (a[0]->nloc()+1)/2;
// dimension of top and bot arrays is nploc: local number of pairs
deque top(nploc), bot(nploc);
// compute total number of pairs np
int np = nploc;
ctxt.isum('r',1,1,&np,1);
//cout << ctxt.mype() << ": np=" << np << endl;
// initialize top and bot arrays
// the pair i is (top[i],bot[i])
// top[i] is the local index of the top column of pair i
// bot[i] is the local index of the bottom column of pair i
for ( int i = 0; i < nploc; i++ )
top[i] = i;
for ( int i = 0; i < nploc; i++ )
bot[nploc-i-1] = nploc+i;
// if top[i] or bot[i] == nloc,the data resides in the array a_aux or u_aux
// jglobal: global column index
// jglobal[i] is the global column index of the column residing in
// the local vector i. If nloc_odd and i==2*nploc-1, jglobal[i] == -1
vector jglobal(2*nploc,-1);
for ( int jblock = 0; jblock < a[0]->nblocks(); jblock++ )
for ( int y = 0; y < a[0]->nbs(jblock); y++ )
jglobal[y + jblock*a[0]->nb()] = a[0]->j(jblock,y);
// store addresses of columns of a and of u in acol and ucol
vector > acol(a.size());
vector ucol(2*nploc);
for ( int k = 0; k < a.size(); k++ )
{
acol[k].resize(2*nploc);
for ( int i = 0; i < a[k]->nloc(); i++ )
acol[k][i] = a[k]->valptr(i*a[k]->mloc());
// if nloc is odd, store the address of vector 2*nploc-1
if ( nloc_odd )
acol[k][2*nploc-1] = &a_aux[k][0];
}
for ( int i = 0; i < u.nloc(); i++ )
ucol[i] = u.valptr(i*u.mloc());
// if nloc is odd, store the address of vector 2*nploc-1
if ( nloc_odd )
ucol[2*nploc-1] = &u_aux[0];
//for ( int i = 0; i < acol.size(); i++ )
// cout << ctxt.mype() << ": acol[" << i << "]=" << acol[i] << endl;
//for ( int i = 0; i < ucol.size(); i++ )
// cout << ctxt.mype() << ": ucol[" << i << "]=" << ucol[i] << endl;
// the vectors of the pair (top[i],bot[i]) are located at
// addresses acol[top[i]] and acol[bot[i]]
bool done = false;
int nsweep = 0;
// allocate matrix element packed array apq
// apq[3*ipair + k*3*nploc] = apq[k][ipair]
// apq[3*ipair+1 + k*3*nploc] = app[k][ipair]
// apq[3*ipair+2 + k*3*nploc] = aqq[k][ipair]
vector apq(a.size()*3*nploc);
double diag_sum = 0.0, previous_diag_sum = 0.0;
while ( !done )
{
// sweep: process local pairs and rotate 2*np-1 times
nsweep++;
double diag_change = 0.0;
for ( int irot = 0; irot < 2*np-1; irot++ )
{
//cout << ctxt.mype() << ": top[i]: ";
//for ( int i = 0; i < nploc; i++ )
// cout << setw(3) << top[i];
//cout << endl;
//cout << ctxt.mype() << ": bot[i]: ";
//for ( int i = 0; i < nploc; i++ )
// cout << setw(3) << bot[i];
//cout << endl;
//cout << ctxt.mype() << ": jglobal[top[i]]: ";
//for ( int i = 0; i < nploc; i++ )
// cout << setw(3) << jglobal[top[i]];
//cout << endl;
//cout << ctxt.mype() << ": jglobal[bot[i]]: ";
//for ( int i = 0; i < nploc; i++ )
// cout << setw(3) << jglobal[bot[i]];
//cout << endl;
// perform Jacobi rotations for all local pairs
// compute off-diagonal matrix elements apq for all pairs
// skip the pair if one or both of the vectors is a dummy vector
// i.e. a vector having jglobal==-1
#pragma omp parallel for
for ( int k = 0; k < a.size(); k++ )
{
for ( int ipair = 0; ipair < nploc; ipair++ )
{
const int iapq = 3*ipair + k*3*nploc;
//cout << ctxt.mype() << ": computing apq for global pair "
// << jglobal[top[ipair]] << " " << jglobal[bot[ipair]] << endl;
apq[iapq] = 0.0;
apq[iapq+1] = 0.0;
apq[iapq+2] = 0.0;
if ( jglobal[top[ipair]] >= 0 && jglobal[bot[ipair]] >= 0 )
{
const double *ap = acol[k][top[ipair]];
const double *aq = acol[k][bot[ipair]];
const double *up = ucol[top[ipair]];
const double *uq = ucol[bot[ipair]];
int one = 1;
apq[iapq] = ddot(&mloc,ap,&one,uq,&one);
apq[iapq+1] = ddot(&mloc,ap,&one,up,&one);
apq[iapq+2] = ddot(&mloc,aq,&one,uq,&one);
}
}
} // for k
// apq now contains partial sums of matrix elements
tm_comm.start();
int len = apq.size();
ctxt.dsum('c',len,1,&apq[0],len);
tm_comm.stop();
// apq now contains the matrix elements
#pragma omp parallel for
for ( int ipair = 0; ipair < nploc; ipair++ )
{
if ( jglobal[top[ipair]] >= 0 &&
jglobal[bot[ipair]] >= 0 )
{
// compute rotation sine and cosine
// Cardoso-Souloumiac expressions for the rotation angle
// compute 2x2 matrix g
double g11 = 0.0, g12 = 0.0, g22 = 0.0;
for ( int k = 0; k < a.size(); k++ )
{
const int iapq = 3*ipair + k*3*nploc;
const double app = apq[iapq+1];
const double aqq = apq[iapq+2];
const double h1 = app - aqq;
const double h2 = 2.0 * apq[iapq];
g11 += h1 * h1;
g12 += h1 * h2;
g22 += h2 * h2;
}
// the matrix g is real, symmetric
// compute Jacobi rotation diagonalizing g
double c = 1.0, s = 0.0, e1 = g11, e2 = g22;
if ( g12*g12 > eps * eps * fabs(g11*g22) )
{
double tau = 0.5 * ( g22 - g11 ) / g12;
double t = 1.0 / ( fabs(tau) + sqrt(1.0 + tau*tau));
if ( tau < 0.0 ) t *= -1.0;
c = 1.0 / sqrt(1.0 + t*t);
s = t * c;
// eigenvalues
e1 -= t * g12;
e2 += t * g12;
}
// components of eigenvector associated with the largest eigenvalue
double x,y;
if ( e1 > e2 )
{
x = c;
y = -s;
}
else
{
x = s;
y = c;
}
// choose eigenvector with x positive to ensure small angle
if ( x < 0.0 )
{
x = -x;
y = -y;
}
// compute Jacobi rotation R(p,q)
c = sqrt(0.5*(x+1.0));
s = y / sqrt(2.0*(x+1.0));
// apply the rotation R(p,q)
//
// | c s |
// R(p,q) = | |
// | -s c |
// U := U * R(p,q)^T
// A := A * R(p,q)^T
// apply rotation to columns of a and u
// the drot function computes
// c*x + s*y -> x
//-s*x + c*y -> y
// call drot with args c, -s
//cout << " p=" << jglobal[top[ipair]]
// << " q=" << jglobal[bot[ipair]]
// << " g11=" << g11 << " g22=" << g22
// << " g12=" << g12 << endl;
//cout << " c=" << c << " s=" << s << endl;
int one = 1;
for ( int k = 0; k < a.size(); k++ )
{
double *ap = acol[k][top[ipair]];
double *aq = acol[k][bot[ipair]];
drot(&mloc,ap,&one,aq,&one,&c,&s);
}
double *up = ucol[top[ipair]];
double *uq = ucol[bot[ipair]];
drot(&mloc,up,&one,uq,&one,&c,&s);
// new value of off-diag element apq
double diag_change_ipair = 0.0;
for ( int k = 0; k < a.size(); k++ )
{
const int iapq = 3*ipair + k*3*nploc;
const double app = apq[iapq+1];
const double aqq = apq[iapq+2];
const double apqnew = (c*c-s*s)*apq[iapq] - c*s*(app-aqq);
// accumulate change in sum of squares of diag elements
// note negative sign: decrease in offdiag is increase in diag
diag_change_ipair -= 2.0 * ( apqnew*apqnew - apq[iapq]*apq[iapq] );
}
#pragma omp critical
diag_change -= diag_change_ipair;
}
} // for ipair
// all local pairs have been processed
// rotate top and bot arrays
if ( nploc > 0 )
{
bot.push_back(top.back());
top.pop_back();
top.push_front(bot.front());
bot.pop_front();
// make rotation skip element 0 on the first process column
// if my process column is zero, swap top[0] and top[1]
if ( ctxt.mycol() == 0 )
{
if ( nploc > 1 )
{
int tmp = top[0];
top[0] = top[1];
top[1] = tmp;
}
else
{
// if there is only one local pair, exchange top[0] and bot[0]
int tmp = top[0];
top[0] = bot[0];
bot[0] = tmp;
}
}
// exchange columns of a[k] and u
int rbufi_left, rbufi_right, sbufi_left, sbufi_right;
// send buffers contain k columns of a and one of u
int bufsize = (a.size()+1)*a[0]->mloc();
vector sbuf_left(bufsize), sbuf_right(bufsize);
vector rbuf_left(bufsize), rbuf_right(bufsize);
// on each task except mycol==npcol-1
// send jglobal[bot[nploc-1]] to the right
// if jglobal != -1 send vector bot[nploc-1] to the right
// on each task except mycol==npcol-1
// recv jglobal from the right
// if jglobal != -1 recv a vector from the right into bot[nploc-1]
// set value of jglobal[bot[nploc-1]]
// on each task except mycol==0
// send jglobal[top[0]] to the left
// if jglobal != -1 send vector top[0] to the left
// on each task except mycol==0
// recv jglobal from the left
// if jglobal != -1 recv a vector from the left into top[0]
// set value of jglobal[top[0]]
// exchange jglobal values first
tm_comm.start();
if ( ctxt.mycol() < last_active_process_col )
{
sbufi_right = jglobal[bot[nploc-1]];
ctxt.isend(1,1,&sbufi_right,1,ctxt.myrow(),ctxt.mycol()+1);
ctxt.irecv(1,1,&rbufi_right,1,ctxt.myrow(),ctxt.mycol()+1);
jglobal[bot[nploc-1]] = rbufi_right;
//cout << ctxt.mype() << ": received jglobal="
// << jglobal[bot[nploc-1]] << " from right" << endl;
}
if ( ctxt.mycol() != 0 )
{
sbufi_left = jglobal[top[0]];
ctxt.isend(1,1,&sbufi_left,1,ctxt.myrow(),ctxt.mycol()-1);
ctxt.irecv(1,1,&rbufi_left,1,ctxt.myrow(),ctxt.mycol()-1);
jglobal[top[0]] = rbufi_left;
//cout << ctxt.mype() << ": received jglobal="
// << jglobal[top[0]] << " from left" << endl;
}
// exchange column vectors
if ( ctxt.mycol() < last_active_process_col )
{
for ( int k = 0; k < a.size(); k++ )
{
memcpy(&sbuf_right[k*mloc],acol[k][bot[nploc-1]],
mloc*sizeof(double));
}
memcpy(&sbuf_right[a.size()*mloc], ucol[bot[nploc-1]],
mloc*sizeof(double) );
ctxt.dsend(bufsize,1,&sbuf_right[0],bufsize,
ctxt.myrow(),ctxt.mycol()+1);
ctxt.drecv(bufsize,1,&rbuf_right[0],bufsize,
ctxt.myrow(),ctxt.mycol()+1);
for ( int k = 0; k < a.size(); k++ )
{
memcpy(acol[k][bot[nploc-1]],&rbuf_right[k*mloc],
mloc*sizeof(double));
}
memcpy(ucol[bot[nploc-1]], &rbuf_right[a.size()*mloc],
mloc*sizeof(double) );
//cout << ctxt.mype() << ": received vector jglobal="
// << jglobal[bot[nploc-1]] << " from right" << endl;
}
if ( ctxt.mycol() != 0 )
{
for ( int k = 0; k < a.size(); k++ )
{
memcpy(&sbuf_left[k*mloc],acol[k][top[0]],mloc*sizeof(double));
}
memcpy(&sbuf_left[a.size()*mloc],ucol[top[0]],mloc*sizeof(double) );
ctxt.dsend(bufsize,1,&sbuf_left[0],bufsize,
ctxt.myrow(),ctxt.mycol()-1);
ctxt.drecv(bufsize,1,&rbuf_left[0],bufsize,
ctxt.myrow(),ctxt.mycol()-1);
for ( int k = 0; k < a.size(); k++ )
{
memcpy(acol[k][top[0]],&rbuf_left[k*mloc],mloc*sizeof(double) );
}
memcpy(ucol[top[0]],&rbuf_left[a.size()*mloc],mloc*sizeof(double) );
//cout << ctxt.mype() << ": received vector jglobal="
// << jglobal[top[0]] << " from left" << endl;
}
tm_comm.stop();
} // if nploc > 0
// end of step
} // for irot
// sweep is complete
tm_comm.start();
ctxt.dsum('r',1,1,&diag_change,1);
tm_comm.stop();
// compute sum of squares of diagonal elements
if ( debug_diag_sum )
{
// compute sum of squares of diagonal elements using current values
// (after rotation)
previous_diag_sum = diag_sum;
diag_sum = 0.0;
for ( int k = 0; k < a.size(); k++ )
{
for ( int ipair = 0; ipair < nploc; ipair++ )
{
double tmp[2] = { 0.0, 0.0 };
// compute the diagonal elements
// skip dummy vectors
int one = 1;
if ( jglobal[top[ipair]] >= 0 )
{
const double *ap = acol[k][top[ipair]];
const double *up = ucol[top[ipair]];
tmp[0] = ddot(&mloc,ap,&one,up,&one);
}
if ( jglobal[bot[ipair]] >= 0 )
{
const double *aq = acol[k][bot[ipair]];
const double *uq = ucol[bot[ipair]];
tmp[1] = ddot(&mloc,aq,&one,uq,&one);
}
// tmp now contains partial sums of app and aqq
ctxt.dsum('c',2,1,tmp,2);
// tmp now contains the diagonal elements app and aqq
diag_sum += tmp[0]*tmp[0] + tmp[1]*tmp[1];
}
}
ctxt.dsum('r',1,1,&diag_sum,1);
#ifdef DEBUG
const double diag_sum_increase = diag_sum - previous_diag_sum;
if ( ctxt.onpe0() )
cout << " jade: nsweep=" << nsweep
<< " dsum: "
<< setw(15) << setprecision(10) << diag_sum
<< " dsum_inc: "
<< setw(15) << setprecision(10) << diag_sum_increase << endl;
#endif
}
#ifdef DEBUG
if ( ctxt.onpe0() )
cout << " jade: nsweep=" << nsweep
<< " dchange: "
<< setw(15) << setprecision(10) << diag_change << endl;
#endif
done = ( ( fabs(diag_change) < tol ) || ( nsweep >= maxsweep ) );
} // while !done
#ifdef DEBUG
//print final values of jglobal[top[i]] and jglobal[bot[i]]
cout << "final jglobal[top/bot] values:" << endl;
cout << ctxt.mype() << ": top: ";
for ( int i = 0; i < nploc; i++ )
cout << " " << jglobal[top[i]];
cout << endl;
cout << ctxt.mype() << ": bot: ";
for ( int i = 0; i < nploc; i++ )
cout << " " << jglobal[bot[i]];
cout << endl;
//print final values of top and bot
cout << "final top/bot values:" << endl;
cout << ctxt.mype() << ": top: ";
for ( int i = 0; i < nploc; i++ )
cout << " " << top[i];
cout << endl;
cout << ctxt.mype() << ": bot: ";
for ( int i = 0; i < nploc; i++ )
cout << " " << bot[i];
cout << endl;
#endif
// rotate columns of a and u to restore original order
double *tmpmat = new double[nloc*mloc];
// rotate columns of a
for ( int k = 0; k < a.size(); k++ )
{
for ( int ipair = 0; ipair < nploc; ipair++ )
{
// copy columns of a[k] to temporary array tmpmat in original order
if ( jglobal[top[ipair]] >= 0 )
{
memcpy(&tmpmat[ipair*mloc], acol[k][top[ipair]], mloc*sizeof(double));
}
if ( jglobal[bot[nploc-ipair-1]] >= 0 )
{
memcpy(&tmpmat[(nploc+ipair)*mloc],acol[k][bot[nploc-ipair-1]],
mloc*sizeof(double));
}
}
// copy tmpmat back to a[k]
memcpy(acol[k][0],tmpmat,nloc*mloc*sizeof(double));
}
// rotate columns of u
for ( int ipair = 0; ipair < nploc; ipair++ )
{
// copy columns of u to temporary array tmpmat in original order
if ( jglobal[top[ipair]] >= 0 )
{
memcpy(&tmpmat[ipair*mloc], ucol[top[ipair]], mloc*sizeof(double));
}
if ( jglobal[bot[nploc-ipair-1]] >= 0 )
{
memcpy(&tmpmat[(nploc+ipair)*mloc],ucol[bot[nploc-ipair-1]],
mloc*sizeof(double));
}
}
// copy tmpmat back to u
memcpy(ucol[0],tmpmat,nloc*mloc*sizeof(double));
delete [] tmpmat;
// compute diagonal values
for ( int k = 0; k < a.size(); k++ )
{
for ( int i = 0; i < a[k]->n(); i++ )
adiag[k][i] = 0.0;
for ( int jblock = 0; jblock < a[k]->nblocks(); jblock++ )
for ( int y = 0; y < a[k]->nbs(jblock); y++ )
{
// j is the global column index
int j = a[k]->j(jblock,y);
int jjj = y + jblock*a[k]->nb();
const double *ap = a[k]->valptr(jjj*a[k]->mloc());
const double *up = u.valptr(jjj*u.mloc());
int one = 1;
adiag[k][j] = ddot(&mloc,ap,&one,up,&one);
}
// adiag[k][i] now contains the partial sums of the diagonal elements of a
tm_comm.start();
ctxt.dsum(a[k]->n(),1,&adiag[k][0],a[k]->n());
tm_comm.stop();
// adiag[k] contains the diagonal elements of a[k]
// u contains the orthogonal transformation minimizing the spread
}
#ifdef DEBUG
if ( ctxt.onpe0() )
cout << " jade: comm time: " << tm_comm.real() << endl;
#endif
return nsweep;
}