//////////////////////////////////////////////////////////////////////////////// // // Copyright (c) 2008 The Regents of the University of California // // This file is part of Qbox // // Qbox is distributed under the terms of the GNU General Public License // as published by the Free Software Foundation, either version 2 of // the License, or (at your option) any later version. // See the file COPYING in the root directory of this distribution // or . // //////////////////////////////////////////////////////////////////////////////// // // jacobi.C // //////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include using namespace std; #ifdef USE_MPI #include #endif #ifdef SCALAPACK #include "blacs.h" #endif #include "Context.h" #include "Matrix.h" #include "blas.h" int jacobi(int maxsweep, double threshold, DoubleMatrix& a, DoubleMatrix& u, vector& e) { assert(threshold>1.e-14); const Context& ctxt = a.context(); // The input matrix is a // the orthogonal transformation is returned in u // on exit, a is diagonal, u contains eigenvectors, e contains eigenvalues assert(a.m()==a.n()); assert(u.m()==u.n()); assert(a.m()==u.m()); assert(a.mb()==u.mb()); assert(a.nb()==u.nb()); const int mloc = a.mloc(); // const int nloc = a.nloc(); // cout << ctxt.mype() << ": nloc: " << nloc << endl; // identify the last active process column // process columns beyond that column do not have any elements of a[k] // compute num_nblocks = total number of column blocks // if num_nblocks >= ctxt.npcol(), all process columns are active // otherwise, the last active process column has index num_nblocks-1 const int num_nblocks = u.n() / u.nb() + ( u.n()%u.nb() == 0 ? 0 : 1 ); const int last_active_process_col = min(ctxt.npcol()-1, num_nblocks-1); // initialize u with the identity u.identity(); // eigenvalue array e.resize(a.n()); // check if the local number of rows is odd const bool nloc_odd = ( a.nloc()%2 != 0 ); // if nloc is odd, an auxiliary array is created to host an extra column // for both a and u vector a_aux, u_aux; if ( nloc_odd ) { a_aux.resize(mloc); u_aux.resize(mloc); } // compute local number of pairs nploc const int nploc = (a.nloc()+1)/2; // dimension of top and bot arrays is nploc: local number of pairs deque top(nploc), bot(nploc); // compute total number of pairs np int np = nploc; ctxt.isum('r',1,1,&np,1); //cout << ctxt.mype() << ": np=" << np << endl; // initialize top and bot arrays // the pair i is (top[i],bot[i]) // top[i] is the local index of the top column of pair i // bot[i] is the local index of the bottom column of pair i for ( int i = 0; i < nploc; i++ ) top[i] = i; for ( int i = 0; i < nploc; i++ ) bot[nploc-i-1] = nploc+i; // if top[i] or bot[i] == nloc,the data resides in the array a_aux or u_aux // jglobal: global column index // jglobal[i] is the global column index of the column residing in // the local vector i. If nloc_odd and i==2*nploc-1, jglobal[i] == -1 vector jglobal(2*nploc,-1); for ( int jblock = 0; jblock < a.nblocks(); jblock++ ) for ( int y = 0; y < a.nbs(jblock); y++ ) jglobal[y + jblock*a.nb()] = a.j(jblock,y); // store addresses of columns of a and of u in acol and ucol vector acol(2*nploc); vector ucol(2*nploc); for ( int i = 0; i < a.nloc(); i++ ) acol[i] = a.valptr(i*a.mloc()); // if nloc is odd, store the address of vector 2*nploc-1 if ( nloc_odd ) acol[2*nploc-1] = &a_aux[0]; for ( int i = 0; i < u.nloc(); i++ ) ucol[i] = u.valptr(i*u.mloc()); // if nloc is odd, store the address of vector 2*nploc-1 if ( nloc_odd ) ucol[2*nploc-1] = &u_aux[0]; //for ( int i = 0; i < acol.size(); i++ ) // cout << ctxt.mype() << ": acol[" << i << "]=" << acol[i] << endl; //for ( int i = 0; i < ucol.size(); i++ ) // cout << ctxt.mype() << ": ucol[" << i << "]=" << ucol[i] << endl; // the vectors of the pair (top[i],bot[i]) are located at // addresses acol[top[i]] and acol[bot[i]] bool done = false; int nsweep = 0; while ( !done ) { int npairs_above_threshold = 0; // sweep: process local pairs and rotate 2*np-1 times nsweep++; for ( int irot = 0; irot < 2*np-1; irot++ ) { //cout << ctxt.mype() << ": top[i]: "; //for ( int i = 0; i < nploc; i++ ) // cout << setw(3) << top[i]; //cout << endl; //cout << ctxt.mype() << ": bot[i]: "; //for ( int i = 0; i < nploc; i++ ) // cout << setw(3) << bot[i]; //cout << endl; //cout << ctxt.mype() << ": jglobal[top[i]]: "; //for ( int i = 0; i < nploc; i++ ) // cout << setw(3) << jglobal[top[i]]; //cout << endl; //cout << ctxt.mype() << ": jglobal[bot[i]]: "; //for ( int i = 0; i < nploc; i++ ) // cout << setw(3) << jglobal[bot[i]]; //cout << endl; // perform Jacobi rotation for all local pairs // compute off-diagonal matrix elements apq for all pairs // skip the pair if one or both of the vectors is a dummy vector // i.e. a vector having jglobal==-1 vector apq(nploc); for ( int ipair = 0; ipair < nploc; ipair++ ) { //cout << ctxt.mype() << ": computing apq for global pair " // << jglobal[top[ipair]] << " " << jglobal[bot[ipair]] << endl; apq[ipair] = 0.0; if ( jglobal[top[ipair]] >= 0 && jglobal[bot[ipair]] >= 0 ) { const double *ap = acol[top[ipair]]; const double *uq = ucol[bot[ipair]]; int one = 1; int mloc = a.mloc(); //cout << ctxt.mype() << ": apq ddot: ipair=" << ipair << endl; //cout << ctxt.mype() << ": apq ddot: top=" << top[ipair] // << " bot=" << bot[ipair] << endl; //cout << ctxt.mype() << ": ap=" << ap << " uq=" << uq << endl; //for ( int i = 0; i < mloc; i++ ) // cout << ap[i] << " " << uq[i] << endl; apq[ipair] = ddot(&mloc,ap,&one,uq,&one); } } // apq now contains partial sums of apq ctxt.dsum('c',nploc,1,&apq[0],nploc); // apq now contains the off-diagonal elements apq // compute the diagonal elements and perform the rotation only on // pairs having abs(apq)>threshold // skip pairs having dummy vectors vector appqq(2*nploc); for ( int ipair = 0; ipair < nploc; ipair++ ) { appqq[2*ipair] = 0.0; appqq[2*ipair+1] = 0.0; if ( jglobal[top[ipair]] >= 0 && jglobal[bot[ipair]] >= 0 && fabs(apq[ipair]) > threshold ) { // compute diagonal matrix elements const double *ap = acol[top[ipair]]; const double *aq = acol[bot[ipair]]; const double *up = ucol[top[ipair]]; const double *uq = ucol[bot[ipair]]; // compute matrix elements app, apq, aqq int one = 1; int mloc = a.mloc(); appqq[2*ipair] = ddot(&mloc,ap,&one,up,&one); appqq[2*ipair+1] = ddot(&mloc,aq,&one,uq,&one); } } // appqq now contains partial sums of app and aqq ctxt.dsum('c',2*nploc,1,&appqq[0],2*nploc); // appqq now contains the off-diagonal elements app and aqq for ( int ipair = 0; ipair < nploc; ipair++ ) { // keep count of the number of pairs above threshold if ( jglobal[top[ipair]] >= 0 && jglobal[bot[ipair]] >= 0 && fabs(apq[ipair]) > threshold ) { npairs_above_threshold++; // compute rotation sine and cosine const double app = appqq[2*ipair]; const double aqq = appqq[2*ipair+1]; // fabs(apq[ipair]) is larger than machine epsilon // since it is larger than the threshold (which is > macheps) const double tau = ( aqq - app ) / ( 2.0 * apq[ipair]); const double sq = sqrt( 1.0 + tau*tau ); double t = 1.0 / ( fabs(tau) + sq ); if ( tau < 0.0 ) t = -t; double c = 1.0 / sqrt(1.0+t*t); double s = t * c; // the drot function computes // c*x + s*y -> x //-s*x + c*y -> y // call drot with args c, -s // change the sign of s before call to drot s = -s; //cout << " p=" << jglobal[top[ipair]] << " q=" << jglobal[bot[ipair]] // << " app=" << app << " aqq=" << aqq // << " apq=" << apq[ipair] << endl; //cout << " tau=" << tau << " c=" << c << " s=" << s << endl; // update columns of a and u double *ap = acol[top[ipair]]; double *aq = acol[bot[ipair]]; double *up = ucol[top[ipair]]; double *uq = ucol[bot[ipair]]; int one = 1; int mloc = a.mloc(); drot(&mloc,ap,&one,aq,&one,&c,&s); drot(&mloc,up,&one,uq,&one,&c,&s); //cout << " apq_check=" << ddot(&mloc,ap,&one,uq,&one); //cout << " aqp_check=" << ddot(&mloc,aq,&one,up,&one) << endl; } } // for ipair // all local pairs have been processed // rotate top and bot arrays if ( nploc > 0 ) { bot.push_back(top.back()); top.pop_back(); top.push_front(bot.front()); bot.pop_front(); // make rotation skip element 0 on the first process column // if my process column is zero, swap top[0] and top[1] if ( ctxt.mycol() == 0 ) { if ( nploc > 1 ) { int tmp = top[0]; top[0] = top[1]; top[1] = tmp; } else { // if there is only one local pair, exchange top[0] and bot[0] int tmp = top[0]; top[0] = bot[0]; bot[0] = tmp; } } int rbufi_left, rbufi_right, sbufi_left, sbufi_right; // send buffers contain a column of a and of u vector sbuf_left(2*a.mloc()), sbuf_right(2*a.mloc()); vector rbuf_left(2*a.mloc()), rbuf_right(2*a.mloc()); // on each task except mycol==npcol-1 // send jglobal[bot[nploc-1]] to the right // if jglobal != -1 send vector bot[nploc-1] to the right // on each task except mycol==npcol-1 // recv jglobal from the right // if jglobal != -1 recv a vector from the right into bot[nploc-1] // set value of jglobal[bot[nploc-1]] // on each task except mycol==0 // send jglobal[top[0]] to the left // if jglobal != -1 send vector top[0] to the left // on each task except mycol==0 // recv jglobal from the left // if jglobal != -1 recv a vector from the left into top[0] // set value of jglobal[top[0]] // exchange jglobal values first if ( ctxt.mycol() < last_active_process_col ) { sbufi_right = jglobal[bot[nploc-1]]; ctxt.isend(1,1,&sbufi_right,1,ctxt.myrow(),ctxt.mycol()+1); ctxt.irecv(1,1,&rbufi_right,1,ctxt.myrow(),ctxt.mycol()+1); jglobal[bot[nploc-1]] = rbufi_right; //cout << ctxt.mype() << ": received jglobal=" // << jglobal[bot[nploc-1]] << " from right" << endl; } if ( ctxt.mycol() != 0 ) { sbufi_left = jglobal[top[0]]; ctxt.isend(1,1,&sbufi_left,1,ctxt.myrow(),ctxt.mycol()-1); ctxt.irecv(1,1,&rbufi_left,1,ctxt.myrow(),ctxt.mycol()-1); jglobal[top[0]] = rbufi_left; //cout << ctxt.mype() << ": received jglobal=" // << jglobal[top[0]] << " from left" << endl; } // exchange column vectors if ( ctxt.mycol() < last_active_process_col ) { memcpy(&sbuf_right[0], acol[bot[nploc-1]], mloc*sizeof(double) ); memcpy(&sbuf_right[mloc], ucol[bot[nploc-1]], mloc*sizeof(double) ); ctxt.dsend(2*mloc,1,&sbuf_right[0],2*mloc,ctxt.myrow(),ctxt.mycol()+1); ctxt.drecv(2*mloc,1,&rbuf_right[0],2*mloc,ctxt.myrow(),ctxt.mycol()+1); memcpy(acol[bot[nploc-1]], &rbuf_right[0], mloc*sizeof(double) ); memcpy(ucol[bot[nploc-1]], &rbuf_right[mloc], mloc*sizeof(double) ); //cout << ctxt.mype() << ": received vector jglobal=" // << jglobal[bot[nploc-1]] << " from right" << endl; } if ( ctxt.mycol() != 0 ) { memcpy(&sbuf_left[0], acol[top[0]], mloc*sizeof(double) ); memcpy(&sbuf_left[mloc], ucol[top[0]], mloc*sizeof(double) ); ctxt.dsend(2*mloc,1,&sbuf_left[0],2*mloc,ctxt.myrow(),ctxt.mycol()-1); ctxt.drecv(2*mloc,1,&rbuf_left[0],2*mloc,ctxt.myrow(),ctxt.mycol()-1); memcpy(acol[top[0]], &rbuf_left[0], mloc*sizeof(double) ); memcpy(ucol[top[0]], &rbuf_left[mloc], mloc*sizeof(double) ); //cout << ctxt.mype() << ": received vector jglobal=" // << jglobal[top[0]] << " from left" << endl; } } // if nploc > 0 ctxt.barrier(); // end of step } // for irot // sweep is complete // accumulate number of pairs above threshold ctxt.isum('r',1,1,&npairs_above_threshold,1); done = ( ( npairs_above_threshold == 0 ) || ( nsweep >= maxsweep ) ); } // while !done // cout << " nsweep=" << nsweep << endl; // if a dummy vector was used, (i.e. if nloc_odd), the dummy vector // may end up anywhere in the array after all rotations are completed. // The array a_aux may contain a (non-dummy) vector. if ( nloc_odd ) { // find position of the dummy vector and copy a_aux onto it int idum = 0; while ( jglobal[idum] != -1 && idum < 2*nploc ) idum++; //cout << ctxt.mype() << ": idum=" << idum << endl; if ( idum != 2*nploc-1 ) { memcpy(acol[idum],&a_aux[0],mloc*sizeof(double)); memcpy(ucol[idum],&u_aux[0],mloc*sizeof(double)); } } // compute eigenvalues for ( int i = 0; i < a.n(); i++ ) e[i] = 0.0; for ( int jblock = 0; jblock < a.nblocks(); jblock++ ) for ( int y = 0; y < a.nbs(jblock); y++ ) { // j is the global column index int j = a.j(jblock,y); int jjj = y + jblock*a.nb(); const double *ap = a.valptr(jjj*a.mloc()); const double *up = u.valptr(jjj*u.mloc()); int mloc = a.mloc(); int one = 1; e[j] = ddot(&mloc,ap,&one,up,&one); } // e now contains the partial sums of the diagonal elements of a ctxt.dsum(a.n(),1,&e[0],a.n()); // e contains the eigenvalues of a // u contains the eigenvectors of a return nsweep; }