////////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2008 The Regents of the University of California
//
// This file is part of Qbox
//
// Qbox is distributed under the terms of the GNU General Public License
// as published by the Free Software Foundation, either version 2 of
// the License, or (at your option) any later version.
// See the file COPYING in the root directory of this distribution
// or .
//
////////////////////////////////////////////////////////////////////////////////
//
// SlaterDet.C
//
////////////////////////////////////////////////////////////////////////////////
#include "SlaterDet.h"
#include "FourierTransform.h"
#include "Context.h"
#include "blas.h" // daxpy
#include "Base64Transcoder.h"
#include "SharedFilePtr.h"
#include "Timer.h"
#include
#include // memcpy
#include
#include
#include
#include
using namespace std;
////////////////////////////////////////////////////////////////////////////////
SlaterDet::SlaterDet(const Context& ctxt, D3vector kpoint) : ctxt_(ctxt),
c_(ctxt)
{
// create cartesian communicator mapped on ctxt
int ndims=2;
// Note: MPI_Cart_comm uses row-major ordering. Need to give
// transposed dimensions as input arguments
int dims[2] = {ctxt.npcol(), ctxt.nprow()};
int periods[2] = { 0, 0};
int reorder = 0;
MPI_Comm comm;
MPI_Cart_create(ctxt.comm(),ndims,dims,periods,reorder,&comm);
int size, myrank;
MPI_Comm_size(comm,&size);
MPI_Comm_rank(comm,&myrank);
int coords[2];
MPI_Cart_coords(comm,myrank,2,coords);
#ifdef DEBUG
for ( int i = 0; i < size; i++ )
{
MPI_Barrier(comm);
if ( myrank == i )
cout << myrank << ": myrow=" << ctxt.myrow() << " mycol=" << ctxt.mycol()
<< " coords= " << coords[0] << ", " << coords[1] << endl;
}
#endif
// Split the cartesian communicator comm to define my_col_comm_
// MPI_Cart_create uses row-major ordering. Need to keep the second
// dimension to get a communicator corresponding to a column of ctxt
int remain_dims[2] = { 0, 1 };
MPI_Cart_sub(comm, remain_dims, &my_col_comm_);
// Free the cartesian communicator
MPI_Comm_free(&comm);
// define basis on the column subcommunicator
basis_ = new Basis(my_col_comm_,kpoint);
}
////////////////////////////////////////////////////////////////////////////////
SlaterDet::SlaterDet(const SlaterDet& rhs) : ctxt_(rhs.context()),
basis_(new Basis(*(rhs.basis_))),
my_col_comm_(rhs.my_col_comm_), c_(rhs.c_){}
////////////////////////////////////////////////////////////////////////////////
SlaterDet::~SlaterDet()
{
delete basis_;
// cout << ctxt_.mype() << ": SlaterDet::dtor: ctxt=" << ctxt_;
#ifdef TIMING
for ( TimerMap::iterator i = tmap.begin(); i != tmap.end(); i++ )
{
double time = (*i).second.real();
double tmin = time;
double tmax = time;
ctxt_.dmin(1,1,&tmin,1);
ctxt_.dmax(1,1,&tmax,1);
if ( ctxt_.myproc()==0 )
{
cout << ""
<< endl;
}
}
#endif
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::resize(const UnitCell& cell, const UnitCell& refcell,
double ecut, int nst)
{
// Test in next line should be replaced by test on basis min/max indices
// to signal change in basis vectors
//if ( basis_->refcell().volume() != 0.0 && !refcell.encloses(cell) )
//{
//cout << " SlaterDet::resize: cell=" << cell;
//cout << " SlaterDet::resize: refcell=" << basis_->refcell();
//throw SlaterDetException("could not resize: cell not in refcell");
//}
try
{
// create a temporary copy of the basis
Basis btmp(*basis_);
// perform normal resize operations, possibly resetting contents of c_
basis_->resize(cell,refcell,ecut);
occ_.resize(nst);
eig_.resize(nst);
const int mb = basis_->maxlocalsize();
const int m = ctxt_.nprow() * mb;
const int nb = nst/ctxt_.npcol() + (nst%ctxt_.npcol() > 0 ? 1 : 0);
// check if the dimensions of c_ must change
if ( m!=c_.m() || nst!=c_.n() || mb!=c_.mb() || nb!=c_.nb() )
{
// make a copy of c_ before resize
ComplexMatrix ctmp(c_);
c_.resize(m,nst,mb,nb);
init();
// check if data can be copied from temporary copy
// It is assumed that nst and ecut are not changing at the same time
// Only the cases where one change at a time occurs is covered
// consider only cases where the dimensions are finite
if ( c_.m() > 0 && c_.n() > 0 )
{
// first case: only nst has changed
if ( c_.m() == ctmp.m() && c_.n() != ctmp.n() )
{
//cout << "SlaterDet::resize: c_m/n= "
// << c_.m() << "/" << c_.n() << endl;
//cout << "SlaterDet::resize: ctmp_m/n=" << ctmp.m()
// << "/" << ctmp.n() << endl;
// nst has changed, basis is unchanged
// copy coefficients up to min(n_old, n_new)
if ( c_.n() < ctmp.n() )
{
c_.getsub(ctmp,ctmp.m(),c_.n(),0,0);
}
else
{
c_.getsub(ctmp,ctmp.m(),ctmp.n(),0,0);
}
gram();
}
// second case: basis was resized, nst unchanged
if ( btmp.ecut() > 0.0 && basis_->ecut() > 0.0 &&
c_.m() != ctmp.m() && c_.n() == ctmp.n() )
{
// transform all states to real space and interpolate
int np0 = max(basis_->np(0),btmp.np(0));
int np1 = max(basis_->np(1),btmp.np(1));
int np2 = max(basis_->np(2),btmp.np(2));
//cout << " SlaterDet::resize: grid: np0/1/2: "
// << np0 << " " << np1 << " " << np2 << endl;
// FourierTransform tf1(oldbasis, new basis grid)
// FourierTransform tf2(newbasis, new basis grid)
FourierTransform ft1(btmp,np0,np1,np2);
FourierTransform ft2(*basis_,np0,np1,np2);
// allocate real-space grid
valarray > tmpr(ft1.np012loc());
// transform each state from old basis to grid to new basis
for ( int n = 0; n < nstloc(); n++ )
{
for ( int i = 0; i < tmpr.size(); i++ )
tmpr[i] = 0.0;
ft1.backward(ctmp.cvalptr(n*ctmp.mloc()),&tmpr[0]);
ft2.forward(&tmpr[0], c_.valptr(n*c_.mloc()));
}
}
}
}
}
catch ( bad_alloc )
{
cout << " bad_alloc exception caught in SlaterDet::resize" << endl;
throw;
}
#if 0
// print error in imaginary part of c(G=0)
double imag_err = g0_imag_error();
if ( ctxt_.onpe0() )
{
cout.setf(ios::scientific,ios::floatfield);
cout << " SlaterDet::resize: imag error on exit: " << imag_err << endl;
}
#endif
cleanup();
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::init(void)
{
// initialize coefficients with lowest plane waves
if ( c_.n() <= basis_->size() )
{
// initialize c_
c_.clear();
const double s2i = 1.0 / sqrt(2.0);
// for each n, find the smallest g vector and initialize
int ismallest = 0;
// on each process, basis.isort(ismallest) is the index of the smallest
// local g vector
for ( int n = 0; n < c_.n(); n++ )
{
double value = 1.0;
if ( basis().real() && n != 0 )
value = s2i;
// find process row holding the smallest g vector
// kpg2: size^2 of smallest vector on this task
// set kpg2 to largest double value if localsize == 0
double kpg2 = numeric_limits::max();
if ( ismallest < basis_->localsize() )
{
kpg2 = basis_->kpg2(basis_->isort(ismallest));
}
// cout << "smallest vector on proc " << ctxt_.mype()
// << " has norm " << kpg2 << endl;
int minrow, mincol;
ctxt_.dmin('c',' ',1,1,&kpg2,1,&minrow,&mincol,1,-1,-1);
// find column hosting state n
int pc = c_.pc(n);
int pr = minrow;
if ( pr == ctxt_.myrow() )
{
int iii = basis_->isort(ismallest);
ismallest++; // increment on entire process row
if ( pc == ctxt_.mycol() )
{
// cout << " n=" << n << " on process "
// << pr << "," << pc
// << " vector " << basis_->idx(3*iii) << " "
// << basis_->idx(3*iii+1) << " "
// << basis_->idx(3*iii+2) << " norm="
// << basis_->g2(iii) << " "
// << value << endl;
int jjj = c_.m(n) * c_.nb() + c_.y(n);
int index = iii+c_.mloc()*jjj;
c_[index] = complex (value,0.0);
}
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::compute_density(FourierTransform& ft,
double weight, double* rho) const
{
//Timer tm_ft, tm_rhosum;
// compute density of the states residing on my column of ctxt_
assert(occ_.size() == c_.n());
vector > tmp(ft.np012loc());
assert(basis_->cell().volume() > 0.0);
const double omega_inv = 1.0 / basis_->cell().volume();
const int np012loc = ft.np012loc();
if ( basis_->real() )
{
// transform two states at a time
for ( int n = 0; n < nstloc()-1; n++, n++ )
{
// global n index
const int nn = ctxt_.mycol() * c_.nb() + n;
const double fac1 = weight * omega_inv * occ_[nn];
const double fac2 = weight * omega_inv * occ_[nn+1];
if ( fac1 + fac2 > 0.0 )
{
//tm_ft.start();
ft.backward(c_.cvalptr(n*c_.mloc()),
c_.cvalptr((n+1)*c_.mloc()),&tmp[0]);
//tm_ft.stop();
const double* psi = (double*) &tmp[0];
int ii = 0;
//tm_rhosum.start();
for ( int i = 0; i < np012loc; i++ )
{
const double psi1 = psi[ii];
const double psi2 = psi[ii+1];
rho[i] += fac1 * psi1 * psi1 + fac2 * psi2 * psi2;
ii++; ii++;
}
//tm_rhosum.start();
}
}
if ( nstloc() % 2 != 0 )
{
const int n = nstloc()-1;
// global n index
const int nn = ctxt_.mycol() * c_.nb() + n;
const double fac1 = weight * omega_inv * occ_[nn];
if ( fac1 > 0.0 )
{
ft.backward(c_.cvalptr(n*c_.mloc()),&tmp[0]);
const double* psi = (double*) &tmp[0];
int ii = 0;
for ( int i = 0; i < np012loc; i++ )
{
const double psi1 = psi[ii];
rho[i] += fac1 * psi1 * psi1;
ii++; ii++;
}
}
}
}
else
{
// only one transform at a time
for ( int n = 0; n < nstloc(); n++ )
{
// global n index
const int nn = ctxt_.mycol() * c_.nb() + n;
const double fac = weight * omega_inv * occ_[nn];
if ( fac > 0.0 )
{
ft.backward(c_.cvalptr(n*c_.mloc()),&tmp[0]);
for ( int i = 0; i < np012loc; i++ )
rho[i] += fac * norm(tmp[i]);
}
}
}
// cout << "SlaterDet: compute_density: ft_bwd time: "
// << tm_ft.real() << endl;
// cout << "SlaterDet: compute_density: rhosum time: "
// << tm_rhosum.real() << endl;
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::rs_mul_add(FourierTransform& ft,
const double* v, SlaterDet& sdp) const
{
// transform states to real space, multiply states by v[r] in real space
// transform back to reciprocal space and add to sdp
// sdp[n] += v * sd[n]
vector > tmp(ft.np012loc());
vector > ctmp(2*c_.mloc());
const int np012loc = ft.np012loc();
const int mloc = c_.mloc();
double* dcp = (double*) sdp.c().valptr();
if ( basis_->real() )
{
// transform two states at a time
for ( int n = 0; n < nstloc()-1; n++, n++ )
{
ft.backward(c_.cvalptr(n*mloc),
c_.cvalptr((n+1)*mloc),&tmp[0]);
#pragma omp parallel for
for ( int i = 0; i < np012loc; i++ )
tmp[i] *= v[i];
ft.forward(&tmp[0], &ctmp[0], &ctmp[mloc]);
int len = 4 * mloc;
int inc1 = 1;
double alpha = 1.0;
daxpy(&len,&alpha,(double*)&ctmp[0],&inc1,&dcp[2*n*mloc],&inc1);
}
if ( nstloc() % 2 != 0 )
{
const int n = nstloc()-1;
ft.backward(c_.cvalptr(n*mloc),&tmp[0]);
#pragma omp parallel for
for ( int i = 0; i < np012loc; i++ )
tmp[i] *= v[i];
ft.forward(&tmp[0], &ctmp[0]);
int len = 2 * mloc;
int inc1 = 1;
double alpha = 1.0;
daxpy(&len,&alpha,(double*)&ctmp[0],&inc1,&dcp[2*n*mloc],&inc1);
}
}
else
{
// only one transform at a time
for ( int n = 0; n < nstloc(); n++ )
{
ft.backward(c_.cvalptr(n*mloc),&tmp[0]);
#pragma omp parallel for
for ( int i = 0; i < np012loc; i++ )
tmp[i] *= v[i];
ft.forward(&tmp[0], &ctmp[0]);
int len = 2 * mloc;
int inc1 = 1;
double alpha = 1.0;
daxpy(&len,&alpha,(double*)&ctmp[0],&inc1,&dcp[2*n*mloc],&inc1);
}
}
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::gram(void)
{
cleanup();
if ( basis_->real() )
{
// k = 0 case
// create a DoubleMatrix proxy for c_
DoubleMatrix c_proxy(c_);
DoubleMatrix s(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
#if TIMING
tmap["syrk"].start();
#endif
s.syrk('l','t',2.0,c_proxy,0.0);
#if TIMING
tmap["syrk"].stop();
tmap["syr"].start();
#endif
s.syr('l',-1.0,c_proxy,0,'r');
#if TIMING
tmap["syr"].stop();
tmap["potrf"].start();
#endif
#ifdef CHOLESKY_REMAP
// create a square context for the Cholesky decomposition
// int nsq = (int) sqrt((double) ctxt_.size());
int nsq = CHOLESKY_REMAP;
Context csq(nsq,nsq);
DoubleMatrix ssq(csq,c_.n(),c_.n(),c_.nb(),c_.nb());
ssq.getsub(s,s.m(),s.n(),0,0);
ssq.potrf('l'); // Cholesky decomposition: S = L * L^T
s.getsub(ssq,s.m(),s.n(),0,0);
#else
s.potrf('l'); // Cholesky decomposition: S = L * L^T
#endif
// solve triangular system X * L^T = C
#if TIMING
tmap["potrf"].stop();
tmap["trsm"].start();
#endif
c_proxy.trsm('r','l','t','n',1.0,s);
#if TIMING
tmap["trsm"].stop();
#endif
}
else
{
// k != 0 case
ComplexMatrix s(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
s.herk('l','c',1.0,c_,0.0);
s.potrf('l'); // Cholesky decomposition: S = L * L^H
// solve triangular system X * L^H = C
c_.trsm('r','l','c','n',1.0,s);
}
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::riccati(const SlaterDet& sd)
{
cleanup();
if ( basis_->real() )
{
// k = 0 case
DoubleMatrix s(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix r(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
s.identity();
r.identity();
DoubleMatrix x(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix xm(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix t(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
// DoubleMatrix proxy for c_ and sd.c()
DoubleMatrix c_proxy(c_);
DoubleMatrix sdc_proxy(sd.c());
#if TIMING
tmap["riccati_syrk"].start();
#endif
// Factor -1.0 in next line: -0.5 from definition of s, 2.0 for G and -G
s.syrk('l','t',-1.0,c_proxy,0.5); // s = 0.5 * ( I - A )
// symmetric rank-1 update using first row of c_proxy
s.syr('l',0.5,c_proxy,0,'r');
#if TIMING
tmap["riccati_syrk"].stop();
tmap["riccati_gemm"].start();
#endif
// factor -2.0 in next line: G and -G
r.gemm('t','n',-2.0,sdc_proxy,c_proxy,1.0); // r = ( I - B )
// rank-1 update using first row of sdc_proxy() and c_proxy
r.ger(1.0,sdc_proxy,0,c_proxy,0);
#if TIMING
tmap["riccati_gemm"].stop();
#endif
xm = s;
xm.symmetrize('l');
#if TIMING
tmap["riccati_while"].start();
#endif
s.syrk('l','t',0.5,r,1.0); // s = s + 0.5 * r^T * r
s.symmetrize('l');
double diff = 1.0;
const double epsilon = 1.e-10;
const int maxiter = 5;
int iter = 0;
while ( iter < maxiter && diff > epsilon )
{
// x = s - 0.5 * ( r - xm )^T * ( r - xm )
// Note: t and r are not symmetric, x, xm, and s are symmetric
for ( int i = 0; i < t.size(); i++ )
t[i] = r[i] - xm[i];
x = s;
x.syrk('l','t',-0.5,t,1.0);
// get full matrix x
x.symmetrize('l');
for ( int i = 0; i < t.size(); i++ )
t[i] = x[i] - xm[i];
diff = t.nrm2();
xm = x;
iter++;
}
#if TIMING
tmap["riccati_while"].stop();
tmap["riccati_symm"].start();
#endif
c_proxy.symm('r','l',1.0,x,sdc_proxy,1.0);
#if TIMING
tmap["riccati_symm"].stop();
#endif
}
else
{
// k != 0 case
ComplexMatrix s(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
ComplexMatrix r(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
s.identity();
r.identity();
ComplexMatrix x(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
ComplexMatrix xm(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
ComplexMatrix t(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
// s = 0.5 * ( I - A )
s.herk('l','c',-0.5,c_,0.5);
// r = ( I - B )
r.gemm('c','n',-1.0,sd.c(),c_,1.0);
xm = s;
xm.symmetrize('l');
// s = s + 0.5 * r^H * r
s.herk('l','c',0.5,r,1.0);
s.symmetrize('l');
double diff = 1.0;
const double epsilon = 1.e-10;
const int maxiter = 5;
int iter = 0;
while ( iter < maxiter && diff > epsilon )
{
// x = s - 0.5 * ( r - xm )^H * ( r - xm )
// Note: t and r are not hermitian, x, xm, and s are hermitian
for ( int i = 0; i < t.size(); i++ )
t[i] = r[i] - xm[i];
x = s;
x.herk('l','c',-0.5,t,1.0);
x.symmetrize('l');
for ( int i = 0; i < t.size(); i++ )
t[i] = x[i] - xm[i];
diff = t.nrm2();
xm = x;
iter++;
}
c_.hemm('r','l',1.0,x,sd.c(),1.0);
}
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::lowdin(void)
{
cleanup();
if ( basis_->real() )
{
ComplexMatrix c_tmp(c_);
DoubleMatrix c_proxy(c_);
DoubleMatrix c_tmp_proxy(c_tmp);
DoubleMatrix l(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix x(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix t(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
l.clear();
l.syrk('l','t',2.0,c_proxy,0.0);
l.syr('l',-1.0,c_proxy,0,'r');
//cout << "SlaterDet::lowdin: A=\n" << l << endl;
// Cholesky decomposition of A=Y^T Y
l.potrf('l');
// The lower triangle of l now contains the Cholesky factor of Y^T Y
//cout << "SlaterDet::lowdin: L=\n" << l << endl;
// Compute the polar decomposition of R = L^T
x.transpose(1.0,l,0.0);
// x now contains R
const double tol = 1.e-6;
const int maxiter = 3;
x.polar(tol,maxiter);
// x now contains the orthogonal polar factor U of the
// polar decomposition R = UH
//cout << " SlaterDet::lowdin: orthogonal polar factor=\n" << x << endl;
// Compute L^-1
l.trtri('l','n');
// l now contains L^-1
// Form the product L^-T U
t.gemm('t','n',1.0,l,x,0.0);
// Multiply c by L^-T U
c_proxy.gemm('n','n',1.0,c_tmp_proxy,t,0.0);
}
else
{
// complex case: not implemented
if ( ctxt_.onpe0() )
cout << " SlaterDet::lowdin: not implemented, reverting to Gram-Schmidt"
<< endl;
gram();
}
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::ortho_align(const SlaterDet& sd)
{
// Orthogonalize *this and align with sd
cleanup();
if ( basis_->real() )
{
ComplexMatrix c_tmp(c_);
DoubleMatrix c_proxy(c_);
DoubleMatrix sdc_proxy(sd.c());
DoubleMatrix c_tmp_proxy(c_tmp);
DoubleMatrix l(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix x(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
#if TIMING
tmap["syrk"].reset();
tmap["syrk"].start();
#endif
l.clear();
l.syrk('l','t',2.0,c_proxy,0.0);
l.syr('l',-1.0,c_proxy,0,'r');
#if TIMING
tmap["syrk"].stop();
#endif
//cout << "SlaterDet::ortho_align: A=\n" << l << endl;
// Cholesky decomposition of A=Y^T Y
#if TIMING
tmap["potrf"].reset();
tmap["potrf"].start();
#endif
l.potrf('l');
#if TIMING
tmap["potrf"].stop();
#endif
// The lower triangle of l now contains the Cholesky factor of Y^T Y
//cout << "SlaterDet::ortho_align: L=\n" << l << endl;
// Compute the polar decomposition of L^-1 B
// where B = C^T sd.C
// Compute B: store result in x
#if TIMING
tmap["gemm"].reset();
tmap["gemm"].start();
#endif
// factor -2.0 in next line: G and -G
x.gemm('t','n',2.0,c_proxy,sdc_proxy,0.0);
// rank-1 update using first row of sdc_proxy() and c_proxy
x.ger(-1.0,c_proxy,0,sdc_proxy,0);
#if TIMING
tmap["gemm"].stop();
#endif
// Form the product L^-1 B, store result in x
// triangular solve: L X = B
// trtrs: solve op(*this) * X = Z, output in Z
#if TIMING
tmap["trtrs"].reset();
tmap["trtrs"].start();
#endif
l.trtrs('l','n','n',x);
#if TIMING
tmap["trtrs"].stop();
#endif
// x now contains L^-1 B
// compute the polar decomposition of X = L^-1 B
#if TIMING
tmap["polar"].reset();
tmap["polar"].start();
#endif
const double tol = 1.e-6;
const int maxiter = 2;
x.polar(tol,maxiter);
#if TIMING
tmap["polar"].stop();
#endif
// x now contains the orthogonal polar factor X of the
// polar decomposition L^-1 B = XH
//cout << " SlaterDet::ortho_align: orthogonal polar factor=\n"
// << x << endl;
// Form the product L^-T Q
// Solve trans(L) Z = X
#if TIMING
tmap["trtrs2"].reset();
tmap["trtrs2"].start();
#endif
l.trtrs('l','t','n',x);
#if TIMING
tmap["trtrs2"].stop();
#endif
// x now contains L^-T Q
// Multiply c by L^-T Q
#if TIMING
tmap["gemm2"].reset();
tmap["gemm2"].start();
#endif
c_proxy.gemm('n','n',1.0,c_tmp_proxy,x,0.0);
#if TIMING
tmap["gemm2"].stop();
#endif
}
else
{
// complex case: not implemented
if ( ctxt_.onpe0() )
cout << " SlaterDet::lowdin: not implemented, reverting to riccati"
<< endl;
riccati(sd);
}
#if TIMING
for ( TimerMap::iterator i = tmap.begin(); i != tmap.end(); i++ )
{
double time = (*i).second.real();
double tmin = time;
double tmax = time;
ctxt_.dmin(1,1,&tmin,1);
ctxt_.dmax(1,1,&tmax,1);
if ( ctxt_.onpe0() )
{
cout << ""
<< endl;
}
}
#endif
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::align(const SlaterDet& sd)
{
// Align *this with sd
if ( basis_->real() )
{
ComplexMatrix c_tmp(c_);
DoubleMatrix c_proxy(c_);
DoubleMatrix sdc_proxy(sd.c());
DoubleMatrix c_tmp_proxy(c_tmp);
DoubleMatrix x(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
DoubleMatrix t(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
// Compute the polar decomposition of B
// where B = C^T sd.C
#if TIMING
tmap["align_gemm1"].start();
#endif
// Compute B: store result in x
// factor -2.0 in next line: G and -G
x.gemm('t','n',2.0,c_proxy,sdc_proxy,0.0);
// rank-1 update using first row of sdc_proxy() and c_proxy
x.ger(-1.0,c_proxy,0,sdc_proxy,0);
#if TIMING
tmap["align_gemm1"].stop();
#endif
// x now contains B
//cout << "SlaterDet::align: B=\n" << x << endl;
// Compute the distance | c - sdc | before alignment
//for ( int i = 0; i < c_proxy.size(); i++ )
// c_tmp_proxy[i] = c_proxy[i] - sdc_proxy[i];
//cout << " SlaterDet::align: distance before: "
// << c_tmp_proxy.nrm2() << endl;
// compute the polar decomposition of B
double tol = 1.e-6;
const int maxiter = 3;
#if TIMING
tmap["align_polar"].start();
#endif
x.polar(tol,maxiter);
#if TIMING
tmap["align_while"].stop();
#endif
// x now contains the orthogonal polar factor X of the
// polar decomposition B = XH
//cout << " SlaterDet::align: orthogonal polar factor=\n" << x << endl;
#if TIMING
tmap["align_gemm2"].start();
#endif
// Multiply c by X
c_tmp_proxy = c_proxy;
c_proxy.gemm('n','n',1.0,c_tmp_proxy,x,0.0);
#if TIMING
tmap["align_gemm2"].stop();
#endif
// Compute the distance | c - sdc | after alignment
//for ( int i = 0; i < c_proxy.size(); i++ )
// c_tmp_proxy[i] = c_proxy[i] - sdc_proxy[i];
//cout << " SlaterDet::align: distance after: "
// << c_tmp_proxy.nrm2() << endl;
}
else
{
// complex case: not implemented
if ( ctxt_.onpe0() )
cout << " SlaterDet::align: not implemented, alignment skipped"
<< endl;
}
}
////////////////////////////////////////////////////////////////////////////////
complex SlaterDet::dot(const SlaterDet& sd) const
{
// dot product of Slater determinants at the same kpoint: dot = tr (V^T W)
assert(basis_->kpoint() == sd.kpoint());
if ( basis_->real() )
{
// DoubleMatrix proxy for c_ and sd.c()
const DoubleMatrix c_proxy(c_);
const DoubleMatrix sdc_proxy(sd.c());
// factor 2.0: G and -G
double d = 2.0 * c_proxy.dot(sdc_proxy);
// correct double counting of first element
double sum = 0.0;
if ( ctxt_.myrow() == 0 )
{
// compute the scalar product of the first rows of c_ and sd.c_
const double *c = c_proxy.cvalptr(0);
const double *sdc = sdc_proxy.cvalptr(0);
int len = c_proxy.nloc();
// stride of scalar product is mloc
int stride = c_proxy.mloc();
sum = ddot(&len,c,&stride,sdc,&stride);
}
ctxt_.dsum(1,1,&sum,1);
return d - sum;
}
else
{
return c_.dot(sd.c());
}
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::update_occ(int nel, int nspin)
{
// compute occupation numbers as 0.0, 1.0 or 2.0
// if nspin = 1: use 0, 1 or 2
// if nspin = 2: use 0 or 1;
assert (nel >= 0);
assert (occ_.size() == c_.n());
if ( nspin == 1 )
{
assert (nel <= 2*c_.n());
int ndouble = nel/2;
for ( int n = 0; n < ndouble; n++ )
occ_[n] = 2.0;
for ( int n = ndouble; n < ndouble+nel%2; n++ )
occ_[n] = 1.0;
for ( int n = ndouble+nel%2; n < c_.n(); n++ )
occ_[n] = 0.0;
}
else if ( nspin == 2 )
{
assert (nel <= c_.n());
for ( int n = 0; n < nel; n++ )
occ_[n] = 1.0;
for ( int n = nel; n < c_.n(); n++ )
occ_[n] = 0.0;
}
else
{
// incorrect value of nspin_
assert(false);
}
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::total_charge(void) const
{
// compute total charge from occ_[i]
double sum = 0.0;
for ( int n = 0; n < occ_.size(); n++ )
{
sum += occ_[n];
}
return sum;
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::update_occ(int nspin, double mu, double temp)
{
// compute occupation numbers using a Fermi distribution f(mu,temp)
// and the eigenvalues in eig_[i]
assert(nspin==1 || nspin==2);
assert (occ_.size() == c_.n());
assert (eig_.size() == c_.n());
if ( nspin == 1 )
{
for ( int n = 0; n < eig_.size(); n++ )
{
occ_[n] = 2.0 * fermi(eig_[n],mu,temp);
}
}
else if ( nspin == 2 )
{
for ( int n = 0; n < eig_.size(); n++ )
{
occ_[n] = fermi(eig_[n],mu,temp);
}
}
else
{
// incorrect value of nspin_
assert(false);
}
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::fermi(double e, double mu, double fermitemp)
{
// e, mu in Hartree, fermitemp in Kelvin
if ( fermitemp == 0.0 )
{
if ( e < mu ) return 1.0;
else if ( e == mu ) return 0.5;
else return 0.0;
}
const double kb = 3.1667907e-6; // Hartree/Kelvin
const double kt = kb * fermitemp;
double arg = ( e - mu ) / kt;
if ( arg < -30.0 ) return 1.0;
if ( arg > 30.0 ) return 0.0;
return 1.0 / ( 1.0 + exp ( arg ) );
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::entropy(int nspin) const
{
// return dimensionless entropy
// the contribution to the free energy is - t_kelvin * k_boltz * wf.entropy()
assert(nspin==1 || nspin==2);
const double fac = ( nspin > 1 ) ? 1.0 : 2.0;
double sum = 0.0;
for ( int n = 0; n < occ_.size(); n++ )
{
const double f = occ_[n] / fac;
if ( f > 0.0 && f < 1.0 )
{
sum -= fac * ( f * log(f) + (1.0-f) * log(1.0-f) );
}
}
return sum;
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::ortho_error(void) const
{
// deviation from orthogonality of c_
double error;
if ( basis_->real() )
{
// k = 0 case
// declare a proxy DoubleMatrix for c_
DoubleMatrix c_proxy(c_);
DoubleMatrix s(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
// real symmetric rank-k update
// factor 2.0 in next line: G and -G
s.syrk('l','t',2.0,c_proxy,0.0); // compute real overlap matrix
// correct for double counting of G=0
// symmetric rank-1 update using first row of c_proxy
s.syr('l',-1.0,c_proxy,0,'r');
DoubleMatrix id(ctxt_,s.m(),s.n(),s.mb(),s.nb());
id.identity();
s -= id; // subtract identity matrix from S
error = s.nrm2();
}
else
{
// k != 0 case
ComplexMatrix s(ctxt_,c_.n(),c_.n(),c_.nb(),c_.nb());
s.herk('l','c',1.0,c_,0.0);
ComplexMatrix id(ctxt_,s.m(),s.n(),s.mb(),s.nb());
id.identity();
s -= id; // subtract identity matrix from S
error = s.nrm2();
}
return error;
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::randomize(double amplitude)
{
if ( basis_->size() == 0 )
return;
for ( int n = 0; n < c_.nloc(); n++ )
{
complex* p = c_.valptr(c_.mloc()*n);
for ( int i = 0; i < basis_->localsize(); i++ )
{
double re = drand48();
double im = drand48();
p[i] += amplitude * complex(re,im);
}
}
// gram does an initial cleanup
gram();
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::cleanup(void)
{
// set Im( c(G=0) ) to zero for real case and
// set the empty rows of the matrix c_ to zero
// The empty rows are located between i = basis_->localsize() and
// c_.mloc(). Empty rows are necessary to insure that the
// local size c_.mloc() is the same on all processes, while the
// local basis size is not.
for ( int n = 0; n < c_.nloc(); n++ )
{
complex* p = c_.valptr(c_.mloc()*n);
// reset imaginary part of G=0 component to zero
if ( basis_->real() && c_.mloc() > 0 && ctxt_.myrow() == 0 )
{
// index of G=0 element
p[0] = complex ( p[0].real(), 0.0);
}
// reset values of empty rows of c_ to zero
for ( int i = basis_->localsize(); i < c_.mloc(); i++ )
p[i] = 0.0;
}
}
////////////////////////////////////////////////////////////////////////////////
SlaterDet& SlaterDet::operator=(SlaterDet& rhs)
{
if ( this == &rhs ) return *this;
assert(ctxt_.ictxt() == rhs.context().ictxt());
c_ = rhs.c_;
occ_ = rhs.occ_;
eig_ = rhs.eig_;
return *this;
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::memsize(void) const
{
return basis_->memsize() + c_.memsize();
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::localmemsize(void) const
{
return basis_->localmemsize() + c_.localmemsize();
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::print(ostream& os, string encoding, double weight, int ispin,
int nspin) const
{
FourierTransform ft(*basis_,basis_->np(0),basis_->np(1),basis_->np(2));
vector > wftmp(ft.np012loc());
const bool real_basis = basis_->real();
const int wftmpr_size = real_basis ? ft.np012() : 2*ft.np012();
const int wftmpr_loc_size = real_basis ? ft.np012loc() : 2*ft.np012loc();
vector wftmpr(wftmpr_size);
Base64Transcoder xcdr;
if ( ctxt_.onpe0() )
{
string spin = (ispin > 0) ? "down" : "up";
os << "kpoint() << "\"\n"
<< " weight=\"" << setprecision(12) << weight << "\""
<< " size=\"" << nst() << "\">" << endl;
os << ""
<< endl;
os.setf(ios::fixed,ios::floatfield);
os.setf(ios::right,ios::adjustfield);
for ( int i = 0; i < nst(); i++ )
{
os << " " << setprecision(8) << occ_[i];
if ( i%10 == 9 )
os << endl;
}
if ( nst()%10 != 0 )
os << endl;
os << "" << endl;
}
for ( int n = 0; n < nst(); n++ )
{
// Barrier to limit the number of messages sent to task 0
// that don't have a receive posted
ctxt_.barrier();
// transform data on ctxt_.mycol()
if ( c_.pc(n) == ctxt_.mycol() )
{
//cout << " state " << n << " is stored on column "
// << ctxt_.mycol() << " local index: " << c_.y(n) << endl;
int nloc = c_.y(n); // local index
ft.backward(c_.cvalptr(c_.mloc()*nloc),&wftmp[0]);
if ( real_basis )
{
double *a = (double*) &wftmp[0];
for ( int i = 0; i < ft.np012loc(); i++ )
wftmpr[i] = a[2*i];
}
else
{
memcpy((void*)&wftmpr[0],(void*)&wftmp[0],
ft.np012loc()*sizeof(complex));
}
}
// send blocks of wftmpr to pe0
for ( int i = 0; i < ctxt_.nprow(); i++ )
{
bool iamsending = c_.pc(n) == ctxt_.mycol() && i == ctxt_.myrow();
// send size of wftmpr block
int size=-1;
if ( ctxt_.onpe0() )
{
if ( iamsending )
{
// sending to self, size not needed
}
else
ctxt_.irecv(1,1,&size,1,i,c_.pc(n));
}
else
{
if ( iamsending )
{
size = wftmpr_loc_size;
ctxt_.isend(1,1,&size,1,0,0);
}
}
// send wftmpr block
if ( ctxt_.onpe0() )
{
if ( iamsending )
{
// do nothing, data is already in place
}
else
{
int istart = ft.np0() * ft.np1() * ft.np2_first(i);
if ( !real_basis )
istart *= 2;
ctxt_.drecv(size,1,&wftmpr[istart],size,i,c_.pc(n));
}
}
else
{
if ( iamsending )
{
ctxt_.dsend(size,1,&wftmpr[0],size,0,0);
}
}
}
// process the data
if ( ctxt_.onpe0() )
{
// wftmpr is now complete on task 0
// wftmpr contains either a real of a complex array
const string element_type = real_basis ? "double" : "complex";
if ( encoding == "base64" )
{
#if PLT_BIG_ENDIAN
xcdr.byteswap_double(wftmpr_size,&wftmpr[0]);
#endif
int nbytes = wftmpr_size*sizeof(double);
int outlen = xcdr.nchars(nbytes);
char* b = new char[outlen];
assert(b!=0);
xcdr.encode(nbytes,(byte*) &wftmpr[0],b);
// Note: optional x0,y0,z0 attributes not used, default is zero
os << "" << endl;
xcdr.print(outlen,(char*) b, os);
os << "\n";
delete [] b;
}
else
{
// encoding == "text" or unknown encoding
// Note: optional x0,y0,z0 attributes not used, default is zero
os << "" << endl;
int count = 0;
for ( int k = 0; k < ft.np2(); k++ )
for ( int j = 0; j < ft.np1(); j++ )
for ( int i = 0; i < ft.np0(); i++ )
{
int index = ft.index(i,j,k);
if ( real_basis )
os << " " << wftmpr[index];
else
os << " " << wftmpr[2*index] << " " << wftmpr[2*index+1];
if ( count++%4 == 3)
os << "\n";
}
if ( count%4 != 0 )
os << "\n";
os << "\n";
}
}
}
if ( ctxt_.onpe0() )
os << "" << endl;
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::write(SharedFilePtr& sfp, string encoding, double weight,
int ispin, int nspin) const
{
FourierTransform ft(*basis_,basis_->np(0),basis_->np(1),basis_->np(2));
vector > wftmp(ft.np012loc());
const bool real_basis = basis_->real();
const int wftmpr_loc_size = real_basis ? ft.np012loc() : 2*ft.np012loc();
vector wftmpr(wftmpr_loc_size);
Base64Transcoder xcdr;
// do not write anything if nst==0
if ( nst() == 0 ) return;
#if USE_MPI
char* wbuf = 0;
size_t wbufsize = 0;
#endif
// Segment n on process iprow is sent to row (n*nprow+iprow)/(nprow)
const int nprow = ctxt_.nprow();
vector scounts(nprow), sdispl(nprow), rcounts(nprow), rdispl(nprow);
string header;
if ( ctxt_.onpe0() )
{
ostringstream ostr_hdr;
string spin = (ispin > 0) ? "down" : "up";
ostr_hdr << "kpoint() << "\"\n"
<< " weight=\"" << setprecision(12) << weight << "\""
<< " size=\"" << nst() << "\">" << endl;
ostr_hdr << ""
<< endl;
ostr_hdr.setf(ios::fixed,ios::floatfield);
ostr_hdr.setf(ios::right,ios::adjustfield);
for ( int i = 0; i < nst(); i++ )
{
ostr_hdr << " " << setprecision(8) << occ_[i];
if ( i%10 == 9 )
ostr_hdr << endl;
}
if ( nst()%10 != 0 )
ostr_hdr << endl;
ostr_hdr << "" << endl;
header = ostr_hdr.str();
}
// serialize all local columns of c and store in segments seg[n]
string seg;
for ( int n = 0; n < nstloc(); n++ )
{
seg.clear();
if ( n == 0 && ctxt_.myrow() == 0 )
seg = header;
ostringstream ostr;
//cout << " state " << n << " is stored on column "
// << ctxt_.mycol() << " local index: " << c_.y(n) << endl;
ft.backward(c_.cvalptr(c_.mloc()*n),&wftmp[0]);
if ( real_basis )
{
double *a = (double*) &wftmp[0];
for ( int i = 0; i < ft.np012loc(); i++ )
wftmpr[i] = a[2*i];
}
else
{
memcpy((void*)&wftmpr[0],(void*)&wftmp[0],
ft.np012loc()*sizeof(complex));
}
// find index of last process holding some data
int lastproc = ctxt_.nprow()-1;
while ( lastproc >= 0 && ft.np2_loc(lastproc) == 0 ) lastproc--;
assert(lastproc>=0);
// Adjust number of values on each task to have a number of values
// divisible by three. This is necessary in order to have base64
// encoding without trailing '=' characters.
// The last node in the process column may have a number of values
// not divisible by 3.
// data now resides in wftmpr, distributed on ctxt_.mycol()
// All nodes in the process column except the last have the
// same wftmpr_loc_size
// Use group-of-three redistribution algorithm to make all sizes
// multiples of 3. In the group-of-three algorithm, nodes are divided
// into groups of three nodes. In each group, the left and right members
// send 1 or 2 values to the center member so that all three members
// end up with a number of values divisible by three.
// Determine how many values must be sent to the center-of-three node
int ndiff;
const int myrow = ctxt_.myrow();
if ( myrow == 0 )
{
ndiff = wftmpr_loc_size % 3;
ctxt_.ibcast_send('c',1,1,&ndiff,1);
}
else
{
ctxt_.ibcast_recv('c',1,1,&ndiff,1,0,ctxt_.mycol());
}
// assume that all nodes have at least ndiff values
if ( myrow <= lastproc ) assert(wftmpr_loc_size >= ndiff);
// Compute number of values to be sent to neighbors
int nsend_left=0, nsend_right=0, nrecv_left=0, nrecv_right=0;
if ( myrow % 3 == 0 )
{
// mype is the left member of a group of three
// send ndiff values to the right if not on the last node
if ( myrow < lastproc )
nsend_right = ndiff;
}
else if ( myrow % 3 == 1 )
{
// mype is the center member of a group of three
if ( myrow <= lastproc )
nrecv_left = ndiff;
if ( myrow <= lastproc-1 )
nrecv_right = ndiff;
}
else if ( myrow % 3 == 2 )
{
// mype is the right member of a group of three
// send ndiff values to the left if not on the first or last node
if ( myrow <= lastproc && myrow > 0 )
nsend_left = ndiff;
}
double rbuf_left[2], rbuf_right[2], sbuf_left[2], sbuf_right[2];
int tmpr_size = wftmpr_loc_size;
if ( nsend_left > 0 )
{
for ( int i = 0; i < ndiff; i++ )
sbuf_left[i] = wftmpr[i];
ctxt_.dsend(ndiff,1,sbuf_left,ndiff,ctxt_.myrow()-1,ctxt_.mycol());
tmpr_size -= ndiff;
}
if ( nsend_right > 0 )
{
for ( int i = 0; i < ndiff; i++)
sbuf_right[i] = wftmpr[wftmpr_loc_size-ndiff+i];
ctxt_.dsend(ndiff,1,sbuf_right,ndiff,ctxt_.myrow()+1,ctxt_.mycol());
tmpr_size -= ndiff;
}
if ( nrecv_left > 0 )
{
ctxt_.drecv(ndiff,1,rbuf_left,ndiff,ctxt_.myrow()-1,ctxt_.mycol());
tmpr_size += ndiff;
}
if ( nrecv_right > 0 )
{
ctxt_.drecv(ndiff,1,rbuf_right,ndiff,ctxt_.myrow()+1,ctxt_.mycol());
tmpr_size += ndiff;
}
// check that size is a multiple of 3 (except on last node)
// cout << ctxt_.mype() << ": tmpr_size: " << tmpr_size << endl;
if ( ctxt_.myrow() != lastproc )
assert(tmpr_size%3 == 0);
vector tmpr(tmpr_size);
// Note: all nodes either receive data or send data, not both
if ( nrecv_left > 0 || nrecv_right > 0 )
{
// this node is a receiver
int index = 0;
if ( nrecv_left > 0 )
{
for ( int i = 0; i < ndiff; i++ )
tmpr[index++] = rbuf_left[i];
}
for ( int i = 0; i < wftmpr_loc_size; i++ )
tmpr[index++] = wftmpr[i];
if ( nrecv_right > 0 )
{
for ( int i = 0; i < ndiff; i++ )
tmpr[index++] = rbuf_right[i];
}
assert(index==tmpr_size);
}
else if ( nsend_left > 0 || nsend_right > 0 )
{
// this node is a sender
int index = 0;
int istart=0, iend=wftmpr_loc_size;
if ( nsend_left > 0 )
istart = ndiff;
if ( nsend_right > 0 )
iend = wftmpr_loc_size - ndiff;
for ( int i = istart; i < iend; i++ )
tmpr[index++] = wftmpr[i];
assert(index==tmpr_size);
}
else
{
// no send and no recv
for ( int i = 0; i < wftmpr_loc_size; i++ )
tmpr[i] = wftmpr[i];
assert(tmpr_size==wftmpr_loc_size);
}
// All nodes (except the last) now have a number of values
// divisible by 3 in tmpr[]
if ( ctxt_.myrow()!=lastproc ) assert(tmpr_size%3==0);
// convert local data to base64 and write to outfile
// tmpr contains either a real or a complex array
const string element_type = real_basis ? "double" : "complex";
if ( encoding == "base64" )
{
#if PLT_BIG_ENDIAN
xcdr.byteswap_double(tmpr_size,&tmpr[0]);
#endif
int nbytes = tmpr_size*sizeof(double);
int outlen = xcdr.nchars(nbytes);
char* b = new char[outlen];
assert(b!=0);
xcdr.encode(nbytes,(byte*) &tmpr[0],b);
// Note: optional x0,y0,z0 attributes not used, default is zero
if ( ctxt_.myrow() == 0 )
{
// if on first row, write grid function header
ostr << "" << endl;
}
xcdr.print(outlen,(char*) b, ostr);
if ( ctxt_.myrow() == lastproc )
ostr << "\n";
delete [] b;
}
else
{
// encoding == "text" or unknown encoding
// Note: optional x0,y0,z0 attributes not used, default is zero
if ( ctxt_.myrow() == 0 )
{
// if on first row, write grid function header
ostr << "" << endl;
}
int count = 0;
for ( int k = 0; k < ft.np2_loc(); k++ )
for ( int j = 0; j < ft.np1(); j++ )
for ( int i = 0; i < ft.np0(); i++ )
{
int index = ft.index(i,j,k);
if ( real_basis )
ostr << " " << tmpr[index];
else
ostr << " " << tmpr[2*index] << " " << tmpr[2*index+1];
if ( count++%4 == 3)
ostr << "\n";
}
if ( count%4 != 0 )
ostr << "\n";
if ( ctxt_.myrow() == lastproc )
ostr << "\n";
}
// copy contents of ostr stringstream to segment
seg += ostr.str();
// cout << ctxt_.mype() << ": segment " << n << " size: " << seg.size()
// << endl;
// seg is defined
#if USE_MPI
// redistribute segments to tasks within each process column
for ( int i = 0; i < nprow; i++ )
{
scounts[i] = 0;
sdispl[i] = 0;
rcounts[i] = 0;
rdispl[i] = 0;
}
int idest = (n*nprow+ctxt_.myrow())/nstloc();
scounts[idest] = seg.size();
// send sendcounts to all procs
MPI_Alltoall(&scounts[0],1,MPI_INT,&rcounts[0],1,MPI_INT,my_col_comm_);
// dimension receive buffer
int rbufsize = rcounts[0];
rdispl[0] = 0;
for ( int i = 1; i < ctxt_.nprow(); i++ )
{
rbufsize += rcounts[i];
rdispl[i] = rdispl[i-1] + rcounts[i-1];
}
char* rbuf = new char[rbufsize];
int err = MPI_Alltoallv((void*)seg.data(),&scounts[0],&sdispl[0],
MPI_CHAR,rbuf,&rcounts[0],&rdispl[0],MPI_CHAR,my_col_comm_);
if ( err != 0 )
cout << ctxt_.mype()
<< " SlaterDet::write: error in MPI_Alltoallv" << endl;
if ( rbufsize > 0 )
{
// append rbuf to wbuf
char* tmp;
try
{
tmp = new char[wbufsize+rbufsize];
}
catch ( bad_alloc )
{
cout << ctxt_.mype() << " bad_alloc in wbuf append "
<< " n=" << n
<< " rbufsize=" << rbufsize
<< " wbufsize=" << wbufsize << endl;
}
memcpy(tmp,wbuf,wbufsize);
memcpy(tmp+wbufsize,rbuf,rbufsize);
delete [] wbuf;
wbuf = tmp;
wbufsize += rbufsize;
}
delete [] rbuf;
#else
sfp.file().write(seg.data(),seg.size());
#endif
}
#if USE_MPI
// wbuf now contains the data to be written in the correct order
ctxt_.barrier();
// compute offsets
sfp.sync();
MPI_Offset off;
long long int local_offset,current_offset;
current_offset = sfp.offset();
// compute local offset of next write
long long int local_size = wbufsize;
MPI_Scan(&local_size, &local_offset, 1,
MPI_LONG_LONG, MPI_SUM, ctxt_.comm());
// add base and correct for inclusive scan by subtracting local_size
local_offset += current_offset - local_size;
off = local_offset;
MPI_Status status;
// write wbuf from all tasks using computed offset
int len = wbufsize;
int err = MPI_File_write_at_all(sfp.file(),off,(void*)wbuf,len,
MPI_CHAR,&status);
if ( err != 0 )
cout << ctxt_.mype()
<< " error in MPI_File_write_at_all: err=" << err << endl;
sfp.set_offset(local_offset+len);
sfp.sync();
delete [] wbuf;
if ( ctxt_.onpe0() )
{
string s("\n");
int err = MPI_File_write_at(sfp.file(),sfp.mpi_offset(),(void*) s.data(),
s.size(),MPI_CHAR,&status);
if ( err != 0 )
cout << ctxt_.mype()
<< " error in MPI_File_write, slater_determinant trailer:"
<< " err=" << err << endl;
sfp.advance(s.size());
}
#else
sfp.file() << "\n";
#endif // USE_MPI
}
////////////////////////////////////////////////////////////////////////////////
void SlaterDet::info(ostream& os) const
{
if ( ctxt_.onpe0() )
{
os << "kpoint() << "\""
<< " size=\"" << nst() << "\">" << endl;
os << " sdcontext: " << ctxt_.nprow() << "x" << ctxt_.npcol() << endl;
//os << " sdcontext: " << ctxt_ << endl;
os << " basis size: " << basis_->size() << endl;
os << " c dimensions: "
<< c_.m() << "x" << c_.n()
<< " (" << c_.mb() << "x" << c_.nb() << " blocks)" << endl;
os << " "
<< endl;
os << " " << endl;
os << "" << endl;
}
}
////////////////////////////////////////////////////////////////////////////////
ostream& operator<<(ostream& os, SlaterDet& sd)
{
sd.print(os,"text",0.0,0,1);
return os;
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::empty_row_error(void)
{
// sum norm of the empty rows of the matrix c_
double sum = 0.0;
for ( int n = 0; n < c_.nloc(); n++ )
{
complex* p = c_.valptr(c_.mloc()*n);
for ( int i = basis_->localsize(); i < c_.mloc(); i++ )
{
sum += norm(p[i]);
}
}
ctxt_.dsum(1,1,&sum,1);
return sum;
}
////////////////////////////////////////////////////////////////////////////////
double SlaterDet::g0_imag_error(void)
{
// sum norm of the imaginary part of c(G=0).
double sum = 0.0;
for ( int n = 0; n < c_.nloc(); n++ )
{
complex* p = c_.valptr(c_.mloc()*n);
if ( basis_->real() && c_.mloc() > 0 && ctxt_.myrow() == 0 )
{
// index of G=0 element
double tim = p[0].imag();
sum += tim*tim;
}
}
ctxt_.dsum(1,1,&sum,1);
return sum;
}