////////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2010 The Regents of the University of California
//
// This file is part of Qbox
//
// Qbox is distributed under the terms of the GNU General Public License
// as published by the Free Software Foundation, either version 2 of
// the License, or (at your option) any later version.
// See the file COPYING in the root directory of this distribution
// or .
//
////////////////////////////////////////////////////////////////////////////////
//
// ExchangeOperator.C
//
////////////////////////////////////////////////////////////////////////////////
#include
#include
#include
#include
#include
#include "VectorLess.h"
#include "ExchangeOperator.h"
#include "Bisection.h"
using namespace std;
#define Tag_NumberOfStates 1
#define Tag_Occupation 2
#define Tag_Exchange 3
#define Tag_Forces 4
#define Tag_States 5
////////////////////////////////////////////////////////////////////////////////
ExchangeOperator::ExchangeOperator( Sample& s, double HFCoeff)
: s_(s), wf0_(s.wf), dwf0_(s.wf), wfc_(s.wf), HFCoeff_(HFCoeff),
gcontext_(s.wf.sd(0,0)->context())
{
eex_ = 0.0; // exchange energy
rcut_ = 1.0; // constant of support function for exchange integration
sigma_exhf_.resize(6);
// column communicator
vcomm_ = s_.wf.sd(0,0)->basis().comm();
// check if the only kpoint is the gamma point:
gamma_only_ = ( s_.wf.nkp()==1 ) && ( s_.wf.sd(0,0)->basis().real() );
if ( gamma_only_ )
{
// create a real basis for the pair densities
vbasis_ = new Basis(vcomm_, D3vector(0.0,0.0,0.0));
}
else
{
// create a complex basis
//!! should avoid the finite k trick to get a complex basis at gamma
vbasis_ = new Basis(vcomm_, D3vector(0.00000001,0.00000001,0.00000001));
}
// the size of the basis for the pair density should be
// twice the size of the wave function basis
vbasis_->resize( s_.wf.cell(),s_.wf.refcell(),4.0*s_.wf.ecut());
// set the size for the r grid to be a product of small primes
np0v_ = vbasis_->np(0)+2;
np1v_ = vbasis_->np(1)+2;
np2v_ = vbasis_->np(2)+2;
while (!vbasis_->factorizable(np0v_)) np0v_ += 2;
while (!vbasis_->factorizable(np1v_)) np1v_ += 2;
while (!vbasis_->factorizable(np2v_)) np2v_ += 2;
if ( gamma_only_ )
{
// create Fourier transform object wavefunctions
wft_ = new FourierTransform( s_.wf.sd(0,0)->basis(),np0v_,np1v_,np2v_);
}
const int ngloc = vbasis_->localsize();
// create Fourier transform object for densities
vft_ = new FourierTransform(*vbasis_,np0v_,np1v_,np2v_);
np012loc_ = vft_->np012loc();
// allocate memory for densities in G space
rhog1_.resize(ngloc);
rhog2_.resize(ngloc);
// allocate memory for densities in r space
rhor1_.resize(np012loc_);
rhor2_.resize(np012loc_);
// if not only at gamma, allocate arrays q+G
if ( !gamma_only_ )
{
// allocate memory for |q+G| and related quantities
qpG21_.resize(ngloc);
qpG22_.resize(ngloc);
qpG2i1_.resize(ngloc);
qpG2i2_.resize(ngloc);
}
// get both local and maximum amount of states on a proc
{
if ( s_.wf.nspin()==1 )
{
SlaterDet& sd = *(s_.wf.sd(0,0));
nLocalStates_=sd.nstloc();
MPI_Allreduce(&nLocalStates_,&nMaxLocalStates_,1,
MPI_INT,MPI_MAX,gcontext_.comm());
}
else
{
SlaterDet& sd_up = *(s_.wf.sd(0,0));
SlaterDet& sd_dn = *(s_.wf.sd(1,0));
nLocalStates_= sd_up.nstloc() > sd_dn.nstloc() ?
sd_up.nstloc() : sd_dn.nstloc();
MPI_Allreduce(&nLocalStates_,&nMaxLocalStates_,1,
MPI_INT,MPI_MAX,gcontext_.comm());
}
}
// allocate memory for exchange energies
exchange_ki_.resize(nMaxLocalStates_);
exchange_kj_.resize(nMaxLocalStates_);
send_buf_exchange_.resize(nMaxLocalStates_);
send_buf_occupation_.resize(nMaxLocalStates_);
// allocate memory for exchange
exchange_.resize(s_.wf.nkp());
for ( int iKp=0; iKp 0.0;
// if only at gamma
if ( gamma_only_ )
{
tmp_.resize(np012loc_);
// allocate bisection object
if ( use_bisection_ )
{
bisection_.resize(s_.wf.nspin());
uc_.resize(s_.wf.nspin());
for ( int ispin = 0; ispin < s_.wf.nspin(); ispin++ )
{
bisection_[ispin] = new Bisection(*s_.wf.sd(ispin,0),s_.ctrl.blHF);
const ComplexMatrix& c = s_.wf.sd(ispin,0)->c();
uc_[ispin] = new DoubleMatrix(c.context(),c.n(),c.n(),c.nb(),c.nb());
}
}
}
// allocate memory for occupation numbers of kpoint iKpi
occ_ki_.resize(nMaxLocalStates_);
occ_kj_.resize(nMaxLocalStates_);
// allocate memory for the real space expression of the forces
dstatei_.resize(nMaxLocalStates_);
dstatej_.resize(nMaxLocalStates_);
for ( int i = 0; i < nMaxLocalStates_; i++ )
{
dstatei_[i].resize(np012loc_);
dstatej_[i].resize(np012loc_);
}
// allocate memory for the copy of forces of kpoint iKpi
{
force_kpi_.resize( nMaxLocalStates_ * mlocMax );
send_buf_forces_.resize( nMaxLocalStates_ * mlocMax );
}
}
////////////////////////////////////////////////////////////////////////////////
ExchangeOperator::~ExchangeOperator()
{
if ( ( s_.wf.nkp()==1 ) && ( s_.wf.sd(0,0)->basis().real() ) )
{
// delete Fourier transform objects on states and forces
delete wft_;
}
// delete Fourier transform and basis for pair densities
delete vft_;
delete vbasis_;
if ( use_bisection_ )
for ( int ispin = 0; ispin < s_.wf.nspin(); ispin++ )
{
delete bisection_[ispin];
delete uc_[ispin];
}
}
////////////////////////////////////////////////////////////////////////////////
double ExchangeOperator::update_energy(bool compute_stress)
{
if ( gamma_only_ )
return eex_ = compute_exchange_at_gamma_(s_.wf, 0, compute_stress);
else
return eex_ = compute_exchange_for_general_case_(&s_, 0, compute_stress);
}
////////////////////////////////////////////////////////////////////////////////
double ExchangeOperator::update_operator(bool compute_stress)
{
dwf0_.clear();
// compute exchange energy and derivatives
if ( gamma_only_ )
eex_ = compute_exchange_at_gamma_(s_.wf, &dwf0_, compute_stress);
else
eex_ = compute_exchange_for_general_case_(&s_, &dwf0_, compute_stress);
// wf0_ is kept as a reference state
wf0_ = s_.wf;
// return exchange energy
return eex_;
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::apply_VXC_(double mix, Wavefunction& wf_ref,
Wavefunction& dwf_ref, Wavefunction& dwf)
{
// dwf += mix * ( |dwf_ref> + |wf_ref>
// - |wf_ref> )
for ( int ispin = 0; ispin < s_.wf.nspin(); ispin++ )
{
const int nst = s_.wf.nst(ispin);
for ( int ikp = 0; ikp < s_.wf.nkp(); ikp++ )
{
const Context &ctxt = s_.wf.sd(ispin,ikp)->c().context();
if ( s_.wf.sd(ispin,ikp)->basis().real() )
{
#if 0
update_sigma();
DoubleMatrix dc_proxy(dwf.sd(ispin,ikp)->c());
DoubleMatrix dcref_proxy(dwf_ref.sd(ispin,ikp)->c());
dc_proxy += dcref_proxy;
#else
DoubleMatrix matproj1(ctxt,nst,nst);
DoubleMatrix matproj2(ctxt,nst,nst);
DoubleMatrix matenergy(ctxt,nst,nst);
DoubleMatrix c_proxy(s_.wf.sd(ispin,ikp)->c());
DoubleMatrix dc_proxy(dwf.sd(ispin,ikp)->c());
DoubleMatrix cref_proxy(wf_ref.sd(ispin,ikp)->c());
DoubleMatrix dcref_proxy(dwf_ref.sd(ispin,ikp)->c());
// matproj1 = => matproj1
matproj1.gemm('t','n',2.0,cref_proxy,c_proxy,0.0);
matproj1.ger(-1.0,cref_proxy,0,c_proxy,0);
// dwf += mix * |dwf_ref> * matproj1
dc_proxy.gemm('n','n',mix,dcref_proxy,matproj1,1.0);
// matenergy =
matenergy.gemm('t','n',2.0,dcref_proxy,cref_proxy,0.0);
matenergy.ger(-1.0,dcref_proxy,0,cref_proxy,0);
// matproj2 = - matenergy * matproj1
matproj2.gemm('n','n',-1.0,matenergy,matproj1,0.0);
// matproj2 +=
matproj2.gemm('t','n',2.0,dcref_proxy,c_proxy,1.0);
matproj2.ger(-1.0,dcref_proxy,0,c_proxy,0);
// |dwf> += mix * |wf_ref> * matproj2
dc_proxy.gemm('n','n',mix,cref_proxy,matproj2,1.0);
#endif
}
else // complex wave functions
{
ComplexMatrix matproj1(ctxt,nst,nst);
ComplexMatrix matproj2(ctxt,nst,nst);
ComplexMatrix matenergy(ctxt,nst,nst);
ComplexMatrix &c(s_.wf.sd(ispin,ikp)->c());
ComplexMatrix &dc(dwf.sd(ispin,ikp)->c());
ComplexMatrix &cref(wf_ref.sd(ispin,ikp)->c());
ComplexMatrix &dcref(dwf_ref.sd(ispin,ikp)->c());
// matproj1 =
matproj1.gemm('c','n',1.0,cref,c,0.0);
// dwf += mix * |dwf_ref> * matproj1
dc.gemm('n','n',mix,dcref,matproj1,1.0);
// matenergy =
matenergy.gemm('c','n',1.0,dcref,cref,0.0);
// matproj2 = - matenergy * matproj1
matproj2.gemm('n','n',-1.0,matenergy,matproj1,0.0);
// matproj2 +=
matproj2.gemm('c','n',1.0,dcref,c,1.0);
// |dpsi> += mix * |psi_ref> * matproj2
dc.gemm('n','n',mix,cref,matproj2,1.0);
}
} // ikp
} // ispin
}
////////////////////////////////////////////////////////////////////////////////
double ExchangeOperator::apply_operator(Wavefunction& dwf)
{
// apply sigmaHF to s_.wf and store result in dwf
// use the reference function wf0_ and reference sigma(wf) dwf0_
apply_VXC_(1.0, wf0_, dwf0_, dwf);
return eex_;
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::add_stress(valarray& sigma_exc)
{
// add current value of the HF stress tensor to sigma_exc
sigma_exc += sigma_exhf_;
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::cell_moved(void)
{
vbasis_->resize( s_.wf.cell(),s_.wf.refcell(),4.0*s_.wf.ecut());
}
////////////////////////////////////////////////////////////////////////////////
// Exchange functions
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
double ExchangeOperator::compute_exchange_for_general_case_( Sample* s,
Wavefunction* dwf, bool compute_stress)
{
if ( compute_stress )
{
cout << " stress at general k-point not implemented" << endl;
gcontext_.abort(1);
}
Timer tm;
tm.start();
const Wavefunction& wf = s->wf;
const double omega = wf.cell().volume();
const int nkpoints = wf.nkp();
const int nspin = wf.nspin();
// determine the spin factor for the pair densities:
// 0.5 if 1 spin, 1 if nspin==2
const double spinFactor = 0.5 * nspin;
const double exfac = - ( 4.0 * M_PI / omega ) * spinFactor;
// initialize total exchange energy
double exchange_sum = 0.0;
double extot = 0.0;
// initialize stress
sigma_exhf_ = 0.0;
// loop on spins
for ( int ispin=0; ispin < nspin; ispin++ )
{
// initialize the numerical correction
// to the exchange integrals over kpoints j
vector numerical_correction(nkpoints);
for ( int iKpi = 0; iKpi < nkpoints; iKpi++ )
numerical_correction[iKpi]=0.0;
// loop over the kpoints
for ( int iKpi = 0; iKpi < nkpoints; iKpi++ )
{
SlaterDet& sdi = *(wf.sd(ispin,iKpi));
ComplexMatrix& ci = sdi.c();
FourierTransform* wfti_ = new FourierTransform(sdi.basis(),
np0v_,np1v_,np2v_);
SlaterDet& dsdi = *(dwf->sd(ispin,iKpi));
ComplexMatrix& dci = dsdi.c();
nStatesKpi_=sdi.nstloc();
// occupation numbers for kpoint i
const double* occ = sdi.occ_ptr();
for ( int i = 0; i *p = ci.cvalptr(0);
for ( int i = 0, bound = nStatesKpi_ * ci.mloc(); i < bound; i++ )
state_kpi_[i]=p[i];
if (dwf)
{
for ( int i = 0, bound = nStatesKpi_ * dci.mloc(); i < bound; i++ )
force_kpi_[i] = 0.0;
}
// initialize communications for the permutations
InitPermutation();
// Start rotation of the states of kpoint i from this point
for ( int iRotationStep=0; iRotationStepbackward( &state_kpi_[i*ci.mloc()], &(statei_[i])[0] );
}
CompleteSendingStates(iRotationStep);
for ( int i = 0, bound = nStatesKpi_ * ci.mloc(); i < bound; i++ )
send_buf_states_[i]=state_kpi_[i];
if (dwf)
{
for ( int i = 0; i < nStatesKpi_; i++ )
for ( int j = 0; j < np012loc_; j++ )
dstatei_[i][j]=0.0;
}
// set number of states of next permutation step
// (contained in nNextStatesKpi_)
SetNextPermutationStateNumber();
// start states permutation
StartStatesPermutation(ci.mloc());
CompleteReceivingOccupations(iRotationStep);
// second loop over kpoints
for ( int iKpj = iKpi; iKpj < nkpoints; iKpj++ )
{
SlaterDet& sdj = *(wf.sd(ispin,iKpj));
FourierTransform* wftj_ = new FourierTransform(sdj.basis(),
np0v_,np1v_,np2v_);
ComplexMatrix& cj = sdj.c();
for ( int i = 0; i < sdj.nstloc(); i++ )
wftj_->backward(cj.cvalptr(i*cj.mloc()),&(statej_[i])[0]);
SlaterDet& dsdj = *(dwf->sd(ispin,iKpj));
ComplexMatrix& dcj = dsdj.c();
if (dwf)
{
for ( int i = 0; i < dsdj.nstloc(); i++ )
for ( int j = 0; j < np012loc_; j++ )
dstatej_[i][j]=0.0;
}
// compute the differences dk between kpoints i and j
// and their symmetric equivalent
// dk1 = kpi-kpj
D3vector dk1 = wf.kpoint(iKpi) - wf.kpoint(iKpj);
// set dk1 in absolute reciprocal coordinates to get q1
D3vector q1 = dk1.x*sdi.basis().cell().b(0)
+ dk1.y*sdi.basis().cell().b(1)
+ dk1.z*sdi.basis().cell().b(2);
// dk2 = kpi+kpj
D3vector dk2 = wf.kpoint(iKpi) + wf.kpoint(iKpj);
// set dk2 in absolute reciprocal coordinates to get q2
D3vector q2 = dk2.x*sdi.basis().cell().b(0)
+ dk2.y*sdi.basis().cell().b(1)
+ dk2.z*sdi.basis().cell().b(2);
// compute values of |q1+G| and |q2+G|
double SumExpQpG2 = 0.0;
const int ngloc = vbasis_->localsize();
for ( int ig = 0; ig < ngloc; ig++ )
{
D3vector G(vbasis_->gx(ig+ngloc*0),
vbasis_->gx(ig+ngloc*1),
vbasis_->gx(ig+ngloc*2));
// compute G+q for each qi and find the value of the
// correction term: sum_(G,q) exp(-rcut_^2*|G+q|^2)/|G+q|^2
// => compute the square norm and inverse of q1+G
qpG21_[ig] = ( G + q1 ) * ( G + q1 );
qpG2i1_[ig] = ( qpG21_[ig] > 0.0 ) ? 1.0 / qpG21_[ig] : 0.0;
// => compute the square norm and inverse of q2+G
qpG22_[ig] = ( G + q2 ) * ( G + q2 );
qpG2i2_[ig] = ( qpG22_[ig] > 0.0 ) ? 1.0 / qpG22_[ig] : 0.0;
// if iKpi=0 (first k point)
// compute the numerical part of the correction
if ( iRotationStep==0 )
{
const double rc2 = rcut_*rcut_;
SumExpQpG2 += (exp(-rc2*qpG21_[ig]) * qpG2i1_[ig] );
SumExpQpG2 += (exp(-rc2*qpG22_[ig]) * qpG2i2_[ig] );
}
}
// Add weighted contribution to numerical correction:
// add the term sum_G exp(-a*|q+G|^2)/|q+G|^2 to the numerical
// correction. Works only if this is the first iKpoint.
//
// divide weight by 2 as we implicitly counted kpoint j and symmetric
if ( iRotationStep==0 ) // && ( iKpi==0 )
{
if ( iKpi==iKpj )
{
numerical_correction[iKpi] += SumExpQpG2 * wf.weight(iKpj)/2.0;
}
else
{
numerical_correction[iKpi] += SumExpQpG2 * wf.weight(iKpj)/2.0;
numerical_correction[iKpj] += SumExpQpG2 * wf.weight(iKpi)/2.0;
}
}
// get occupation numbers for kpoint j
const double* occ = sdj.occ_ptr();
for ( int i = 0; iforward(&rhor1_[0], &rhog1_[0]);
vft_->forward(&rhor2_[0], &rhog2_[0]);
// initialize contrib of the states psi(kj,j) to the exchange
// energy associated to state psi(ki,i)
double ex_ki_i_kj_j = 0.0;
for ( int ig = 0; ig < ngloc; ig++ )
{
// Add the values of |rho1(G)|^2/|G+q1|^2
// and |rho2(G)|^2/|G+q2|^2 to the exchange energy.
// This does not take the point G=q=0 into account
// as qpG2i = 0.
const double t1 = norm(rhog1_[ig]) * qpG2i1_[ig];
const double t2 = norm(rhog2_[ig]) * qpG2i2_[ig];
ex_ki_i_kj_j += t1;
ex_ki_i_kj_j += t2;
if ( dwf )
{
// compute rhog1_[G]/|G+q1|^2 and rhog2_[G]/|G+q1|^2
rhog1_[ig] *= qpG2i1_[ig];
rhog2_[ig] *= qpG2i2_[ig];
}
}
if ( dwf )
{
// Backtransform rhog[G]/|q+G|^2
vft_->backward(&rhog1_[0], &rhor1_[0]);
vft_->backward(&rhog2_[0], &rhor2_[0]);
}
// if iKpi=iKpj, add this contribution to the
// exchange energy of state psi(ki,i)
if ( iKpi==iKpj )
{
// case iKpi=iKpj
// count only for psi(ki,i)
//
// compute the weights:
//
// => divide the weight of kpoint j by 2 as we implicitly
// counted both kj and -kj in ex_ki_i_kj_j.
// => take in account the occupation of the state psi(kj,j)
// (divide by 2 to remove spin factor if only one spin).
// => multiply by the constants of computation
double weight = - 4.0 * M_PI / omega * wf.weight(iKpj) /
2.0 * occ_kj_[j] * spinFactor;
// add contribution to exchange energy
exchange_sum += ex_ki_i_kj_j * weight * wf.weight(iKpi) *
0.5 * occ_ki_[i];
if (dwf)
{
// acumulate weighted contributions
// Psi_j,kj(r) * TF( rhog[G]/|q+G|^2 ) and symmetric
// in dpsi_i. We take now into account the mixing coeff
weight *= HFCoeff_;
for ( int ir = 0; ir < np012loc_; ir++ )
{
dstatei_[i][ir] += ( statej_[j][ir] * rhor1_[ir] +
conj(statej_[j][ir] ) * rhor2_[ir]) * weight;
}
}
}
// if iKpi/=iKpj, add this contrib to the exch energy of both
// states psi(ki,i) and psi(kj,j)
// (as ex(ki,i,kj,j)=ex(kj,j,ki,i))
// this way, we can avoid computing ex(ki,i,kj,j) for kjiKpi
// count for both psi(ki,i) and psi(kj,j)
//
// compute the weights:
//
// => divide the weight of kpoints j (resp. i) by 2 as we
// implicitly counted both kj and -kj (resp ki and -ki)
// in ex_ki_i_kj_j.
// => take in account the occupation of the state psi(kj,j)
// (resp psi(ki,i)), and divide by 2 to remove spin factor
// => multiply by the constants of computation
double weighti = - 4.0 * M_PI / omega * wf.weight(iKpi) /
2.0 * occ_ki_[i] * spinFactor;
double weightj = - 4.0 * M_PI / omega * wf.weight(iKpj) /
2.0 * occ_kj_[j] * spinFactor;
// add contribution to exchange energy
exchange_sum += ex_ki_i_kj_j * weightj * wf.weight(iKpi) *
0.5 * occ_ki_[i];
exchange_sum += ex_ki_i_kj_j * weighti * wf.weight(iKpj) *
0.5 * occ_kj_[j];
if (dwf)
{
// acumulate weighted contributions in dpsi_j and dpsi_j.
// the correspondances between rho12 and rho 21 are given by
//
// /
// rho1_12(r) = | psi2*(r')*psi1(r')/|r-r'|dr'
// /
//
// /
// rho2_12(r) = | psi2(r')*psi1(r')/|r-r'|dr'
// /
//
// => rho1_21 = rho1_12*
// => rho2_21 = rho2_12
//
// We take also into account the mixing coefficient.
//
weighti *= HFCoeff_;
weightj *= HFCoeff_;
for ( int ir = 0; ir < np012loc_; ir++ )
{
dstatei_[i][ir] += ( statej_[j][ir] * rhor1_[ir] +
conj( statej_[j][ir] ) * rhor2_[ir] ) * weightj;
dstatej_[j][ir] += ( statei_[i][ir] * conj(rhor1_[ir]) +
conj( statei_[i][ir] )* rhor2_[ir] ) * weighti;
}
}
}
}
} // for j
} // for i
if (dwf)
{
// add the g space contributions to each state derivative of kpj
for ( int i = 0; i < dsdj.nstloc(); i++ )
{
// compute the g space state derivative contribution
wftj_->forward(&(dstatej_[i])[0],&buffer_dstate_[0]);
// add the g the result to the state derivative of kpj
complex *p=dcj.valptr(i*dcj.mloc());
for ( int j=0; jforward(&(dstatei_[i])[0], &buffer_dstate_[0]);
// sum up contributions in send buffer
complex *p1=&force_kpi_[i*dci.mloc()];
complex *p2=&send_buf_forces_[i*dci.mloc()];
for ( int j=0; jmype() == 0 )
{
const double div_corr_2 = - exfac * rcut_ * rcut_ * occ_ki_[i] *
wf.weight(iKpi);
div_corr += div_corr_2;
const double e_div_corr_2 = -0.5 * div_corr_2 * occ_ki_[i];
exchange_sum += e_div_corr_2 * wf.weight(iKpi);
// add here contributions of div_corr_2 to stress
const double div_corr_3 = - exfac * integ/vbz * occ_ki_[i];
div_corr += div_corr_3;
const double e_div_corr_3 = -0.5 * div_corr_3 * occ_ki_[i];
exchange_sum += e_div_corr_3 * wf.weight(iKpi);
// no contribution to stress from div_corr_3
}
// contribution of divergence corrections to forces on wave functions
if (dwf)
{
// sum the partial contributions to the correction for state i
gcontext_.dsum('C', 1, 1, &div_corr, 1);
// add correction to the derivatives of state i
complex *ps=ci.valptr(i*ci.mloc());
complex *pf1=dci.valptr(i*dci.mloc());
complex *pf2=&force_kpi_[i*dci.mloc()];
for ( int j = 0; j < dci.mloc(); j++ )
pf1[j] += pf2[j] - ps[j] * div_corr * HFCoeff_;
}
} // for i
delete wfti_;
} // for iKpi
} // for ispin
// reduce the total energy
gcontext_.dsum(1, 1, &exchange_sum, 1);
extot = exchange_sum;
extot *= HFCoeff_;
tm.stop();
#ifdef DEBUG
if ( gcontext_.onpe0() )
{
cout << setprecision(10);
cout << " total exchange computation time: " << tm.real()
<< " s" << endl;
}
#endif
return extot;
}
////////////////////////////////////////////////////////////////////////////////
double ExchangeOperator::compute_exchange_at_gamma_(const Wavefunction &wf,
Wavefunction* dwf, bool compute_stress)
{
Timer tm;
Timer tmb;
wfc_ = wf;
cout << setprecision(10);
const double omega = wfc_.cell().volume();
const int nspin = wfc_.nspin();
// spin factor for the pair densities: 0.5 if 1 spin, 1 if nspin==2
const double spinFactor=0.5*nspin;
// total exchange energy
double exchange_sum = 0.0;
double extot = 0.0;
sigma_exhf_ = 0.0;
const double *const g_x = vbasis_->gx_ptr(0);
const double *const g_y = vbasis_->gx_ptr(1);
const double *const g_z = vbasis_->gx_ptr(2);
for ( int ispin = 0; ispin < wfc_.nspin(); ispin++ )
{
SlaterDet& sd = *(wfc_.sd(ispin,0));
ComplexMatrix& c = sd.c();
const int nst = sd.nst();
// if using bisection, localize the wave functions
if ( use_bisection_ )
{
tmb.start();
int maxsweep = 50;
if ( s_.ctrl.debug.find("BISECTION_MAXSWEEP") != string::npos )
{
// override tolerance for bisection
istringstream is(s_.ctrl.debug);
string s;
is >> s >> maxsweep;
if ( gcontext_.onpe0() )
cout << " override bisection maxsweep value: maxsweep = "
<< maxsweep << endl;
assert(maxsweep >= 0);
}
double tol = 1.0;
if ( s_.ctrl.debug.find("BISECTION_TOL") != string::npos )
{
// override tolerance for bisection
istringstream is(s_.ctrl.debug);
string s;
is >> s >> tol;
if ( gcontext_.onpe0() )
cout << " override bisection tol value: tol = " << tol << endl;
assert(tol > 0.0);
}
#if TIMING
Timer tmbtransf;
tmbtransf.start();
#endif
bisection_[ispin]->compute_transform(*wfc_.sd(ispin,0),maxsweep,tol);
#if TIMING
tmbtransf.stop();
Timer tmbcomploc;
tmbcomploc.start();
#endif
bisection_[ispin]->compute_localization(s_.ctrl.btHF);
#if TIMING
tmbcomploc.stop();
#endif
// copy of localization vector from Bisection object
localization_ = bisection_[ispin]->localization();
#if TIMING
Timer tmbsize, tmbpair;
tmbsize.start();
#endif
if ( gcontext_.onpe0() )
{
cout << " ExchangeOperator: bisection size: ispin=" << ispin
<< ": " << bisection_[ispin]->total_size() << endl;
}
#if TIMING
tmbsize.stop();
tmbpair.start();
#endif
if ( gcontext_.onpe0() )
{
cout << " ExchangeOperator: pair fraction: ispin=" << ispin
<< ": " << bisection_[ispin]->pair_fraction() << endl;
}
#if TIMING
tmbpair.stop();
#endif
// copy the orthogonal transformation u to uc_[ispin]
*uc_[ispin] = bisection_[ispin]->u();
bool distribute = s_.ctrl.debug.find("BISECTION_NODIST") == string::npos;
if ( distribute )
{
// define a permutation ordering states by increasing degree
// permute states according to the order defined by the
// localization vector
// compute the degree of the vertices of the exchange graph
// using the localization vector
#if TIMING
Timer tmb_ov;
tmb_ov.start();
#endif
vector degree(nst);
for ( int i = 0; i < nst; i++ )
{
int count = 0;
for ( int j = 0; j < nst; j++ )
{
if ( bisection_[ispin]->overlap(localization_,i,j) )
count++;
}
degree[i] = count;
}
#if TIMING
tmb_ov.stop();
if ( gcontext_.onpe0() )
{
cout << setprecision(3);
cout << " ExchangeOperator: bisection overlap time: "
<< tmb_ov.real() << " s" << endl;
}
#endif
// permutation index
vector index(nst);
for ( int j = 0; j < index.size(); j++ )
index[j] = j;
// Create function object for comparison of degree
VectorLess degree_less(degree);
sort(index.begin(), index.end(), degree_less);
// At this point degree[index[i]] <= degree[index[j]] if i < j
for ( int i = 0; i < index.size()-1; i++ )
assert(degree[index[i]] <= degree[index[i+1]]);
#if DEBUG
if ( gcontext_.onpe0() )
{
cout << "degree order after sort:" << endl;
for ( int j = 0; j < index.size(); j++ )
cout << j << " -> " << index[j]
<< " " << degree[index[j]]
<< endl;
}
#endif
// distribute the states to process columns in round robin fashion
// Assume that the states are initially ordered by increasing degree
// i.e. degree(index_[i]) < degree(index_[j]) if i < j
const int nb = uc_[ispin]->nb();
vector distrib_index(nst);
int ibase = 0;
int icol = 0;
for ( int i = 0; i < nst; i++ )
{
// check if next slot is beyond n
if ( ibase + icol * nb >= nst )
{
// restart in column 0 with incremented ibase
icol = 0;
ibase++;
}
distrib_index[ibase + icol * nb] = i;
icol++;
}
// combine index[i] and distrib_index[i]
vector itmp(index.size());
for ( int i = 0; i < index.size(); i++ )
itmp[i] = index[distrib_index[i]];
index = itmp;
#if DEBUG
if ( gcontext_.onpe0() )
{
cout << "index after round robin distrib:" << endl;
for ( int j = 0; j < index.size(); j++ )
cout << j << " -> " << index[j] << endl;
}
#endif
// apply the permutation defined by index to the localization vector
vector loc_tmp(localization_.size());
for ( int i = 0; i < index.size(); i++ )
loc_tmp[i] = localization_[index[i]];
localization_ = loc_tmp;
// apply the permutation defined by index to the occupation vector
vector occ_tmp(nst);
for ( int i = 0; i < index.size(); i++ )
occ_tmp[i] = sd.occ(index[i]);
sd.set_occ(occ_tmp);
// compute a pivot vector containing a sequence of transpositions
// equivalent to the permutation defined by index[i]
// compute the inverse of index
vector index_inv(index.size());
for ( int i = 0; i < index.size(); i++ )
index_inv[index[i]] = i;
assert(index_inv.size()==index.size());
vector pivot;
for ( int i = 0; i < index.size(); i++ )
{
int j = index_inv[i];
int tmp = index[i];
index[i] = index[j];
index[j] = tmp;
// update elements of index_inv
index_inv[index[i]] = i;
index_inv[index[j]] = j;
pivot.push_back(j);
}
assert(pivot.size()==index.size());
#if DEBUG
if ( gcontext_.onpe0() )
{
cout << "pivot:" << endl;
for ( int j = 0; j < pivot.size(); j++ )
cout << j << " -> " << pivot[j] << endl;
}
#endif
// create a local pivot vector on this process (size uc->nloc())
// this vector must be replicated on all tasks of the
// process grid columns
const int nloc = uc_[ispin]->nloc();
vector locpivot(nloc);
// fill the local pivot vector on all tasks
// add 1 to index values for lapack fortran index convention
for ( int j=0; j < nloc; j++ )
{
int jglobal = uc_[ispin]->jglobal(j);
locpivot[j] = pivot[jglobal]+1;
}
#if 0
for ( int ipe = 0; ipe < uc.context().size(); ipe++ )
{
uc.context().barrier();
if ( ipe == uc.context().mype() )
{
cout << "locpivot:" << endl;
for ( int j = 0; j < locpivot.size(); j++ )
cout << ipe << ": " << j << " -> " << locpivot[j] << endl;
}
}
#endif
#if 0
if ( u_->context().size() == 1 )
{
// cout << " uc before perm: " << endl;
// cout << uc;
// local permutation
assert(locpivot.size()==u_->n());
double *p = uc.valptr(0);
const int mloc = uc.mloc();
for ( int i = locpivot.size()-1; i >= 0; i-- )
{
const int j = locpivot[i]-1;
// swap columns i and j of u_->c()
cout << " swap " << i << " " << j << endl;
for ( int k = 0; k < mloc; k++ )
{
double tmp = p[i*mloc+k];
p[i*mloc+k] = p[j*mloc+k];
p[j*mloc+k] = tmp;
}
}
// cout << " uc after perm: " << endl;
// cout << uc;
}
#endif
#if TIMING
Timer tmblapiv;
tmblapiv.start();
#endif
// apply the permutation to the columns of uc
uc_[ispin]->lapiv('B','C',&locpivot[0]);
#if TIMING
tmblapiv.stop();
if ( gcontext_.onpe0() )
{
cout << setprecision(3);
cout << " ExchangeOperator: bisection size time: "
<< tmbsize.real() << " s" << endl;
cout << setprecision(3);
cout << " ExchangeOperator: bisection pair time: "
<< tmbpair.real() << " s" << endl;
cout << setprecision(3);
cout << " ExchangeOperator: bisection lapiv time: "
<< tmblapiv.real() << " s" << endl;
}
#endif
#if DEBUG
// recompute the degree of the vertices of the exchange graph
for ( int i = 0; i < nst; i++ )
{
int count = 0;
for ( int j = 0; j < nst; j++ )
{
if ( bisection_[ispin]->overlap(localization_,i,j) )
count++;
}
degree[i] = count;
}
if ( gcontext_.onpe0() )
{
cout << "degree after permutation:" << endl;
for ( int j = 0; j < degree.size(); j++ )
cout << " deg[" << j << "] = " << degree[j] << endl;
}
// print localization vectors and overlaps after distribution
if ( gcontext_.onpe0() )
{
int sum = 0;
for ( int i = 0; i < nst; i++ )
{
int count = 0;
for ( int j = 0; j < nst; j++ )
{
if ( bisection_[ispin]->overlap(localization_,i,j) )
count++;
}
cout << "localization[" << i << "]: "
<< localization_[i] << " "
<< bitset<30>(localization_[i])
<< " overlaps: "
<< count << endl;
sum += count;
}
cout << " Bisection::localize: total overlaps: " << sum << " / "
<< nst*nst << " = " << ((double) sum)/(nst*nst) << endl;
}
#endif
}
else
{
if ( gcontext_.onpe0() )
cout << " ExchangeOperator: bisection distribution disabled" << endl;
} // if distribute
#if TIMING
Timer tmbfwd;
tmbfwd.start();
#endif
bisection_[ispin]->forward(*uc_[ispin], *wfc_.sd(ispin,0));
#if TIMING
tmbfwd.stop();
#endif
tmb.stop();
if ( gcontext_.onpe0() )
{
#if TIMING
cout << setprecision(3);
cout << " ExchangeOperator: bisection compute transform time: "
<< tmbtransf.real() << " s" << endl;
cout << setprecision(3);
cout << " ExchangeOperator: bisection compute localization time: "
<< tmbcomploc.real() << " s" << endl;
cout << setprecision(3);
cout << " ExchangeOperator: bisection forward time: "
<< tmbfwd.real() << " s" << endl;
#endif
cout << setprecision(3);
cout << " ExchangeOperator: bisection time: "
<< tmb.real() << " s" << endl;
}
} // if use_bisection_
tm.start();
// compute exchange
// real-space local states -> statej_[i][ir]
// loop over states 2 by 2
for ( int i = 0; i < sd.nstloc()-1; i+=2 )
{
wft_->backward(c.cvalptr(i*c.mloc()), c.cvalptr((i+1)*c.mloc()),
&tmp_[0]);
double *p = (double *)&tmp_[0];
#pragma omp parallel for
for ( int ir = 0; ir < np012loc_; ir++ )
{
statej_[i][ir]=p[2*ir];
statej_[i+1][ir]=p[2*ir+1];
}
}
if ( sd.nstloc() % 2 == 1 )
{
int i = sd.nstloc()-1;
wft_->backward(c.cvalptr(i*c.mloc()), &(statej_[i])[0]);
}
SlaterDet& dsd = *(dwf->sd(ispin,0));
ComplexMatrix& dc = dsd.c();
if (dwf)
{
// reset real space derivatives dstatej_[i][ir]
for ( int i = 0; i < dsd.nstloc(); i++ )
for ( int j = 0; j < np012loc_; j++ )
dstatej_[i][j] = 0.0;
}
const int ngloc = vbasis_->localsize();
// correction term: sum_(G) exp(-rcut_^2*|G|^2)/|G|^2
const double exfac = - ( 4.0 * M_PI / omega ) * spinFactor;
double SumExpG2 = 0.0;
double sigma_sumexp[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
const double *g2 = vbasis_->g2_ptr();
const double *g2i = vbasis_->g2i_ptr();
const double rc2 = rcut_*rcut_;
for ( int ig = 0; ig < ngloc; ig++ )
{
// factor 2.0: real basis
const double tg2i = g2i[ig];
double t = 2.0 * exp( - rc2 * g2[ig] ) * tg2i;
SumExpG2 += t;
if ( compute_stress )
{
const double tgx = g_x[ig];
const double tgy = g_y[ig];
const double tgz = g_z[ig];
// factor 2.0: derivative of G^2
const double fac = t * 2.0 * ( rc2 + tg2i );
sigma_sumexp[0] += fac * tgx * tgx;
sigma_sumexp[1] += fac * tgy * tgy;
sigma_sumexp[2] += fac * tgz * tgz;
sigma_sumexp[3] += fac * tgx * tgy;
sigma_sumexp[4] += fac * tgy * tgz;
sigma_sumexp[5] += fac * tgz * tgx;
}
}
// local occupation numbers
const double* occ = sd.occ_ptr();
for ( int i = 0; i < sd.nstloc(); i++ )
occ_kj_[i]=occ[c.jglobal(i)];
// number of states to be sent
nStatesKpi_ = sd.nstloc();
// occupation numbers of circulating states
for ( int i = 0; i < nStatesKpi_; i++ )
occ_ki_[i] = occ_kj_[i];
// copy local states into circulating states
const complex *p = c.cvalptr(0);
for ( int i = 0; i < nStatesKpi_ * c.mloc(); i++ )
state_kpi_[i]=p[i];
// initialize circulating derivatives
if (dwf)
{
for ( int i = 0; i < nStatesKpi_ * dc.mloc(); i++ )
force_kpi_[i]=0.0;
}
// initiate send nStatesKpi_ and receive nNextStatesKpi_
InitPermutation();
#if LOAD_MATRIX
// collect number of processed pairs in array load_matrix
vector load_matrix(gcontext_.npcol()*gcontext_.npcol(),0);
#endif
// Start rotation of circulating states
for ( int iRotationStep = 0; iRotationStep first_member_of_pair;
vector second_member_of_pair;
// flag indicating that a circulating state has been used
vector useState(nStatesKpi_,0);
// finish receiving occupations in occ_ki_[]
CompleteReceivingOccupations(iRotationStep);
// loop over circulating states
for ( int i = 0; i < nStatesKpi_; i++ )
{
// original column index of circulating state i
int iColI = gcontext_.mycol() - iRotationStep;
iColI = ( iColI < 0 ) ? iColI + gcontext_.npcol() : iColI;
// global index of circulating state i
int iGlobI = c.jglobal(iColI,i);
// loop over fixed states
for ( int j = 0; j < sd.nstloc(); j++ )
{
// check if there is something to compute for this pair
if ( occ_ki_[i]!=0.0 || occ_kj_[j]!=0.0 )
{
// global index of fixed state j
int iGlobJ = c.jglobal(j);
// determine the overlap between those two states
bool overlap_ij = ( !use_bisection_ ||
bisection_[ispin]->overlap(localization_,iGlobI,iGlobJ) );
// use the chess board condition to
// optimize the distribution of work on
// each process:
// - if i and j have different parity, use
// the condition i=j
int parity_i = iGlobI & 1;
int parity_j = iGlobJ & 1;
if ( parity_i == parity_j )
{
if ( iGlobI >= iGlobJ && overlap_ij )
{
first_member_of_pair.push_back( i );
second_member_of_pair.push_back( j );
nPair++;
// circulating state i is used
useState[i] = 1;
}
}
else
{
if ( iGlobI < iGlobJ && overlap_ij )
{
first_member_of_pair.push_back( i );
second_member_of_pair.push_back( j );
nPair++;
// circulating state i is used
useState[i] = 1;
}
}
}
}
}
// pair list is complete
#if LOAD_MATRIX
// collect nPair statistics if on row 0
if ( gcontext_.myrow() == 0 )
load_matrix[iRotationStep*gcontext_.npcol()+gcontext_.mycol()] = nPair;
#endif
// complete receiving states
// note: this does nothing if iRotationStep == 0
CompleteReceivingStates(iRotationStep);
// circulating states in state_kpi_[i+j*mloc] can now be used
// compute real space circulating states
if ( nPair > 0 )
{
int i = 0;
int j = 1;
// try to associate states 2 by 2
while ( i < nStatesKpi_ )
{
// seek the first next used state
while ( i < nStatesKpi_ && !useState[i] ) i++;
// seek the second next used state
j=i+1;
while ( j < nStatesKpi_ && !useState[j] ) j++;
// if i is a valid index
if ( i < nStatesKpi_ )
{
// if j is a valid index
if ( j < nStatesKpi_ )
{
// back transform the couple (i,j)
wft_->backward(&state_kpi_[i*c.mloc()],
&state_kpi_[j*c.mloc()], &tmp_[0]);
// copy the result in state[i] and state[j]
double *p = (double *)&tmp_[0];
#pragma omp parallel for
for ( int ir = 0; ir < np012loc_; ir++ )
{
statei_[i][ir]=p[2*ir];
statei_[j][ir]=p[2*ir+1];
}
}
// else, if there is only one state to transform
else
{
wft_->backward(&state_kpi_[i*c.mloc()], &(statei_[i])[0]);
}
}
// increment indices
i = j + 1;
}
}
// finish sending states in send_buf_states_
CompleteSendingStates(iRotationStep);
// send_buf_states_ can now be reused
// copy the states to be sent in the send buffer
for ( int i = 0; i < nStatesKpi_ * c.mloc(); i++ )
send_buf_states_[i] = state_kpi_[i];
if (dwf)
{
for ( int i = 0; i < nStatesKpi_; i++ )
for ( int j = 0; j < np012loc_; j++ )
dstatei_[i][j] = 0.0;
}
// nNextStatesKpi: number of states of next permutation step
SetNextPermutationStateNumber();
// start sending states in send_buf_states_
StartStatesPermutation(c.mloc());
// loop over pairs 2 by 2
if ( nPair > 0 )
{
double ex_sum_1, ex_sum_2;
double sigma_sum_1[6], sigma_sum_2[6];
int iPair;
for ( iPair=0; iPairforward(&rhor1_[0], &rhog1_[0], &rhog2_[0]);
// compute contributions to the exchange energy and forces on wfs
ex_sum_1 = 0.0;
ex_sum_2 = 0.0;
if ( compute_stress )
{
for ( int i = 0; i < 6; i++ )
{
sigma_sum_1[i] = 0.0;
sigma_sum_2[i] = 0.0;
}
}
for ( int ig = 0; ig < ngloc; ig++ )
{
// Add the values of |rho1(G)|^2/|G+q1|^2
// and |rho2(G)|^2/|G+q2|^2 to the exchange energy.
// note: g2i[G=0] == 0
// factor 2.0: real basis
const double tg2i = g2i[ig];
const double t1 = 2.0 * norm(rhog1_[ig]) * tg2i;
const double t2 = 2.0 * norm(rhog2_[ig]) * tg2i;
ex_sum_1 += t1;
ex_sum_2 += t2;
if (dwf)
{
// compute rhog1_[G]/|G+q1|^2 and rhog2_[G]/|G+q1|^2
rhog1_[ig] *= tg2i;
rhog2_[ig] *= tg2i;
}
if ( compute_stress )
{
const double tgx = g_x[ig];
const double tgy = g_y[ig];
const double tgz = g_z[ig];
// factor 2.0: derivative of 1/G^2
const double fac1 = 2.0 * t1 * tg2i;
sigma_sum_1[0] += fac1 * tgx * tgx;
sigma_sum_1[1] += fac1 * tgy * tgy;
sigma_sum_1[2] += fac1 * tgz * tgz;
sigma_sum_1[3] += fac1 * tgx * tgy;
sigma_sum_1[4] += fac1 * tgy * tgz;
sigma_sum_1[5] += fac1 * tgz * tgx;
const double fac2 = 2.0 * t2 * tg2i;
sigma_sum_2[0] += fac2 * tgx * tgx;
sigma_sum_2[1] += fac2 * tgy * tgy;
sigma_sum_2[2] += fac2 * tgz * tgz;
sigma_sum_2[3] += fac2 * tgx * tgy;
sigma_sum_2[4] += fac2 * tgy * tgz;
sigma_sum_2[5] += fac2 * tgz * tgx;
}
}
if (dwf)
{
// Backtransform rhog[G]/|q+G|^2
vft_->backward(&rhog1_[0], &rhog2_[0], &rhor1_[0]);
}
// accumulate contributions to the exchange energy
// first pair: (i1,j1)
const double fac1 = 0.5 * exfac * occ_ki_[i1] * occ_kj_[j1];
if ( ( i1==j1 ) && ( iRotationStep==0 ) )
{
exchange_sum += fac1 * ex_sum_1;
sigma_exhf_[0] += fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
sigma_exhf_[1] += fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
sigma_exhf_[2] += fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
sigma_exhf_[3] += fac1 * ( -sigma_sum_1[3] ) / omega;
sigma_exhf_[4] += fac1 * ( -sigma_sum_1[4] ) / omega;
sigma_exhf_[5] += fac1 * ( -sigma_sum_1[5] ) / omega;
if (dwf)
{
const double weight = exfac * occ_kj_[j1] * HFCoeff_;
double *dp = (double *) &dstatei_[i1][0];
double *pj = (double *) &statej_[j1][0];
double *pr = (double *) &rhor1_[0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
dp[ip] += pj[ip] * pr[ip] * weight;
}
}
else
{
exchange_sum += 2.0 * fac1 * ex_sum_1;
sigma_exhf_[0] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
sigma_exhf_[1] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
sigma_exhf_[2] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
sigma_exhf_[3] += 2.0 * fac1 * ( -sigma_sum_1[3] ) / omega;
sigma_exhf_[4] += 2.0 * fac1 * ( -sigma_sum_1[4] ) / omega;
sigma_exhf_[5] += 2.0 * fac1 * ( -sigma_sum_1[5] ) / omega;
if (dwf)
{
double weighti = exfac * occ_ki_[i1] * HFCoeff_;
double weightj = exfac * occ_kj_[j1] * HFCoeff_;
double *dpi = (double *) &dstatei_[i1][0];
double *dpj = (double *) &dstatej_[j1][0];
double *pi = (double *) &statei_[i1][0];
double *pj = (double *) &statej_[j1][0];
double *pr = (double *) &rhor1_[0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
{
dpi[ip] += pj[ip] * pr[ip] * weightj;
dpj[ip] += pi[ip] * pr[ip] * weighti;
}
}
}
// second pair
const double fac2 = 0.5 * exfac * occ_ki_[i2] * occ_kj_[j2];
if ( ( i2==j2 ) && ( iRotationStep==0 ) )
{
exchange_sum += fac2 * ex_sum_2;
sigma_exhf_[0] += fac2 * (ex_sum_2 - sigma_sum_2[0]) / omega;
sigma_exhf_[1] += fac2 * (ex_sum_2 - sigma_sum_2[1]) / omega;
sigma_exhf_[2] += fac2 * (ex_sum_2 - sigma_sum_2[2]) / omega;
sigma_exhf_[3] += fac2 * ( -sigma_sum_2[3] ) / omega;
sigma_exhf_[4] += fac2 * ( -sigma_sum_2[4] ) / omega;
sigma_exhf_[5] += fac2 * ( -sigma_sum_2[5] ) / omega;
if (dwf)
{
const double weight = exfac * occ_kj_[j2] * HFCoeff_;
double *dp = (double *) &dstatei_[i2][0];
double *pj = (double *) &statej_[j2][0];
double *pr = (double *) &rhor1_[0];
pr = pr + 1;
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
dp[ip] += pj[ip] * pr[ip] * weight;
}
}
else
{
exchange_sum += 2.0 * fac2 * ex_sum_2;
sigma_exhf_[0] += 2.0 * fac2 * (ex_sum_2 - sigma_sum_2[0]) / omega;
sigma_exhf_[1] += 2.0 * fac2 * (ex_sum_2 - sigma_sum_2[1]) / omega;
sigma_exhf_[2] += 2.0 * fac2 * (ex_sum_2 - sigma_sum_2[2]) / omega;
sigma_exhf_[3] += 2.0 * fac2 * ( -sigma_sum_2[3] ) / omega;
sigma_exhf_[4] += 2.0 * fac2 * ( -sigma_sum_2[4] ) / omega;
sigma_exhf_[5] += 2.0 * fac2 * ( -sigma_sum_2[5] ) / omega;
if (dwf)
{
double weighti = exfac * occ_ki_[i2] * HFCoeff_;
double weightj = exfac * occ_kj_[j2] * HFCoeff_;
double *dpi = (double *) &dstatei_[i2][0];
double *dpj = (double *) &dstatej_[j2][0];
double *pi = (double *) &statei_[i2][0];
double *pj = (double *) &statej_[j2][0];
double *pr = (double *) &rhor1_[0];
pr = pr + 1;
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
{
dpi[ip] += pj[ip] * pr[ip] * weightj;
dpj[ip] += pi[ip] * pr[ip] * weighti;
}
}
}
} // iPair
// last pair if necessary
if ( iPair < first_member_of_pair.size() )
{
int i1 = first_member_of_pair[iPair];
int j1 = second_member_of_pair[iPair];
// rhor1_ = psi_i1 * psi_j1
double *p = (double *)&rhor1_[0];
double *pi1 = (double *)&statei_[i1][0];
double *pj1 = (double *)&statej_[j1][0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
{
p[ip] = pi1[ip] * pj1[ip];
p[ip+1] = 0.0;
}
vft_->forward(&rhor1_[0], &rhog1_[0]);
ex_sum_1 = 0.0;
if ( compute_stress )
{
for ( int i = 0; i < 6; i++ )
sigma_sum_1[i] = 0.0;
}
for ( int ig = 0; ig < ngloc; ig++ )
{
// Add the values of |rho1(G)|^2/|G|^2
// and |rho2(G)|^2/|G|^2 to the exchange energy.
// note: g2i[G=0] == 0
// factor 2.0: real basis
const double tg2i = g2i[ig];
const double t1 = 2.0 * norm(rhog1_[ig]) * tg2i;
ex_sum_1 += t1;
if (dwf)
{
rhog1_[ig] *= tg2i;
}
if ( compute_stress )
{
const double tgx = g_x[ig];
const double tgy = g_y[ig];
const double tgz = g_z[ig];
// factor 2.0: derivative of 1/G^2
const double fac = 2.0 * t1 * tg2i;
sigma_sum_1[0] += fac * tgx * tgx;
sigma_sum_1[1] += fac * tgy * tgy;
sigma_sum_1[2] += fac * tgz * tgz;
sigma_sum_1[3] += fac * tgx * tgy;
sigma_sum_1[4] += fac * tgy * tgz;
sigma_sum_1[5] += fac * tgz * tgx;
}
}
if (dwf)
{
// Backtransform rhog[G]/|q+G|^2
vft_->backward(&rhog1_[0], &rhor1_[0]);
}
const double fac1 = 0.5 * exfac * occ_ki_[i1] * occ_kj_[j1];
if ( ( i1==j1 ) && ( iRotationStep==0 ) )
{
exchange_sum += fac1 * ex_sum_1;
sigma_exhf_[0] += fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
sigma_exhf_[1] += fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
sigma_exhf_[2] += fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
sigma_exhf_[3] += fac1 * ( -sigma_sum_1[3] ) / omega;
sigma_exhf_[4] += fac1 * ( -sigma_sum_1[4] ) / omega;
sigma_exhf_[5] += fac1 * ( -sigma_sum_1[5] ) / omega;
if (dwf)
{
const double weight = exfac * occ_kj_[j1] * HFCoeff_;
double *dp = (double *) &dstatei_[i1][0];
double *pj = (double *) &statej_[j1][0];
double *pr = (double *) &rhor1_[0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
dp[ip] += pj[ip] * pr[ip] * weight;
}
}
else
{
exchange_sum += 2.0 * fac1 * ex_sum_1;
sigma_exhf_[0] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
sigma_exhf_[1] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
sigma_exhf_[2] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
sigma_exhf_[3] += 2.0 * fac1 * ( -sigma_sum_1[3] ) / omega;
sigma_exhf_[4] += 2.0 * fac1 * ( -sigma_sum_1[4] ) / omega;
sigma_exhf_[5] += 2.0 * fac1 * ( -sigma_sum_1[5] ) / omega;
if (dwf)
{
double weighti = exfac * occ_ki_[i1] * HFCoeff_;
double weightj = exfac * occ_kj_[j1] * HFCoeff_;
double *dpi = (double *) &dstatei_[i1][0];
double *dpj = (double *) &dstatej_[j1][0];
double *pi = (double *) &statei_[i1][0];
double *pj = (double *) &statej_[j1][0];
double *pr = (double *) &rhor1_[0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
{
dpi[ip] += pj[ip] * pr[ip] * weightj;
dpj[ip] += pi[ip] * pr[ip] * weighti;
}
}
}
} // if last pair
} // if nPair > 0
// End of loop over pairs
if (dwf)
{
// finish receiving forces in force_kpi_[]
CompleteReceivingForces(iRotationStep);
// finish sending forces in send_buf_forces_[]
CompleteSendingForces(iRotationStep);
// add locally computed contributions to circulated forces
{
// copy force_kpi to to send_buf_forces
for ( int i = 0; i < nStatesKpi_; i++ )
{
complex *ps = &send_buf_forces_[i*dc.mloc()];
complex *pf = &force_kpi_[i*dc.mloc()];
for ( int j = 0; j < dc.mloc(); j++)
ps[j] = pf[j];
}
// backtransform computed forces to G coordinates and
// add to send buffer
if ( nPair > 0 )
{
int i = 0;
int j = 1;
// try to associate states 2 by 2
while ( i < nStatesKpi_ )
{
// find the first next used state
while ( i < nStatesKpi_ && !useState[i] ) i++;
// find the second next used state
j = i+1;
while ( j < nStatesKpi_ && !useState[j] ) j++;
// if i is a valid index
if ( i < nStatesKpi_ )
{
// if j is a valid index
if ( j < nStatesKpi_ )
{
// store dstate[i] + i * dstate[j] in tmp_
double *p = (double *)&tmp_[0];
double *dpr = (double *)&dstatei_[i][0];
double *dpi = (double *)&dstatei_[j][0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
{
p[ip]=dpr[ip];
p[ip+1]=dpi[ip];
}
// transform the pair of states
wft_->forward( &(tmp_)[0], &buffer_forces_1_[0],
&buffer_forces_2_[0] );
// accumulate contributions in send buffer
complex *ps1=&send_buf_forces_[i*dc.mloc()];
complex *ps2=&send_buf_forces_[j*dc.mloc()];
for ( int k = 0; k < dc.mloc(); k++)
{
ps1[k] += buffer_forces_1_[k];
ps2[k] += buffer_forces_2_[k];
}
}
else
{
// there is only one force to transform
wft_->forward(&(dstatei_[i])[0], &buffer_forces_1_[0]);
// accumulate contributions in send buffer
complex *ps1=&send_buf_forces_[i*dc.mloc()];
for ( int k = 0; k < dc.mloc(); k++ )
ps1[k] += buffer_forces_1_[k];
}
i = j+1;
}
}
}
}
// end back transform and addition of locally computed
// forces to the send buffer
StartForcesPermutation(dc.mloc());
} // if dwf
CompleteSendingOccupations(iRotationStep);
StartOccupationsPermutation();
// set the new number of local states
nStatesKpi_ = nNextStatesKpi_;
} // iRotationStep
// end of rotation of the states of kpoint i from this point
#if LOAD_MATRIX
// collect load_matrix
gcontext_.isum('R', load_matrix.size(), 1, &load_matrix[0],
load_matrix.size());
if ( gcontext_.onpe0() )
{
cout << " ExchangeOperator: load_matrix" << endl;
const int nst = s_.wfc_.nst(ispin);
int spreadsum = 0;
for ( int irot = 0; irot < gcontext_.npcol(); irot++ )
{
int rowsum = 0;
int wmin = nst*nst;
int wmax = 0;
for ( int icol = 0; icol < gcontext_.npcol(); icol++ )
{
int w = load_matrix[irot*gcontext_.npcol()+icol];
cout << " " << setw(5) << w;
rowsum += w;
wmin = min(wmin,w);
wmax = max(wmax,w);
}
int spread = abs(wmin-wmax);
cout << " " << setw(5) << rowsum
<< " spread: " << spread << endl;
spreadsum += spread;
}
cout << endl;
// print sums of columns
int rowcolsum = 0;
for ( int icol = 0; icol < gcontext_.npcol(); icol++ )
{
int colsum = 0;
for ( int irot = 0; irot < gcontext_.npcol(); irot++ )
{
int w = load_matrix[irot*gcontext_.npcol()+icol];
colsum += w;
}
cout << " " << setw(5) << colsum;
rowcolsum += colsum;
}
cout << " " << setw(5) << rowcolsum
<< " spread: " << spreadsum << endl;
cout << " pair fraction: " << ((double) rowcolsum)/(0.5*nst*(nst+1))
<< endl;
}
#endif
// wait for all communications to be completed
// complete all permutations except forces
CompleteReceivingStates(1);
CompleteSendingStates(1);
CompleteReceivingOccupations(1);
CompleteSendingOccupations(1);
if (dwf)
{
// complete forces permutation
CompleteReceivingForces(1);
CompleteSendingForces(1);
}
FreePermutation();
// transform accumulated real-space forces to G space
// loop over pairs of states
for ( int i = 0; i < nStatesKpi_-1; i+=2 )
{
double *p = (double *)&tmp_[0];
double *dpr = (double *)&dstatej_[i][0];
double *dpi = (double *)&dstatej_[i+1][0];
#pragma omp parallel for
for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
{
p[ip]=dpr[ip];
p[ip+1]=dpi[ip];
}
// transform the pair of forces
wft_->forward(&(tmp_)[0], &buffer_forces_1_[0], &buffer_forces_2_[0]);
// accumulate contributions into dc
complex *p1=dc.valptr(i*dc.mloc());
complex *p2=dc.valptr((i+1)*dc.mloc());
complex *pf1=&force_kpi_[i*dc.mloc()];
complex *pf2=&force_kpi_[(i+1)*dc.mloc()];
for ( int j = 0; j < dc.mloc(); j++ )
{
p1[j] = buffer_forces_1_[j] + pf1[j];
p2[j] = buffer_forces_2_[j] + pf2[j];
}
}
// transform the last state if necessary
if ( nStatesKpi_ % 2 == 1 )
{
int i = nStatesKpi_ - 1;
// transform the force
wft_->forward(&(dstatej_[i])[0], &buffer_forces_1_[0]);
// accumulate contributions into dc
complex *p1=dc.valptr(i*dc.mloc());
complex *pf1=&force_kpi_[i*dc.mloc()];
for ( int j = 0; j < dc.mloc(); j++ )
p1[j] = buffer_forces_1_[j] + pf1[j];
}
// dc now contains the forces
// correct the energy of state i
for ( int i = 0; i < sd.nstloc(); i++ )
{
// divergence corrections
double div_corr = 0.0;
// SumExpG2 contribution
const double div_corr_1 = exfac * SumExpG2 * occ_ki_[i];
div_corr += div_corr_1;
const double e_div_corr_1 = -0.5 * div_corr_1 * occ_ki_[i];
exchange_sum += e_div_corr_1;
const double fac1 = 0.5 * exfac * occ_ki_[i] * occ_ki_[i];
sigma_exhf_[0] += ( e_div_corr_1 + fac1 * sigma_sumexp[0] ) / omega;
sigma_exhf_[1] += ( e_div_corr_1 + fac1 * sigma_sumexp[1] ) / omega;
sigma_exhf_[2] += ( e_div_corr_1 + fac1 * sigma_sumexp[2] ) / omega;
sigma_exhf_[3] += ( fac1 * sigma_sumexp[3] ) / omega;
sigma_exhf_[4] += ( fac1 * sigma_sumexp[4] ) / omega;
sigma_exhf_[5] += ( fac1 * sigma_sumexp[5] ) / omega;
// rcut*rcut divergence correction
if ( vbasis_->mype() == 0 )
{
const double div_corr_2 = - exfac * rcut_ * rcut_ * occ_ki_[i];
div_corr += div_corr_2;
const double e_div_corr_2 = -0.5 * div_corr_2 * occ_ki_[i];
exchange_sum += e_div_corr_2;
sigma_exhf_[0] += e_div_corr_2 / omega;
sigma_exhf_[1] += e_div_corr_2 / omega;
sigma_exhf_[2] += e_div_corr_2 / omega;
}
// analytical part
const double integ = 4.0 * M_PI * sqrt(M_PI) / ( 2.0 * rcut_ );
const double vbz = pow(2.0*M_PI,3.0) / omega;
if ( vbasis_->mype() == 0 )
{
const double div_corr_3 = - exfac * integ/vbz * occ_ki_[i];
div_corr += div_corr_3;
const double e_div_corr_3 = -0.5 * div_corr_3 * occ_ki_[i];
exchange_sum += e_div_corr_3;
// no contribution to stress
}
// contribution of divergence corrections to forces on wave functions
if (dwf)
{
// sum the partial contributions to the correction for state i
gcontext_.dsum('C', 1, 1, &div_corr, 1);
// add correction to the derivatives of state i
complex *ps=c.valptr(i*c.mloc());
complex *pf=dc.valptr(i*dc.mloc());
for ( int j = 0; j < dc.mloc(); j++ )
pf[j] -= ps[j] * div_corr * HFCoeff_;
}
} // for i
if ( use_bisection_ )
{
bisection_[ispin]->backward(*uc_[ispin], *dwf->sd(ispin,0));
}
} // for ispin
// sum all contributions to the exchange energy
gcontext_.dsum(1, 1, &exchange_sum, 1);
extot = exchange_sum;
extot *= HFCoeff_;
// accumulate stress tensor contributions
gcontext_.dsum(6,1,&sigma_exhf_[0],6);
// scale stress tensor with HF coefficient
sigma_exhf_ *= HFCoeff_;
tm.stop();
#ifdef DEBUG
if ( gcontext_.onpe0() )
{
cout << setprecision(3);
cout << " total exchange computation time: " << tm.real()
<< " s" << endl;
if ( compute_stress )
{
cout << " exchange stress (a.u.) " << endl;
cout.setf(ios::fixed,ios::floatfield);
cout.setf(ios::right,ios::adjustfield);
cout << setprecision(8);
cout << " \n"
<< " " << setw(12)
<< sigma_exhf_[0] << " \n"
<< " " << setw(12)
<< sigma_exhf_[1] << " \n"
<< " " << setw(12)
<< sigma_exhf_[2] << " \n"
<< " " << setw(12)
<< sigma_exhf_[3] << " \n"
<< " " << setw(12)
<< sigma_exhf_[4] << " \n"
<< " " << setw(12)
<< sigma_exhf_[5] << " \n"
<< " \n"
<< endl;
}
}
#endif
// return total exchange in Hartree, scaled by HF coefficient
return extot;
}
////////////////////////////////////////////////////////////////////////////////
// Communication functions
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::InitPermutation(void)
{
// determine to whom we send and from whom we receive
colSendTo_ = ( gcontext_.mycol() < gcontext_.npcol() - 1 ) ?
gcontext_.mycol() + 1 : 0;
colRecvFr_ = ( gcontext_.mycol() > 0 ) ?
gcontext_.mycol() - 1 : gcontext_.npcol() - 1;
iSendTo_ = gcontext_.pmap( vbasis_->mype(), colSendTo_);
iRecvFr_ = gcontext_.pmap( vbasis_->mype(), colRecvFr_);
// Get communicator for this context
comm_ = gcontext_.comm();
// Init communication for the number of states
MPI_Send_init((void *) &nStatesKpi_, 1, MPI_INT,
iSendTo_, Tag_NumberOfStates, comm_, &send_request_NumberOfStates_ );
MPI_Recv_init((void *) &nNextStatesKpi_, 1, MPI_INT,
iRecvFr_, Tag_NumberOfStates, comm_, &recv_request_NumberOfStates_ );
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::FreePermutation(void)
{
// free permanent communications
MPI_Request_free(&send_request_NumberOfStates_);
MPI_Request_free(&recv_request_NumberOfStates_);
}
////////////////////////////////////////////////////////////////////////////////
// Permutation of the state numbers
// this function sets nNextStatesKpi_
void ExchangeOperator::SetNextPermutationStateNumber(void)
{
// send the number of states to be send
MPI_Start(&send_request_NumberOfStates_);
MPI_Start(&recv_request_NumberOfStates_);
// wait for the number of states to receive to be transmitted
{
MPI_Status status_recv;
MPI_Status status_send;
MPI_Wait( &recv_request_NumberOfStates_, &status_recv);
MPI_Wait( &send_request_NumberOfStates_, &status_send);
}
}
////////////////////////////////////////////////////////////////////////////////
// send the states in send_buf_states to the next column
// recieve the states from the previous column in state_kpi_
void ExchangeOperator::StartStatesPermutation(int mloc)
{
// send the states
if ( nStatesKpi_>0 )
{
wait_send_states_=1;
MPI_Isend((void *) &send_buf_states_[0], 2*nStatesKpi_*mloc,
MPI_DOUBLE, iSendTo_, Tag_States, comm_, &send_request_States_ );
}
else
{
wait_send_states_=0;
}
// receive the states
if ( nNextStatesKpi_>0 )
{
wait_recv_states_=1;
MPI_Irecv((void *) &state_kpi_[0], 2*nNextStatesKpi_*mloc,
MPI_DOUBLE, iRecvFr_, Tag_States, comm_, &recv_request_States_ );
}
else
{
wait_recv_states_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::CompleteReceivingStates(int iRotationStep)
{
// do something only if iRotationStep>0
if ( iRotationStep != 0 && wait_recv_states_ )
{
// wait for the reception
MPI_Status status;
MPI_Wait( &recv_request_States_, &status);
// init reception flag
wait_recv_states_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::CompleteSendingStates(int iRotationStep)
{
// do something only if iRotationStep>0
if ( iRotationStep != 0 && wait_send_states_ )
{
// wait for the reception
MPI_Status status;
MPI_Wait( &send_request_States_, &status);
// init reception flag
wait_send_states_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
// StartForcesPermutation
// send the forces in send_buf_forces to the next column
// receive the forces from the previous column in force_kpi_
void ExchangeOperator::StartForcesPermutation(int mloc)
{
// send the forces
if ( nStatesKpi_>0 )
{
wait_send_forces_=1;
MPI_Isend((void *) &send_buf_forces_[0], 2*nStatesKpi_*mloc,
MPI_DOUBLE, iSendTo_, Tag_Forces, comm_, &send_request_Forces_ );
}
else
{
wait_send_forces_=0;
}
// receive the forces
if ( nNextStatesKpi_>0 )
{
wait_recv_forces_=1;
MPI_Irecv((void *) &force_kpi_[0], 2*nNextStatesKpi_*mloc,
MPI_DOUBLE, iRecvFr_, Tag_Forces, comm_, &recv_request_Forces_ );
}
else
{
wait_recv_forces_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::CompleteReceivingForces(int iRotationStep)
{
// do something only if iRotationStep>0
if ( iRotationStep != 0 && wait_recv_forces_ )
{
// wait for the reception
MPI_Status status;
MPI_Wait( &recv_request_Forces_, &status);
// init reception flag
wait_recv_forces_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::CompleteSendingForces(int iRotationStep)
{
// do something only if iRotationStep>0
if ( iRotationStep != 0 && wait_send_forces_ )
{
// wait for the reception
MPI_Status status;
MPI_Wait( &send_request_Forces_, &status);
// init reception flag
wait_send_forces_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
// StartOccupationsPermutation
// store then send the occupations in send_buf_occupation_ to the next column
// receive the occupations from the previous column in occ_ki_
void ExchangeOperator::StartOccupationsPermutation(void)
{
// store energies in the send buffer
for ( int i=0; i0 )
{
wait_send_occupations_=1;
MPI_Isend((void *) &send_buf_occupation_[0], nStatesKpi_,
MPI_DOUBLE, iSendTo_, Tag_Occupation, comm_, &send_request_Occupation_ );
}
else
{
wait_send_occupations_=0;
}
// receive the occupations
if ( nNextStatesKpi_>0 )
{
wait_recv_occupations_=1;
MPI_Irecv((void *) &occ_ki_[0], nNextStatesKpi_,
MPI_DOUBLE, iRecvFr_, Tag_Occupation, comm_, &recv_request_Occupation_ );
}
else
{
wait_recv_occupations_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::CompleteReceivingOccupations(int iRotationStep)
{
if ( iRotationStep != 0 && wait_recv_occupations_ )
{
// wait for the reception
MPI_Status status;
MPI_Wait( &recv_request_Occupation_, &status);
// init reception flag
wait_recv_occupations_=0;
}
}
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::CompleteSendingOccupations(int iRotationStep)
{
if ( iRotationStep != 0 && wait_send_occupations_ )
{
// wait for the reception
MPI_Status status;
MPI_Wait( &send_request_Occupation_, &status);
// init reception flag
wait_send_occupations_=0;
}
}