ExchangeOperator.C 76.7 KB
Newer Older
Francois Gygi committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
////////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2010 The Regents of the University of California
//
// This file is part of Qbox
//
// Qbox is distributed under the terms of the GNU General Public License
// as published by the Free Software Foundation, either version 2 of
// the License, or (at your option) any later version.
// See the file COPYING in the root directory of this distribution
// or <http://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////////////////////
//
// ExchangeOperator.C
//
////////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <fstream>
#include <iomanip>
22 23 24 25
#include <bitset>
#include <algorithm>
#include "VectorLess.h"

Francois Gygi committed
26
#include "ExchangeOperator.h"
27
#include "Bisection.h"
Francois Gygi committed
28 29 30 31 32 33 34 35 36 37 38

using namespace std;

#define Tag_NumberOfStates 1
#define Tag_Occupation 2
#define Tag_Exchange 3
#define Tag_Forces 4
#define Tag_States 5

////////////////////////////////////////////////////////////////////////////////
ExchangeOperator::ExchangeOperator( Sample& s, double HFCoeff)
39 40
: s_(s), wf0_(s.wf), dwf0_(s.wf), wfc_(s.wf), HFCoeff_(HFCoeff),
gcontext_(s.wf.sd(0,0)->context())
Francois Gygi committed
41 42 43 44
{
  eex_ = 0.0; // exchange energy
  rcut_ = 1.0;  // constant of support function for exchange integration

45 46
  sigma_exhf_.resize(6);

47 48
  // column communicator
  vcomm_ = s_.wf.sd(0,0)->basis().comm();
Francois Gygi committed
49 50 51 52 53 54 55

  // check if the only kpoint is the gamma point:
  gamma_only_ = ( s_.wf.nkp()==1 ) && ( s_.wf.sd(0,0)->basis().real() );

  if ( gamma_only_ )
  {
    // create a real basis for the pair densities
56
    vbasis_ = new Basis(vcomm_, D3vector(0.0,0.0,0.0));
Francois Gygi committed
57 58 59 60 61
  }
  else
  {
    // create a complex basis
    //!! should avoid the finite k trick to get a complex basis at gamma
62
    vbasis_ = new Basis(vcomm_, D3vector(0.00000001,0.00000001,0.00000001));
Francois Gygi committed
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
  }
  // the size of the basis for the pair density should be
  // twice the size of the wave function basis
  vbasis_->resize( s_.wf.cell(),s_.wf.refcell(),4.0*s_.wf.ecut());

  // set the size for the r grid to be a product of small primes
  np0v_ = vbasis_->np(0)+2;
  np1v_ = vbasis_->np(1)+2;
  np2v_ = vbasis_->np(2)+2;
  while (!vbasis_->factorizable(np0v_)) np0v_ += 2;
  while (!vbasis_->factorizable(np1v_)) np1v_ += 2;
  while (!vbasis_->factorizable(np2v_)) np2v_ += 2;

  if ( gamma_only_ )
  {
    // create Fourier transform object wavefunctions
    wft_ = new FourierTransform(  s_.wf.sd(0,0)->basis(),np0v_,np1v_,np2v_);
  }

  const int ngloc = vbasis_->localsize();
  // create Fourier transform object for densities
  vft_ = new FourierTransform(*vbasis_,np0v_,np1v_,np2v_);

  np012loc_ = vft_->np012loc();

  // allocate memory for densities in G space
  rhog1_.resize(ngloc);
  rhog2_.resize(ngloc);

  // allocate memory for densities in r space
  rhor1_.resize(np012loc_);
  rhor2_.resize(np012loc_);

  // if not only at gamma, allocate arrays q+G
  if ( !gamma_only_ )
  {
    // allocate memory for |q+G| and related quantities
    qpG21_.resize(ngloc);
    qpG22_.resize(ngloc);
    qpG2i1_.resize(ngloc);
    qpG2i2_.resize(ngloc);
  }

  // get both local and maximum amount of states on a proc
  {
    if ( s_.wf.nspin()==1 )
    {
      SlaterDet& sd = *(s_.wf.sd(0,0));
      nLocalStates_=sd.nstloc();
      MPI_Allreduce(&nLocalStates_,&nMaxLocalStates_,1,
        MPI_INT,MPI_MAX,gcontext_.comm());
    }
    else
    {
      SlaterDet& sd_up = *(s_.wf.sd(0,0));
      SlaterDet& sd_dn = *(s_.wf.sd(1,0));
      nLocalStates_= sd_up.nstloc() > sd_dn.nstloc() ?
                     sd_up.nstloc() : sd_dn.nstloc();
      MPI_Allreduce(&nLocalStates_,&nMaxLocalStates_,1,
        MPI_INT,MPI_MAX,gcontext_.comm());
    }
  }

  // allocate memory for exchange energies
  exchange_ki_.resize(nMaxLocalStates_);
  exchange_kj_.resize(nMaxLocalStates_);
  send_buf_exchange_.resize(nMaxLocalStates_);
  send_buf_occupation_.resize(nMaxLocalStates_);

  // allocate memory for exchange
  exchange_.resize(s_.wf.nkp());
  for ( int iKp=0; iKp<s_.wf.nkp(); iKp++ )
  {
    // allocate memory for exchange energies of states of this kpoint.
    exchange_[iKp].resize(nMaxLocalStates_);
  }
  // get maximum number of g coeff per states
  int mlocMax=0;
  for ( int iKpi = 0; iKpi < s_.wf.nkp(); iKpi++ )
  {
    SlaterDet& sdi = *(s_.wf.sd(0,iKpi));
    ComplexMatrix& ci = sdi.c();
    if (mlocMax<ci.mloc()) mlocMax=ci.mloc();
  }

  // allocate memory for the copy of states of kpoint iKpi
  {
    state_kpi_.resize( nMaxLocalStates_ * mlocMax );
    send_buf_states_.resize( nMaxLocalStates_ * mlocMax );
  }

  // allocate buffers ( different if only at gamma or not )
  if ( gamma_only_ )
  {
    buffer_forces_1_.resize( mlocMax );
    buffer_forces_2_.resize( mlocMax );
  }
  else
  {
    buffer_dstate_.resize( mlocMax );
  }

  // allocate memory for the r coordinate expression
  // of each state of kpi and kpj
  statei_.resize(nMaxLocalStates_);
  statej_.resize(nMaxLocalStates_);
  for ( int i = 0; i < nMaxLocalStates_; i++ )
  {
    statei_[i].resize(np012loc_);
    statej_[i].resize(np012loc_);
  }

175 176
  use_bisection_ = s.ctrl.btHF > 0.0;

Francois Gygi committed
177 178 179 180 181
  // if only at gamma
  if ( gamma_only_ )
  {
    tmp_.resize(np012loc_);
    // allocate bisection object
182 183 184 185 186 187 188 189 190 191 192
    if ( use_bisection_ )
    {
      bisection_.resize(s_.wf.nspin());
      uc_.resize(s_.wf.nspin());
      for ( int ispin = 0; ispin < s_.wf.nspin(); ispin++ )
      {
        bisection_[ispin] = new Bisection(*s_.wf.sd(ispin,0),s_.ctrl.blHF);
        const ComplexMatrix& c = s_.wf.sd(ispin,0)->c();
        uc_[ispin] = new DoubleMatrix(c.context(),c.n(),c.n(),c.nb(),c.nb());
      }
    }
Francois Gygi committed
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
  }

  // allocate memory for occupation numbers of kpoint iKpi
  occ_ki_.resize(nMaxLocalStates_);
  occ_kj_.resize(nMaxLocalStates_);

  // allocate memory for the real space expression of the forces
  dstatei_.resize(nMaxLocalStates_);
  dstatej_.resize(nMaxLocalStates_);
  for ( int i = 0; i < nMaxLocalStates_; i++ )
  {
    dstatei_[i].resize(np012loc_);
    dstatej_[i].resize(np012loc_);
  }

  // allocate memory for the copy of forces of kpoint iKpi
  {
    force_kpi_.resize( nMaxLocalStates_ * mlocMax );
    send_buf_forces_.resize( nMaxLocalStates_ * mlocMax );
  }
}

////////////////////////////////////////////////////////////////////////////////
ExchangeOperator::~ExchangeOperator()
{
  if ( ( s_.wf.nkp()==1 ) && ( s_.wf.sd(0,0)->basis().real() ) )
  {
    // delete Fourier transform objects on states and forces
    delete wft_;
  }
  // delete Fourier transform and basis for pair densities
  delete vft_;
  delete vbasis_;
226 227 228 229 230 231
  if ( use_bisection_ )
    for ( int ispin = 0; ispin < s_.wf.nspin(); ispin++ )
    {
      delete bisection_[ispin];
      delete uc_[ispin];
    }
Francois Gygi committed
232 233 234
}

////////////////////////////////////////////////////////////////////////////////
235
double ExchangeOperator::update_energy(bool compute_stress)
Francois Gygi committed
236 237
{
  if ( gamma_only_ )
238
    return eex_ = compute_exchange_at_gamma_(s_.wf, 0, compute_stress);
Francois Gygi committed
239
  else
240
    return eex_ = compute_exchange_for_general_case_(&s_, 0, compute_stress);
Francois Gygi committed
241 242 243
}

////////////////////////////////////////////////////////////////////////////////
244
double ExchangeOperator::update_operator(bool compute_stress)
Francois Gygi committed
245 246 247 248 249
{
  dwf0_.clear();

  // compute exchange energy and derivatives
  if ( gamma_only_ )
250
    eex_ = compute_exchange_at_gamma_(s_.wf, &dwf0_, compute_stress);
Francois Gygi committed
251
  else
252
    eex_ = compute_exchange_for_general_case_(&s_, &dwf0_, compute_stress);
Francois Gygi committed
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320

  // wf0_ is kept as a reference state
  wf0_ = s_.wf;

  // return exchange energy
  return eex_;
}

////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::apply_VXC_(double mix, Wavefunction& wf_ref,
  Wavefunction& dwf_ref, Wavefunction& dwf)
{
  // dwf += mix * ( |dwf_ref> <wf_ref|s_.wf> + |wf_ref><dwf_ref|s_.wf>
  // - |wf_ref><wf_ref|sigma_HF|wf_ref><wf_ref|s_.wf> )
  for ( int ispin = 0; ispin < s_.wf.nspin(); ispin++ )
  {
    const int nst = s_.wf.nst(ispin);
    for ( int ikp = 0; ikp < s_.wf.nkp(); ikp++ )
    {
      const Context &ctxt = s_.wf.sd(ispin,ikp)->c().context();
      if ( s_.wf.sd(ispin,ikp)->basis().real() )
      {
#if 0
        update_sigma();
        DoubleMatrix dc_proxy(dwf.sd(ispin,ikp)->c());
        DoubleMatrix dcref_proxy(dwf_ref.sd(ispin,ikp)->c());
        dc_proxy += dcref_proxy;
#else
        DoubleMatrix matproj1(ctxt,nst,nst);
        DoubleMatrix matproj2(ctxt,nst,nst);
        DoubleMatrix matenergy(ctxt,nst,nst);

        DoubleMatrix c_proxy(s_.wf.sd(ispin,ikp)->c());
        DoubleMatrix dc_proxy(dwf.sd(ispin,ikp)->c());
        DoubleMatrix cref_proxy(wf_ref.sd(ispin,ikp)->c());
        DoubleMatrix dcref_proxy(dwf_ref.sd(ispin,ikp)->c());

        // matproj1 = <wf_ref|wf> => matproj1
        matproj1.gemm('t','n',2.0,cref_proxy,c_proxy,0.0);
        matproj1.ger(-1.0,cref_proxy,0,c_proxy,0);

        // dwf += mix * |dwf_ref> * matproj1
        dc_proxy.gemm('n','n',mix,dcref_proxy,matproj1,1.0);

        // matenergy = <dpsi_ref|psi_ref>
        matenergy.gemm('t','n',2.0,dcref_proxy,cref_proxy,0.0);
        matenergy.ger(-1.0,dcref_proxy,0,cref_proxy,0);

        // matproj2 = - matenergy * matproj1
        matproj2.gemm('n','n',-1.0,matenergy,matproj1,0.0);

        // matproj2 += <dwf_ref|wf>
        matproj2.gemm('t','n',2.0,dcref_proxy,c_proxy,1.0);
        matproj2.ger(-1.0,dcref_proxy,0,c_proxy,0);

        // |dwf> += mix * |wf_ref> * matproj2
        dc_proxy.gemm('n','n',mix,cref_proxy,matproj2,1.0);
#endif
      }
      else // complex wave functions
      {
        ComplexMatrix matproj1(ctxt,nst,nst);
        ComplexMatrix matproj2(ctxt,nst,nst);
        ComplexMatrix matenergy(ctxt,nst,nst);

        ComplexMatrix &c(s_.wf.sd(ispin,ikp)->c());
        ComplexMatrix &dc(dwf.sd(ispin,ikp)->c());
        ComplexMatrix &cref(wf_ref.sd(ispin,ikp)->c());
Francois Gygi committed
321
        ComplexMatrix &dcref(dwf_ref.sd(ispin,ikp)->c());
Francois Gygi committed
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345

        // matproj1 = <wf_ref|wf>
        matproj1.gemm('c','n',1.0,cref,c,0.0);

        // dwf += mix * |dwf_ref> * matproj1
        dc.gemm('n','n',mix,dcref,matproj1,1.0);

        // matenergy = <dwf_ref|wf_ref>
        matenergy.gemm('c','n',1.0,dcref,cref,0.0);

        // matproj2 = - matenergy * matproj1
        matproj2.gemm('n','n',-1.0,matenergy,matproj1,0.0);

        // matproj2 += <dwf_ref|wf>
        matproj2.gemm('c','n',1.0,dcref,c,1.0);

        // |dpsi> += mix * |psi_ref> * matproj2
        dc.gemm('n','n',mix,cref,matproj2,1.0);
      }
    } // ikp
  } // ispin
}

////////////////////////////////////////////////////////////////////////////////
346
double ExchangeOperator::apply_operator(Wavefunction& dwf)
Francois Gygi committed
347 348 349 350 351 352 353
{
  // apply sigmaHF to s_.wf and store result in dwf
  // use the reference function wf0_ and reference sigma(wf) dwf0_
  apply_VXC_(1.0, wf0_, dwf0_, dwf);
  return eex_;
}

354 355 356 357 358 359 360
////////////////////////////////////////////////////////////////////////////////
void ExchangeOperator::add_stress(valarray<double>& sigma_exc)
{
  // add current value of the HF stress tensor to sigma_exc
  sigma_exc += sigma_exhf_;
}

Francois Gygi committed
361
////////////////////////////////////////////////////////////////////////////////
362 363 364 365 366 367
void ExchangeOperator::cell_moved(void)
{
  vbasis_->resize( s_.wf.cell(),s_.wf.refcell(),4.0*s_.wf.ecut());
}

////////////////////////////////////////////////////////////////////////////////
Francois Gygi committed
368 369 370 371 372
// Exchange functions
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
double ExchangeOperator::compute_exchange_for_general_case_( Sample* s,
373
  Wavefunction* dwf, bool compute_stress)
Francois Gygi committed
374
{
375 376 377
  if ( compute_stress )
  {
    cout << " stress at general k-point not implemented" << endl;
378
    gcontext_.abort(1);
379
  }
380 381 382 383

  Timer tm;
  tm.start();

Francois Gygi committed
384 385 386 387 388 389 390 391
  const Wavefunction& wf = s->wf;

  const double omega = wf.cell().volume();
  const int nkpoints = wf.nkp();
  const int nspin = wf.nspin();

  // determine the spin factor for the pair densities:
  // 0.5 if 1 spin, 1 if nspin==2
392 393
  const double spinFactor = 0.5 * nspin;
  const double exfac = - ( 4.0 * M_PI / omega ) * spinFactor;
Francois Gygi committed
394 395

  // initialize total exchange energy
396
  double exchange_sum = 0.0;
Francois Gygi committed
397 398
  double extot = 0.0;

399 400 401
  // initialize stress
  sigma_exhf_ = 0.0;

Francois Gygi committed
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
  // loop on spins
  for ( int ispin=0; ispin < nspin; ispin++ )
  {
    // initialize the numerical correction
    // to the exchange integrals over kpoints j
    vector<double> numerical_correction(nkpoints);
    for ( int iKpi = 0; iKpi < nkpoints; iKpi++ )
      numerical_correction[iKpi]=0.0;

    // loop over the kpoints
    for ( int iKpi = 0; iKpi < nkpoints; iKpi++ )
    {
      SlaterDet& sdi = *(wf.sd(ispin,iKpi));
      ComplexMatrix& ci = sdi.c();
      FourierTransform* wfti_ = new FourierTransform(sdi.basis(),
        np0v_,np1v_,np2v_);
      SlaterDet& dsdi = *(dwf->sd(ispin,iKpi));
      ComplexMatrix& dci = dsdi.c();

      nStatesKpi_=sdi.nstloc();

423
      // occupation numbers for kpoint i
Francois Gygi committed
424 425 426 427
      const double* occ = sdi.occ_ptr();
      for ( int i = 0; i<nStatesKpi_; i++ )
        occ_ki_[i]=occ[ci.jglobal(i)];

428
      // copy of the local states at kpoint iKpi
Francois Gygi committed
429 430 431 432 433 434 435
      const complex<double> *p = ci.cvalptr(0);
      for ( int i = 0, bound = nStatesKpi_ * ci.mloc(); i < bound; i++ )
        state_kpi_[i]=p[i];

      if (dwf)
      {
        for ( int i = 0, bound = nStatesKpi_ * dci.mloc(); i < bound; i++ )
436
          force_kpi_[i] = 0.0;
Francois Gygi committed
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
      }

      // initialize communications for the permutations
      InitPermutation();

      //  Start rotation of the states of kpoint i from this point
      for ( int iRotationStep=0; iRotationStep<gcontext_.npcol();
            iRotationStep++ )
      {
        CompleteReceivingStates(iRotationStep);

        // compute the r coordinate expression of each state of kpi
        for ( int i = 0; i < nStatesKpi_; i++ )
        {
          wfti_->backward( &state_kpi_[i*ci.mloc()], &(statei_[i])[0] );
        }

        CompleteSendingStates(iRotationStep);

        for ( int i = 0, bound = nStatesKpi_ * ci.mloc(); i < bound; i++ )
          send_buf_states_[i]=state_kpi_[i];

        if (dwf)
        {
          for ( int i = 0; i < nStatesKpi_; i++ )
462 463
            for ( int j = 0; j < np012loc_; j++ )
              dstatei_[i][j]=0.0;
Francois Gygi committed
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
        }

        // set number of states of next permutation step
        // (contained in nNextStatesKpi_)
        SetNextPermutationStateNumber();

        // start states permutation
        StartStatesPermutation(ci.mloc());

        CompleteReceivingOccupations(iRotationStep);

        // second loop over kpoints
        for ( int iKpj = iKpi; iKpj < nkpoints; iKpj++ )
        {
          SlaterDet& sdj = *(wf.sd(ispin,iKpj));
          FourierTransform* wftj_ = new FourierTransform(sdj.basis(),
            np0v_,np1v_,np2v_);
          ComplexMatrix& cj = sdj.c();

          for ( int i = 0; i < sdj.nstloc(); i++ )
            wftj_->backward(cj.cvalptr(i*cj.mloc()),&(statej_[i])[0]);

          SlaterDet& dsdj = *(dwf->sd(ispin,iKpj));
          ComplexMatrix& dcj = dsdj.c();

          if (dwf)
          {
            for ( int i = 0; i < dsdj.nstloc(); i++ )
492 493
              for ( int j = 0; j < np012loc_; j++ )
                dstatej_[i][j]=0.0;
Francois Gygi committed
494 495 496 497 498 499 500 501 502
          }

          // compute the differences dk between kpoints i and j
          // and their symmetric equivalent
          // dk1 = kpi-kpj
          D3vector dk1 =   wf.kpoint(iKpi) - wf.kpoint(iKpj);

          // set dk1 in absolute reciprocal coordinates to get q1
          D3vector q1 = dk1.x*sdi.basis().cell().b(0)
503 504
                     +  dk1.y*sdi.basis().cell().b(1)
                     +  dk1.z*sdi.basis().cell().b(2);
Francois Gygi committed
505 506 507 508 509 510

          // dk2 = kpi+kpj
          D3vector dk2 =   wf.kpoint(iKpi) + wf.kpoint(iKpj);

          // set dk2 in absolute reciprocal coordinates to get q2
          D3vector q2 = dk2.x*sdi.basis().cell().b(0)
511 512
                     +  dk2.y*sdi.basis().cell().b(1)
                     +  dk2.z*sdi.basis().cell().b(2);
Francois Gygi committed
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533

          // compute values of |q1+G| and |q2+G|
          double SumExpQpG2 = 0.0;
          const int ngloc = vbasis_->localsize();
          for ( int ig = 0; ig < ngloc; ig++ )
          {
            D3vector G(vbasis_->gx(ig+ngloc*0),
                       vbasis_->gx(ig+ngloc*1),
                       vbasis_->gx(ig+ngloc*2));

            // compute G+q for each qi and find the value of the
            // correction term: sum_(G,q) exp(-rcut_^2*|G+q|^2)/|G+q|^2
            // => compute the square norm and inverse of q1+G
            qpG21_[ig]  = ( G + q1 ) * ( G + q1 );
            qpG2i1_[ig] = ( qpG21_[ig] > 0.0 ) ? 1.0 / qpG21_[ig] : 0.0;

            // => compute the square norm and inverse of q2+G
            qpG22_[ig]  = ( G + q2 ) * ( G + q2 );
            qpG2i2_[ig] = ( qpG22_[ig] > 0.0 ) ? 1.0 / qpG22_[ig] : 0.0;

            // if iKpi=0 (first k point)
534
            // compute the numerical part of the correction
535
            if ( iRotationStep==0 )
Francois Gygi committed
536
            {
537 538 539
              const double rc2 = rcut_*rcut_;
              SumExpQpG2 += (exp(-rc2*qpG21_[ig]) * qpG2i1_[ig] );
              SumExpQpG2 += (exp(-rc2*qpG22_[ig]) * qpG2i2_[ig] );
Francois Gygi committed
540 541 542 543 544 545 546 547
            }
          }

          // Add weighted contribution to numerical correction:
          // add the term sum_G exp(-a*|q+G|^2)/|q+G|^2 to the numerical
          // correction. Works only if this is the first iKpoint.
          //
          // divide weight by 2 as we implicitly counted kpoint j and symmetric
548
          if ( iRotationStep==0 ) // && ( iKpi==0 )
Francois Gygi committed
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
          {
            if ( iKpi==iKpj )
            {
              numerical_correction[iKpi] += SumExpQpG2 * wf.weight(iKpj)/2.0;
            }
            else
            {
              numerical_correction[iKpi] += SumExpQpG2 * wf.weight(iKpj)/2.0;
              numerical_correction[iKpj] += SumExpQpG2 * wf.weight(iKpi)/2.0;
            }
          }

          // get occupation numbers for kpoint j
          const double* occ = sdj.occ_ptr();
          for ( int i = 0; i<sdj.nstloc(); i++ )
            occ_kj_[i]=occ[cj.jglobal(i)];

          // loop over the states at kpoint i
          for ( int i = 0; i < nStatesKpi_; i++ )
          {
            // loop over the states at kpoint j
            for ( int j = 0; j < sdj.nstloc(); j++ )
            {
              // check if something to compute for this pair
              if ( ( occ_ki_[i]!=0.0 && wf.weight(iKpi)!=0.0 )
                || ( occ_kj_[j]!=0.0 && wf.weight(iKpj)!=0.0 ) )
              {
                // compute the pair densities
                // rhor = statei_(r)' * statej_(r)
                for ( int ir = 0; ir < np012loc_; ir++ )
                {
                  rhor1_[ir] = conj( statej_[j][ir] ) * statei_[i][ir];
                  rhor2_[ir] =       statej_[j][ir]   * statei_[i][ir];
                }

                // Fourier transform the codensity to obtain rho(G).
                vft_->forward(&rhor1_[0], &rhog1_[0]);
                vft_->forward(&rhor2_[0], &rhog2_[0]);

                // initialize contrib of the states psi(kj,j) to the exchange
                // energy associated to state psi(ki,i)
590
                double ex_ki_i_kj_j = 0.0;
Francois Gygi committed
591

592
                for ( int ig = 0; ig < ngloc; ig++ )
Francois Gygi committed
593
                {
594 595 596 597 598 599 600 601 602 603
                  // Add the values of |rho1(G)|^2/|G+q1|^2
                  // and |rho2(G)|^2/|G+q2|^2 to the exchange energy.
                  // This does not take the point G=q=0 into account
                  // as qpG2i = 0.
                  const double t1 = norm(rhog1_[ig]) * qpG2i1_[ig];
                  const double t2 = norm(rhog2_[ig]) * qpG2i2_[ig];
                  ex_ki_i_kj_j += t1;
                  ex_ki_i_kj_j += t2;

                  if ( dwf )
Francois Gygi committed
604 605 606 607 608
                  {
                    // compute rhog1_[G]/|G+q1|^2 and rhog2_[G]/|G+q1|^2
                    rhog1_[ig] *= qpG2i1_[ig];
                    rhog2_[ig] *= qpG2i2_[ig];
                  }
609 610 611
                }
                if ( dwf )
                {
Francois Gygi committed
612 613 614 615 616
                  // Backtransform rhog[G]/|q+G|^2
                  vft_->backward(&rhog1_[0], &rhor1_[0]);
                  vft_->backward(&rhog2_[0], &rhor2_[0]);
                }

617
                // if iKpi=iKpj, add this contribution to the
Francois Gygi committed
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
                // exchange energy of state psi(ki,i)
                if ( iKpi==iKpj )
                {
                  // case iKpi=iKpj
                  // count only for psi(ki,i)
                  //
                  // compute the weights:
                  //
                  // => divide the weight of kpoint j by 2 as we implicitly
                  //    counted both kj and -kj in ex_ki_i_kj_j.
                  // => take in account the occupation of the state psi(kj,j)
                  //    (divide by 2 to remove spin factor if only one spin).
                  // => multiply by the constants of computation
                  double weight = - 4.0 * M_PI / omega * wf.weight(iKpj) /
                    2.0 * occ_kj_[j] * spinFactor;

                  // add contribution to exchange energy
635 636
                  exchange_sum += ex_ki_i_kj_j * weight * wf.weight(iKpi) *
                                  0.5 * occ_ki_[i];
Francois Gygi committed
637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674

                  if (dwf)
                  {
                    // acumulate weighted contributions
                    // Psi_j,kj(r) * TF( rhog[G]/|q+G|^2 ) and symmetric
                    // in dpsi_i. We take now into account the mixing coeff
                    weight *= HFCoeff_;
                    for ( int ir = 0; ir < np012loc_; ir++ )
                    {
                      dstatei_[i][ir] += ( statej_[j][ir] * rhor1_[ir] +
                        conj(statej_[j][ir] ) * rhor2_[ir]) * weight;
                    }
                  }
                }

                // if iKpi/=iKpj, add this contrib to the exch energy of both
                // states psi(ki,i) and psi(kj,j)
                // (as ex(ki,i,kj,j)=ex(kj,j,ki,i))
                // this way, we can avoid computing ex(ki,i,kj,j) for kj<ki.
                else
                {
                  // case iKpj>iKpi
                  // count for both psi(ki,i) and psi(kj,j)
                  //
                  // compute the weights:
                  //
                  // => divide the weight of kpoints j (resp. i) by 2 as we
                  //    implicitly counted both kj and -kj (resp ki and -ki)
                  //    in ex_ki_i_kj_j.
                  // => take in account the occupation of the state psi(kj,j)
                  //    (resp psi(ki,i)), and divide by 2 to remove spin factor
                  // => multiply by the constants of computation
                  double weighti = - 4.0 * M_PI / omega * wf.weight(iKpi) /
                    2.0 * occ_ki_[i] * spinFactor;
                  double weightj = - 4.0 * M_PI / omega * wf.weight(iKpj) /
                    2.0 * occ_kj_[j] * spinFactor;

                  // add contribution to exchange energy
675 676 677 678
                  exchange_sum += ex_ki_i_kj_j * weightj * wf.weight(iKpi) *
                                  0.5 * occ_ki_[i];
                  exchange_sum += ex_ki_i_kj_j * weighti * wf.weight(iKpj) *
                                  0.5 * occ_kj_[j];
Francois Gygi committed
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709

                  if (dwf)
                  {
                    // acumulate weighted contributions in dpsi_j and dpsi_j.
                    // the correspondances between rho12 and rho 21 are given by
                    //
                    //                      /
                    // rho1_12(r) = | psi2*(r')*psi1(r')/|r-r'|dr'
                    //                      /
                    //
                    //                      /
                    // rho2_12(r) = | psi2(r')*psi1(r')/|r-r'|dr'
                    //                      /
                    //
                    // => rho1_21  = rho1_12*
                    // => rho2_21  = rho2_12
                    //
                    // We take also into account the mixing coefficient.
                    //
                    weighti *= HFCoeff_;
                    weightj *= HFCoeff_;
                    for ( int ir = 0; ir < np012loc_; ir++ )
                    {
                      dstatei_[i][ir] += ( statej_[j][ir] * rhor1_[ir] +
                        conj( statej_[j][ir] ) * rhor2_[ir] ) * weightj;
                      dstatej_[j][ir] += ( statei_[i][ir] * conj(rhor1_[ir]) +
                        conj( statei_[i][ir] )* rhor2_[ir] ) * weighti;
                    }
                  }
                }
              }
710 711
            } // for j
          } // for i
Francois Gygi committed
712 713 714 715 716 717 718

          if (dwf)
          {
            // add the g space contributions to each state derivative of kpj
            for ( int i = 0; i < dsdj.nstloc(); i++ )
            {
              // compute the g space state derivative contribution
719
              wftj_->forward(&(dstatej_[i])[0],&buffer_dstate_[0]);
Francois Gygi committed
720 721 722

              // add the g the result to the state derivative of kpj
              complex<double> *p=dcj.valptr(i*dcj.mloc());
723 724
              for ( int j=0; j<dcj.mloc(); j++ )
                p[j]+=buffer_dstate_[j];
Francois Gygi committed
725 726 727
            }
          }
          delete wftj_;
728
        } // for iKpj
Francois Gygi committed
729 730

        // End of loop over kpoints j
731
        // finish to rotate the columns
Francois Gygi committed
732 733 734 735 736 737 738 739

        if (dwf)
        {
          CompleteReceivingForces(iRotationStep);

          CompleteSendingForces(iRotationStep);

          // add the g coordinate expression contributions to each
740
          // state derivative of kpi, store everything in the send buffer
Francois Gygi committed
741 742 743 744 745 746 747 748
          for ( int i=0; i<nStatesKpi_; i++ )
          {
            // transform contribution to g coordinates
            wfti_->forward(&(dstatei_[i])[0], &buffer_dstate_[0]);

            // sum up contributions in send buffer
            complex<double> *p1=&force_kpi_[i*dci.mloc()];
            complex<double> *p2=&send_buf_forces_[i*dci.mloc()];
749 750
            for ( int j=0; j<dci.mloc(); j++ )
              p2[j] = p1[j] + buffer_dstate_[j];
Francois Gygi committed
751 752 753 754 755 756 757 758 759 760 761 762 763
          }

          // Start forces permutation
          StartForcesPermutation(dci.mloc());
        }

        CompleteSendingOccupations(iRotationStep);

        // start occupations permutation
        StartOccupationsPermutation();

        // set the new number of local states
        nStatesKpi_ = nNextStatesKpi_;
764
      } // end iRotationStep
Francois Gygi committed
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786

      //  end of rotation of the states of kpoint i from this point

      // wait for all communications to be completed
      {
        // complete all permutations except forces
        CompleteReceivingStates(1);
        CompleteSendingStates(1);
        CompleteReceivingOccupations(1);
        CompleteSendingOccupations(1);

        if (dwf)
        {
          // complete forces permutation
          CompleteReceivingForces(1);
          CompleteSendingForces(1);
        }

        // Terminate permutation
        FreePermutation();
      }

787
      // divergence corrections
788
      const double integ = 4.0 * M_PI * sqrt(M_PI) / ( 2.0 * rcut_ );
Francois Gygi committed
789 790 791 792
      const double vbz = pow(2.0*M_PI,3.0) / omega;

      for ( int i = 0; i < sdi.nstloc(); i++ )
      {
793
        double div_corr = 0.0;
Francois Gygi committed
794

Francois Gygi committed
795
        const double div_corr_1 = exfac * numerical_correction[iKpi] *
Francois Gygi committed
796
                                  occ_ki_[i];
797
        div_corr += div_corr_1;
798
        const double e_div_corr_1 = -div_corr_1;
799
        exchange_sum += e_div_corr_1 * wf.weight(iKpi);
800
        // add here contributions to stress from div_corr_1;
Francois Gygi committed
801

802
        // rcut*rcut divergence correction
803
        if ( vbasis_->mype() == 0 )
804 805
        {
          const double div_corr_2 = - exfac * rcut_ * rcut_ * occ_ki_[i] *
806
                                    wf.weight(iKpi);
807 808
          div_corr += div_corr_2;
          const double e_div_corr_2 = -0.5 * div_corr_2 * occ_ki_[i];
809
          exchange_sum += e_div_corr_2 * wf.weight(iKpi);
810 811
          // add here contributions of div_corr_2 to stress

812 813
          const double div_corr_3 = - exfac * integ/vbz * occ_ki_[i];

814 815
          div_corr += div_corr_3;
          const double e_div_corr_3 = -0.5 * div_corr_3 * occ_ki_[i];
816
          exchange_sum += e_div_corr_3 * wf.weight(iKpi);
817
          // no contribution to stress from div_corr_3
Francois Gygi committed
818
        }
819 820

        // contribution of divergence corrections to forces on wave functions
Francois Gygi committed
821 822
        if (dwf)
        {
823
          // sum the partial contributions to the correction for state i
824
          gcontext_.dsum('C', 1, 1, &div_corr, 1);
Francois Gygi committed
825

826
          // add correction to the derivatives of state i
Francois Gygi committed
827 828 829 830
          complex<double> *ps=ci.valptr(i*ci.mloc());
          complex<double> *pf1=dci.valptr(i*dci.mloc());
          complex<double> *pf2=&force_kpi_[i*dci.mloc()];
          for ( int j = 0; j < dci.mloc(); j++ )
831
            pf1[j] += pf2[j] - ps[j] * div_corr * HFCoeff_;
Francois Gygi committed
832
        }
833 834

      } // for i
Francois Gygi committed
835
      delete wfti_;
836 837
    } // for iKpi
  } // for ispin
Francois Gygi committed
838 839

  // reduce the total energy
840 841 842
  gcontext_.dsum(1, 1, &exchange_sum, 1);
  extot = exchange_sum;
  extot *= HFCoeff_;
Francois Gygi committed
843

844
  tm.stop();
845
#ifdef DEBUG
846
  if ( gcontext_.onpe0() )
Francois Gygi committed
847 848
  {
    cout << setprecision(10);
849 850
    cout << " total exchange computation time: " << tm.real()
         << " s" << endl;
Francois Gygi committed
851
  }
852
#endif
Francois Gygi committed
853

854
  return extot;
Francois Gygi committed
855 856 857
}

////////////////////////////////////////////////////////////////////////////////
858
double ExchangeOperator::compute_exchange_at_gamma_(const Wavefunction &wf,
859
  Wavefunction* dwf, bool compute_stress)
Francois Gygi committed
860 861 862 863
{
  Timer tm;
  Timer tmb;

864 865
  wfc_ = wf;

Francois Gygi committed
866
  cout << setprecision(10);
867 868
  const double omega = wfc_.cell().volume();
  const int nspin = wfc_.nspin();
Francois Gygi committed
869 870 871 872 873

  // spin factor for the pair densities: 0.5 if 1 spin, 1 if nspin==2
  const double spinFactor=0.5*nspin;

  // total exchange energy
874
  double exchange_sum = 0.0;
Francois Gygi committed
875 876
  double extot = 0.0;

877 878 879 880 881
  sigma_exhf_ = 0.0;
  const double *const g_x = vbasis_->gx_ptr(0);
  const double *const g_y = vbasis_->gx_ptr(1);
  const double *const g_z = vbasis_->gx_ptr(2);

882
  for ( int ispin = 0; ispin < wfc_.nspin(); ispin++ )
Francois Gygi committed
883
  {
884 885 886
    SlaterDet& sd = *(wfc_.sd(ispin,0));
    ComplexMatrix& c = sd.c();
    const int nst = sd.nst();
Francois Gygi committed
887

888 889
    // if using bisection, localize the wave functions
    if ( use_bisection_ )
Francois Gygi committed
890
    {
891
      tmb.start();
892 893 894 895 896 897 898 899 900 901 902 903
      int maxsweep = 50;
      if ( s_.ctrl.debug.find("BISECTION_MAXSWEEP") != string::npos )
      {
        // override tolerance for bisection
        istringstream is(s_.ctrl.debug);
        string s;
        is >> s >> maxsweep;
        if ( gcontext_.onpe0() )
          cout << " override bisection maxsweep value: maxsweep = "
               << maxsweep << endl;
        assert(maxsweep >= 0);
      }
904
      double tol = 1.0;
905 906 907 908 909 910 911 912 913 914
      if ( s_.ctrl.debug.find("BISECTION_TOL") != string::npos )
      {
        // override tolerance for bisection
        istringstream is(s_.ctrl.debug);
        string s;
        is >> s >> tol;
        if ( gcontext_.onpe0() )
          cout << " override bisection tol value: tol = " << tol << endl;
        assert(tol > 0.0);
      }
915 916 917 918
#if TIMING
      Timer tmbtransf;
      tmbtransf.start();
#endif
919
      bisection_[ispin]->compute_transform(*wfc_.sd(ispin,0),maxsweep,tol);
920 921 922 923 924
#if TIMING
      tmbtransf.stop();
      Timer tmbcomploc;
      tmbcomploc.start();
#endif
925
      bisection_[ispin]->compute_localization(s_.ctrl.btHF);
926 927 928
#if TIMING
      tmbcomploc.stop();
#endif
929 930 931
      // copy of localization vector from Bisection object
      localization_ = bisection_[ispin]->localization();

932 933 934 935
#if TIMING
      Timer tmbsize, tmbpair;
      tmbsize.start();
#endif
936
      if ( gcontext_.onpe0() )
Francois Gygi committed
937
      {
938 939
          cout << " ExchangeOperator: bisection size: ispin=" << ispin
               << ": " << bisection_[ispin]->total_size() << endl;
940 941 942 943 944 945 946
      }
#if TIMING
      tmbsize.stop();
      tmbpair.start();
#endif
      if ( gcontext_.onpe0() )
      {
947 948
          cout << " ExchangeOperator: pair fraction:  ispin=" << ispin
               << ": " << bisection_[ispin]->pair_fraction() << endl;
Francois Gygi committed
949
      }
950 951 952
#if TIMING
      tmbpair.stop();
#endif
953 954 955 956 957 958 959 960 961 962 963 964 965

      // copy the orthogonal transformation u to uc_[ispin]
      *uc_[ispin] = bisection_[ispin]->u();

      bool distribute = s_.ctrl.debug.find("BISECTION_NODIST") == string::npos;
      if ( distribute )
      {
        // define a permutation ordering states by increasing degree
        // permute states according to the order defined by the
        // localization vector

        // compute the degree of the vertices of the exchange graph
        // using the localization vector
966 967 968 969
#if TIMING
        Timer tmb_ov;
        tmb_ov.start();
#endif
970 971 972 973 974 975 976 977 978 979 980
        vector<int> degree(nst);
        for ( int i = 0; i < nst; i++ )
        {
          int count = 0;
          for ( int j = 0; j < nst; j++ )
          {
            if ( bisection_[ispin]->overlap(localization_,i,j) )
              count++;
          }
          degree[i] = count;
        }
981 982 983 984 985 986 987 988 989
#if TIMING
        tmb_ov.stop();
        if ( gcontext_.onpe0() )
        {
          cout << setprecision(3);
          cout << " ExchangeOperator: bisection overlap time: "
             << tmb_ov.real() << " s" << endl;
        }
#endif
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055

        // permutation index
        vector<int> index(nst);
        for ( int j = 0; j < index.size(); j++ )
          index[j] = j;

        // Create function object for comparison of degree
        VectorLess<int> degree_less(degree);
        sort(index.begin(), index.end(), degree_less);
        // At this point degree[index[i]] <= degree[index[j]] if i < j
        for ( int i = 0; i < index.size()-1; i++ )
          assert(degree[index[i]] <= degree[index[i+1]]);

#if DEBUG
        if ( gcontext_.onpe0() )
        {
          cout << "degree order after sort:" << endl;
          for ( int j = 0; j < index.size(); j++ )
            cout << j << " -> " << index[j]
                 << "  " << degree[index[j]]
                 << endl;
        }
#endif

        // distribute the states to process columns in round robin fashion
        // Assume that the states are initially ordered by increasing degree
        // i.e. degree(index_[i]) < degree(index_[j]) if i < j

        const int nb = uc_[ispin]->nb();

        vector<int> distrib_index(nst);
        int ibase = 0;
        int icol = 0;
        for ( int i = 0; i < nst; i++ )
        {
          // check if next slot is beyond n
          if ( ibase + icol * nb >= nst )
          {
            // restart in column 0 with incremented ibase
            icol = 0;
            ibase++;
          }
          distrib_index[ibase + icol * nb] = i;
          icol++;
        }

        // combine index[i] and distrib_index[i]
        vector<int> itmp(index.size());
        for ( int i = 0; i < index.size(); i++ )
          itmp[i] = index[distrib_index[i]];
        index = itmp;

#if DEBUG
        if ( gcontext_.onpe0() )
        {
          cout << "index after round robin distrib:" << endl;
          for ( int j = 0; j < index.size(); j++ )
             cout << j << " -> " << index[j] << endl;
        }
#endif
        // apply the permutation defined by index to the localization vector
        vector<long int> loc_tmp(localization_.size());
        for ( int i = 0; i < index.size(); i++ )
          loc_tmp[i] = localization_[index[i]];
        localization_ = loc_tmp;

1056 1057 1058 1059 1060 1061
        // apply the permutation defined by index to the occupation vector
        vector<double> occ_tmp(nst);
        for ( int i = 0; i < index.size(); i++ )
          occ_tmp[i] = sd.occ(index[i]);
        sd.set_occ(occ_tmp);

1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
        // compute a pivot vector containing a sequence of transpositions
        // equivalent to the permutation defined by index[i]
        // compute the inverse of index
        vector<int> index_inv(index.size());
        for ( int i = 0; i < index.size(); i++ )
          index_inv[index[i]] = i;
        assert(index_inv.size()==index.size());

        vector<int> pivot;
        for ( int i = 0; i < index.size(); i++ )
        {
          int j = index_inv[i];

          int tmp = index[i];
          index[i] = index[j];
          index[j] = tmp;

          // update elements of index_inv
          index_inv[index[i]] = i;
          index_inv[index[j]] = j;

          pivot.push_back(j);
        }
        assert(pivot.size()==index.size());
#if DEBUG
        if ( gcontext_.onpe0() )
        {
          cout << "pivot:" << endl;
          for ( int j = 0; j < pivot.size(); j++ )
            cout << j << " -> " << pivot[j] << endl;
        }
#endif

        // create a local pivot vector on this process (size uc->nloc())
        // this vector must be replicated on all tasks of the
        // process grid columns
        const int nloc = uc_[ispin]->nloc();
        vector<int> locpivot(nloc);
        // fill the local pivot vector on all tasks
        // add 1 to index values for lapack fortran index convention
        for ( int j=0; j < nloc; j++ )
        {
          int jglobal = uc_[ispin]->jglobal(j);
          locpivot[j] = pivot[jglobal]+1;
        }
#if 0
        for ( int ipe = 0; ipe < uc.context().size(); ipe++ )
        {
          uc.context().barrier();
          if ( ipe == uc.context().mype() )
          {
            cout << "locpivot:" << endl;
            for ( int j = 0; j < locpivot.size(); j++ )
              cout << ipe << ": " << j << " -> " << locpivot[j] << endl;
          }
        }
#endif

#if 0
        if ( u_->context().size() == 1 )
        {
          // cout << " uc before perm: " << endl;
          // cout << uc;
          // local permutation
          assert(locpivot.size()==u_->n());
          double *p = uc.valptr(0);
          const int mloc = uc.mloc();
          for ( int i = locpivot.size()-1; i >= 0; i-- )
          {
            const int j = locpivot[i]-1;
            // swap columns i and j of u_->c()
            cout << " swap " << i << " " << j << endl;
            for ( int k = 0; k < mloc; k++ )
            {
              double tmp = p[i*mloc+k];
              p[i*mloc+k] = p[j*mloc+k];
              p[j*mloc+k] = tmp;
            }
          }
          // cout << " uc after perm: " << endl;
          // cout << uc;
        }
#endif

1146 1147 1148 1149
#if TIMING
        Timer tmblapiv;
        tmblapiv.start();
#endif
1150 1151
        // apply the permutation to the columns of uc
        uc_[ispin]->lapiv('B','C',&locpivot[0]);
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
#if TIMING
        tmblapiv.stop();
        if ( gcontext_.onpe0() )
        {
          cout << setprecision(3);
          cout << " ExchangeOperator: bisection size time: "
             << tmbsize.real() << " s" << endl;
          cout << setprecision(3);
          cout << " ExchangeOperator: bisection pair time: "
             << tmbpair.real() << " s" << endl;
          cout << setprecision(3);
          cout << " ExchangeOperator: bisection lapiv time: "
             << tmblapiv.real() << " s" << endl;
        }
#endif
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218

#if DEBUG
        // recompute the degree of the vertices of the exchange graph
        for ( int i = 0; i < nst; i++ )
        {
          int count = 0;
          for ( int j = 0; j < nst; j++ )
          {
            if ( bisection_[ispin]->overlap(localization_,i,j) )
              count++;
          }
          degree[i] = count;
        }

        if ( gcontext_.onpe0() )
        {
          cout << "degree after permutation:" << endl;
          for ( int j = 0; j < degree.size(); j++ )
            cout << " deg[" << j << "] = " << degree[j] << endl;
        }

        // print localization vectors and overlaps after distribution
        if ( gcontext_.onpe0() )
        {
          int sum = 0;
          for ( int i = 0; i < nst; i++ )
          {
            int count = 0;
            for ( int j = 0; j < nst; j++ )
            {
              if ( bisection_[ispin]->overlap(localization_,i,j) )
                count++;
            }
            cout << "localization[" << i << "]: "
                 << localization_[i] << " "
                 << bitset<30>(localization_[i])
                 << "  overlaps: "
                 << count << endl;
            sum += count;
          }
          cout << " Bisection::localize: total overlaps: " << sum << " / "
               << nst*nst << " = " << ((double) sum)/(nst*nst) << endl;
        }
#endif

      }
      else
      {
        if ( gcontext_.onpe0() )
          cout << " ExchangeOperator: bisection distribution disabled" << endl;
      } // if distribute

1219 1220 1221 1222
#if TIMING
      Timer tmbfwd;
      tmbfwd.start();
#endif
1223
      bisection_[ispin]->forward(*uc_[ispin], *wfc_.sd(ispin,0));
1224 1225 1226
#if TIMING
      tmbfwd.stop();
#endif
1227 1228 1229 1230

      tmb.stop();
      if ( gcontext_.onpe0() )
      {
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241
#if TIMING
        cout << setprecision(3);
        cout << " ExchangeOperator: bisection compute transform time: "
           << tmbtransf.real() << " s" << endl;
        cout << setprecision(3);
        cout << " ExchangeOperator: bisection compute localization time: "
           << tmbcomploc.real() << " s" << endl;
        cout << setprecision(3);
        cout << " ExchangeOperator: bisection forward time: "
           << tmbfwd.real() << " s" << endl;
#endif
1242 1243
        cout << setprecision(3);
        cout << " ExchangeOperator: bisection time: "
Francois Gygi committed
1244
           << tmb.real() << " s" << endl;
1245 1246
      }
    } // if use_bisection_
Francois Gygi committed
1247

1248
    tm.start();
Francois Gygi committed
1249

1250
    // compute exchange
Francois Gygi committed
1251 1252 1253 1254 1255 1256 1257 1258

    // real-space local states -> statej_[i][ir]
    // loop over states 2 by 2
    for ( int i = 0; i < sd.nstloc()-1; i+=2 )
    {
      wft_->backward(c.cvalptr(i*c.mloc()), c.cvalptr((i+1)*c.mloc()),
                     &tmp_[0]);
      double *p = (double *)&tmp_[0];
Francois Gygi committed
1259 1260
#pragma omp parallel for
      for ( int ir = 0; ir < np012loc_; ir++ )
Francois Gygi committed
1261
      {
Francois Gygi committed
1262 1263
        statej_[i][ir]=p[2*ir];
        statej_[i+1][ir]=p[2*ir+1];
Francois Gygi committed
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284
      }
    }

    if ( sd.nstloc() % 2 == 1 )
    {
      int i = sd.nstloc()-1;
      wft_->backward(c.cvalptr(i*c.mloc()), &(statej_[i])[0]);
    }

    SlaterDet& dsd = *(dwf->sd(ispin,0));
    ComplexMatrix& dc = dsd.c();

    if (dwf)
    {
      // reset real space derivatives dstatej_[i][ir]
      for ( int i = 0; i < dsd.nstloc(); i++ )
        for ( int j = 0; j < np012loc_; j++ )
          dstatej_[i][j] = 0.0;
    }

    const int ngloc = vbasis_->localsize();
1285 1286 1287 1288
    // correction term: sum_(G) exp(-rcut_^2*|G|^2)/|G|^2
    const double exfac = - ( 4.0 * M_PI / omega ) * spinFactor;
    double SumExpG2 = 0.0;
    double sigma_sumexp[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
Francois Gygi committed
1289 1290
    const double *g2 = vbasis_->g2_ptr();
    const double *g2i = vbasis_->g2i_ptr();
1291
    const double rc2 = rcut_*rcut_;
Francois Gygi committed
1292 1293
    for ( int ig = 0; ig < ngloc; ig++ )
    {
1294 1295 1296 1297
      // factor 2.0: real basis
      const double tg2i = g2i[ig];
      double t = 2.0 * exp( - rc2 * g2[ig] ) * tg2i;
      SumExpG2 += t;
Francois Gygi committed
1298

1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
      if ( compute_stress )
      {
        const double tgx = g_x[ig];
        const double tgy = g_y[ig];
        const double tgz = g_z[ig];
        // factor 2.0: derivative of G^2
        const double fac = t * 2.0 * ( rc2 + tg2i );
        sigma_sumexp[0] += fac * tgx * tgx;
        sigma_sumexp[1] += fac * tgy * tgy;
        sigma_sumexp[2] += fac * tgz * tgz;
        sigma_sumexp[3] += fac * tgx * tgy;
        sigma_sumexp[4] += fac * tgy * tgz;
        sigma_sumexp[5] += fac * tgz * tgx;
      }
    }
Francois Gygi committed
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381

    // local occupation numbers
    const double* occ = sd.occ_ptr();
    for ( int i = 0; i < sd.nstloc(); i++ )
      occ_kj_[i]=occ[c.jglobal(i)];

    // number of states to be sent
    nStatesKpi_ = sd.nstloc();

    // occupation numbers of circulating states
    for ( int i = 0; i < nStatesKpi_; i++ )
      occ_ki_[i] = occ_kj_[i];

    // copy local states into circulating states
    const complex<double> *p = c.cvalptr(0);
    for ( int i = 0; i < nStatesKpi_ * c.mloc(); i++ )
      state_kpi_[i]=p[i];

    // initialize circulating derivatives
    if (dwf)
    {
      for ( int i = 0; i < nStatesKpi_ * dc.mloc(); i++ )
        force_kpi_[i]=0.0;
    }

    // initiate send nStatesKpi_ and receive nNextStatesKpi_
    InitPermutation();

#if LOAD_MATRIX
    // collect number of processed pairs in array load_matrix
    vector<int> load_matrix(gcontext_.npcol()*gcontext_.npcol(),0);
#endif

    // Start rotation of circulating states
    for ( int iRotationStep = 0; iRotationStep<gcontext_.npcol();
          iRotationStep++ )
    {
      // generate a list of pairs of overlapping states
      int nPair = 0;
      vector<int> first_member_of_pair;
      vector<int> second_member_of_pair;

      // flag indicating that a circulating state has been used
      vector<int> useState(nStatesKpi_,0);

      // finish receiving occupations in occ_ki_[]
      CompleteReceivingOccupations(iRotationStep);

      // loop over circulating states
      for ( int i = 0; i < nStatesKpi_; i++ )
      {
        // original column index of circulating state i
        int iColI = gcontext_.mycol() - iRotationStep;
        iColI = ( iColI < 0 ) ? iColI + gcontext_.npcol() : iColI;

        // global index of circulating state i
        int iGlobI = c.jglobal(iColI,i);

        // loop over fixed states
        for ( int j = 0; j < sd.nstloc(); j++ )
        {
          // check if there is something to compute for this pair
          if ( occ_ki_[i]!=0.0 || occ_kj_[j]!=0.0 )
          {
            // global index of fixed state j
            int iGlobJ = c.jglobal(j);

            // determine the overlap between those two states
1382 1383
            bool overlap_ij = ( !use_bisection_ ||
              bisection_[ispin]->overlap(localization_,iGlobI,iGlobJ) );
Francois Gygi committed
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462

            // use the chess board condition to
            // optimize the distribution of work on
            // each process:
            // - if i and j have different parity, use
            // the condition i<j
            // - if i and j have same parity, use
            // the condition i>=j
            int parity_i = iGlobI & 1;
            int parity_j = iGlobJ & 1;

            if ( parity_i == parity_j )
            {
              if ( iGlobI >= iGlobJ && overlap_ij )
              {
                first_member_of_pair.push_back( i );
                second_member_of_pair.push_back( j );
                nPair++;

                // circulating state i is used
                useState[i] = 1;
              }
            }
            else
            {
              if ( iGlobI < iGlobJ && overlap_ij )
              {
                first_member_of_pair.push_back( i );
                second_member_of_pair.push_back( j );
                nPair++;

                // circulating state i is used
                useState[i] = 1;
              }
            }
          }
        }
      }
      // pair list is complete

#if LOAD_MATRIX
      // collect nPair statistics if on row 0
      if ( gcontext_.myrow() == 0 )
        load_matrix[iRotationStep*gcontext_.npcol()+gcontext_.mycol()] = nPair;
#endif

      // complete receiving states
      // note: this does nothing if iRotationStep == 0
      CompleteReceivingStates(iRotationStep);
      // circulating states in state_kpi_[i+j*mloc] can now be used

      // compute real space circulating states
      if ( nPair > 0 )
      {
        int i = 0;
        int j = 1;

        // try to associate states 2 by 2
        while ( i < nStatesKpi_ )
        {
          // seek the first next used state
          while ( i < nStatesKpi_ && !useState[i] ) i++;

          // seek the second next used state
          j=i+1;
          while ( j < nStatesKpi_ && !useState[j] ) j++;

          // if i is a valid index
          if ( i < nStatesKpi_ )
          {
            // if j is a valid index
            if ( j < nStatesKpi_ )
            {
              // back transform the couple (i,j)
              wft_->backward(&state_kpi_[i*c.mloc()],
                &state_kpi_[j*c.mloc()], &tmp_[0]);

              // copy the result in state[i] and state[j]
              double *p = (double *)&tmp_[0];
Francois Gygi committed
1463 1464
#pragma omp parallel for
              for ( int ir = 0; ir < np012loc_; ir++ )
Francois Gygi committed
1465
              {
Francois Gygi committed
1466 1467
                statei_[i][ir]=p[2*ir];
                statei_[j][ir]=p[2*ir+1];
Francois Gygi committed
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505
              }
            }

            // else, if there is only one state to transform
            else
            {
              wft_->backward(&state_kpi_[i*c.mloc()], &(statei_[i])[0]);
            }
          }

          // increment indices
          i = j + 1;
        }
      }

      // finish sending states in send_buf_states_
      CompleteSendingStates(iRotationStep);
      // send_buf_states_ can now be reused

      // copy the states to be sent in the send buffer
      for ( int i = 0; i < nStatesKpi_ * c.mloc(); i++ )
        send_buf_states_[i] = state_kpi_[i];

      if (dwf)
      {
        for ( int i = 0; i < nStatesKpi_; i++ )
          for ( int j = 0; j < np012loc_; j++ )
            dstatei_[i][j] = 0.0;
      }

      // nNextStatesKpi: number of states of next permutation step
      SetNextPermutationStateNumber();
      // start sending states in send_buf_states_
      StartStatesPermutation(c.mloc());

      // loop over pairs 2 by 2
      if ( nPair > 0 )
      {
1506 1507
        double ex_sum_1, ex_sum_2;
        double sigma_sum_1[6], sigma_sum_2[6];
Francois Gygi committed
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
        int iPair;
        for ( iPair=0; iPair<first_member_of_pair.size()-1; iPair+=2 )
        {
          int i1 = first_member_of_pair[iPair];
          int j1 = second_member_of_pair[iPair];
          int i2 = first_member_of_pair[iPair+1];
          int j2 = second_member_of_pair[iPair+1];

          // compute the pair densities
          // rhor = conjg(statei_(r)) * statej_(r)
1518
          // note: gamma point, densities are real
Francois Gygi committed
1519 1520 1521 1522 1523 1524 1525 1526
          {
            // rhor1_ = psi_i1 * psi_j1 + i * psi_i2 * psi_j2
            double *p   = (double *)&rhor1_[0];
            double *pi1 = (double *)&statei_[i1][0];
            double *pi2 = (double *)&statei_[i2][0];
            double *pj1 = (double *)&statej_[j1][0];
            double *pj2 = (double *)&statej_[j2][0];

Francois Gygi committed
1527 1528
#pragma omp parallel for
            for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1529 1530 1531 1532 1533 1534 1535 1536 1537
            {
              p[ip]   = pi1[ip] * pj1[ip];
              p[ip+1] = pi2[ip] * pj2[ip];
            }
          }

          // Fourier transform the pair density
          vft_->forward(&rhor1_[0], &rhog1_[0], &rhog2_[0]);

1538 1539 1540 1541
          // compute contributions to the exchange energy and forces on wfs
          ex_sum_1 = 0.0;
          ex_sum_2 = 0.0;
          if ( compute_stress )
Francois Gygi committed
1542
          {
1543
            for ( int i = 0; i < 6; i++ )
Francois Gygi committed
1544
            {
1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
              sigma_sum_1[i] = 0.0;
              sigma_sum_2[i] = 0.0;
            }
          }

          for ( int ig = 0; ig < ngloc; ig++ )
          {
            // Add the values of |rho1(G)|^2/|G+q1|^2
            // and |rho2(G)|^2/|G+q2|^2 to the exchange energy.
            // note: g2i[G=0] == 0
            // factor 2.0: real basis
            const double tg2i = g2i[ig];
            const double t1 = 2.0 * norm(rhog1_[ig]) * tg2i;
            const double t2 = 2.0 * norm(rhog2_[ig]) * tg2i;
            ex_sum_1 += t1;
            ex_sum_2 += t2;
Francois Gygi committed
1561

1562 1563
            if (dwf)
            {
Francois Gygi committed
1564
              // compute rhog1_[G]/|G+q1|^2 and rhog2_[G]/|G+q1|^2
1565 1566
              rhog1_[ig] *= tg2i;
              rhog2_[ig] *= tg2i;
Francois Gygi committed
1567 1568
            }

1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591
            if ( compute_stress )
            {
              const double tgx = g_x[ig];
              const double tgy = g_y[ig];
              const double tgz = g_z[ig];
              // factor 2.0: derivative of 1/G^2
              const double fac1 = 2.0 * t1 * tg2i;
              sigma_sum_1[0] += fac1 * tgx * tgx;
              sigma_sum_1[1] += fac1 * tgy * tgy;
              sigma_sum_1[2] += fac1 * tgz * tgz;
              sigma_sum_1[3] += fac1 * tgx * tgy;
              sigma_sum_1[4] += fac1 * tgy * tgz;
              sigma_sum_1[5] += fac1 * tgz * tgx;

              const double fac2 = 2.0 * t2 * tg2i;
              sigma_sum_2[0] += fac2 * tgx * tgx;
              sigma_sum_2[1] += fac2 * tgy * tgy;
              sigma_sum_2[2] += fac2 * tgz * tgz;
              sigma_sum_2[3] += fac2 * tgx * tgy;
              sigma_sum_2[4] += fac2 * tgy * tgz;
              sigma_sum_2[5] += fac2 * tgz * tgx;
            }
          }
Francois Gygi committed
1592

1593 1594
          if (dwf)
          {
Francois Gygi committed
1595 1596 1597 1598 1599
            // Backtransform rhog[G]/|q+G|^2
            vft_->backward(&rhog1_[0], &rhog2_[0], &rhor1_[0]);
          }

          // accumulate contributions to the exchange energy
1600 1601
          // first pair: (i1,j1)
          const double fac1 = 0.5 * exfac * occ_ki_[i1] * occ_kj_[j1];
Francois Gygi committed
1602 1603
          if ( ( i1==j1 ) && ( iRotationStep==0 ) )
          {
1604 1605 1606 1607 1608 1609 1610 1611
            exchange_sum += fac1 * ex_sum_1;

            sigma_exhf_[0] += fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
            sigma_exhf_[1] += fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
            sigma_exhf_[2] += fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
            sigma_exhf_[3] += fac1 * ( -sigma_sum_1[3] ) / omega;
            sigma_exhf_[4] += fac1 * ( -sigma_sum_1[4] ) / omega;
            sigma_exhf_[5] += fac1 * ( -sigma_sum_1[5] ) / omega;
Francois Gygi committed
1612 1613 1614

            if (dwf)
            {
1615
              const double weight = exfac * occ_kj_[j1] * HFCoeff_;
Francois Gygi committed
1616 1617 1618
              double *dp = (double *) &dstatei_[i1][0];
              double *pj = (double *) &statej_[j1][0];
              double *pr = (double *) &rhor1_[0];
Francois Gygi committed
1619 1620
#pragma omp parallel for
              for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1621 1622 1623 1624 1625
                dp[ip] += pj[ip] * pr[ip] * weight;
            }
          }
          else
          {
1626 1627 1628 1629 1630 1631 1632 1633
            exchange_sum += 2.0 * fac1 * ex_sum_1;

            sigma_exhf_[0] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
            sigma_exhf_[1] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
            sigma_exhf_[2] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
            sigma_exhf_[3] += 2.0 * fac1 * ( -sigma_sum_1[3] ) / omega;
            sigma_exhf_[4] += 2.0 * fac1 * ( -sigma_sum_1[4] ) / omega;
            sigma_exhf_[5] += 2.0 * fac1 * ( -sigma_sum_1[5] ) / omega;
Francois Gygi committed
1634 1635 1636

            if (dwf)
            {
1637 1638
              double weighti = exfac * occ_ki_[i1] * HFCoeff_;
              double weightj = exfac * occ_kj_[j1] * HFCoeff_;
Francois Gygi committed
1639 1640 1641 1642 1643 1644
              double *dpi = (double *) &dstatei_[i1][0];
              double *dpj = (double *) &dstatej_[j1][0];
              double *pi = (double *) &statei_[i1][0];
              double *pj = (double *) &statej_[j1][0];
              double *pr = (double *) &rhor1_[0];

Francois Gygi committed
1645 1646
#pragma omp parallel for
              for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1647 1648 1649 1650 1651 1652 1653 1654
              {
                dpi[ip] += pj[ip] * pr[ip] * weightj;
                dpj[ip] += pi[ip] * pr[ip] * weighti;
              }
            }
          }

          // second pair
1655
          const double fac2 = 0.5 * exfac * occ_ki_[i2] * occ_kj_[j2];
Francois Gygi committed
1656 1657
          if ( ( i2==j2 ) && ( iRotationStep==0 ) )
          {
1658 1659 1660 1661 1662 1663 1664 1665
            exchange_sum += fac2 * ex_sum_2;

            sigma_exhf_[0] += fac2 * (ex_sum_2 - sigma_sum_2[0]) / omega;
            sigma_exhf_[1] += fac2 * (ex_sum_2 - sigma_sum_2[1]) / omega;
            sigma_exhf_[2] += fac2 * (ex_sum_2 - sigma_sum_2[2]) / omega;
            sigma_exhf_[3] += fac2 * ( -sigma_sum_2[3] ) / omega;
            sigma_exhf_[4] += fac2 * ( -sigma_sum_2[4] ) / omega;
            sigma_exhf_[5] += fac2 * ( -sigma_sum_2[5] ) / omega;
Francois Gygi committed
1666 1667 1668

            if (dwf)
            {
1669
              const double weight = exfac * occ_kj_[j2] * HFCoeff_;
Francois Gygi committed
1670 1671 1672 1673 1674
              double *dp = (double *) &dstatei_[i2][0];
              double *pj = (double *) &statej_[j2][0];
              double *pr = (double *) &rhor1_[0];
              pr = pr + 1;

Francois Gygi committed
1675 1676
#pragma omp parallel for
              for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1677 1678 1679 1680 1681
                dp[ip] += pj[ip] * pr[ip] * weight;
            }
          }
          else
          {
1682 1683 1684 1685 1686 1687 1688 1689
            exchange_sum += 2.0 * fac2 * ex_sum_2;

            sigma_exhf_[0] += 2.0 * fac2 * (ex_sum_2 - sigma_sum_2[0]) / omega;
            sigma_exhf_[1] += 2.0 * fac2 * (ex_sum_2 - sigma_sum_2[1]) / omega;
            sigma_exhf_[2] += 2.0 * fac2 * (ex_sum_2 - sigma_sum_2[2]) / omega;
            sigma_exhf_[3] += 2.0 * fac2 * ( -sigma_sum_2[3] ) / omega;
            sigma_exhf_[4] += 2.0 * fac2 * ( -sigma_sum_2[4] ) / omega;
            sigma_exhf_[5] += 2.0 * fac2 * ( -sigma_sum_2[5] ) / omega;
Francois Gygi committed
1690 1691 1692

            if (dwf)
            {
1693 1694
              double weighti = exfac * occ_ki_[i2] * HFCoeff_;
              double weightj = exfac * occ_kj_[j2] * HFCoeff_;
Francois Gygi committed
1695 1696 1697 1698 1699 1700 1701
              double *dpi = (double *) &dstatei_[i2][0];
              double *dpj = (double *) &dstatej_[j2][0];
              double *pi = (double *) &statei_[i2][0];
              double *pj = (double *) &statej_[j2][0];
              double *pr = (double *) &rhor1_[0];
              pr = pr + 1;

Francois Gygi committed
1702 1703
#pragma omp parallel for
              for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
              {
                dpi[ip] += pj[ip] * pr[ip] * weightj;
                dpj[ip] += pi[ip] * pr[ip] * weighti;
              }
            }
          }
        } // iPair

        // last pair if necessary
        if ( iPair < first_member_of_pair.size() )
        {
          int i1 = first_member_of_pair[iPair];
          int j1 = second_member_of_pair[iPair];

          // rhor1_ = psi_i1 * psi_j1
          double *p   = (double *)&rhor1_[0];
          double *pi1 = (double *)&statei_[i1][0];
          double *pj1 = (double *)&statej_[j1][0];
Francois Gygi committed
1722 1723
#pragma omp parallel for
          for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1724 1725 1726 1727 1728 1729 1730
          {
            p[ip]   = pi1[ip] * pj1[ip];
            p[ip+1] = 0.0;
          }

          vft_->forward(&rhor1_[0], &rhog1_[0]);

1731 1732
          ex_sum_1 = 0.0;
          if ( compute_stress )
Francois Gygi committed
1733
          {
1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745
            for ( int i = 0; i < 6; i++ )
              sigma_sum_1[i] = 0.0;
          }
          for ( int ig = 0; ig < ngloc; ig++ )
          {
            // Add the values of |rho1(G)|^2/|G|^2
            // and |rho2(G)|^2/|G|^2 to the exchange energy.
            // note: g2i[G=0] == 0
            // factor 2.0: real basis
            const double tg2i = g2i[ig];
            const double t1 = 2.0 * norm(rhog1_[ig]) * tg2i;
            ex_sum_1 += t1;
Francois Gygi committed
1746

1747 1748 1749
            if (dwf)
            {
              rhog1_[ig] *= tg2i;
Francois Gygi committed
1750 1751
            }

1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766
            if ( compute_stress )
            {
              const double tgx = g_x[ig];
              const double tgy = g_y[ig];
              const double tgz = g_z[ig];
              // factor 2.0: derivative of 1/G^2
              const double fac = 2.0 * t1 * tg2i;
              sigma_sum_1[0] += fac * tgx * tgx;
              sigma_sum_1[1] += fac * tgy * tgy;
              sigma_sum_1[2] += fac * tgz * tgz;
              sigma_sum_1[3] += fac * tgx * tgy;
              sigma_sum_1[4] += fac * tgy * tgz;
              sigma_sum_1[5] += fac * tgz * tgx;
            }
          }
Francois Gygi committed
1767

1768 1769
          if (dwf)
          {
Francois Gygi committed
1770 1771 1772 1773
            // Backtransform rhog[G]/|q+G|^2
            vft_->backward(&rhog1_[0], &rhor1_[0]);
          }

1774
          const double fac1 = 0.5 * exfac * occ_ki_[i1] * occ_kj_[j1];
Francois Gygi committed
1775 1776
          if ( ( i1==j1 ) && ( iRotationStep==0 ) )
          {
1777
            exchange_sum += fac1 * ex_sum_1;
Francois Gygi committed
1778

1779 1780 1781 1782 1783 1784
            sigma_exhf_[0] += fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
            sigma_exhf_[1] += fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
            sigma_exhf_[2] += fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
            sigma_exhf_[3] += fac1 * ( -sigma_sum_1[3] ) / omega;
            sigma_exhf_[4] += fac1 * ( -sigma_sum_1[4] ) / omega;
            sigma_exhf_[5] += fac1 * ( -sigma_sum_1[5] ) / omega;
Francois Gygi committed
1785 1786 1787

            if (dwf)
            {
1788
              const double weight = exfac * occ_kj_[j1] * HFCoeff_;
Francois Gygi committed
1789 1790 1791
              double *dp = (double *) &dstatei_[i1][0];
              double *pj = (double *) &statej_[j1][0];
              double *pr = (double *) &rhor1_[0];
Francois Gygi committed
1792 1793
#pragma omp parallel for
              for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1794 1795 1796 1797 1798
                dp[ip] += pj[ip] * pr[ip] * weight;
            }
          }
          else
          {
1799
            exchange_sum += 2.0 * fac1 * ex_sum_1;
Francois Gygi committed
1800

1801 1802 1803 1804 1805 1806
            sigma_exhf_[0] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[0]) / omega;
            sigma_exhf_[1] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[1]) / omega;
            sigma_exhf_[2] += 2.0 * fac1 * (ex_sum_1 - sigma_sum_1[2]) / omega;
            sigma_exhf_[3] += 2.0 * fac1 * ( -sigma_sum_1[3] ) / omega;
            sigma_exhf_[4] += 2.0 * fac1 * ( -sigma_sum_1[4] ) / omega;
            sigma_exhf_[5] += 2.0 * fac1 * ( -sigma_sum_1[5] ) / omega;
Francois Gygi committed
1807 1808 1809

            if (dwf)
            {
1810 1811
              double weighti = exfac * occ_ki_[i1] * HFCoeff_;
              double weightj = exfac * occ_kj_[j1] * HFCoeff_;
Francois Gygi committed
1812 1813 1814 1815 1816 1817
              double *dpi = (double *) &dstatei_[i1][0];
              double *dpj = (double *) &dstatej_[j1][0];
              double *pi = (double *) &statei_[i1][0];
              double *pj = (double *) &statej_[j1][0];
              double *pr = (double *) &rhor1_[0];

Francois Gygi committed
1818 1819
#pragma omp parallel for
              for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
              {
                dpi[ip] += pj[ip] * pr[ip] * weightj;
                dpj[ip] += pi[ip] * pr[ip] * weighti;
              }
            }
          }
        } // if last pair
      } // if nPair > 0
      // End of loop over pairs

      if (dwf)
      {
        // finish receiving forces in force_kpi_[]
        CompleteReceivingForces(iRotationStep);
        // finish sending forces in send_buf_forces_[]
        CompleteSendingForces(iRotationStep);

        // add locally computed contributions to circulated forces
        {
          // copy force_kpi to to send_buf_forces
          for ( int i = 0; i < nStatesKpi_; i++ )
          {
            complex<double> *ps = &send_buf_forces_[i*dc.mloc()];
            complex<double> *pf = &force_kpi_[i*dc.mloc()];
            for ( int j = 0; j < dc.mloc(); j++)
              ps[j] = pf[j];
          }
          // backtransform computed forces to G coordinates and
          // add to send buffer
          if ( nPair > 0 )
          {
            int i = 0;
            int j = 1;

            // try to associate states 2 by 2
            while ( i < nStatesKpi_ )
            {
              // find the first next used state
              while ( i < nStatesKpi_ && !useState[i] ) i++;

              // find the second next used state
              j = i+1;
              while ( j < nStatesKpi_ && !useState[j] ) j++;

              // if i is a valid index
              if ( i < nStatesKpi_ )
              {
                // if j is a valid index
                if ( j < nStatesKpi_ )
                {
                  // store dstate[i] + i * dstate[j] in tmp_
                  double *p = (double *)&tmp_[0];
                  double *dpr = (double *)&dstatei_[i][0];
                  double *dpi = (double *)&dstatei_[j][0];
Francois Gygi committed
1874 1875
#pragma omp parallel for
                  for ( int ip = 0; ip < 2*np012loc_; ip+=2 )
Francois Gygi committed
1876
                  {
Francois Gygi committed
1877 1878
                    p[ip]=dpr[ip];
                    p[ip+1]=dpi[ip];
Francois Gygi committed
1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928
                  }

                  // transform the pair of states
                  wft_->forward( &(tmp_)[0], &buffer_forces_1_[0],
                    &buffer_forces_2_[0] );

                  // accumulate contributions in send buffer
                  complex<double> *ps1=&send_buf_forces_[i*dc.mloc()];
                  complex<double> *ps2=&send_buf_forces_[j*dc.mloc()];
                  for ( int k = 0; k < dc.mloc(); k++)
                  {
                    ps1[k] += buffer_forces_1_[k];
                    ps2[k] += buffer_forces_2_[k];
                  }
                }
                else
                {
                  // there is only one force to transform
                  wft_->forward(&(dstatei_[i])[0], &buffer_forces_1_[0]);

                  // accumulate contributions in send buffer
                  complex<double> *ps1=&send_buf_forces_[i*dc.mloc()];
                  for ( int k = 0; k < dc.mloc(); k++ )
                    ps1[k] += buffer_forces_1_[k];
                }
                i = j+1;
              }
            }
          }
        }
        // end back transform and addition of locally computed
        // forces to the send buffer
        StartForcesPermutation(dc.mloc());
      } // if dwf

      CompleteSendingOccupations(iRotationStep);
      StartOccupationsPermutation();

      // set the new number of local states
      nStatesKpi_ = nNextStatesKpi_;
    } // iRotationStep
    // end of rotation of the states of kpoint i from this point

#if LOAD_MATRIX
    // collect load_matrix
    gcontext_.isum('R', load_matrix.size(), 1, &load_matrix[0],
                   load_matrix.size());
    if ( gcontext_.onpe0() )
    {
      cout << " ExchangeOperator: load_matrix" << endl;
1929
      const int nst = s_.wfc_.nst(ispin);
Francois Gygi committed
1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940