//////////////////////////////////////////////////////////////////////////////// // // Copyright (c) 2008 The Regents of the University of California // // This file is part of Qbox // // Qbox is distributed under the terms of the GNU General Public License // as published by the Free Software Foundation, either version 2 of // the License, or (at your option) any later version. // See the file COPYING in the root directory of this distribution // or . // //////////////////////////////////////////////////////////////////////////////// // // NonLocalPotential.C // //////////////////////////////////////////////////////////////////////////////// #include "NonLocalPotential.h" #include "Species.h" #include "blas.h" #include using namespace std; #if USE_MASSV extern "C" void vsincos(double *x, double *y, double *z, int *n); #endif //////////////////////////////////////////////////////////////////////////////// NonLocalPotential::~NonLocalPotential(void) { #if TIMING for ( TimerMap::iterator i = tmap.begin(); i != tmap.end(); i++ ) { double time = (*i).second.real(); double tmin = time; double tmax = time; ctxt_.dmin(1,1,&tmin,1); ctxt_.dmax(1,1,&tmax,1); if ( ctxt_.myproc()==0 ) { cout << "" << endl; } } #endif } //////////////////////////////////////////////////////////////////////////////// void NonLocalPotential::init(void) { const int ngwl = basis_.localsize(); nsp = atoms_.nsp(); nop.resize(nsp); lloc.resize(nsp); lproj.resize(nsp); na.resize(nsp); npr.resize(nsp); nprna.resize(nsp); wt.resize(nsp); twnl.resize(nsp); dtwnl.resize(nsp); nquad.resize(nsp); rquad.resize(nsp); wquad.resize(nsp); nspnl = 0; for ( int is = 0; is < nsp; is++ ) { Species *s = atoms_.species_list[is]; npr[is] = 0; nprna[is] = 0; if ( s->non_local() ) { nspnl++; na[is] = atoms_.na(is); nop[is] = s->nop(); lloc[is] = s->llocal(); nquad[is] = s->nquad(); // compute total number of projectors npr[is] // KB potentials have nlm projectors // semilocal potentials have nlm*nquad projectors if ( s->nquad() == 0 ) { npr[is] = s->nlm(); } else { npr[is] = s->nlm() * nquad[is]; } nprna[is] = npr[is] * na[is]; // l value for projector ipr lproj[is].resize(npr[is]); twnl[is].resize(npr[is]*ngwl); dtwnl[is].resize(npr[is]*6*ngwl); // quadrature abcissae and weights rquad[is].resize(nquad[is]); wquad[is].resize(nquad[is]); enum quadrature_rule_type { TRAPEZOID, MODIF_TRAPEZOID, TRAPEZOID_WITH_RIGHT_ENDPOINT, SIMPSON }; const quadrature_rule_type quad_rule = TRAPEZOID; //const quadrature_rule_type quad_rule = MODIF_TRAPEZOID; //const quadrature_rule_type quad_rule = TRAPEZOID_WITH_RIGHT_ENDPOINT; //const quadrature_rule_type quad_rule = SIMPSON; if ( quad_rule == TRAPEZOID ) { // trapezoidal rule with interior points only // (end points are zero) const double h = s->rquad() / (nquad[is]+1); for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { rquad[is][iquad] = (iquad+1) * h; wquad[is][iquad] = h; } //cout << " NonLocalPotential::init: trapezoidal rule (interior)" // << endl; } else if ( quad_rule == MODIF_TRAPEZOID ) { // use modified trapezoidal rule with interior points, and include // correction for first derivative at r=0 as // h^2/12 f'(0) where f'(0) is estimated with f(h)/h // i.e. add correction h/12) * f(h) // See Davis & Rabinowitz, p. 132 const double h = s->rquad() / (nquad[is]+1); for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { rquad[is][iquad] = (iquad+1) * h; wquad[is][iquad] = h; } wquad[is][0] += h / 12.0; //cout << " NonLocalPotential::init: modified trapezoidal rule" // << endl; } else if ( quad_rule == TRAPEZOID_WITH_RIGHT_ENDPOINT ) { const double h = s->rquad() / nquad[is]; for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { rquad[is][iquad] = (iquad+1) * h; wquad[is][iquad] = h; } wquad[is][nquad[is]-1] = 0.5 * h; //cout << " NonLocalPotential::init: trapezoidal rule, right endpoint" // << endl; } else if ( quad_rule == SIMPSON ) { // must have 2n+1 points assert(nquad[is]%2==1); const double h = s->rquad() / (nquad[is]-1); for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { rquad[is][iquad] = iquad * h; if ( ( iquad == 0 ) || ( iquad == nquad[is]-1 ) ) wquad[is][iquad] = h / 3.0; else if ( iquad % 2 == 0 ) wquad[is][iquad] = h * 2.0 / 3.0; else wquad[is][iquad] = h * 4.0 / 3.0; } //cout << " NonLocalPotential::init: Simpson rule" << endl; } else { assert(false); } // compute weights wt[is][ipr] wt[is].resize(npr[is]); // compute lproj[is][ipr] int ipr_base = 0; for ( int iop = 0; iop < nop[is]; iop++ ) { int l = s->l(iop); if ( l != lloc[is] ) { if ( nquad[is] == 0 ) { // Kleinman-Bylander form // wt[is][ipr] // index = ipr_base+m for ( int m = 0; m < 2*l+1; m++ ) { const int ipr = ipr_base + m; wt[is][ipr] = s->wsg(iop); lproj[is][ipr] = l; } ipr_base += 2*l+1; } else { for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { const double r = rquad[is][iquad]; double v,dv,vl,dvl; s->dvpsr(l,r,v,dv); s->dvpsr(lloc[is],r,vl,dvl); // wt[is][iquad+ipr*nquad] for ( int m = 0; m < 2*l+1; m++ ) { const int ipr = ipr_base + iquad + nquad[is] * m; wt[is][ipr] = ( v - vl ) * wquad[is][iquad]; lproj[is][ipr] = l; } } ipr_base += (2*l+1) * nquad[is]; } } } assert(ipr_base==npr[is]); } // if s->non_local() } } //////////////////////////////////////////////////////////////////////////////// void NonLocalPotential::update_twnl(void) { // update arrays twnl[is][ipr][ig], dtwnl[is][ipr][j][ig], // following a change of cell dimensions // It is assumed that basis_ has been updated // It is assumed that nsp, npr[is], nquad[is] did not change since init tmap["update_twnl"].start(); const int ngwl = basis_.localsize(); const double pi = M_PI; const double fpi = 4.0 * pi; const double s14pi = sqrt(1.0/fpi); const double s34pi = sqrt(3.0/fpi); const double s54pi = sqrt(5.0/fpi); const double s74pi = sqrt(7.0/fpi); const double s20pi = sqrt(20.0*pi); const double s20pi3 = sqrt(20.0*pi/3.0); const double s3 = sqrt(3.0); const double s32 = sqrt(1.5); const double s52 = sqrt(2.5); const double s15 = sqrt(15.0); const double *kpg = basis_.kpg_ptr(); const double *kpgi = basis_.kpgi_ptr(); const double *kpg_x = basis_.kpgx_ptr(0); const double *kpg_y = basis_.kpgx_ptr(1); const double *kpg_z = basis_.kpgx_ptr(2); // compute twnl and dtwnl for ( int is = 0; is < nsp; is++ ) { Species *s = atoms_.species_list[is]; int ilm = 0; for ( int iop = 0; iop < nop[is]; iop++ ) { int l = s->l(iop); if ( l != lloc[is] ) { if ( l == 0 ) { if ( nquad[is] == 0 ) { // Kleinman-Bylander // twnl[is][ipr][ig] const int ipr = ilm; // index = ig + ngwl*ipr, i.e. index = ig double *t0 = &twnl[is][ngwl*ipr]; // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ipr ), ipr = 0 // i.e. index = ig + ij * ngwl double *dt0_xx = &dtwnl[is][ngwl*(0+6*ipr)]; double *dt0_yy = &dtwnl[is][ngwl*(1+6*ipr)]; double *dt0_zz = &dtwnl[is][ngwl*(2+6*ipr)]; double *dt0_xy = &dtwnl[is][ngwl*(3+6*ipr)]; double *dt0_yz = &dtwnl[is][ngwl*(4+6*ipr)]; double *dt0_xz = &dtwnl[is][ngwl*(5+6*ipr)]; // Special case k=G=0 is ok since kpgi[0] = 0.0 at k=G=0 for ( int ig = 0; ig < ngwl; ig++ ) { double v,dv; s->dvnlg(iop,kpg[ig],v,dv); t0[ig] = s14pi * v; const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tmp = kpgi[ig] * s14pi * dv; dt0_xx[ig] = tmp * tgx * tgx; dt0_yy[ig] = tmp * tgy * tgy; dt0_zz[ig] = tmp * tgz * tgz; dt0_xy[ig] = tmp * tgx * tgy; dt0_yz[ig] = tmp * tgy * tgz; dt0_xz[ig] = tmp * tgx * tgz; } } else { // semi-local for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { // twnl[is][ipr][ig] // ipr = iquad + nquad[is]*ilm, where ilm=0 // = iquad // index = ig + ngwl*iquad double *t0 = &twnl[is][ngwl*iquad]; // dtwnl[is][ipr][j][ngwl] // index = ig + ngwl * ( ij + 6 * iquad) double *dt0_xx = &dtwnl[is][ngwl*(0+6*iquad)]; double *dt0_yy = &dtwnl[is][ngwl*(1+6*iquad)]; double *dt0_zz = &dtwnl[is][ngwl*(2+6*iquad)]; double *dt0_xy = &dtwnl[is][ngwl*(3+6*iquad)]; double *dt0_yz = &dtwnl[is][ngwl*(4+6*iquad)]; double *dt0_xz = &dtwnl[is][ngwl*(5+6*iquad)]; const double r = rquad[is][iquad]; // G=0 element must be treated separately if k=0 if ( basis_.real() && ctxt_.myrow() == 0) { // compute G=0 element separately to get // sin(Gr)/(Gr) limit -> 1 const int ig = 0; // this task holds the G=0 element // I(l=0) = 4 pi j_l(G r) r // twnl[is][ipr][l][ig] = 4 pi j_0(Gr_i) r_i Ylm // j_0(Gr) = sin(Gr) / (Gr) == 1.0 // Ylm = s14pi // next line: sin(Gr)/(Gr) replaced by 1 t0[ig] = fpi * s14pi * r; // dtwnl = fpi s14pi G_i G_j / G (r cos(Gr)/G -sin(Gr)/G^2) // dtwnl = 0.0; dt0_xx[ig] = 0.0; dt0_yy[ig] = 0.0; dt0_zz[ig] = 0.0; dt0_xy[ig] = 0.0; dt0_yz[ig] = 0.0; dt0_xz[ig] = 0.0; } else { // Use the normal procedure const int ig = 0; // I(l=0) = 4 pi j_l(G r) r // twnl[is][ipr][l][ig] = 4 pi j_0(Gr_i) r_i Ylm // j_0(Gr) * r = sin(Gr) / G // Ylm = s14pi const double arg = kpg[ig] * r; // Note: for G=0, gi[0] = 0 const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double ts = sin(arg); const double tc = cos(arg); t0[ig] = fpi * s14pi * ts * tgi; // dtwnl = fpi s14pi k+G_i k+G_j / k+G // (r cos(k+Gr)/k+G -sin(k+Gr)/k+G^2) const double tmp = fpi * s14pi * tgi2 * (r*tc - ts*tgi); dt0_xx[ig] = tmp * tgx * tgx; dt0_yy[ig] = tmp * tgy * tgy; dt0_zz[ig] = tmp * tgz * tgz; dt0_xy[ig] = tmp * tgx * tgy; dt0_yz[ig] = tmp * tgy * tgz; dt0_xz[ig] = tmp * tgx * tgz; } for ( int ig = 1; ig < ngwl; ig++ ) { // I(l=0) = 4 pi j_l(|k+G| r) r // twnl[is][ipr][l][ig] = 4 pi j_0(|k+G|r_i) r_i Ylm // j_0(|k+G|r) * r = sin(|k+G|r) / |k+G| // Ylm = s14pi const double arg = kpg[ig] * r; const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double ts = sin(arg); const double tc = cos(arg); t0[ig] = fpi * s14pi * ts * tgi; // dtwnl = fpi s14pi (k+G)_i (k+G)_j / // |k+G| (r cos(|k+G|r)/|k+G| -sin(|k+G|r)/|k+G|^2) const double tmp = fpi * s14pi * tgi2 * (r*tc - ts*tgi); dt0_xx[ig] = tmp * tgx * tgx; dt0_yy[ig] = tmp * tgy * tgy; dt0_zz[ig] = tmp * tgz * tgz; dt0_xy[ig] = tmp * tgx * tgy; dt0_yz[ig] = tmp * tgy * tgz; dt0_xz[ig] = tmp * tgx * tgz; } } // for iquad } ilm += 2*l+1; } else if ( l == 1 ) { if ( nquad[is] == 0 ) { // Kleinman-Bylander // twnl[is][ipr][ig] // ipr = ilm const int ipr1 = ilm; const int ipr2 = ilm+1; const int ipr3 = ilm+2; // index = ig + ngwl*ilm double *t1 = &twnl[is][ngwl*ipr1]; double *t2 = &twnl[is][ngwl*ipr2]; double *t3 = &twnl[is][ngwl*ipr3]; // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ipr ) double *dt1_xx = &dtwnl[is][ngwl*(0+6*ipr1)]; double *dt1_yy = &dtwnl[is][ngwl*(1+6*ipr1)]; double *dt1_zz = &dtwnl[is][ngwl*(2+6*ipr1)]; double *dt1_xy = &dtwnl[is][ngwl*(3+6*ipr1)]; double *dt1_yz = &dtwnl[is][ngwl*(4+6*ipr1)]; double *dt1_xz = &dtwnl[is][ngwl*(5+6*ipr1)]; double *dt2_xx = &dtwnl[is][ngwl*(0+6*ipr2)]; double *dt2_yy = &dtwnl[is][ngwl*(1+6*ipr2)]; double *dt2_zz = &dtwnl[is][ngwl*(2+6*ipr2)]; double *dt2_xy = &dtwnl[is][ngwl*(3+6*ipr2)]; double *dt2_yz = &dtwnl[is][ngwl*(4+6*ipr2)]; double *dt2_xz = &dtwnl[is][ngwl*(5+6*ipr2)]; double *dt3_xx = &dtwnl[is][ngwl*(0+6*ipr3)]; double *dt3_yy = &dtwnl[is][ngwl*(1+6*ipr3)]; double *dt3_zz = &dtwnl[is][ngwl*(2+6*ipr3)]; double *dt3_xy = &dtwnl[is][ngwl*(3+6*ipr3)]; double *dt3_yz = &dtwnl[is][ngwl*(4+6*ipr3)]; double *dt3_xz = &dtwnl[is][ngwl*(5+6*ipr3)]; for ( int ig = 0; ig < ngwl; ig++ ) { double v,dv; const double tg = kpg[ig]; s->dvnlg(iop,tg,v,dv); const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgx2 = tgx * tgx; const double tgy2 = tgy * tgy; const double tgz2 = tgz * tgz; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double y1 = s34pi * tgx * tgi; const double y2 = s34pi * tgy * tgi; const double y3 = s34pi * tgz * tgi; t1[ig] = y1 * v; t2[ig] = y2 * v; t3[ig] = y3 * v; const double fac1 = - y1 * ( v - tg * dv ) * tgi2; // m=x dt1_xx[ig] = fac1 * tgx2 + v * y1; dt1_yy[ig] = fac1 * tgy2; dt1_zz[ig] = fac1 * tgz2; dt1_xy[ig] = fac1 * tgx * tgy; dt1_yz[ig] = fac1 * tgy * tgz; dt1_xz[ig] = fac1 * tgx * tgz; const double fac2 = - y2 * ( v - tg * dv ) * tgi2; // m=y dt2_xx[ig] = fac2 * tgx2; dt2_yy[ig] = fac2 * tgy2 + v * y2; dt2_zz[ig] = fac2 * tgz2; dt2_xy[ig] = fac2 * tgx * tgy + v * y1; dt2_yz[ig] = fac2 * tgy * tgz; dt2_xz[ig] = fac2 * tgx * tgz; const double fac3 = - y3 * ( v - tg * dv ) * tgi2; // m=z dt3_xx[ig] = fac3 * tgx2; dt3_yy[ig] = fac3 * tgy2; dt3_zz[ig] = fac3 * tgz2 + v * y3; dt3_xy[ig] = fac3 * tgx * tgy; dt3_yz[ig] = fac3 * tgy * tgz + v * y2; dt3_xz[ig] = fac3 * tgx * tgz + v * y1; } } else { // semi-local for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { // twnl[is][ipr][ig] // index = ig + ngwl*(iquad+nquad[is]*ilm) const int ipr1 = iquad+nquad[is]*ilm; const int ipr2 = iquad+nquad[is]*(ilm+1); const int ipr3 = iquad+nquad[is]*(ilm+2); double *t1 = &twnl[is][ngwl*ipr1]; double *t2 = &twnl[is][ngwl*ipr2]; double *t3 = &twnl[is][ngwl*ipr3]; // dtwnl[is][ipr][j][ngwl] // index = ig + ngwl * ( ij + 6 * ipr ) double *dt1_xx = &dtwnl[is][ngwl*(0+6*ipr1)]; double *dt1_yy = &dtwnl[is][ngwl*(1+6*ipr1)]; double *dt1_zz = &dtwnl[is][ngwl*(2+6*ipr1)]; double *dt1_xy = &dtwnl[is][ngwl*(3+6*ipr1)]; double *dt1_yz = &dtwnl[is][ngwl*(4+6*ipr1)]; double *dt1_xz = &dtwnl[is][ngwl*(5+6*ipr1)]; double *dt2_xx = &dtwnl[is][ngwl*(0+6*ipr2)]; double *dt2_yy = &dtwnl[is][ngwl*(1+6*ipr2)]; double *dt2_zz = &dtwnl[is][ngwl*(2+6*ipr2)]; double *dt2_xy = &dtwnl[is][ngwl*(3+6*ipr2)]; double *dt2_yz = &dtwnl[is][ngwl*(4+6*ipr2)]; double *dt2_xz = &dtwnl[is][ngwl*(5+6*ipr2)]; double *dt3_xx = &dtwnl[is][ngwl*(0+6*ipr3)]; double *dt3_yy = &dtwnl[is][ngwl*(1+6*ipr3)]; double *dt3_zz = &dtwnl[is][ngwl*(2+6*ipr3)]; double *dt3_xy = &dtwnl[is][ngwl*(3+6*ipr3)]; double *dt3_yz = &dtwnl[is][ngwl*(4+6*ipr3)]; double *dt3_xz = &dtwnl[is][ngwl*(5+6*ipr3)]; const double r = rquad[is][iquad]; for ( int ig = 0; ig < ngwl; ig++ ) { double v = 0.0, dv = 0.0; // j_1(Gr) = (1/(Gr))*(sin(Gr)/(Gr)-cos(Gr)) const double tg = kpg[ig]; const double z = tg * r; if ( z != 0.0 ) { const double zi = 1.0 / z; const double c = cos(z); const double s = sin(z); const double j1 = ( s * zi - c ) * zi; const double dj1 = ( 2.0 * z * c + ( z*z - 2.0 ) * s ) * zi*zi*zi; // v = 4 pi j1(Gr) r v = fpi * j1 * r; // dv = d/dG v = 4 pi dj1(Gr)/d(Gr) d(Gr)/dG r // = 4 pi dj1 r^2 dv = fpi * dj1 * r * r; } const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgx2 = tgx * tgx; const double tgy2 = tgy * tgy; const double tgz2 = tgz * tgz; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double y1 = s34pi * tgx * tgi; const double y2 = s34pi * tgy * tgi; const double y3 = s34pi * tgz * tgi; t1[ig] = y1 * v; t2[ig] = y2 * v; t3[ig] = y3 * v; const double fac1 = - y1 * ( v - tg * dv ) * tgi2; // m=x dt1_xx[ig] = fac1 * tgx2 + v * y1; dt1_yy[ig] = fac1 * tgy2; dt1_zz[ig] = fac1 * tgz2; dt1_xy[ig] = fac1 * tgx * tgy; dt1_yz[ig] = fac1 * tgy * tgz; dt1_xz[ig] = fac1 * tgx * tgz; const double fac2 = - y2 * ( v - tg * dv ) * tgi2; // m=y dt2_xx[ig] = fac2 * tgx2; dt2_yy[ig] = fac2 * tgy2 + v * y2; dt2_zz[ig] = fac2 * tgz2; dt2_xy[ig] = fac2 * tgx * tgy + v * y1; dt2_yz[ig] = fac2 * tgy * tgz; dt2_xz[ig] = fac2 * tgx * tgz; const double fac3 = - y3 * ( v - tg * dv ) * tgi2; // m=z dt3_xx[ig] = fac3 * tgx2; dt3_yy[ig] = fac3 * tgy2; dt3_zz[ig] = fac3 * tgz2 + v * y3; dt3_xy[ig] = fac3 * tgx * tgy; dt3_yz[ig] = fac3 * tgy * tgz + v * y2; dt3_xz[ig] = fac3 * tgx * tgz + v * y1; } // ig } } ilm += 2*l+1; } else if ( l == 2 ) { if ( nquad[is] == 0 ) { // Kleinman-Bylander const int ipr4 = ilm; const int ipr5 = ilm+1; const int ipr6 = ilm+2; const int ipr7 = ilm+3; const int ipr8 = ilm+4; double *t4 = &twnl[is][ngwl*ipr4]; double *t5 = &twnl[is][ngwl*ipr5]; double *t6 = &twnl[is][ngwl*ipr6]; double *t7 = &twnl[is][ngwl*ipr7]; double *t8 = &twnl[is][ngwl*ipr8]; // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ipr ) double *dt4_xx = &dtwnl[is][ngwl*(0+6*ipr4)]; double *dt4_yy = &dtwnl[is][ngwl*(1+6*ipr4)]; double *dt4_zz = &dtwnl[is][ngwl*(2+6*ipr4)]; double *dt4_xy = &dtwnl[is][ngwl*(3+6*ipr4)]; double *dt4_yz = &dtwnl[is][ngwl*(4+6*ipr4)]; double *dt4_xz = &dtwnl[is][ngwl*(5+6*ipr4)]; double *dt5_xx = &dtwnl[is][ngwl*(0+6*ipr5)]; double *dt5_yy = &dtwnl[is][ngwl*(1+6*ipr5)]; double *dt5_zz = &dtwnl[is][ngwl*(2+6*ipr5)]; double *dt5_xy = &dtwnl[is][ngwl*(3+6*ipr5)]; double *dt5_yz = &dtwnl[is][ngwl*(4+6*ipr5)]; double *dt5_xz = &dtwnl[is][ngwl*(5+6*ipr5)]; double *dt6_xx = &dtwnl[is][ngwl*(0+6*ipr6)]; double *dt6_yy = &dtwnl[is][ngwl*(1+6*ipr6)]; double *dt6_zz = &dtwnl[is][ngwl*(2+6*ipr6)]; double *dt6_xy = &dtwnl[is][ngwl*(3+6*ipr6)]; double *dt6_yz = &dtwnl[is][ngwl*(4+6*ipr6)]; double *dt6_xz = &dtwnl[is][ngwl*(5+6*ipr6)]; double *dt7_xx = &dtwnl[is][ngwl*(0+6*ipr7)]; double *dt7_yy = &dtwnl[is][ngwl*(1+6*ipr7)]; double *dt7_zz = &dtwnl[is][ngwl*(2+6*ipr7)]; double *dt7_xy = &dtwnl[is][ngwl*(3+6*ipr7)]; double *dt7_yz = &dtwnl[is][ngwl*(4+6*ipr7)]; double *dt7_xz = &dtwnl[is][ngwl*(5+6*ipr7)]; double *dt8_xx = &dtwnl[is][ngwl*(0+6*ipr8)]; double *dt8_yy = &dtwnl[is][ngwl*(1+6*ipr8)]; double *dt8_zz = &dtwnl[is][ngwl*(2+6*ipr8)]; double *dt8_xy = &dtwnl[is][ngwl*(3+6*ipr8)]; double *dt8_yz = &dtwnl[is][ngwl*(4+6*ipr8)]; double *dt8_xz = &dtwnl[is][ngwl*(5+6*ipr8)]; for ( int ig = 0; ig < ngwl; ig++ ) { double v,dv; const double tg = kpg[ig]; s->dvnlg(iop,tg,v,dv); const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgx2 = tgx * tgx; const double tgy2 = tgy * tgy; const double tgz2 = tgz * tgz; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double tgxx = tgx2 * tgi2; const double tgyy = tgy2 * tgi2; const double tgzz = tgz2 * tgi2; const double tgxy = tgx * tgy * tgi2; const double tgyz = tgy * tgz * tgi2; const double tgxz = tgx * tgz * tgi2; const double y4 = s54pi * 0.5 * (3.0 * tgzz - 1.0 ); const double y5 = s54pi * 0.5 * s3 * ( tgxx - tgyy ); const double y6 = s54pi * s3 * tgxy; const double y7 = s54pi * s3 * tgyz; const double y8 = s54pi * s3 * tgxz; const double y1x = s34pi * tgx * tgi; const double y1y = s34pi * tgy * tgi; const double y1z = s34pi * tgz * tgi; const double dx_xx = y1x * tgxx - y1x; const double dx_yy = y1x * tgyy; const double dx_zz = y1x * tgzz; const double dx_xy = y1x * tgxy; const double dx_yz = y1x * tgyz; const double dx_xz = y1x * tgxz; const double dy_xx = y1y * tgxx; const double dy_yy = y1y * tgyy - y1y; const double dy_zz = y1y * tgzz; const double dy_xy = y1y * tgxy - y1x; const double dy_yz = y1y * tgyz; const double dy_xz = y1y * tgxz; const double dz_xx = y1z * tgxx; const double dz_yy = y1z * tgyy; const double dz_zz = y1z * tgzz - y1z; const double dz_xy = y1z * tgxy; const double dz_yz = y1z * tgyz - y1y; const double dz_xz = y1z * tgxz - y1x; t4[ig] = y4 * v; t5[ig] = y5 * v; t6[ig] = y6 * v; t7[ig] = y7 * v; t8[ig] = y8 * v; // y4 = s54pi 1/2 ( 3 z^2/r^2 - 1 ) dt4_xx[ig] = -(v * s20pi * dz_xx * y1z - y4 * dv * tg * tgxx); dt4_yy[ig] = -(v * s20pi * dz_yy * y1z - y4 * dv * tg * tgyy); dt4_zz[ig] = -(v * s20pi * dz_zz * y1z - y4 * dv * tg * tgzz); dt4_xy[ig] = -(v * s20pi * dz_xy * y1z - y4 * dv * tg * tgxy); dt4_yz[ig] = -(v * s20pi * dz_yz * y1z - y4 * dv * tg * tgyz); dt4_xz[ig] = -(v * s20pi * dz_xz * y1z - y4 * dv * tg * tgxz); // y5 = s54pi sqrt(3)/2 ( x^2 - y^2 ) / r^2 dt5_xx[ig] = -(v * s20pi3 * (y1x * dx_xx - y1y * dy_xx) - y5 * dv * tg * tgxx); dt5_yy[ig] = -(v * s20pi3 * (y1x * dx_yy - y1y * dy_yy) - y5 * dv * tg * tgyy); dt5_zz[ig] = -(v * s20pi3 * (y1x * dx_zz - y1y * dy_zz) - y5 * dv * tg * tgzz); dt5_xy[ig] = -(v * s20pi3 * (y1x * dx_xy - y1y * dy_xy) - y5 * dv * tg * tgxy); dt5_yz[ig] = -(v * s20pi3 * (y1x * dx_yz - y1y * dy_yz) - y5 * dv * tg * tgyz); dt5_xz[ig] = -(v * s20pi3 * (y1x * dx_xz - y1y * dy_xz) - y5 * dv * tg * tgxz); // y6 = s54pi sqrt(3) x y / r^2 dt6_xx[ig] = -(v * s20pi3 * (dx_xx * y1y + y1x * dy_xx) - y6 * dv * tg * tgxx); dt6_yy[ig] = -(v * s20pi3 * (dx_yy * y1y + y1x * dy_yy) - y6 * dv * tg * tgyy); dt6_zz[ig] = -(v * s20pi3 * (dx_zz * y1y + y1x * dy_zz) - y6 * dv * tg * tgzz); dt6_xy[ig] = -(v * s20pi3 * (dx_xy * y1y + y1x * dy_xy) - y6 * dv * tg * tgxy); dt6_yz[ig] = -(v * s20pi3 * (dx_yz * y1y + y1x * dy_yz) - y6 * dv * tg * tgyz); dt6_xz[ig] = -(v * s20pi3 * (dx_xz * y1y + y1x * dy_xz) - y6 * dv * tg * tgxz); // y7 = s54pi sqrt(3) y z / r^2 dt7_xx[ig] = -(v * s20pi3 * (dy_xx * y1z + y1y * dz_xx) - y7 * dv * tg * tgxx); dt7_yy[ig] = -(v * s20pi3 * (dy_yy * y1z + y1y * dz_yy) - y7 * dv * tg * tgyy); dt7_zz[ig] = -(v * s20pi3 * (dy_zz * y1z + y1y * dz_zz) - y7 * dv * tg * tgzz); dt7_xy[ig] = -(v * s20pi3 * (dy_xy * y1z + y1y * dz_xy) - y7 * dv * tg * tgxy); dt7_yz[ig] = -(v * s20pi3 * (dy_yz * y1z + y1y * dz_yz) - y7 * dv * tg * tgyz); dt7_xz[ig] = -(v * s20pi3 * (dy_xz * y1z + y1y * dz_xz) - y7 * dv * tg * tgxz); // y8 = s54pi sqrt(3) z x / r^2 dt8_xx[ig] = -(v * s20pi3 * (dx_xx * y1z + y1x * dz_xx) - y8 * dv * tg * tgxx); dt8_yy[ig] = -(v * s20pi3 * (dx_yy * y1z + y1x * dz_yy) - y8 * dv * tg * tgyy); dt8_zz[ig] = -(v * s20pi3 * (dx_zz * y1z + y1x * dz_zz) - y8 * dv * tg * tgzz); dt8_xy[ig] = -(v * s20pi3 * (dx_xy * y1z + y1x * dz_xy) - y8 * dv * tg * tgxy); dt8_yz[ig] = -(v * s20pi3 * (dx_yz * y1z + y1x * dz_yz) - y8 * dv * tg * tgyz); dt8_xz[ig] = -(v * s20pi3 * (dx_xz * y1z + y1x * dz_xz) - y8 * dv * tg * tgxz); } } else { // semi-local for ( int iquad = 0; iquad < nquad[is]; iquad++ ) { // twnl[is][ipr][ig] // ipr = iquad+nquad[is]*ilm // index = ig + ngwl*ipr const int ipr4 = iquad+nquad[is]*ilm; const int ipr5 = iquad+nquad[is]*(ilm+1); const int ipr6 = iquad+nquad[is]*(ilm+2); const int ipr7 = iquad+nquad[is]*(ilm+3); const int ipr8 = iquad+nquad[is]*(ilm+4); double *t4 = &twnl[is][ngwl*ipr4]; double *t5 = &twnl[is][ngwl*ipr5]; double *t6 = &twnl[is][ngwl*ipr6]; double *t7 = &twnl[is][ngwl*ipr7]; double *t8 = &twnl[is][ngwl*ipr8]; // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ipr ) double *dt4_xx = &dtwnl[is][ngwl*(0+6*ipr4)]; double *dt4_yy = &dtwnl[is][ngwl*(1+6*ipr4)]; double *dt4_zz = &dtwnl[is][ngwl*(2+6*ipr4)]; double *dt4_xy = &dtwnl[is][ngwl*(3+6*ipr4)]; double *dt4_yz = &dtwnl[is][ngwl*(4+6*ipr4)]; double *dt4_xz = &dtwnl[is][ngwl*(5+6*ipr4)]; double *dt5_xx = &dtwnl[is][ngwl*(0+6*ipr5)]; double *dt5_yy = &dtwnl[is][ngwl*(1+6*ipr5)]; double *dt5_zz = &dtwnl[is][ngwl*(2+6*ipr5)]; double *dt5_xy = &dtwnl[is][ngwl*(3+6*ipr5)]; double *dt5_yz = &dtwnl[is][ngwl*(4+6*ipr5)]; double *dt5_xz = &dtwnl[is][ngwl*(5+6*ipr5)]; double *dt6_xx = &dtwnl[is][ngwl*(0+6*ipr6)]; double *dt6_yy = &dtwnl[is][ngwl*(1+6*ipr6)]; double *dt6_zz = &dtwnl[is][ngwl*(2+6*ipr6)]; double *dt6_xy = &dtwnl[is][ngwl*(3+6*ipr6)]; double *dt6_yz = &dtwnl[is][ngwl*(4+6*ipr6)]; double *dt6_xz = &dtwnl[is][ngwl*(5+6*ipr6)]; double *dt7_xx = &dtwnl[is][ngwl*(0+6*ipr7)]; double *dt7_yy = &dtwnl[is][ngwl*(1+6*ipr7)]; double *dt7_zz = &dtwnl[is][ngwl*(2+6*ipr7)]; double *dt7_xy = &dtwnl[is][ngwl*(3+6*ipr7)]; double *dt7_yz = &dtwnl[is][ngwl*(4+6*ipr7)]; double *dt7_xz = &dtwnl[is][ngwl*(5+6*ipr7)]; double *dt8_xx = &dtwnl[is][ngwl*(0+6*ipr8)]; double *dt8_yy = &dtwnl[is][ngwl*(1+6*ipr8)]; double *dt8_zz = &dtwnl[is][ngwl*(2+6*ipr8)]; double *dt8_xy = &dtwnl[is][ngwl*(3+6*ipr8)]; double *dt8_yz = &dtwnl[is][ngwl*(4+6*ipr8)]; double *dt8_xz = &dtwnl[is][ngwl*(5+6*ipr8)]; const double r = rquad[is][iquad]; for ( int ig = 0; ig < ngwl; ig++ ) { double v = 0.0, dv = 0.0; // j_2(z) = (3/z^3-1/z) sin(z) - 3/z^2 cos(z) // j_2(z) = (1/z)*((3/z^2-1)*sin(z) - (3/z) cos(z) ) const double tg = kpg[ig]; const double z = tg * r; if ( z != 0.0 ) { const double zi = 1.0 / z; const double c = cos(z); const double s = sin(z); const double j2 = ((3.0*zi*zi - 1.0) * s - 3.0*zi * c ) * zi; const double z2 = z * z; const double dj2 = ( (4.0 * z2 - 9.0) * s + z*(9.0-z2) * c ) / (z2*z2) ; // v = 4 pi j2(Gr) r v = fpi * j2 * r; // dv = d/dG v = 4 pi dj2(Gr)/d(Gr) d(Gr)/dG r // = 4 pi dj2 r^2 dv = fpi * dj2 * r * r; } const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgx2 = tgx * tgx; const double tgy2 = tgy * tgy; const double tgz2 = tgz * tgz; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double tgxx = tgx2 * tgi2; const double tgyy = tgy2 * tgi2; const double tgzz = tgz2 * tgi2; const double tgxy = tgx * tgy * tgi2; const double tgyz = tgy * tgz * tgi2; const double tgxz = tgx * tgz * tgi2; const double y4 = s54pi * 0.5 * (3.0 * tgzz - 1.0 ); const double y5 = s54pi * 0.5 * s3 * ( tgxx - tgyy ); const double y6 = s54pi * s3 * tgxy; const double y7 = s54pi * s3 * tgyz; const double y8 = s54pi * s3 * tgxz; const double y1x = s34pi * tgx * tgi; const double y1y = s34pi * tgy * tgi; const double y1z = s34pi * tgz * tgi; const double dx_xx = y1x * tgxx - y1x; const double dx_yy = y1x * tgyy; const double dx_zz = y1x * tgzz; const double dx_xy = y1x * tgxy; const double dx_yz = y1x * tgyz; const double dx_xz = y1x * tgxz; const double dy_xx = y1y * tgxx; const double dy_yy = y1y * tgyy - y1y; const double dy_zz = y1y * tgzz; const double dy_xy = y1y * tgxy - y1x; const double dy_yz = y1y * tgyz; const double dy_xz = y1y * tgxz; const double dz_xx = y1z * tgxx; const double dz_yy = y1z * tgyy; const double dz_zz = y1z * tgzz - y1z; const double dz_xy = y1z * tgxy; const double dz_yz = y1z * tgyz - y1y; const double dz_xz = y1z * tgxz - y1x; t4[ig] = y4 * v; t5[ig] = y5 * v; t6[ig] = y6 * v; t7[ig] = y7 * v; t8[ig] = y8 * v; // y4 = s54pi 1/2 ( 3 z^2/r^2 - 1 ) dt4_xx[ig] = -(v * s20pi * dz_xx * y1z - y4 * dv * tg * tgxx); dt4_yy[ig] = -(v * s20pi * dz_yy * y1z - y4 * dv * tg * tgyy); dt4_zz[ig] = -(v * s20pi * dz_zz * y1z - y4 * dv * tg * tgzz); dt4_xy[ig] = -(v * s20pi * dz_xy * y1z - y4 * dv * tg * tgxy); dt4_yz[ig] = -(v * s20pi * dz_yz * y1z - y4 * dv * tg * tgyz); dt4_xz[ig] = -(v * s20pi * dz_xz * y1z - y4 * dv * tg * tgxz); // y5 = s54pi sqrt(3)/2 ( x^2 - y^2 ) / r^2 dt5_xx[ig] = -(v * s20pi3 * (y1x * dx_xx - y1y * dy_xx) - y5 * dv * tg * tgxx); dt5_yy[ig] = -(v * s20pi3 * (y1x * dx_yy - y1y * dy_yy) - y5 * dv * tg * tgyy); dt5_zz[ig] = -(v * s20pi3 * (y1x * dx_zz - y1y * dy_zz) - y5 * dv * tg * tgzz); dt5_xy[ig] = -(v * s20pi3 * (y1x * dx_xy - y1y * dy_xy) - y5 * dv * tg * tgxy); dt5_yz[ig] = -(v * s20pi3 * (y1x * dx_yz - y1y * dy_yz) - y5 * dv * tg * tgyz); dt5_xz[ig] = -(v * s20pi3 * (y1x * dx_xz - y1y * dy_xz) - y5 * dv * tg * tgxz); // y6 = s54pi sqrt(3) x y / r^2 dt6_xx[ig] = -(v * s20pi3 * (dx_xx * y1y + y1x * dy_xx) - y6 * dv * tg * tgxx); dt6_yy[ig] = -(v * s20pi3 * (dx_yy * y1y + y1x * dy_yy) - y6 * dv * tg * tgyy); dt6_zz[ig] = -(v * s20pi3 * (dx_zz * y1y + y1x * dy_zz) - y6 * dv * tg * tgzz); dt6_xy[ig] = -(v * s20pi3 * (dx_xy * y1y + y1x * dy_xy) - y6 * dv * tg * tgxy); dt6_yz[ig] = -(v * s20pi3 * (dx_yz * y1y + y1x * dy_yz) - y6 * dv * tg * tgyz); dt6_xz[ig] = -(v * s20pi3 * (dx_xz * y1y + y1x * dy_xz) - y6 * dv * tg * tgxz); // y7 = s54pi sqrt(3) y z / r^2 dt7_xx[ig] = -(v * s20pi3 * (dy_xx * y1z + y1y * dz_xx) - y7 * dv * tg * tgxx); dt7_yy[ig] = -(v * s20pi3 * (dy_yy * y1z + y1y * dz_yy) - y7 * dv * tg * tgyy); dt7_zz[ig] = -(v * s20pi3 * (dy_zz * y1z + y1y * dz_zz) - y7 * dv * tg * tgzz); dt7_xy[ig] = -(v * s20pi3 * (dy_xy * y1z + y1y * dz_xy) - y7 * dv * tg * tgxy); dt7_yz[ig] = -(v * s20pi3 * (dy_yz * y1z + y1y * dz_yz) - y7 * dv * tg * tgyz); dt7_xz[ig] = -(v * s20pi3 * (dy_xz * y1z + y1y * dz_xz) - y7 * dv * tg * tgxz); // y8 = s54pi sqrt(3) z x / r^2 dt8_xx[ig] = -(v * s20pi3 * (dx_xx * y1z + y1x * dz_xx) - y8 * dv * tg * tgxx); dt8_yy[ig] = -(v * s20pi3 * (dx_yy * y1z + y1x * dz_yy) - y8 * dv * tg * tgyy); dt8_zz[ig] = -(v * s20pi3 * (dx_zz * y1z + y1x * dz_zz) - y8 * dv * tg * tgzz); dt8_xy[ig] = -(v * s20pi3 * (dx_xy * y1z + y1x * dz_xy) - y8 * dv * tg * tgxy); dt8_yz[ig] = -(v * s20pi3 * (dx_yz * y1z + y1x * dz_yz) - y8 * dv * tg * tgyz); dt8_xz[ig] = -(v * s20pi3 * (dx_xz * y1z + y1x * dz_xz) - y8 * dv * tg * tgxz); } // ig } // iquad } ilm += 2*l+1; } else if ( l == 3 ) { // only implemented for Kleiman-Bylander type assert(nquad[is] == 0); // Kleinman-Bylander const int ipr09 = ilm; const int ipr10 = ilm + 1; const int ipr11 = ilm + 2; const int ipr12 = ilm + 3; const int ipr13 = ilm + 4; const int ipr14 = ilm + 5; const int ipr15 = ilm + 6; double *t09 = &twnl[is][ngwl * ipr09]; double *t10 = &twnl[is][ngwl * ipr10]; double *t11 = &twnl[is][ngwl * ipr11]; double *t12 = &twnl[is][ngwl * ipr12]; double *t13 = &twnl[is][ngwl * ipr13]; double *t14 = &twnl[is][ngwl * ipr14]; double *t15 = &twnl[is][ngwl * ipr15]; // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ipr ) double *dt09_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr09 )]; double *dt09_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr09 )]; double *dt09_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr09 )]; double *dt09_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr09 )]; double *dt09_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr09 )]; double *dt09_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr09 )]; double *dt10_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr10 )]; double *dt10_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr10 )]; double *dt10_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr10 )]; double *dt10_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr10 )]; double *dt10_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr10 )]; double *dt10_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr10 )]; double *dt11_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr11 )]; double *dt11_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr11 )]; double *dt11_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr11 )]; double *dt11_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr11 )]; double *dt11_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr11 )]; double *dt11_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr11 )]; double *dt12_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr12 )]; double *dt12_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr12 )]; double *dt12_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr12 )]; double *dt12_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr12 )]; double *dt12_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr12 )]; double *dt12_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr12 )]; double *dt13_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr13 )]; double *dt13_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr13 )]; double *dt13_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr13 )]; double *dt13_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr13 )]; double *dt13_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr13 )]; double *dt13_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr13 )]; double *dt14_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr14 )]; double *dt14_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr14 )]; double *dt14_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr14 )]; double *dt14_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr14 )]; double *dt14_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr14 )]; double *dt14_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr14 )]; double *dt15_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr15 )]; double *dt15_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr15 )]; double *dt15_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr15 )]; double *dt15_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr15 )]; double *dt15_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr15 )]; double *dt15_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr15 )]; for ( int ig = 0; ig < ngwl; ig++ ) { double v, dv; const double tg = kpg[ig]; s->dvnlg(iop,tg,v,dv); const double tgx = kpg_x[ig]; const double tgy = kpg_y[ig]; const double tgz = kpg_z[ig]; const double tgx2 = tgx * tgx; const double tgy2 = tgy * tgy; const double tgz2 = tgz * tgz; const double tgx3 = tgx * tgx2; const double tgy3 = tgy * tgy2; const double tgz3 = tgz * tgz2; const double tgi = kpgi[ig]; const double tgi2 = tgi * tgi; const double tgi3 = tgi * tgi2; const double tgxx = tgx2 * tgi2; const double tgyy = tgy2 * tgi2; const double tgzz = tgz2 * tgi2; const double tgxy = tgx * tgy * tgi2; const double tgyz = tgy * tgz * tgi2; const double tgxz = tgx * tgz * tgi2; const double tgxxx = tgx3 * tgi3; const double tgyyy = tgy3 * tgi3; const double tgzzz = tgz3 * tgi3; const double tgxyy = tgx * tgy2 * tgi3; const double tgxzz = tgx * tgz2 * tgi3; const double tgyxx = tgy * tgx2 * tgi3; const double tgyzz = tgy * tgz2 * tgi3; const double tgzxx = tgz * tgx2 * tgi3; const double tgzyy = tgz * tgy2 * tgi3; const double tgxyz = tgx * tgy * tgz * tgi3; const double factor = 0.5 * s74pi; const double y09 = factor * s52 * ( 3.0 * tgyxx - tgyyy ); const double y10 = factor * s15 * 2.0 * tgxyz; const double y11 = factor * s32 * ( 4.0 * tgyzz - tgyxx - tgyyy ); const double y12 = factor * ( 2.0 * tgzzz - 3.0 * ( tgzxx + tgzyy ) ); const double y13 = factor * s32 * ( 4.0 * tgxzz - tgxxx - tgxyy ); const double y14 = factor * s15 * ( tgzxx - tgzyy ); const double y15 = factor * s52 * ( tgxxx - 3.0 * tgxyy ); // derivative of x^3/r^3 w.r.t. x, y, and z const double dx_xxx = 3.0 * tgxx * ( 1.0 - tgxx ) * tgi; const double dy_xxx = -3.0 * tgxx * tgxy * tgi; const double dz_xxx = -3.0 * tgxx * tgxz * tgi; // derivative of y^3/r^3 w.r.t. x, y, and z const double dx_yyy = -3.0 * tgyy * tgxy * tgi; const double dy_yyy = 3.0 * tgyy * ( 1.0 - tgyy ) * tgi; const double dz_yyy = -3.0 * tgyy * tgyz * tgi; // derivative of z^3/r^3 w.r.t. x, y, and z const double dx_zzz = -3.0 * tgzz * tgxz * tgi; const double dy_zzz = -3.0 * tgzz * tgyz * tgi; const double dz_zzz = 3.0 * tgzz * ( 1.0 - tgzz ) * tgi; // derivative of xy^2/r^3 w.r.t. x, y, and z const double dx_xyy = tgyy * ( 1.0 - 3.0 * tgxx ) * tgi; const double dy_xyy = tgxy * ( 2.0 - 3.0 * tgyy ) * tgi; const double dz_xyy = -3.0 * tgyy * tgxz * tgi; // derivative of xz^2/r^3 w.r.t. x, y, and z const double dx_xzz = tgzz * ( 1.0 - 3.0 * tgxx ) * tgi; const double dy_xzz = -3.0 * tgzz * tgxy * tgi; const double dz_xzz = tgxz * ( 2.0 - 3.0 * tgzz ) * tgi; // derivative of yx^2/r^3 w.r.t. x, y, and z const double dx_yxx = tgxy * ( 2.0 - 3.0 * tgxx ) * tgi; const double dy_yxx = tgxx * ( 1.0 - 3.0 * tgyy ) * tgi; const double dz_yxx = -3.0 * tgxx * tgyz * tgi; // derivative of yz^2/r^3 w.r.t. x, y, and z const double dx_yzz = -3.0 * tgzz * tgxy * tgi; const double dy_yzz = tgzz * ( 1.0 - 3.0 * tgyy ) * tgi; const double dz_yzz = tgyz * ( 2.0 - 3.0 * tgzz ) * tgi; // derivative of zx^2/r^3 w.r.t. x, y, and z const double dx_zxx = tgxz * ( 2.0 - 3.0 * tgxx ) * tgi; const double dy_zxx = -3.0 * tgxx * tgyz * tgi; const double dz_zxx = tgxx * ( 1.0 - 3.0 * tgzz ) * tgi; // derivative of zy^2/r^3 w.r.t. x, y, and z const double dx_zyy = -3.0 * tgyy * tgxz * tgi; const double dy_zyy = tgyz * ( 2.0 - 3.0 * tgyy ) * tgi; const double dz_zyy = tgyy * ( 1.0 - 3.0 * tgzz ) * tgi; // derivative of xyz/r^3 w.r.t. x, y, and z const double dx_xyz = tgyz * ( 1 - 3.0 * tgxx ) * tgi; const double dy_xyz = tgxz * ( 1 - 3.0 * tgyy ) * tgi; const double dz_xyz = tgxy * ( 1 - 3.0 * tgzz ) * tgi; // derivatives of spherical harmonics // y9 = factor * s52 * ( 3.0 * tgyxx - tgyyy ); const double dx_y09 = factor * s52 * ( 3.0 * dx_yxx - dx_yyy ); const double dy_y09 = factor * s52 * ( 3.0 * dy_yxx - dy_yyy ); const double dz_y09 = factor * s52 * ( 3.0 * dz_yxx - dz_yyy ); // y10 = factor * s15 * 2.0 * tgxyz; const double dx_y10 = factor * s15 * 2.0 * dx_xyz; const double dy_y10 = factor * s15 * 2.0 * dy_xyz; const double dz_y10 = factor * s15 * 2.0 * dz_xyz; // y11 = factor * s32 * ( 4.0 * tgyzz - tgyxx - tgyyy ); const double dx_y11 = factor * s32 * ( 4.0 * dx_yzz - dx_yxx - dx_yyy ); const double dy_y11 = factor * s32 * ( 4.0 * dy_yzz - dy_yxx - dy_yyy ); const double dz_y11 = factor * s32 * ( 4.0 * dz_yzz - dz_yxx - dz_yyy ); // y12 = factor * ( 2.0 * tgzzz - 3.0 * ( tgzxx + tgzyy ) ); const double dx_y12 = factor * ( 2.0 * dx_zzz - 3.0 * ( dx_zxx + dx_zyy ) ); const double dy_y12 = factor * ( 2.0 * dy_zzz - 3.0 * ( dy_zxx + dy_zyy ) ); const double dz_y12 = factor * ( 2.0 * dz_zzz - 3.0 * ( dz_zxx + dz_zyy ) ); // y13 = factor * s32 * ( 4.0 * tgxzz - tgxxx - tgxyy ); const double dx_y13 = factor * s32 * ( 4.0 * dx_xzz - dx_xxx - dx_xyy ); const double dy_y13 = factor * s32 * ( 4.0 * dy_xzz - dy_xxx - dy_xyy ); const double dz_y13 = factor * s32 * ( 4.0 * dz_xzz - dz_xxx - dz_xyy ); // y14 = factor * s15 * ( tgzxx - tgzyy ); const double dx_y14 = factor * s15 * ( dx_zxx - dx_zyy ); const double dy_y14 = factor * s15 * ( dy_zxx - dy_zyy ); const double dz_y14 = factor * s15 * ( dz_zxx - dz_zyy ); // y15 = factor * s52 * ( tgxxx - 3.0 * tgxyy ); const double dx_y15 = factor * s52 * ( dx_xxx - 3.0 * dx_xyy ); const double dy_y15 = factor * s52 * ( dy_xxx - 3.0 * dy_xyy ); const double dz_y15 = factor * s52 * ( dz_xxx - 3.0 * dz_xyy ); t09[ig] = y09 * v; t10[ig] = y10 * v; t11[ig] = y11 * v; t12[ig] = y12 * v; t13[ig] = y13 * v; t14[ig] = y14 * v; t15[ig] = y15 * v; // contribution to stress tensor // sigma_ij = 0.5 * (xi * dj + xj * di) dt09_xx[ig] = -( v * dx_y09 * tgx - y09 * dv * tg * tgxx ); dt09_yy[ig] = -( v * dy_y09 * tgy - y09 * dv * tg * tgyy ); dt09_zz[ig] = -( v * dz_y09 * tgz - y09 * dv * tg * tgzz ); dt09_xy[ig] = -( 0.5 * v * ( dx_y09 * tgy + dy_y09 * tgx ) - y09 * dv * tg * tgxy ); dt09_yz[ig] = -( 0.5 * v * ( dy_y09 * tgz + dz_y09 * tgy ) - y09 * dv * tg * tgyz ); dt09_xz[ig] = -( 0.5 * v * ( dx_y09 * tgz + dz_y09 * tgx ) - y09 * dv * tg * tgxz ); dt10_xx[ig] = -( v * dx_y10 * tgx - y10 * dv * tg * tgxx ); dt10_yy[ig] = -( v * dy_y10 * tgy - y10 * dv * tg * tgyy ); dt10_zz[ig] = -( v * dz_y10 * tgz - y10 * dv * tg * tgzz ); dt10_xy[ig] = -( 0.5 * v * ( dx_y10 * tgy + dy_y10 * tgx ) - y10 * dv * tg * tgxy ); dt10_yz[ig] = -( 0.5 * v * ( dy_y10 * tgz + dz_y10 * tgy ) - y10 * dv * tg * tgyz ); dt10_xz[ig] = -( 0.5 * v * ( dx_y10 * tgz + dz_y10 * tgx ) - y10 * dv * tg * tgxz ); dt11_xx[ig] = -( v * dx_y11 * tgx - y11 * dv * tg * tgxx ); dt11_yy[ig] = -( v * dy_y11 * tgy - y11 * dv * tg * tgyy ); dt11_zz[ig] = -( v * dz_y11 * tgz - y11 * dv * tg * tgzz ); dt11_xy[ig] = -( 0.5 * v * ( dx_y11 * tgy + dy_y11 * tgx ) - y11 * dv * tg * tgxy ); dt11_yz[ig] = -( 0.5 * v * ( dy_y11 * tgz + dz_y11 * tgy ) - y11 * dv * tg * tgyz ); dt11_xz[ig] = -( 0.5 * v * ( dx_y11 * tgz + dz_y11 * tgx ) - y11 * dv * tg * tgxz ); dt12_xx[ig] = -( v * dx_y12 * tgx - y12 * dv * tg * tgxx ); dt12_yy[ig] = -( v * dy_y12 * tgy - y12 * dv * tg * tgyy ); dt12_zz[ig] = -( v * dz_y12 * tgz - y12 * dv * tg * tgzz ); dt12_xy[ig] = -( 0.5 * v * ( dx_y12 * tgy + dy_y12 * tgx ) - y12 * dv * tg * tgxy ); dt12_yz[ig] = -( 0.5 * v * ( dy_y12 * tgz + dz_y12 * tgy ) - y12 * dv * tg * tgyz ); dt12_xz[ig] = -( 0.5 * v * ( dx_y12 * tgz + dz_y12 * tgx ) - y12 * dv * tg * tgxz ); dt13_xx[ig] = -( v * dx_y13 * tgx - y13 * dv * tg * tgxx ); dt13_yy[ig] = -( v * dy_y13 * tgy - y13 * dv * tg * tgyy ); dt13_zz[ig] = -( v * dz_y13 * tgz - y13 * dv * tg * tgzz ); dt13_xy[ig] = -( 0.5 * v * ( dx_y13 * tgy + dy_y13 * tgx ) - y13 * dv * tg * tgxy ); dt13_yz[ig] = -( 0.5 * v * ( dy_y13 * tgz + dz_y13 * tgy ) - y13 * dv * tg * tgyz ); dt13_xz[ig] = -( 0.5 * v * ( dx_y13 * tgz + dz_y13 * tgx ) - y13 * dv * tg * tgxz ); dt14_xx[ig] = -( v * dx_y14 * tgx - y14 * dv * tg * tgxx ); dt14_yy[ig] = -( v * dy_y14 * tgy - y14 * dv * tg * tgyy ); dt14_zz[ig] = -( v * dz_y14 * tgz - y14 * dv * tg * tgzz ); dt14_xy[ig] = -( 0.5 * v * ( dx_y14 * tgy + dy_y14 * tgx ) - y14 * dv * tg * tgxy ); dt14_yz[ig] = -( 0.5 * v * ( dy_y14 * tgz + dz_y14 * tgy ) - y14 * dv * tg * tgyz ); dt14_xz[ig] = -( 0.5 * v * ( dx_y14 * tgz + dz_y14 * tgx ) - y14 * dv * tg * tgxz ); dt15_xx[ig] = -( v * dx_y15 * tgx - y15 * dv * tg * tgxx ); dt15_yy[ig] = -( v * dy_y15 * tgy - y15 * dv * tg * tgyy ); dt15_zz[ig] = -( v * dz_y15 * tgz - y15 * dv * tg * tgzz ); dt15_xy[ig] = -( 0.5 * v * ( dx_y15 * tgy + dy_y15 * tgx ) - y15 * dv * tg * tgxy ); dt15_yz[ig] = -( 0.5 * v * ( dy_y15 * tgz + dz_y15 * tgy ) - y15 * dv * tg * tgyz ); dt15_xz[ig] = -( 0.5 * v * ( dx_y15 * tgz + dz_y15 * tgx ) - y15 * dv * tg * tgxz ); } ilm += 2 * l + 1; } // l == 3 else { assert(false); } } // l != lloc[is] } // for l assert(ilm == s->nlm()); } tmap["update_twnl"].stop(); } //////////////////////////////////////////////////////////////////////////////// double NonLocalPotential::energy(bool compute_hpsi, SlaterDet& dsd, bool compute_forces, vector >& fion_enl, bool compute_stress, valarray& sigma_enl) { const vector& occ = sd_.occ(); const int ngwl = basis_.localsize(); // define atom block size const int na_block_size = 32; valarray gr(na_block_size*ngwl); // gr[ig+ia*ngwl] valarray cgr(na_block_size*ngwl); // cgr[ig+ia*ngwl] valarray sgr(na_block_size*ngwl); // sgr[ig+ia*ngwl] vector > tau; atoms_.get_positions(tau); double enl = 0.0; double tsum[6] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; for ( int is = 0; is < fion_enl.size(); is++ ) for ( int i = 0; i < fion_enl[is].size(); i++ ) fion_enl[is][i] = 0.0; if ( nspnl == 0 ) return 0.0; const double omega = basis_.cell().volume(); assert(omega != 0.0); const double omega_inv = 1.0 / omega; for ( int is = 0; is < nsp; is++ ) { Species *s = atoms_.species_list[is]; if ( npr[is] > 0 ) // species is is non-local { valarray tmpfion(3*na[is]); tmpfion = 0.0; // define number of atom blocks const int na_blocks = na[is] / na_block_size + ( na[is] % na_block_size == 0 ? 0 : 1 ); valarray anl_loc(npr[is]*na_block_size*2*ngwl); const int nstloc = sd_.nstloc(); // fnl_loc[ipra][n] // fnl is real if basis is real, complex otherwise const int fnl_loc_size = basis_.real() ? npr[is]*na_block_size*nstloc : 2*npr[is]*na_block_size*nstloc; valarray fnl_loc(fnl_loc_size); valarray fnl_buf(fnl_loc_size); for ( int ia_block = 0; ia_block < na_blocks; ia_block++ ) { // process projectors of atoms in block ia_block const int iastart = ia_block * na_block_size; const int iaend = (ia_block+1) * na_block_size < na[is] ? (ia_block+1) * na_block_size : na[is]; const int ia_block_size = iaend - iastart; // compute cgr[is][ia][ig], sgr[is][ia][ig] tmap["comp_eigr"].start(); int k = 3; double mone = -1.0, zero = 0.0; char cn='n'; // next line: const cast is ok since dgemm_ does not modify argument double* kpgx = const_cast(basis_.kpgx_ptr(0)); dgemm(&cn,&cn,(int*)&ngwl,(int*)&ia_block_size,&k,&mone, kpgx,(int*)&ngwl, &tau[is][3*iastart],&k, &zero,&gr[0],(int*)&ngwl); int len = ia_block_size * ngwl; #if USE_MASSV vsincos(&sgr[0],&cgr[0],&gr[0],&len); #else for ( int i = 0; i < len; i++ ) { const double arg = gr[i]; sgr[i] = sin(arg); cgr[i] = cos(arg); } #endif tmap["comp_eigr"].stop(); // compute anl_loc tmap["comp_anl"].start(); for ( int ipr = 0; ipr < npr[is]; ipr++ ) { // twnl[is][ig+ngwl*ipr] const double * t = &twnl[is][ngwl*ipr]; const int l = lproj[is][ipr]; // anl_loc[ig+ipra*ngwl] for ( int ia = 0; ia < ia_block_size; ia++ ) { double* a = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl]; const double* c = &cgr[ia*ngwl]; const double* s = &sgr[ia*ngwl]; if ( l == 0 ) { for ( int ig = 0; ig < ngwl; ig++ ) { a[2*ig] = t[ig] * c[ig]; a[2*ig+1] = t[ig] * s[ig]; } } else if ( l == 1 ) { for ( int ig = 0; ig < ngwl; ig++ ) { /* Next line: -i * eigr */ /* -i * (a+i*b) = b - i*a */ a[2*ig] = t[ig] * s[ig]; a[2*ig+1] = -t[ig] * c[ig]; } } else if ( l == 2 ) { for ( int ig = 0; ig < ngwl; ig++ ) { // Next line: (-) sign for -eigr a[2*ig] = -t[ig] * c[ig]; a[2*ig+1] = -t[ig] * s[ig]; } } else if ( l == 3 ) { for ( int ig = 0; ig < ngwl; ig++ ) { // Next line: i * eigr // i * (a+i*b) = -b + i*a a[2*ig] = -t[ig] * s[ig]; a[2*ig+1] = t[ig] * c[ig]; } } } } // ipr tmap["comp_anl"].stop(); // array anl_loc is complete // compute fnl[npra][nstloc] = anl^T * c int nprnaloc = ia_block_size * npr[is]; if ( basis_.real() ) { double one=1.0; char ct='t'; int twongwl = 2 * ngwl; int c_lda = 2*sd_.c().mloc(); const complex* c = sd_.c().cvalptr(); tmap["fnl_gemm"].start(); dgemm(&ct,&cn,&nprnaloc,(int*)&nstloc,&twongwl,&one, &anl_loc[0],&twongwl, (double*)c, &c_lda, &zero,&fnl_loc[0],&nprnaloc); tmap["fnl_gemm"].stop(); // correct for double counting if ctxt_.myrow() == 0 if ( ctxt_.myrow() == 0 ) { // rank-one update // dger(m,n,alpha,x,incx,y,incy,a,lda); // a += alpha * x * transpose(y) // x = first row of anl_loc // y^T = first row of c double alpha = -0.5; dger(&nprnaloc,(int*)&nstloc,&alpha,&anl_loc[0],&twongwl, (double*)c,&c_lda,&fnl_loc[0],&nprnaloc); } } else { // fnl is complex complex cone=1.0; complex czero=0.0; char cc='c'; int c_lda = sd_.c().mloc(); const complex* c = sd_.c().cvalptr(); tmap["fnl_zemm"].start(); zgemm(&cc,&cn,&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone, (complex*) &anl_loc[0],(int*)&ngwl, (complex*) c, &c_lda,&czero, (complex*)&fnl_loc[0],&nprnaloc); tmap["fnl_zemm"].stop(); } #if USE_MPI tmap["fnl_allreduce"].start(); // Allreduce fnl partial sum MPI_Comm basis_comm = basis_.comm(); int fnl_size = basis_.real() ? nprnaloc*nstloc : 2*nprnaloc*nstloc; MPI_Allreduce(&fnl_loc[0],&fnl_buf[0],fnl_size, MPI_DOUBLE,MPI_SUM,basis_comm); tmap["fnl_allreduce"].stop(); // factor 2.0 in next line is: counting G, -G if ( basis_.real() ) fnl_loc = 2.0 * fnl_buf; else fnl_loc = fnl_buf; #else // factor 2.0 in next line is: counting G, -G if ( basis_.real() ) fnl_loc *= 2.0; #endif // if the species has multiple projectors, that are not orthogonal // multiply with D matrix if ( s->has_dmatrix() ) { if ( basis_.real() ) { // helper variables double one = 1.0; double zero = 0.0; const size_t dsize = nprnaloc * nprnaloc; // // construct D matrix // // allocate array valarray dmatrix(zero,dsize); // loop over all elements for ( int i = 0; i < nprnaloc; i++ ) { // extract atom, l, and m corresponding to this index const int iatom = i % ia_block_size; const int ipr = i / ia_block_size; for ( int j = 0; j <= i; j++ ) { // extract atom, l, and m corresponding to this index const int jatom = j % ia_block_size; const int jpr = j / ia_block_size; // D matrix is diagonal in atom, l, and m if ( iatom != jatom ) continue; const double val = s->dmatrix(ipr,jpr); dmatrix[nprnaloc * i + j] = val; // matrix is symmetric if ( i != j ) dmatrix[nprnaloc * j + i] = val; } } // multiply F'_In = D_IJ F_Jn dgemm(&cn,&cn,&nprnaloc,(int*) &nstloc,&nprnaloc,&one,&dmatrix[0], &nprnaloc,&fnl_loc[0],&nprnaloc,&zero,&fnl_buf[0],&nprnaloc); } else { // helper variables complex one = 1.0; complex zero = 0.0; const size_t dsize = nprnaloc * nprnaloc; // // construct D matrix // // allocate array valarray < complex > dmatrix(zero,dsize); // loop over all elements for ( int i = 0; i < nprnaloc; i++ ) { // extract atom, l, and m corresponding to this index const int iatom = i % ia_block_size; const int ipr = i / ia_block_size; for ( int j = 0; j <= i; j++ ) { // extract atom, l, and m corresponding to this index const int jatom = j % ia_block_size; const int jpr = j / ia_block_size; // D matrix is diagonal in atom, l, and m if ( iatom != jatom ) continue; const double val = s->dmatrix(ipr,jpr); dmatrix[nprnaloc * i + j] = val; // matrix is symmetric if ( i != j ) dmatrix[nprnaloc * j + i] = val; } } // multiply F'_In = D_IJ F_Jn zgemm(&cn,&cn,&nprnaloc,(int*) &nstloc,&nprnaloc,&one,&dmatrix[0], &nprnaloc,(complex*) &fnl_loc[0],&nprnaloc,&zero, (complex*) &fnl_buf[0],&nprnaloc); } } else { fnl_buf = fnl_loc; } // accumulate Enl contribution const int nbase = ctxt_.mycol() * sd_.c().nb(); if ( basis_.real() ) { for ( int ipr = 0; ipr < npr[is]; ipr++ ) { const double fac = wt[is][ipr] * omega_inv; for ( int n = 0; n < nstloc; n++ ) { const double facn = fac * occ[n + nbase]; for ( int ia = 0; ia < ia_block_size; ia++ ) { const int i = ia + ipr*ia_block_size + n * nprnaloc; //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia // << ",n=" << n << "]: " << fnl_loc[i] << endl; enl += facn * fnl_loc[i] * fnl_buf[i]; fnl_loc[i] = fac * fnl_buf[i]; } } } } else { // fnl is complex for ( int ipr = 0; ipr < npr[is]; ipr++ ) { const double fac = wt[is][ipr] * omega_inv; for ( int n = 0; n < nstloc; n++ ) { const double facn = fac * occ[n + nbase]; for ( int ia = 0; ia < ia_block_size; ia++ ) { const int i = ia + ipr*ia_block_size + n * nprnaloc; //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia // << ",n=" << n << "]: " << fnl_loc[i] << endl; const double f_re = fnl_loc[2*i]; const double f_im = fnl_loc[2*i+1]; const double fb_re = fnl_buf[2*i]; const double fb_im = fnl_buf[2*i+1]; enl += facn * (f_re*fb_re + f_im*fb_im); fnl_loc[2*i] = fac * fb_re; fnl_loc[2*i+1] = fac * fb_im; } } } } if ( compute_hpsi ) { if ( basis_.real() ) { tmap["enl_hpsi"].start(); // compute cp += anl * fnl complex* cp = dsd.c().valptr(); int cp_lda = 2*dsd.c().mloc(); int twongwl = 2 * ngwl; double one = 1.0; dgemm(&cn,&cn,&twongwl,(int*)&nstloc,&nprnaloc,&one, &anl_loc[0],&twongwl, &fnl_loc[0],&nprnaloc, &one,(double*)cp, &cp_lda); tmap["enl_hpsi"].stop(); } else { tmap["enl_hpsi"].start(); // compute cp += anl * fnl complex* cp = dsd.c().valptr(); complex cone = 1.0; int cp_lda = dsd.c().mloc(); zgemm(&cn,&cn,(int*)&ngwl,(int*)&nstloc,&nprnaloc,&cone, (complex*) &anl_loc[0],(int*)&ngwl, (complex*) &fnl_loc[0],&nprnaloc, &cone,cp, &cp_lda); tmap["enl_hpsi"].stop(); } } // ionic forces if ( compute_forces ) { tmap["enl_fion"].start(); valarray dfnl_loc(fnl_loc_size); for ( int j = 0; j < 3; j++ ) { const double *const kpgxj = basis_.kpgx_ptr(j); // compute anl_loc for ( int ipr = 0; ipr < npr[is]; ipr++ ) { // twnl[is][ig+ngwl*ipr] const double * t = &twnl[is][ngwl*ipr]; const int l = lproj[is][ipr]; // anl_loc[ig+ipra*ngwl] for ( int ia = 0; ia < ia_block_size; ia++ ) { double* a = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl]; const double* c = &cgr[ia*ngwl]; const double* s = &sgr[ia*ngwl]; if ( l == 0 ) { for ( int ig = 0; ig < ngwl; ig++ ) { const double tt = kpgxj[ig] * t[ig]; // Next lines: -i * ( a + ib ) = b - ia a[2*ig] = tt * s[ig]; a[2*ig+1] = -tt * c[ig]; } } else if ( l == 1 ) { for ( int ig = 0; ig < ngwl; ig++ ) { // Next lines: (-i)**2 * ( a + ib ) = - a - ib const double tt = - kpgxj[ig] * t[ig]; a[2*ig] = tt * c[ig]; a[2*ig+1] = tt * s[ig]; } } else if ( l == 2 ) { for ( int ig = 0; ig < ngwl; ig++ ) { // Next lines: (-i) * - ( a + ib ) = i*(a+ib) = - b + ia const double tt = kpgxj[ig] * t[ig]; a[2*ig] = -tt * s[ig]; a[2*ig+1] = tt * c[ig]; } } else if ( l == 3 ) { for ( int ig = 0; ig < ngwl; ig++ ) { // Next lines: (-i)**4 * ( a + ib ) = a + ib const double tt = kpgxj[ig] * t[ig]; a[2*ig] = tt * c[ig]; a[2*ig+1] = tt * s[ig]; } } } } // ipr // array anl_loc is complete // compute dfnl[npra][nstloc] = anl^T * c if ( basis_.real() ) { // factor 2.0 in next line is: counting G, -G // Note: no need to correct for double counting of the // G=0 component which is always zero double two=2.0; char ct='t'; const int twongwl = 2 * ngwl; const int nprnaloc = ia_block_size * npr[is]; int c_lda = 2*sd_.c().mloc(); const complex* c = sd_.c().cvalptr(); dgemm(&ct,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&twongwl,&two, &anl_loc[0],(int*)&twongwl, (double*)c,(int*)&c_lda, &zero,&dfnl_loc[0],(int*)&nprnaloc); } else { complex cone=1.0; complex czero=0.0; char cc='c'; const int nprnaloc = ia_block_size * npr[is]; int c_lda = sd_.c().mloc(); const complex* c = sd_.c().cvalptr(); zgemm(&cc,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone, (complex*) &anl_loc[0],(int*)&ngwl, (complex*) c,(int*)&c_lda, &czero,(complex*)&dfnl_loc[0],(int*)&nprnaloc); } // accumulate non-local contributions to forces if ( basis_.real() ) { for ( int ipr = 0; ipr < npr[is]; ipr++ ) { for ( int n = 0; n < nstloc; n++ ) { // Factor 2.0 in next line from derivative of |Fnl|^2 const double facn = 2.0 * occ[n + nbase]; for ( int ia = 0; ia < ia_block_size; ia++ ) { const int ia_global = ia + iastart; const int i = ia + ipr*ia_block_size + n * nprnaloc; //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia // << ",n=" << n << "]: " << fnl_loc[i] << endl; tmpfion[3*ia_global+j] -= facn * fnl_loc[i] * dfnl_loc[i]; } } } } else { for ( int ipr = 0; ipr < npr[is]; ipr++ ) { for ( int n = 0; n < nstloc; n++ ) { // Factor 2.0 in next line from derivative of |Fnl|^2 const double facn = 2.0 * occ[n + nbase]; for ( int ia = 0; ia < ia_block_size; ia++ ) { const int ia_global = ia + iastart; const int i = ia + ipr*ia_block_size + n * nprnaloc; //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia // << ",n=" << n << "]: " << fnl_loc[i] << endl; double f_re = fnl_loc[2*i]; double f_im = fnl_loc[2*i+1]; double df_re = dfnl_loc[2*i]; double df_im = dfnl_loc[2*i+1]; tmpfion[3*ia_global+j] -= facn * ( f_re * df_re + f_im * df_im ); } } } } } // j tmap["enl_fion"].stop(); } // compute_forces if ( compute_stress ) { tmap["enl_sigma"].start(); valarray dfnl_loc(fnl_loc_size); for ( int ij = 0; ij < 6; ij++ ) { // compute anl_loc int ipr = 0; while ( ipr < npr[is] ) { // twnl[is][ig+ngwl*ipr] const int l = lproj[is][ipr]; if ( l == 0 ) { // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ipr)) // ipr = iquad + nquad[is] * ilm, where ilm = 0 const double *const dt0 = &dtwnl[is][ngwl*(ij+6*ipr)]; for ( int ia = 0; ia < ia_block_size; ia++ ) { double* a0 = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl]; const double* c = &cgr[ia*ngwl]; const double* s = &sgr[ia*ngwl]; for ( int ig = 0; ig < ngwl; ig++ ) { const double d0 = dt0[ig]; // Next lines: -i * ( a + ib ) = b - ia a0[2*ig] = d0 * c[ig]; a0[2*ig+1] = d0 * s[ig]; } } } else if ( l == 1 ) { const int ipr1 = ipr; const int ipr2 = ipr + 1; const int ipr3 = ipr + 2; // dtwnl[is][ipr][ij][ngwl] // index = ig + ngwl * ( ij + 6 * iprx )) const double *dt1 = &dtwnl[is][ngwl*(ij+6*ipr1)]; const double *dt2 = &dtwnl[is][ngwl*(ij+6*ipr2)]; const double *dt3 = &dtwnl[is][ngwl*(ij+6*ipr3)]; for ( int ia = 0; ia < ia_block_size; ia++ ) { double* a1 = &anl_loc[2*(ia+ipr1*ia_block_size)*ngwl]; double* a2 = &anl_loc[2*(ia+ipr2*ia_block_size)*ngwl]; double* a3 = &anl_loc[2*(ia+ipr3*ia_block_size)*ngwl]; const double* c = &cgr[ia*ngwl]; const double* s = &sgr[ia*ngwl]; for ( int ig = 0; ig < ngwl; ig++ ) { const double d1 = dt1[ig]; const double d2 = dt2[ig]; const double d3 = dt3[ig]; // Next line: (-i)^l factor is -i // Next line: -i * eigr // -i * (a+i*b) = b - i*a const double tc = -c[ig]; // -cosgr[ia][ig] const double ts = s[ig]; // singr[ia][ig] a1[2*ig] = d1 * ts; a1[2*ig+1] = d1 * tc; a2[2*ig] = d2 * ts; a2[2*ig+1] = d2 * tc; a3[2*ig] = d3 * ts; a3[2*ig+1] = d3 * tc; } } } else if ( l == 2 ) { const int ipr4 = ipr; const int ipr5 = ipr + 1; const int ipr6 = ipr + 2; const int ipr7 = ipr + 3; const int ipr8 = ipr + 4; // dtwnl[is][ipr][iquad][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ( iquad + nquad[is] * ipr )) const double *dt4 = &dtwnl[is][ngwl*(ij+6*ipr4)]; const double *dt5 = &dtwnl[is][ngwl*(ij+6*ipr5)]; const double *dt6 = &dtwnl[is][ngwl*(ij+6*ipr6)]; const double *dt7 = &dtwnl[is][ngwl*(ij+6*ipr7)]; const double *dt8 = &dtwnl[is][ngwl*(ij+6*ipr8)]; for ( int ia = 0; ia < ia_block_size; ia++ ) { double* a4 = &anl_loc[2*(ia+ipr4*ia_block_size)*ngwl]; double* a5 = &anl_loc[2*(ia+ipr5*ia_block_size)*ngwl]; double* a6 = &anl_loc[2*(ia+ipr6*ia_block_size)*ngwl]; double* a7 = &anl_loc[2*(ia+ipr7*ia_block_size)*ngwl]; double* a8 = &anl_loc[2*(ia+ipr8*ia_block_size)*ngwl]; const double* c = &cgr[ia*ngwl]; const double* s = &sgr[ia*ngwl]; for ( int ig = 0; ig < ngwl; ig++ ) { const double d4 = dt4[ig]; const double d5 = dt5[ig]; const double d6 = dt6[ig]; const double d7 = dt7[ig]; const double d8 = dt8[ig]; // Next lines: (-i)^2 * ( a + ib ) = - ( a + ib ) const double tc = -c[ig]; // -cosgr[ia][ig] const double ts = -s[ig]; // -singr[ia][ig] a4[2*ig] = d4 * tc; a4[2*ig+1] = d4 * ts; a5[2*ig] = d5 * tc; a5[2*ig+1] = d5 * ts; a6[2*ig] = d6 * tc; a6[2*ig+1] = d6 * ts; a7[2*ig] = d7 * tc; a7[2*ig+1] = d7 * ts; a8[2*ig] = d8 * tc; a8[2*ig+1] = d8 * ts; } } } else if ( l == 3 ) { const int ipr09 = ipr; const int ipr10= ipr + 1; const int ipr11= ipr + 2; const int ipr12= ipr + 3; const int ipr13= ipr + 4; const int ipr14= ipr + 5; const int ipr15= ipr + 6; // dtwnl[is][ipr][iquad][ij][ngwl] // index = ig + ngwl * ( ij + 6 * ( iquad + nquad[is] * ipr )) const double *dt09 = &dtwnl[is][ngwl * ( ij + 6 * ipr09 )]; const double *dt10 = &dtwnl[is][ngwl * ( ij + 6 * ipr10 )]; const double *dt11 = &dtwnl[is][ngwl * ( ij + 6 * ipr11 )]; const double *dt12 = &dtwnl[is][ngwl * ( ij + 6 * ipr12 )]; const double *dt13 = &dtwnl[is][ngwl * ( ij + 6 * ipr13 )]; const double *dt14 = &dtwnl[is][ngwl * ( ij + 6 * ipr14 )]; const double *dt15 = &dtwnl[is][ngwl * ( ij + 6 * ipr15 )]; for ( int ia = 0; ia < ia_block_size; ia++ ) { double* a09 = &anl_loc[2 * ( ia + ipr09 * ia_block_size ) * ngwl]; double* a10 = &anl_loc[2 * ( ia + ipr10 * ia_block_size ) * ngwl]; double* a11 = &anl_loc[2 * ( ia + ipr11 * ia_block_size ) * ngwl]; double* a12 = &anl_loc[2 * ( ia + ipr12 * ia_block_size ) * ngwl]; double* a13 = &anl_loc[2 * ( ia + ipr13 * ia_block_size ) * ngwl]; double* a14 = &anl_loc[2 * ( ia + ipr14 * ia_block_size ) * ngwl]; double* a15 = &anl_loc[2 * ( ia + ipr15 * ia_block_size ) * ngwl]; const double* c = &cgr[ia * ngwl]; const double* s = &sgr[ia * ngwl]; for ( int ig = 0; ig < ngwl; ig++ ) { const double d09 = dt09[ig]; const double d10 = dt10[ig]; const double d11 = dt11[ig]; const double d12 = dt12[ig]; const double d13 = dt13[ig]; const double d14 = dt14[ig]; const double d15 = dt15[ig]; // Next lines: (-i)^2 * ( a + ib ) = - ( a + ib ) const double tc = c[ig]; // cosgr[ia][ig] const double ts = -s[ig]; // -singr[ia][ig] a09[2 * ig] = d09 * ts; a09[2 * ig + 1] = d09 * tc; a10[2 * ig] = d10 * ts; a10[2 * ig + 1] = d10 * tc; a11[2 * ig] = d11 * ts; a11[2 * ig + 1] = d11 * tc; a12[2 * ig] = d12 * ts; a12[2 * ig + 1] = d12 * tc; a13[2 * ig] = d13 * ts; a13[2 * ig + 1] = d13 * tc; a14[2 * ig] = d14 * ts; a14[2 * ig + 1] = d14 * tc; a15[2 * ig] = d15 * ts; a15[2 * ig + 1] = d15 * tc; } } } else { assert(false); } // l ipr += 2*l+1; } // while ipr // array anl_loc is complete // compute dfnl[npra][nstloc] = anl^T * c if ( basis_.real() ) { // factor 2.0 in next line is: counting G, -G // Note: no need to correct for double counting of the // G=0 component which is always zero double two=2.0; char ct='t'; const int twongwl = 2 * ngwl; const int nprnaloc = ia_block_size * npr[is]; int c_lda = 2*sd_.c().mloc(); const complex* c = sd_.c().cvalptr(); dgemm(&ct,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&twongwl,&two, &anl_loc[0],(int*)&twongwl, (double*)c,(int*)&c_lda, &zero,&dfnl_loc[0],(int*)&nprnaloc); } else { complex cone=1.0; complex czero=0.0; char cc='c'; const int nprnaloc = ia_block_size * npr[is]; int c_lda = sd_.c().mloc(); const complex* c = sd_.c().cvalptr(); zgemm(&cc,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone, (complex*)&anl_loc[0],(int*)&ngwl, (complex*)c,(int*)&c_lda, &czero,(complex*)&dfnl_loc[0],(int*)&nprnaloc); } // accumulate non-local contributions to sigma_ij if ( basis_.real() ) { for ( int n = 0; n < nstloc; n++ ) { // Factor 2.0 in next line from derivative of |Fnl|^2 const double facn = 2.0 * occ[n + nbase]; for ( int ipra = 0; ipra < npr[is]*ia_block_size; ipra++ ) { const int i = ipra + n * nprnaloc; tsum[ij] += facn * fnl_loc[i] * dfnl_loc[i]; } } } else { for ( int n = 0; n < nstloc; n++ ) { // Factor 2.0 in next line from derivative of |Fnl|^2 const double facn = 2.0 * occ[n + nbase]; for ( int ipra = 0; ipra < npr[is]*ia_block_size; ipra++ ) { const int i = ipra + n * nprnaloc; double f_re = fnl_loc[2*i]; double f_im = fnl_loc[2*i+1]; double df_re = dfnl_loc[2*i]; double df_im = dfnl_loc[2*i+1]; tsum[ij] += facn * ( f_re * df_re + f_im * df_im ); } } } } // ij tmap["enl_sigma"].stop(); } // compute_stress } // ia_block if ( compute_forces ) { ctxt_.dsum(3*na[is],1,&tmpfion[0],3*na[is]); for ( int ia = 0; ia < na[is]; ia++ ) { fion_enl[is][3*ia+0] += tmpfion[3*ia]; fion_enl[is][3*ia+1] += tmpfion[3*ia+1]; fion_enl[is][3*ia+2] += tmpfion[3*ia+2]; } } } // npr[is]>0 } // is // reduction of enl across rows ctxt_.dsum('r',1,1,&enl,1); sigma_enl = 0.0; if ( compute_stress ) { ctxt_.dsum(6,1,&tsum[0],6); sigma_enl[0] = ( enl + tsum[0] ) * omega_inv; sigma_enl[1] = ( enl + tsum[1] ) * omega_inv; sigma_enl[2] = ( enl + tsum[2] ) * omega_inv; sigma_enl[3] = + tsum[3] * omega_inv; sigma_enl[4] = + tsum[4] * omega_inv; sigma_enl[5] = + tsum[5] * omega_inv; } return enl; }