Commit 0af317c0 authored by Francois Gygi's avatar Francois Gygi
Browse files

Merge branch 'develop'

parents 4c1362ae 7fcfac04
......@@ -54,12 +54,14 @@ void JDWavefunctionStepper::update(Wavefunction& dwf)
{
for ( int ikp_loc = 0; ikp_loc < wf_.nkp_loc(); ++ikp_loc )
{
if ( wf_.sd(isp_loc,ikp_loc)->basis().real() )
SlaterDet* sd = wf_.sd(isp_loc,ikp_loc);
SlaterDet* dsd = dwf.sd(isp_loc,ikp_loc);
if ( sd->basis().real() )
{
// compute A = Y^T H Y and descent direction HY - YA
// proxy real matrices c, cp
DoubleMatrix c_proxy(wf_.sd(isp_loc,ikp_loc)->c());
DoubleMatrix cp_proxy(dwf.sd(isp_loc,ikp_loc)->c());
DoubleMatrix c_proxy(sd->c());
DoubleMatrix cp_proxy(dsd->c());
DoubleMatrix a(c_proxy.context(),c_proxy.n(),c_proxy.n(),
c_proxy.nb(),c_proxy.nb());
......@@ -74,8 +76,8 @@ void JDWavefunctionStepper::update(Wavefunction& dwf)
else // real
{
// compute A = Y^T H Y and descent direction HY - YA
ComplexMatrix& c = wf_.sd(isp_loc,ikp_loc)->c();
ComplexMatrix& cp = dwf.sd(isp_loc,ikp_loc)->c();
ComplexMatrix& c = sd->c();
ComplexMatrix& cp = dsd->c();
ComplexMatrix a(c.context(),c.n(),c.n(),c.nb(),c.nb());
// (Y,HY)
......
......@@ -150,7 +150,7 @@ CXXFLAGS += -DVERSION='"$(VERSION)"'
$(LD) $(DFLAGS) -o $@ $^ $(LDFLAGS)
testVWN: testVWN.o VWNFunctional.o LDAFunctional.o
$(LD) $(DFLAGS) -o $@ $^ $(LDFLAGS)
testMatrix: testMatrix.o Matrix.o Context.o
testMatrix: testMatrix.o Matrix.o Context.o MPIdata.o
$(LD) $(DFLAGS) -o $@ $^ $(LDFLAGS)
testjacobi: testjacobi.o jacobi.o Matrix.o Context.o
$(LD) $(DFLAGS) -o $@ $^ $(LDFLAGS)
......
......@@ -22,10 +22,10 @@
#include <limits>
#include <iostream>
using namespace std;
#ifdef USE_MPI
#include <mpi.h>
#endif
#include "MPIdata.h"
#include "Context.h"
#ifdef SCALAPACK
......@@ -86,6 +86,8 @@ using namespace std;
#define zdscal zdscal_
#define dcopy dcopy_
#define ddot ddot_
#define dnrm2 dnrm2_
#define dznrm2 dznrm2_
#define zdotu zdotu_
#define zdotc zdotc_
#define daxpy daxpy_
......@@ -124,7 +126,7 @@ using namespace std;
extern "C"
{
int numroc(int*, int*, int*, int*, int*);
int numroc(const int*, const int*, const int*, const int*, const int*);
#ifdef SCALAPACK
// PBLAS
void pdsymm(const char*, const char*, const int*, const int*, const double*,
......@@ -298,6 +300,8 @@ extern "C"
void dcopy(const int *, const double*, const int *, double*, const int*);
double ddot(const int *, const double *, const int *,
const double *, const int *);
double dnrm2(const int *, const double *, const int *);
double dznrm2(const int *, const complex<double> *, const int *);
complex<double> zdotc(const int *, const complex<double>*, const int *,
const complex<double>*, const int *);
complex<double> zdotu(const int *, const complex<double>*, const int *,
......@@ -385,13 +389,46 @@ extern "C"
complex<double>* work, int* lwork, int* info);
}
////////////////////////////////////////////////////////////////////////////////
// numroc0: ScaLAPACK numroc function specialized for the case isrcproc=0
// i.e. the process holding the first row/col of the matrix is proc 0
int numroc0(int n, int nb, int iproc, int nprocs)
{
// n number of rows/cols of the distributed matrix
// nb block size
// iproc coordinate of the process whose local array size is being computed
// iproc = 0..nprocs
// nprocs number of processes over which the matrix is distributed
// nblocks = total number of whole nb blocks
int n_whole_blocks = n / nb;
// minimum number of rows or cols a process can have
int nroc = ( n_whole_blocks / nprocs ) * nb;
// number of extra blocks needed
int n_extra_blocks = n_whole_blocks % nprocs;
// adjust numroc depending on iproc
if ( iproc < n_extra_blocks )
nroc += nb;
else if ( iproc == n_extra_blocks )
nroc += n % nb;
return nroc;
}
////////////////////////////////////////////////////////////////////////////////
int DoubleMatrix::mloc(int irow) const
{
return numroc0(m_,mb_,irow,nprow_);
}
#ifndef SCALAPACK
int numroc(int* a, int* b, int* c, int* d, int* e)
////////////////////////////////////////////////////////////////////////////////
int DoubleMatrix::nloc(int icol) const
{
return *a;
return numroc0(n_,nb_,icol,npcol_);
}
#endif
////////////////////////////////////////////////////////////////////////////////
// reference constructor create a proxy for a ComplexMatrix rhs
......@@ -1244,13 +1281,43 @@ void ComplexMatrix::symmetrize(char uplo)
////////////////////////////////////////////////////////////////////////////////
double DoubleMatrix::nrm2(void) const
{
return sqrt(dot(*this));
double sum=0.;
double tsum=0.;
if ( active_ )
{
int ione=1;
// dnrm2 returns sqrt(sum_i x[i]*x[i])
tsum = dnrm2(&size_,val,&ione);
tsum = tsum*tsum;
}
#ifdef SCALAPACK
if ( active_ )
MPI_Allreduce(&tsum, &sum, 1, MPI_DOUBLE, MPI_SUM, ctxt_.comm() );
#else
sum=tsum;
#endif
return sqrt(sum);
}
////////////////////////////////////////////////////////////////////////////////
double ComplexMatrix::nrm2(void) const
{
return abs(dot(*this));
double sum=0.;
double tsum=0.;
if ( active_ )
{
int ione=1;
// dznrm2 returns sqrt(sum_i conjg(x[i])*x[i])
tsum = dznrm2(&size_,val,&ione);
tsum = tsum*tsum;
}
#ifdef SCALAPACK
if ( active_ )
MPI_Allreduce(&tsum, &sum, 1, MPI_DOUBLE, MPI_SUM, ctxt_.comm() );
#else
sum=tsum;
#endif
return sqrt(sum);
}
////////////////////////////////////////////////////////////////////////////////
......
......@@ -27,6 +27,8 @@
#include <complex>
#include <cstring> // memcpy
int numroc0(int n, int nb, int iproc, int nprocs);
class ComplexMatrix;
class DoubleMatrix
......@@ -63,7 +65,9 @@ class DoubleMatrix
int mb(void) const { return mb_; } // size of blocks
int nb(void) const { return nb_; } // size of blocks
int mloc(void) const { return mloc_; } // size of local array
int mloc(int irow) const; // size of local array on row irow
int nloc(void) const { return nloc_; } // size of local array
int nloc(int icol) const; // size of local array on column icol
int size(void) const { return size_; } // size of local array
int localsize(void) const { return mloc_*nloc_; } // local size of val
double memsize(void) const { return (double)m_*(double)n_*sizeof(double); }
......
......@@ -32,6 +32,7 @@
#define daxpy daxpy_
#define ddot ddot_
#define dnrm2 dnrm2_
#define dznrm2 dznrm2_
#define drot drot_
#define dasum dasum_
#define dsbmv dsbmv_
......@@ -63,6 +64,7 @@ void daxpy(int *n, double *alpha, double *x, int *incx,
double ddot(const int *n, const double *a, const int *inca,
const double *b, const int *incb);
double dnrm2(const int *n, const double *a, const int *inca);
double dznrm2(const int *n, const std::complex<double> *a, const int *inca);
void drot(int*, double*, int*, double*, int*, double*, double*);
void dgemm(char *ta, char *tb, int *m, int *n, int *k,
double *alpha, double *a, int *lda, double *b, int *ldb,
......
......@@ -19,5 +19,5 @@
#include "release.h"
std::string release(void)
{
return std::string("rel1_73_1");
return std::string("rel1_73_1dev");
}
......@@ -37,6 +37,7 @@
#include <valarray>
using namespace std;
#include "MPIdata.h"
#include "Timer.h"
#include <mpi.h>
......@@ -107,6 +108,9 @@ int main(int argc, char **argv)
MPI_Bcast(&nb_c, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ta, 1, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Bcast(&tb, 1, MPI_CHAR, 0, MPI_COMM_WORLD);
MPIdata::set(nprow,npcol,1,1);
{
if ( ta == 'N' ) ta = 'n';
if ( tb == 'N' ) tb = 'n';
......@@ -167,9 +171,20 @@ int main(int argc, char **argv)
}
tm.start();
double gemm_time = MPI_Wtime();
c.gemm(ta,tb,1.0,a,b,0.0);
gemm_time = MPI_Wtime() - gemm_time;
if ( ctxt.onpe0() )
cout << "gemm_time: " << gemm_time << endl;
tm.stop();
Context csq(MPI_COMM_WORLD,1,1);
DoubleMatrix ssq(csq,c.n(),c.n(),c.nb(),c.nb());
double getsub_time = MPI_Wtime();
ssq.getsub(c,c.m(),c.n(),0,0);
getsub_time = MPI_Wtime() - getsub_time;
if ( ctxt.onpe0() )
cout << "getsub_time: " << getsub_time << endl;
if ( tcheck )
{
......@@ -215,19 +230,42 @@ int main(int argc, char **argv)
exit(1);
}
}
cout << " results checked" << endl;
}
cout << " CPU/Real: " << setw(8) << tm.cpu()
<< " / " << setw(8) << tm.real();
if ( tm.real() > 0.0 )
// print dgemm timing
const int kmax = ( ta == 'n' ) ? a.n() : a.m();
double buf[2];
buf[0] = tm.cpu();
buf[1] = tm.real();
double cpu, real;
for ( int irow = 0; irow < nprow; irow++ )
{
int kmax = ( ta == 'n' ) ? a.n() : a.m();
cout << " MFlops: "
<< (2.0e-6*m_c*n_c*kmax) / tm.real() << endl;
for ( int icol = 0; icol < npcol; icol++ )
{
ctxt.barrier();
if ( irow != 0 && icol != 0 )
{
if ( ctxt.onpe0() )
ctxt.drecv(1,1,buf,2,irow,icol);
else
if ( irow == ctxt.myrow() && icol == ctxt.mycol() )
ctxt.dsend(1,1,buf,2,0,0);
}
if ( ctxt.onpe0() )
{
double tcpu = buf[0];
double treal = buf[1];
cout << "(" << setw(3) << irow << "," << setw(3) << icol << ") "
<< "CPU/Real: " << setw(8) << tcpu << " / " << setw(8) << treal;
if ( treal > 0.0 )
cout << " MFlops: " << (2.0e-6*m_c*n_c*kmax) / treal;
cout << endl;
}
}
}
double norma=a.nrm2();
if(mype == 0)cout<<"Norm(a)="<<norma<<endl;
......
......@@ -57,16 +57,24 @@ int main(int argc, char **argv)
int npr = atoi(argv[15]);
int npc = atoi(argv[16]);
MPIdata::set(npr,npc);
cout << MPIdata::rank() << ": npr=" << npr << " npc=" << npc << endl;
Timer tm;
if ( MPIdata::onpe0() )
{
cout << " testSlaterDet" << endl;
cout << " args: " << a << " " << b << " " << c << " "
<< ecut << " " << nst << " "
<< kpoint << " " << npr << " " << npc << endl;
}
std::map<std::string,Timer> tmap;
{ // Context scope
Context ctxt(MPIdata::sd_comm(),npr,npc);
if ( ctxt.myproc() == 0 )
cout << " npr=" << npr << " npc=" << npc << endl;
SlaterDet sd(ctxt,kpoint);
cout << sd.context();
sd.resize(cell,cell,ecut,nst);
if ( ctxt.myproc() == 0 )
......@@ -87,11 +95,11 @@ int main(int argc, char **argv)
if ( ctxt.myproc() == 0 )
cout << " Orthogonality error after rand " << err << endl;
tm.reset();
tm.start();
tmap["gram"].reset();
tmap["gram"].start();
sd.gram();
tm.stop();
cout << " Gram: CPU/Real: " << tm.cpu() << " / " << tm.real() << endl;
tmap["gram"].stop();
err = sd.ortho_error();
if ( ctxt.myproc() == 0 )
cout << " Gram orthogonality error " << err << endl;
......@@ -102,11 +110,10 @@ int main(int argc, char **argv)
sdm.c() = sd.c();
sd.randomize(0.1/sd.basis().size());
tm.reset();
tm.start();
tmap["riccati"].reset();
tmap["riccati"].start();
sd.riccati(sdm);
tm.stop();
cout << " riccati: CPU/Real: " << tm.cpu() << " / " << tm.real() << endl;
tmap["riccati"].stop();
err = sd.ortho_error();
if ( ctxt.myproc() == 0 )
cout << " Riccati orthogonality error " << err << endl;
......@@ -122,8 +129,6 @@ int main(int argc, char **argv)
<< sd.localmemsize() / 1048576.0 << endl;
}
//cout << " ekin: " << setprecision(8) << sd.ekin(occ) << endl;
// compute charge density in real space
FourierTransform ft(sd.basis(),
2*sd.basis().np(0), 2*sd.basis().np(1), 2*sd.basis().np(2));
......@@ -131,20 +136,16 @@ int main(int argc, char **argv)
vector<complex<double> > f(ft.np012loc());
vector<double> rho(ft.np012loc());
Timer tmrho;
tmrho.reset();
tmrho.start();
if ( ctxt.myproc() == 0 )
cout << " density" << endl;
cout << MPIdata::rank() << ": compute_density..." << endl;
// update_occ(nel,nspin)
sd.update_occ(2*nst,1);
double weight = 1.0;
cout << MPIdata::rank() << ": sd.nstloc()=" << sd.nstloc() << endl;
sd.compute_density(ft,weight,&rho[0]);
tmrho.stop();
cout << MPIdata::rank() << ": compute_density: CPU/Real: "
<< tmrho.cpu() << " / " << tmrho.real() << endl;
tmap["compute_density"].reset();
tmap["compute_density"].start();
sd.compute_density(ft,weight,&rho[0]);
tmap["compute_density"].stop();
// integral of rho in r space
double sum = 0.0;
......@@ -154,8 +155,8 @@ int main(int argc, char **argv)
double tsum;
MPI_Allreduce(&sum,&tsum,1,MPI_DOUBLE,MPI_SUM,sd.context().comm());
sum = tsum;
cout << ctxt.mype() << ": "
<< " rho: " << sum * cell.volume() / ft.np012() << endl;
if ( ctxt.myproc() == 0 )
cout << " density sum: " << sum * cell.volume() / ft.np012() << endl;
SlaterDet sdp(sd);
//SlaterDet sdp(ctxt,kpoint);
......@@ -168,12 +169,10 @@ int main(int argc, char **argv)
cout << " Gram orthogonality error " << err << endl;
Timer tmv;
tmv.reset();
tmv.start();
tmap["rs_mul_add"].reset();
tmap["rs_mul_add"].start();
sd.rs_mul_add(ft,&rho[0],sdp);
tmv.stop();
cout << " rs_mul_add: CPU/Real: "
<< tmv.cpu() << " / " << tmv.real() << endl;
tmap["rs_mul_add"].stop();
//cout << sd;
#if 0
......@@ -186,12 +185,26 @@ int main(int argc, char **argv)
<< tm.cpu() << " / " << tm.real() << endl;
#endif
for ( TimerMap::iterator i = sd.tmap.begin(); i != sd.tmap.end(); i++ )
cout << ctxt.mype() << ": "
<< setw(15) << (*i).first
<< " : " << setprecision(3) << (*i).second.real() << endl;
ctxt.barrier();
} // Context scope
if ( MPIdata::onpe0() )
{
printf("\n");
printf(" %-22s %12s %12s\n","timer name","min","max");
printf(" %-22s %12s %12s\n","----------","---","---");
}
for ( TimerMap::iterator i = tmap.begin(); i != tmap.end(); i++ )
{
double time = i->second.real();
double tmin, tmax;
MPI_Reduce(&time,&tmin,1,MPI_DOUBLE,MPI_MIN,0,MPIdata::comm());
MPI_Reduce(&time,&tmax,1,MPI_DOUBLE,MPI_MAX,0,MPIdata::comm());
if ( MPIdata::onpe0() )
{
string s = (*i).first;
printf(" %-22s %12.6f %12.6f\n",s.c_str(), tmin, tmax);
}
}
MPI_Finalize();
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment