FourierTransform.C 53.9 KB
Newer Older
Francois Gygi committed
1 2
////////////////////////////////////////////////////////////////////////////////
//
Francois Gygi committed
3 4 5 6
// Copyright (c) 2008 The Regents of the University of California
//
// This file is part of Qbox
//
Francois Gygi committed
7 8
// Qbox is distributed under the terms of the GNU General Public License
// as published by the Free Software Foundation, either version 2 of
Francois Gygi committed
9 10 11 12 13 14
// the License, or (at your option) any later version.
// See the file COPYING in the root directory of this distribution
// or <http://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////////////////////
//
Francois Gygi committed
15 16 17 18 19 20
// FourierTransform.C
//
////////////////////////////////////////////////////////////////////////////////

#include "FourierTransform.h"
#include "Basis.h"
21
#include "blas.h"
Francois Gygi committed
22 23 24 25 26 27

#include <complex>
#include <algorithm>
#include <map>
#include <cassert>

28 29
#if _OPENMP
#include <omp.h>
30 31 32 33 34
#else
// _OPENMP is not defined
#if defined(USE_FFTW3_THREADS)
#error "Need OpenMP to use FFTW3 threads"
#endif
35 36
#endif

Francois Gygi committed
37 38 39 40 41 42
#if USE_MPI
#include <mpi.h>
#else
typedef int MPI_Comm;
#endif

43
#if defined(USE_FFTW2) || defined(USE_FFTW3)
Francois Gygi committed
44 45
#ifdef ADD_
#define zdscal zdscal_
46 47 48 49 50 51 52 53 54 55
#define zcopy zcopy_
#endif
#endif

#if defined(USE_FFTW2)
#if defined(FFTWMEASURE)
#define FFTW_ALGO FFTW_MEASURE
#else
#define FFTW_ALGO FFTW_ESTIMATE
#endif
Francois Gygi committed
56
#endif
57

58 59 60 61 62 63 64 65 66
#if defined(USE_FFTW3)
#if defined(FFTWMEASURE)
#define FFTW_ALGO ( FFTW_MEASURE | FFTW_UNALIGNED )
#else
#define FFTW_ALGO ( FFTW_ESTIMATE | FFTW_UNALIGNED )
#endif
#endif

#if defined(USE_FFTW2) || defined(USE_FFTW3)
67
extern "C" void zdscal(int *n,double *alpha,std::complex<double> *x,int *incx);
68
#elif USE_ESSL_FFT
Francois Gygi committed
69
extern "C" {
70 71
  void dcft_(int *initflag, std::complex<double> *x, int *inc2x, int *inc3x,
             std::complex<double> *y, int *inc2y, int *inc3y,
Francois Gygi committed
72 73 74
             int *length, int *ntrans, int *isign,
             double *scale, double *aux1, int *naux1,
             double *aux2, int *naux2);
75 76
  void dcft2_(int *initflag, std::complex<double> *x, int *inc1x, int *inc2x,
             std::complex<double> *y, int *inc1y, int *inc2y,
Francois Gygi committed
77 78 79 80 81
             int *n1, int *n2, int *isign,
             double *scale, double *aux1, int *naux1,
             double *aux2, int *naux2);
#define USE_GATHER_SCATTER 1
}
82
#elif defined(FFT_NOLIB)
83
void cfftm ( std::complex<double> *ain, std::complex<double> *aout,
84
  double scale, int ntrans, int length, int ainc, int ajmp, int idir );
85 86
#else
#error "Must define USE_FFTW2, USE_FFTW3, USE_ESSL_FFT or FFT_NOLIB"
Francois Gygi committed
87 88 89 90 91
#endif

#if USE_GATHER_SCATTER
extern "C" {
  // zgthr: x(i) = y(indx(i))
92
  void zgthr_(int* n, std::complex<double>* y,
93
              std::complex<double>* x, int*indx);
Francois Gygi committed
94
  // zsctr: y(indx(i)) = x(i)
95
  void zsctr_(int* n, std::complex<double>* x, int* indx,
96
              std::complex<double>* y);
Francois Gygi committed
97 98 99
}
#endif

100 101
using namespace std;

Francois Gygi committed
102 103 104
////////////////////////////////////////////////////////////////////////////////
FourierTransform::~FourierTransform()
{
105
#if USE_FFTW2
Francois Gygi committed
106 107 108 109 110 111 112
  fftw_destroy_plan(fwplan0);
  fftw_destroy_plan(fwplan1);
  fftw_destroy_plan(fwplan2);
  fftw_destroy_plan(bwplan0);
  fftw_destroy_plan(bwplan1);
  fftw_destroy_plan(bwplan2);
#endif
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129

#if USE_FFTW3
#if USE_FFTW3_THREADS
  fftw_cleanup_threads();
#endif
#if defined(USE_FFTW3_2D) || defined(USE_FFTW3_THREADS)
  fftw_destroy_plan(fwplan2d);
  fftw_destroy_plan(bwplan2d);
#else
  fftw_destroy_plan(fwplanx);
  fftw_destroy_plan(bwplanx);
  fftw_destroy_plan(fwplany);
  fftw_destroy_plan(bwplany);
#endif
  fftw_destroy_plan(fwplan);
  fftw_destroy_plan(bwplan);
#endif
Francois Gygi committed
130 131 132 133
}

////////////////////////////////////////////////////////////////////////////////
FourierTransform::FourierTransform (const Basis &basis,
134
  int np0, int np1, int np2) : comm_(basis.comm()), basis_(basis),
Francois Gygi committed
135 136
  np0_(np0), np1_(np1), np2_(np2)
{
137 138
  MPI_Comm_size(comm_,&nprocs_);
  MPI_Comm_rank(comm_,&myproc_);
Francois Gygi committed
139 140 141 142 143

  np2_loc_.resize(nprocs_);
  np2_first_.resize(nprocs_);

  // Block-cyclic distribution for np2
144
  // Partition np2 into nprocs_ intervals and
Francois Gygi committed
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
  // store local sizes in np2_loc_[iproc]
  // Use same block distribution as in ScaLAPACK
  // Blocks 0,...,nprocs_-2 have size np2_block_size
  // Block nprocs_-1 may have a smaller size
  if ( np2_ % nprocs_ == 0 )
  {
    // all blocks have equal size
    const int np2_block_size = np2_ / nprocs_;
    for ( int iproc = 0; iproc < nprocs_; iproc++ )
      np2_loc_[iproc] = np2_block_size;
  }
  else
  {
    // first k-1 blocks have same size, k_th block is smaller, others zero
    const int np2_block_size = np2_ / nprocs_ + 1;
    const int k = np2_ / np2_block_size;
    for ( int iproc = 0; iproc < k; iproc++ )
      np2_loc_[iproc] = np2_block_size;
    np2_loc_[k] = np2_ - k * np2_block_size;
    for ( int iproc = k+1; iproc < nprocs_; iproc++ )
      np2_loc_[iproc] = 0;
  }

  np2_first_[0] = 0;
  for ( int iproc = 1; iproc < nprocs_; iproc++ )
  {
    np2_first_[iproc] = np2_first_[iproc-1] + np2_loc_[iproc-1];
  }
173

Francois Gygi committed
174 175 176 177 178 179 180 181 182 183 184 185 186
  // number of local z vectors
  if ( basis_.real() )
  {
    if ( myproc_ == 0 )
      // rod(0,0) is mapped to only one z vector
      nvec_ = 2 * basis_.nrod_loc() - 1;
    else
      nvec_ = 2 * basis_.nrod_loc();
  }
  else
  {
    nvec_ = basis_.nrod_loc();
  }
187

188 189 190 191 192 193
  // compute number of transforms along the x,y and z directions
  // ntrans0_ is the number of transforms along x in one of the two blocks
  // of vectors corresponding to positive and negative y indices
  ntrans0_ = max(abs(basis_.idxmax(1)),abs(basis_.idxmin(1)))+1;
  ntrans1_ = np0_;
  ntrans2_ = nvec_;
194

Francois Gygi committed
195 196
  // resize array zvec holding columns
  zvec_.resize(nvec_ * np2_);
197

198 199 200
#if TIMING
  tm_init.start();
#endif
Francois Gygi committed
201 202
  // Initialize FT library auxiliary arrays
  init_lib();
203 204 205
#if TIMING
  tm_init.stop();
#endif
206

Francois Gygi committed
207 208
  // allocate send buffer
  sbuf.resize(nvec_ * np2_);
209

Francois Gygi committed
210 211 212 213 214
  // allocate receive buffer
  if ( basis_.real() )
    rbuf.resize((2 * basis_.nrods() - 1) * np2_loc_[myproc_]);
  else
    rbuf.resize(basis_.nrods() * np2_loc_[myproc_]);
215

Francois Gygi committed
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
  // compute send/receive counts and displacements in units of sizeof(double)

  scounts.resize(nprocs_);
  sdispl.resize(nprocs_);
  rcounts.resize(nprocs_);
  rdispl.resize(nprocs_);

  if ( basis_.real() )
  {
    for ( int iproc = 0; iproc < nprocs_; iproc++ )
    {
      scounts[iproc] = 2 * nvec_ * np2_loc_[iproc];
      int nvec_iproc = iproc == 0 ? 2*basis_.nrod_loc(iproc)-1 :
                                2 * basis_.nrod_loc(iproc);
      rcounts[iproc] = 2 * nvec_iproc * np2_loc_[myproc_];
    }
  }
  else
  {
    for ( int iproc = 0; iproc < nprocs_; iproc++ )
    {
      scounts[iproc] = 2 * nvec_ * np2_loc_[iproc];
      int nvec_iproc = basis_.nrod_loc(iproc);
      rcounts[iproc] = 2 * nvec_iproc * np2_loc_[myproc_];
    }
  }

  sdispl[0] = 0;
  rdispl[0] = 0;
  for ( int iproc = 1; iproc < nprocs_; iproc++ )
  {
    sdispl[iproc] = sdispl[iproc-1] + scounts[iproc-1];
    rdispl[iproc] = rdispl[iproc-1] + rcounts[iproc-1];
  }
250

Francois Gygi committed
251 252 253 254 255
  if ( basis_.real() )
  {
    // compute index arrays ifftp_ and ifftm_ for mapping vector->zvec
    ifftp_.resize(basis_.localsize());
    ifftm_.resize(basis_.localsize());
256

Francois Gygi committed
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
    if ( myproc_ == 0 )
    {
      // this process holds rod(0,0)
      // nvec_ == 2 * nrod_loc - 1

      // map rod(0,0)
      // the positive segment of rod(0,0) maps onto the first half of
      // the first column of zvec_, and the negative segment maps onto
      // the second half
      int ig = 0;
      ifftp_[0] = 0;
      ifftm_[0] = 0;
      ig++;
      for ( int l = 1; l < basis_.rod_size(0); l++ )
      {
        ifftp_[ig] = l;
        ifftm_[ig] = np2_ - l;
        ig++;
      }

      // map other rods(h,k) on pe 0, h!=0, k!=0
      // rod(h,k) maps onto column (2*irod-1)*np2_ of zvec_, irod=1,..,nrods-1
      // rod(-h,-k) maps onto column (2*irod)*np2_ of zvec_, irod=1,..,nrods-1
      for ( int irod = 1; irod < basis_.nrod_loc(); irod++ )
      {
        const int rodsize = basis_.rod_size(irod);
        for ( int i = 0; i < rodsize; i++ )
        {
          const int l = i + basis_.rod_lmin(irod);
          int izp =  l;
          int izm = -l;
          if ( izp < 0 ) izp += np2_;
          if ( izm < 0 ) izm += np2_;
          ifftp_[ig] = ( 2 * irod - 1 ) * np2_ + izp;
          ifftm_[ig] = ( 2 * irod ) * np2_ + izm;
          ig++;
        }
      }
      assert(ig == basis_.localsize());
    }
    else
    {
      // this process does not hold rod(0,0)
      // map rods(h,k)
      // rod(h,k)   maps onto column (2*irod)*np2_ of zvec_, irod=0,..,nrods-1
      // rod(-h,-k) maps onto column (2*irod+1)*np2_ of zvec_, irod=0,..,nrods-1
      int ig = 0;
      for ( int irod = 0; irod < basis_.nrod_loc(); irod++ )
      {
        const int rodsize = basis_.rod_size(irod);
        for ( int i = 0; i < rodsize; i++ )
        {
          const int l = i + basis_.rod_lmin(irod);
          int izp =  l;
          int izm = -l;
          if ( izp < 0 ) izp += np2_;
          if ( izm < 0 ) izm += np2_;
          ifftp_[ig] = ( 2 * irod ) * np2_ + izp;
          ifftm_[ig] = ( 2 * irod + 1 ) * np2_ + izm;
          ig++;
        }
      }
      assert(ig == basis_.localsize());
    }

    // compute ipack index array
    // used in packing zvec_ into sbuf
    // sbuf[ipack_[i]] = zvec_[i]
    ipack_.resize(nvec_*np2_);
    int idest = 0;
    for ( int iproc = 0; iproc < nprocs_; iproc++ )
    {
      int isource = np2_first_[iproc];
      int sz = np2_loc_[iproc];
      for ( int ivec = 0; ivec < nvec_; ivec++ )
      {
        for ( int i = 0; i < sz; i++ )
        {
          ipack_[isource+i] = idest + i;
        }
        idest += sz;
        isource += np2_;
      }
    }
341

Francois Gygi committed
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
    // compute array iunpack
    // used in unpacking rbuf into val
    // val[iunpack[i]] = rbuf[i]

    // rbuf contains 2*_nrods-1 segments of size np2_loc[myproc]
    // the position of vector ivec in local rbuf[_nrods*np2_loc_] is
    // obtained from rod_h[iproc][irod], rod_k[irod][iproc]
    // compute iunpack[i], i = 0, .. , _nrods * np2_loc_
    iunpack_.resize((2*basis_.nrods()-1)*np2_loc_[myproc_]);

    // map rod(0,0)
    for ( int l = 0; l < np2_loc_[myproc_]; l++ )
    {
      iunpack_[l] = l * np0_ * np1_;
    }
    int isource_p = np2_loc_[myproc_];
    int isource_m = 2 * np2_loc_[myproc_];
359

Francois Gygi committed
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
    // all rods of pe 0
    for ( int irod = 1; irod < basis_.nrod_loc(0); irod++ )
    {
      // map rod(h,k) and rod(-h,-k) columns of zvec_

      // map rod(h,k)
      // find position of rod(h,k) in the slab
      int hp = basis_.rod_h(0,irod);
      int kp = basis_.rod_k(0,irod);
      if ( hp < 0 ) hp += np0_;
      if ( kp < 0 ) kp += np1_;

      int hm = -hp;
      int km = -kp;
      if ( hm < 0 ) hm += np0_;
      if ( km < 0 ) km += np1_;

      for ( int l = 0; l < np2_loc_[myproc_]; l++ )
      {
        int idest_p = hp + np0_ * ( kp + np1_ * l );
        iunpack_[isource_p+l] = idest_p;

        int idest_m = hm + np0_ * ( km + np1_ * l );
        iunpack_[isource_m+l] = idest_m;
      }
      isource_p += 2 * np2_loc_[myproc_];
      isource_m += 2 * np2_loc_[myproc_];
    }

    // pe's above pe0
    for ( int iproc = 1; iproc < nprocs_; iproc++ )
    {
      for ( int irod = 0; irod < basis_.nrod_loc(iproc); irod++ )
      {
        // map rod(h,k) and rod(-h,-k) columns of zvec_

        // map rod(h,k)
        // find position of rod(h,k) in the slab
        int hp = basis_.rod_h(iproc,irod);
        int kp = basis_.rod_k(iproc,irod);
        if ( hp < 0 ) hp += np0_;
        if ( kp < 0 ) kp += np1_;
402

Francois Gygi committed
403 404 405 406
        int hm = -hp;
        int km = -kp;
        if ( hm < 0 ) hm += np0_;
        if ( km < 0 ) km += np1_;
407

Francois Gygi committed
408 409 410 411
        for ( int l = 0; l < np2_loc_[myproc_]; l++ )
        {
          int idest_p = hp + np0_ * ( kp + np1_ * l );
          iunpack_[isource_p+l] = idest_p;
412

Francois Gygi committed
413 414 415 416 417 418 419 420 421 422 423 424 425 426
          int idest_m = hm + np0_ * ( km + np1_ * l );
          iunpack_[isource_m+l] = idest_m;
        }
        isource_p += 2 * np2_loc_[myproc_];
        isource_m += 2 * np2_loc_[myproc_];
      }
    }
  }
  else
  {
    // basis is complex
    // compute index array ifftp_ for mapping vector->zvec
    // Note: ifftm_ is not used
    ifftp_.resize(basis_.localsize());
427

Francois Gygi committed
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
    // map rods(h,k)
    // rod(h,k)   maps onto column irod*np2_ of zvec_, irod=0,..,nrods-1
    int ig = 0;
    for ( int irod = 0; irod < basis_.nrod_loc(); irod++ )
    {
      const int rodsize = basis_.rod_size(irod);
      for ( int i = 0; i < rodsize; i++ )
      {
        const int l = i + basis_.rod_lmin(irod);
        int iz =  l;
        if ( iz < 0 ) iz += np2_;
        ifftp_[ig] = irod * np2_ + iz;
        ig++;
      }
    }
    assert(ig == basis_.localsize());

    // compute ipack index array
    // used in packing zvec_ into sbuf
    // sbuf[ipack_[i]] = zvec_[i]
    ipack_.resize(nvec_*np2_);
    int idest = 0;
    for ( int iproc = 0; iproc < nprocs_; iproc++ )
    {
      int isource = np2_first_[iproc];
      int sz = np2_loc_[iproc];
      for ( int ivec = 0; ivec < nvec_; ivec++ )
      {
        for ( int i = 0; i < sz; i++ )
        {
          ipack_[isource+i] = idest + i;
        }
        idest += sz;
        isource += np2_;
      }
    }
464

Francois Gygi committed
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
    // compute array iunpack
    // used in unpacking rbuf into val
    // val[iunpack[i]] = rbuf[i]

    // rbuf contains _nrods segments of size np2_loc[mype]
    // the position of vector ivec in local rbuf[_nrods*np2_loc_] is
    // obtained from rod_h[iproc][irod], rod_k[irod][iproc]
    // compute iunpack[i], i = 0, .. , _nrods * np2_loc_
    iunpack_.resize(basis_.nrods()*np2_loc_[myproc_]);

    int isource = 0;
    for ( int iproc = 0; iproc < nprocs_; iproc++ )
    {
      for ( int irod = 0; irod < basis_.nrod_loc(iproc); irod++ )
      {
        // map rod(h,k)
        // find position of rod(h,k) in the slab
        int h = basis_.rod_h(iproc,irod);
        int k = basis_.rod_k(iproc,irod);
        if ( h < 0 ) h += np0_;
        if ( k < 0 ) k += np1_;
486

Francois Gygi committed
487 488 489 490
        for ( int l = 0; l < np2_loc_[myproc_]; l++ )
        {
          int idest = h + np0_ * ( k + np1_ * l );
          iunpack_[isource+l] = idest;
491

Francois Gygi committed
492 493 494 495 496
        }
        isource += np2_loc_[myproc_];
      }
    }
  }
497

Francois Gygi committed
498 499 500 501 502
  // for ( int ig = 0; ig < basis_.localsize(); ig++ )
  // {
  //   assert(ifftp_[ig] >= 0 && ifftp_[ig] < zvec_.size());
  //   assert(ifftm_[ig] >= 0 && ifftm_[ig] < zvec_.size());
  // }
503

Francois Gygi committed
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520
#if USE_GATHER_SCATTER
  // shift index array by one for fortran ZGTHR and ZSCTR calls
  for ( int i = 0; i < iunpack_.size(); i++ )
  {
    iunpack_[i]++;
  }
  for ( int i = 0; i < ipack_.size(); i++ )
  {
    ipack_[i]++;
  }
#endif
}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::backward(const complex<double>* c, complex<double>* f)
{
#if TIMING
521
  tm_b_map.start();
Francois Gygi committed
522 523 524
#endif
  vector_to_zvec(c);
#if TIMING
525
  tm_b_map.stop();
Francois Gygi committed
526 527 528 529 530 531 532 533
#endif
  bwd(f);
}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::forward(complex<double>* f, complex<double>* c)
{
  fwd(f);
534 535 536
#if TIMING
  tm_f_map.start();
#endif
Francois Gygi committed
537
  zvec_to_vector(c);
538 539 540
#if TIMING
  tm_f_map.stop();
#endif
Francois Gygi committed
541 542 543
}

////////////////////////////////////////////////////////////////////////////////
544
void FourierTransform::backward(const complex<double>* c1,
Francois Gygi committed
545 546 547
                               const complex<double>* c2,
                               complex<double>* f)
{
548 549 550
#if TIMING
  tm_b_map.start();
#endif
Francois Gygi committed
551
  doublevector_to_zvec(c1,c2);
552 553 554
#if TIMING
  tm_b_map.stop();
#endif
Francois Gygi committed
555 556 557 558 559 560 561 562
  bwd(f);
}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::forward(complex<double>* f,
  complex<double>* c1, complex<double>* c2)
{
  fwd(f);
563 564 565
#if TIMING
  tm_f_map.start();
#endif
Francois Gygi committed
566
  zvec_to_doublevector(c1,c2);
567 568 569
#if TIMING
  tm_f_map.stop();
#endif
Francois Gygi committed
570
}
571

Francois Gygi committed
572 573 574 575 576 577 578
////////////////////////////////////////////////////////////////////////////////
void FourierTransform::bwd(complex<double>* val)
{
  // transform zvec along z, transpose and transform along x,y, store
  // result in val
  // The columns of zvec[nvec_ * np2_] contain the full vectors
  // to be transformed
579 580
  //
  // If the basis is real: Column (h,k) is followed by column (-h,-k),
Francois Gygi committed
581 582 583
  // except for (0,0)

#if TIMING
584
  tm_b_fft.start();
585
  tm_b_z.start();
Francois Gygi committed
586 587
#endif

588
#if USE_ESSL_FFT
Francois Gygi committed
589 590
  int inc1 = 1, inc2 = np2_, ntrans = nvec_, isign = -1, initflag = 0;
  double scale = 1.0;
591

Francois Gygi committed
592 593
  dcft_(&initflag,&zvec_[0],&inc1,&inc2,&zvec_[0],&inc1,&inc2,&np2_,&ntrans,
       &isign,&scale,&aux1zb[0],&naux1z,&aux2[0],&naux2);
594
#elif USE_FFTW2
595
   /*
Francois Gygi committed
596 597 598 599
    * void fftw(fftw_plan plan, int howmany,
    *    FFTW_COMPLEX *in, int istride, int idist,
    *    FFTW_COMPLEX *out, int ostride, int odist);
    */
600 601 602 603 604 605 606 607
#if _OPENMP
  #pragma omp parallel for
  for ( int i = 0; i < nvec_; i++ )
  {
    fftw_one(bwplan2,(FFTW_COMPLEX*)&zvec_[i*np2_],(FFTW_COMPLEX*)0);
  }
#else
  int ntrans = nvec_, inc1 = 1, inc2 = np2_;
Francois Gygi committed
608 609
  fftw(bwplan2,ntrans,(FFTW_COMPLEX*)&zvec_[0],inc1,inc2,
                      (FFTW_COMPLEX*)0,0,0);
610 611 612
#endif // _OPENMP

#elif USE_FFTW3 // USE_FFTW2
613

614 615 616
#if USE_FFTW3_THREADS
  fftw_execute_dft ( bwplan, (fftw_complex*)&zvec_[0],
                     (fftw_complex*)&zvec_[0]);
Francois Gygi committed
617
#else
618 619 620 621 622 623 624 625 626
  #pragma omp parallel for
  for ( int i = 0; i < nvec_; i++ )
  {
    fftw_execute_dft ( bwplan, (fftw_complex*)&zvec_[i*np2_],
                       (fftw_complex*)&zvec_[i*np2_]);
  }
#endif // USE_FFTW3_THREADS

#elif defined(FFT_NOLIB) // USE_FFTW3
627 628 629 630 631 632 633 634 635
  // No library
  /* Transform along z */
  int ntrans = nvec_;
  int length = np2_;
  int ainc   = 1;
  int ajmp   = np2_;
  double scale = 1.0;
  int idir = -1;
  cfftm ( &zvec_[0], &zvec_[0], scale, ntrans, length, ainc, ajmp, idir );
636 637 638
#else
#error "Must define USE_FFTW2, USE_FFTW3, USE_ESSL_FFT or FFT_NOLIB"
#endif // USE_FFTW3
639

Francois Gygi committed
640
#if TIMING
641 642
  tm_b_z.stop();
  tm_b_com.start();
643 644
  tm_b_fft.stop();
  tm_b_pack.start();
Francois Gygi committed
645
#endif
646

Francois Gygi committed
647 648 649 650 651 652 653 654 655 656 657 658 659 660
  // scatter zvec_ to sbuf for transpose
#if USE_GATHER_SCATTER
  // zsctr: y(indx(i)) = x(i)
  // void zsctr_(int* n, complex<double>* x, int* indx, complex<double>* y);
  {
    complex<double>* y = &sbuf[0];
    complex<double>* x = &zvec_[0];
    int n = zvec_.size();
    zsctr_(&n,x,&ipack_[0],y);
  }
#else
  const int zvec_size = zvec_.size();
  double* const ps = (double*) &sbuf[0];
  const double* const pz = (double*) &zvec_[0];
661
  #pragma omp parallel for
Francois Gygi committed
662 663 664 665 666 667 668 669 670 671
  for ( int i = 0; i < zvec_size; i++ )
  {
    // sbuf[ipack_[i]] = zvec_[i];
    const int ip = ipack_[i];
    const double a = pz[2*i];
    const double b = pz[2*i+1];
    ps[2*ip]   = a;
    ps[2*ip+1] = b;
  }
#endif
672

Francois Gygi committed
673
  // segments of z-vectors are now in sbuf
674

Francois Gygi committed
675
#if TIMING
676 677
  tm_b_pack.stop();
  tm_b_mpi.start();
Francois Gygi committed
678 679 680 681 682 683
#endif

  // transpose
#if USE_MPI
  int status = MPI_Alltoallv((double*)&sbuf[0],&scounts[0],&sdispl[0],
      MPI_DOUBLE,(double*)&rbuf[0],&rcounts[0],&rdispl[0],MPI_DOUBLE,
684
      comm_);
Francois Gygi committed
685 686 687
  if ( status != 0 )
  {
    cout << " FourierTransform: status = " << status << endl;
688
    MPI_Abort(MPI_COMM_WORLD,2);
Francois Gygi committed
689 690 691 692 693
  }
#else
  assert(sbuf.size()==rbuf.size());
  rbuf = sbuf;
#endif
694

Francois Gygi committed
695
#if TIMING
696 697
  tm_b_mpi.stop();
  tm_b_zero.start();
Francois Gygi committed
698 699 700 701 702
#endif

  // copy from rbuf to val
  // scatter index array iunpack
  {
703
    const int len = np012loc() * 2;
Francois Gygi committed
704
    double* const pv = (double*) &val[0];
705
    #pragma omp parallel for
Francois Gygi committed
706 707
    for ( int i = 0; i < len; i++ )
    {
708
      pv[i]   = 0.0;
Francois Gygi committed
709 710
    }
  }
711

Francois Gygi committed
712
#if TIMING
713 714
  tm_b_zero.stop();
  tm_b_unpack.start();
Francois Gygi committed
715 716 717 718 719 720 721 722 723 724 725 726 727 728 729
#endif

#if USE_GATHER_SCATTER
  // zsctr(n,x,indx,y): y(indx(i)) = x(i)
  {
    complex<double>* y = &val[0];
    complex<double>* x = &rbuf[0];
    int n = rbuf.size();
    zsctr_(&n,x,&iunpack_[0],y);
  }
#else
  {
    const int rbuf_size = rbuf.size();
    const double* const pr = (double*) &rbuf[0];
    double* const pv = (double*) &val[0];
730
    #pragma omp parallel for
Francois Gygi committed
731 732 733 734 735 736 737 738 739 740 741
    for ( int i = 0; i < rbuf_size; i++ )
    {
      // val[iunpack_[i]] = rbuf[i];
      const int iu = iunpack_[i];
      const double a = pr[2*i];
      const double b = pr[2*i+1];
      pv[2*iu]   = a;
      pv[2*iu+1] = b;
    }
  }
#endif
742

Francois Gygi committed
743
#if TIMING
744 745
  tm_b_unpack.stop();
  tm_b_fft.start();
746 747
  tm_b_com.stop();
  tm_b_xy.start();
Francois Gygi committed
748 749
#endif

750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811
#if USE_FFTW3
#if USE_FFTW3_THREADS
  fftw_execute_dft ( bwplan2d, (fftw_complex*)&val[0],
                     (fftw_complex*)&val[0] );
#elif USE_FFTW3_2D
  #pragma omp parallel for
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
    fftw_execute_dft ( bwplan2d, (fftw_complex*)&val[k*np0_*np1_],
                       (fftw_complex*)&val[k*np0_*np1_] );
#else // FFTW3_2D
  // fftw3 1d
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    int ibase = k * np0_ * np1_;
#if TIMING
    tm_b_x.start();
#endif
    #pragma omp parallel for
    for ( int i = 0; i < ntrans0_; i++ )
    {
      // Transform first block along x: positive y indices
      fftw_execute_dft ( bwplanx, (fftw_complex*)&val[ibase+i*np0_],
                         (fftw_complex*)&val[ibase+i*np0_]);
      // Transform second block along x: negative y indices
      fftw_execute_dft ( bwplanx,
                         (fftw_complex*)&val[ibase+(np1_-ntrans0_+i)*np0_],
                         (fftw_complex*)&val[ibase+(np1_-ntrans0_+i)*np0_]);
    }
#if TIMING
    tm_b_x.stop();
    tm_b_y.start();
#endif
#if FFTW_TRANSPOSE
    #pragma omp parallel
    {
      vector<complex<double> >t_trans(np1_);
      #pragma omp for
      for ( int i = 0; i < np0_; i++ )
      {
        int length = t_trans.size();
        int inc1 = 1, inc2 = np0_;
        zcopy(&length, &val[ibase+i], &inc2, &t_trans[0], &inc1);
        fftw_execute_dft ( bwplany, (fftw_complex*)&t_trans[0],
                           (fftw_complex*)&t_trans[0]);
        zcopy(&length, &t_trans[0], &inc1, &val[ibase+i], &inc2);
      }
    }
#else // FFTW_TRANSPOSE
    #pragma omp parallel for
    for ( int i = 0; i < np0_; i++ )
    {
      fftw_execute_dft ( bwplany, (fftw_complex*)&val[ibase+i],
                         (fftw_complex*)&val[ibase+i]);
    }
#endif // FFTW_TRANSPOSE
#if TIMING
    tm_b_y.stop();
#endif
  }
#endif // USE_FFTW3_2D

#elif USE_ESSL_FFT
Francois Gygi committed
812 813 814
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    // transform along x for non-zero vectors only
815
    // transform along x for y in [0,ntrans0_] and y in [np1-ntrans0_, np1-1]
816
#if USE_ESSL_2DFFT
Francois Gygi committed
817

818 819 820
    // use 2D FFT for x and y transforms
    int inc1, inc2, istart, isign = -1, initflag = 0;
    double scale = 1.0;
821

822 823 824 825 826
    // xy transform
    istart = k * np0_ * np1_;
    inc1 = 1; inc2 = np0_;
    dcft2_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
          &np0_,&np1_,&isign,&scale,&aux1xyb[0],&naux1xy,&aux2[0],&naux2);
Francois Gygi committed
827 828 829

#else

830
    // use multiple 1-d FFTs for x and y transforms
831

832 833 834 835 836 837 838 839 840 841 842
    int inc1, inc2, ntrans, istart, length, isign = -1, initflag = 0;
    double scale = 1.0;
    // transform only non-zero vectors along x
    // First block: positive y indices: [0,ntrans0_]
    ntrans = ntrans0_;
    inc1 = 1;
    inc2 = np0_;
    istart = k * np0_ * np1_;
    length = np0_;
    dcft_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
         &length,&ntrans,&isign,&scale,&aux1xb[0],&naux1x,&aux2[0],&naux2);
843

844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867
    // Second block: negative y indices: [np1-ntrans0_,np1-1]
    inc1 = 1;
    inc2 = np0_;
    istart = np0_ * ( (np1_-ntrans) + k * np1_ );
    length = np0_;
    dcft_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
         &length,&ntrans,&isign,&scale,&aux1xb[0],&naux1x,&aux2[0],&naux2);

    // transform along y for all values of x
    ntrans = np0_;
    inc1 = np0_;
    inc2 = 1;
    istart = k * np0_ * np1_;
    length = np1_;
    dcft_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
         &length,&ntrans,&isign,&scale,&aux1yb[0],&naux1y,&aux2[0],&naux2);
#endif // USE_ESSL_2DFFT
  } // k

#elif USE_FFTW2
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    // transform along x for non-zero vectors only
    // transform along x for y in [0,ntrans0_] and y in [np1-ntrans0_, np1-1]
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
#if _OPENMP
  int ibase = k * np0_ * np1_;
  #pragma omp parallel for
  for ( int i = 0; i < ntrans0_; i++ )
  {
    //#pragma omp task
    {
      // Transform first block along x: positive y indices
      fftw_one(bwplan0,(FFTW_COMPLEX*)&val[ibase+i*np0_],(FFTW_COMPLEX*)0);
      //fftw(bwplan0,1,(FFTW_COMPLEX*)&val[ibase+i*np0_],1,np0_,
      //               (FFTW_COMPLEX*)0,0,0);
      // Transform second block along x: negative y indices
      fftw_one(bwplan0,(FFTW_COMPLEX*)&val[ibase+(np1_-ntrans0_+i)*np0_],
                       (FFTW_COMPLEX*)0);
      //fftw(bwplan0,1,(FFTW_COMPLEX*)&val[ibase+(np1_-ntrans0_+i)*np0_],1,np0_,
      //               (FFTW_COMPLEX*)0,0,0);
    }
  }

  //complex<double> *tmp1 = new complex<double>[np1_];
  #pragma omp parallel for
  for ( int i = 0; i < np0_; i++ )
  {
    {
      // transform along y for all values of x
      // copy data to local array
      int one=1;
      #if 0
      zcopy_(&np1_,&val[ibase+i],&np0_,tmp1,&one);
      fftw_one(bwplan1,(FFTW_COMPLEX*)tmp1,(FFTW_COMPLEX*)0);
      zcopy_(&np1_,tmp1,&one,&val[ibase+i],&np0_);
      #else
      fftw(bwplan1,1,(FFTW_COMPLEX*)&val[ibase+i],np0_,one,
                     (FFTW_COMPLEX*)0,0,0);
      #endif
    }
  }
  //delete [] tmp1;
906
#else // _OPENMP
907
    int inc1, inc2, istart;
Francois Gygi committed
908

909 910
    int ntrans = ntrans0_;
    // Transform first block along x: positive y indices
Francois Gygi committed
911 912
    inc1 = 1;
    inc2 = np0_;
913
    istart = k * np0_ * np1_;
Francois Gygi committed
914 915
    fftw(bwplan0,ntrans,(FFTW_COMPLEX*)&val[istart],inc1,inc2,
                        (FFTW_COMPLEX*)0,0,0);
916
    // Transform second block along x: negative y indices
Francois Gygi committed
917 918
    inc1 = 1;
    inc2 = np0_;
919
    istart = np0_ * ( (np1_-ntrans) + k * np1_ );
Francois Gygi committed
920 921
    fftw(bwplan0,ntrans,(FFTW_COMPLEX*)&val[istart],inc1,inc2,
                        (FFTW_COMPLEX*)0,0,0);
922

Francois Gygi committed
923 924 925 926
    // transform along y for all values of x
    ntrans = np0_;
    inc1 = np0_;
    inc2 = 1;
927
    istart = k * np0_ * np1_;
Francois Gygi committed
928 929
    fftw(bwplan1,ntrans,(FFTW_COMPLEX*)&val[istart],inc1,inc2,
                        (FFTW_COMPLEX*)0,0,0);
930 931 932 933 934 935 936 937
#endif // _OPENMP
  } // k
#elif defined(FFT_NOLIB) // USE_FFTW2
  // No library
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    // transform along x for non-zero vectors only
    // transform along x for y in [0,ntrans0_] and y in [np1-ntrans0_, np1-1]
938
    // transform along x for non-zero elements
939 940
    // Transform first block along x: positive y indices
    int ntrans = ntrans0_;
941 942 943 944 945 946 947
    int istart = k * np0_ * np1_;
    int length = np0_;
    int ainc   = 1;
    int ajmp   = np0_;
    double scale = 1.0;
    int idir = -1;
    cfftm (&val[istart],&val[istart],scale,ntrans,length,ainc,ajmp,idir );
948

949 950
    // Transform second block along x: negative y indices
    istart = np0_ * ( (np1_-ntrans) + k * np1_ );
951
    cfftm (&val[istart],&val[istart],scale,ntrans,length,ainc,ajmp,idir );
952

953 954 955 956 957 958 959 960
    // transform along y for all values of x
    ntrans = np0_;
    istart = k * np0_ * np1_;
    length = np1_;
    ainc = np0_;
    ajmp = 1;
    cfftm (&val[istart],&val[istart],scale,ntrans,length,ainc,ajmp,idir );
  } // for k
961 962 963
#else
#error "Must define USE_FFTW2, USE_FFTW3, USE_ESSL_FFT or FFT_NOLIB"
#endif
964

Francois Gygi committed
965
#if TIMING
966
  tm_b_xy.stop();
967
  tm_b_fft.stop();
Francois Gygi committed
968 969 970 971 972 973
#endif
}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::fwd(complex<double>* val)
{
974 975
#if TIMING
  tm_f_fft.start();
976
  tm_f_xy.start();
977
#endif
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040

//fftw_execute_dft is thread safe
#if USE_FFTW3
#if USE_FFTW3_THREADS
  fftw_execute_dft ( fwplan2d, (fftw_complex*)&val[0],
                     (fftw_complex*)&val[0] );
#elif USE_FFTW3_2D // USE_FFTW3_2D
  #pragma omp parallel for
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
    fftw_execute_dft ( fwplan2d, (fftw_complex*)&val[k*np0_*np1_],
                       (fftw_complex*)&val[k*np0_*np1_] );
#else // USE_FFTW3_2D
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    const int ibase = k * np0_ * np1_;
#if TIMING
    tm_f_y.start();
#endif
#if FFTW_TRANSPOSE
    #pragma omp parallel
    {
      vector<complex<double> >t_trans(np1_);
      #pragma omp for
      for ( int i = 0; i < np0_; i++ )
      {
        int length = t_trans.size();
        int inc1 = 1, inc2 = np0_;
        zcopy(&length, &val[ibase+i], &inc2, &t_trans[0], &inc1);
        fftw_execute_dft ( fwplany, (fftw_complex*)&t_trans[0],
                         (fftw_complex*)&t_trans[0]);
        zcopy(&length, &t_trans[0], &inc1, &val[ibase+i], &inc2);
      }
    }
#else // FFTW_TRANSPOSE
    #pragma omp parallel for
    for ( int i = 0; i < np0_; i++ )
    {
      fftw_execute_dft ( fwplany, (fftw_complex*)&val[ibase+i],
                         (fftw_complex*)&val[ibase+i]);
    }
#endif // FFTW_TRANSPOSE
#if TIMING
    tm_f_y.stop();
    tm_f_x.start();
#endif
    #pragma omp parallel for
    for ( int i = 0; i < ntrans0_; i++ )
    {
      // Transform first block along x: positive y indices
      fftw_execute_dft ( fwplanx,(fftw_complex*)&val[ibase+i*np0_],
                         (fftw_complex*)&val[ibase+i*np0_]);

      // Transform second block along x: negative y indices
      fftw_execute_dft ( fwplanx,
                         (fftw_complex*)&val[ibase+(np1_-ntrans0_+i)*np0_],
                         (fftw_complex*)&val[ibase+(np1_-ntrans0_+i)*np0_]);
    }
#if TIMING
    tm_f_x.stop();
#endif
  }
#endif // USE_FFTW3_2D
#elif USE_ESSL_FFT
Francois Gygi committed
1041 1042 1043
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    // transform along x for non-zero vectors only
1044
    // transform along x for y in [0,ntrans0_] and y in [np1-ntrans0_, np1-1]
1045
#if USE_ESSL_2DFFT
Francois Gygi committed
1046 1047 1048 1049

    // use 2D FFT for x and y transforms
    int inc1, inc2, istart, isign = 1, initflag = 0;
    double scale = 1.0;
1050

Francois Gygi committed
1051 1052 1053 1054 1055 1056 1057 1058 1059
    // xy transform
    istart = k * np0_ * np1_;
    inc1 = 1; inc2 = np0_;
    dcft2_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
          &np0_,&np1_,&isign,&scale,&aux1xyf[0],&naux1xy,&aux2[0],&naux2);

#else

    // use multiple 1-d FFTs for x and y transforms
1060

Francois Gygi committed
1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
    int inc1, inc2, ntrans, istart, length, isign = 1, initflag = 0;
    double scale = 1.0;
    // transform along y for all values of x
    ntrans = np0_;
    inc1 = np0_;
    inc2 = 1;
    istart = k * np0_ * np1_;
    length = np1_;
    dcft_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
         &length,&ntrans,&isign,&scale,&aux1yf[0],&naux1y,&aux2[0],&naux2);
1071

Francois Gygi committed
1072
    // transform only non-zero vectors along x
1073
    ntrans = ntrans0_;
Francois Gygi committed
1074 1075 1076 1077 1078 1079
    inc1 = 1;
    inc2 = np0_;
    istart = k * np0_ * np1_;
    length = np0_;
    dcft_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
         &length,&ntrans,&isign,&scale,&aux1xf[0],&naux1x,&aux2[0],&naux2);
1080

Francois Gygi committed
1081 1082
    inc1 = 1;
    inc2 = np0_;
1083
    istart = np0_ * ( (np1_-ntrans) + k * np1_ );
Francois Gygi committed
1084 1085 1086
    length = np0_;
    dcft_(&initflag,&val[istart],&inc1,&inc2,&val[istart],&inc1,&inc2,
         &length,&ntrans,&isign,&scale,&aux1xf[0],&naux1x,&aux2[0],&naux2);
1087 1088 1089 1090 1091 1092 1093
#endif // USE_ESSL_2DFFT
  } // k
#elif USE_FFTW2
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    // transform along x for non-zero vectors only
    // transform along x for y in [0,ntrans0_] and y in [np1-ntrans0_, np1-1]
1094 1095
#if _OPENMP
  int ibase = k * np0_ * np1_;
1096
  //complex<double> *tmp1 = new complex<double>[np1_];
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
  #pragma omp parallel for
  for ( int i = 0; i < np0_; i++ )
  {
    //#pragma omp task
    {
      // transform along y for all values of x
      // copy data to local array
      int one=1;
      #if 0
      zcopy_(&np1_,&val[ibase+i],&np0_,tmp1,&one);
      fftw_one(fwplan1,(FFTW_COMPLEX*)tmp1,(FFTW_COMPLEX*)0);
      zcopy_(&np1_,tmp1,&one,&val[ibase+i],&np0_);
      #else
      fftw(fwplan1,1,(FFTW_COMPLEX*)&val[ibase+i],np0_,one,
                     (FFTW_COMPLEX*)0,0,0);
      #endif
    }
  }
1115
  //delete [] tmp1;
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128

  #pragma omp parallel for
  for ( int i = 0; i < ntrans0_; i++ )
  {
    //#pragma omp task
    {
      // Transform first block along x: positive y indices
      fftw_one(fwplan0,(FFTW_COMPLEX*)&val[ibase+i*np0_],(FFTW_COMPLEX*)0);
      // Transform second block along x: negative y indices
      fftw_one(fwplan0,(FFTW_COMPLEX*)&val[ibase+(np1_-ntrans0_+i)*np0_],
                       (FFTW_COMPLEX*)0);
    }
  }
1129
#else // _OPENMP
1130
    int inc1, inc2, istart;
Francois Gygi committed
1131 1132

    // transform along y for all values of x
1133
    int ntrans = np0_;
Francois Gygi committed
1134 1135
    inc1 = np0_;
    inc2 = 1;
1136
    istart = k * np0_ * np1_;
Francois Gygi committed
1137 1138
    fftw(fwplan1,ntrans,(FFTW_COMPLEX*)&val[istart],inc1,inc2,
                        (FFTW_COMPLEX*)0,0,0);
1139

1140 1141
    ntrans = ntrans0_;
    // Transform first block along x: positive y indices
Francois Gygi committed
1142 1143
    inc1 = 1;
    inc2 = np0_;
1144
    istart = k * np0_ * np1_;
Francois Gygi committed
1145 1146
    fftw(fwplan0,ntrans,(FFTW_COMPLEX*)&val[istart],inc1,inc2,
                        (FFTW_COMPLEX*)0,0,0);
1147
    // Transform second block along x: negative y indices
Francois Gygi committed
1148 1149
    inc1 = 1;
    inc2 = np0_;
1150
    istart = np0_ * ( (np1_-ntrans) + k * np1_ );
Francois Gygi committed
1151 1152
    fftw(fwplan0,ntrans,(FFTW_COMPLEX*)&val[istart],inc1,inc2,
                        (FFTW_COMPLEX*)0,0,0);
1153 1154 1155 1156 1157 1158 1159 1160
#endif // _OPENMP
  } // k
#elif defined(FFT_NOLIB)
  // No library
  for ( int k = 0; k < np2_loc_[myproc_]; k++ )
  {
    // transform along x for non-zero vectors only
    // transform along x for y in [0,ntrans0_] and y in [np1-ntrans0_, np1-1]
1161 1162 1163 1164 1165 1166 1167 1168 1169
    // transform along y for all values of x
    int ntrans = np0_;
    int istart = k * np0_ * np1_;
    int length = np1_;
    int ainc = np0_;
    int ajmp = 1;
    double scale = 1.0;
    int idir = 1;
    cfftm (&val[istart],&val[istart],scale,ntrans,length,ainc,ajmp,idir );
1170

1171
    // transform along x for non-zero elements
1172
    ntrans = ntrans0_;
1173 1174 1175 1176 1177
    istart = k * np0_ * np1_;
    length = np0_;
    ainc   = 1;
    ajmp   = np0_;
    cfftm (&val[istart],&val[istart],scale,ntrans,length,ainc,ajmp,idir );
1178

1179
    istart = np0_ * ( (np1_-ntrans) + k * np1_ );
1180 1181
    cfftm (&val[istart],&val[istart],scale,ntrans,length,ainc,ajmp,idir );
  } // for k
1182 1183 1184
#else
#error "Must define USE_FFTW2, USE_FFTW3, USE_ESSL_FFT or FFT_NOLIB"
#endif
1185

1186
#if TIMING
1187 1188
  tm_f_xy.stop();
  tm_f_com.start();
1189 1190 1191
  tm_f_fft.stop();
  tm_f_pack.start();
#endif
Francois Gygi committed
1192

1193
  // gather val into rbuf
Francois Gygi committed
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
#if USE_GATHER_SCATTER
  // zgthr: x(i) = y(indx(i))
  // void zgthr_(int* n, complex<double>* y, complex<double>* x, int*indx);
  {
    complex<double>* y = &val[0];
    complex<double>* x = &rbuf[0];
    int n = rbuf.size();
    zgthr_(&n,y,x,&iunpack_[0]);
  }
#else
  const int rbuf_size = rbuf.size();
  double* const pr = (double*) &rbuf[0];
  const double* const pv = (double*) &val[0];
1207
  #pragma omp parallel for
Francois Gygi committed
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
  for ( int i = 0; i < rbuf_size; i++ )
  {
    // rbuf[i] = val[iunpack_[i]];
    const int iu = iunpack_[i];
    const double a = pv[2*iu];
    const double b = pv[2*iu+1];
    pr[2*i]   = a;
    pr[2*i+1] = b;
  }
#endif
1218

Francois Gygi committed
1219
  // transpose
1220 1221 1222 1223
#if TIMING
  tm_f_pack.stop();
  tm_f_mpi.start();
#endif
Francois Gygi committed
1224 1225 1226
#if USE_MPI
  int status = MPI_Alltoallv((double*)&rbuf[0],&rcounts[0],&rdispl[0],
      MPI_DOUBLE,(double*)&sbuf[0],&scounts[0],&sdispl[0],MPI_DOUBLE,
1227
      comm_);
Francois Gygi committed
1228 1229 1230
  assert ( status == 0 );
#else
  assert(sbuf.size()==rbuf.size());
1231
  sbuf = rbuf;
Francois Gygi committed
1232
#endif
1233

Francois Gygi committed
1234 1235
  // segments of z-vectors are now in sbuf
  // gather sbuf into zvec_
1236 1237 1238 1239
#if TIMING
  tm_f_mpi.stop();
  tm_f_unpack.start();
#endif
1240

Francois Gygi committed
1241 1242 1243 1244 1245 1246 1247 1248 1249
#if USE_GATHER_SCATTER
  // zgthr: x(i) = y(indx(i))
  // void zgthr_(int* n, complex<double>* y, complex<double>* x, int*indx);
  {
    complex<double>* y = &sbuf[0];
    complex<double>* x = &zvec_[0];
    int n = zvec_.size();
    zgthr_(&n,y,x,&ipack_[0]);
  }
1250
#else // no gather scatter
Francois Gygi committed
1251 1252 1253
  const int zvec_size = zvec_.size();
  const double* const ps = (double*) &sbuf[0];
  double* const pz = (double*) &zvec_[0];
1254
  #pragma omp parallel for
Francois Gygi committed
1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266
  for ( int i = 0; i < zvec_size; i++ )
  {
    // zvec_[i] = sbuf[ipack_[i]];
    const int ip = ipack_[i];
    const double a = ps[2*ip];
    const double b = ps[2*ip+1];
    pz[2*i]   = a;
    pz[2*i+1] = b;
  }
#endif

  // transform along z
1267 1268 1269
#if TIMING
  tm_f_unpack.stop();
  tm_f_fft.start();
1270 1271
  tm_f_com.stop();
  tm_f_z.start();
1272
#endif
1273

1274
#if USE_ESSL_FFT
Francois Gygi committed
1275
  int inc1 = 1, inc2 = np2_, ntrans = nvec_, isign = 1, initflag = 0;
1276
  double scale = 1.0 / np012();
1277

Francois Gygi committed
1278 1279
  dcft_(&initflag,&zvec_[0],&inc1,&inc2,&zvec_[0],&inc1,&inc2,&np2_,&ntrans,
        &isign,&scale,&aux1zf[0],&naux1z,&aux2[0],&naux2);
1280

1281
#elif USE_FFTW2
1282
#if _OPENMP
1283
  const double fac = 1.0 / np012();
1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295
  #pragma omp parallel for
  for ( int i = 0; i < nvec_; i++ )
  {
    //#pragma omp task
    fftw_one(fwplan2,(FFTW_COMPLEX*)&zvec_[i*np2_],(FFTW_COMPLEX*)0);
    for ( int j = 0; j < np2_; j++ )
      zvec_[i*np2_+j] *= fac;
  }
  // int inc1=1;
  // zdscal(&len,&fac,&zvec_[0],&inc1);
#else

Francois Gygi committed
1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
 /*
  * void fftw(fftw_plan plan, int howmany,
  *    FFTW_COMPLEX *in, int istride, int idist,
  *    FFTW_COMPLEX *out, int ostride, int odist);
  */
  int ntrans, inc1, inc2;

  ntrans = nvec_;
  inc1 = 1;
  inc2 = np2_;
  fftw(fwplan2,ntrans,(FFTW_COMPLEX*)&zvec_[0],inc1,inc2,
                      (FFTW_COMPLEX*)0,0,0);
  int len = zvec_.size();
1309
  double fac = 1.0 / np012();
Francois Gygi committed
1310
  zdscal(&len,&fac,&zvec_[0],&inc1);
1311
#endif
1312 1313 1314 1315 1316
#elif USE_FFTW3

#if USE_FFTW3_THREADS
  fftw_execute_dft ( fwplan, (fftw_complex*)&zvec_[0],
                    (fftw_complex*)&zvec_[0]);
Francois Gygi committed
1317
#else
1318 1319 1320 1321 1322 1323 1324 1325 1326
  // do np2_ same for D_USE_1D or not
  #pragma omp parallel for
  for ( int i = 0; i < nvec_; i++ )
  {
    fftw_execute_dft ( fwplan, (fftw_complex*)&zvec_[i*np2_],
                      (fftw_complex*)&zvec_[i*np2_]);
  }
#endif
  // scale
1327
  double fac = 1.0 / np012();
1328 1329 1330 1331
  int len = zvec_.size();
  int inc1 = 1;
  zdscal(&len,&fac,&zvec_[0],&inc1);
#elif defined(FFT_NOLIB)
1332 1333 1334 1335 1336 1337
  // No library
  /* Transform along z */
  int ntrans = nvec_;
  int length = np2_;
  int ainc   = 1;
  int ajmp   = np2_;
1338
  double scale = 1.0 / np012();
1339 1340
  int idir = 1;
  cfftm ( &zvec_[0], &zvec_[0], scale, ntrans, length, ainc, ajmp, idir );
1341 1342
#else
#error "Must define USE_FFTW2, USE_FFTW3, USE_ESSL_FFT or FFT_NOLIB"
Francois Gygi committed
1343
#endif
1344 1345

#if TIMING
1346
  tm_f_z.stop();
1347 1348
  tm_f_fft.stop();
#endif
Francois Gygi committed
1349 1350 1351 1352 1353 1354 1355
}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::init_lib(void)
{
  // initialization of FFT libs

1356 1357
#if USE_ESSL_FFT
  complex<double> *p = 0;
1358
#if USE_ESSL_2DFFT
Francois Gygi committed
1359 1360 1361 1362 1363 1364 1365
  // use 2D FFT for x and y transforms and 1D FFT for z transforms
  naux1xy = 40000 + 2.28 * (np0_+np1_);
  aux1xyf.resize(naux1xy);
  aux1xyb.resize(naux1xy);
  int r = max(np0_,np1_);
  int s = min(64,min(np0_,np1_));
  naux2 = 20000 + (2*r+256)*(s+2.28);
1366

Francois Gygi committed
1367 1368 1369
  naux1z = 20000 + 2.28 * np2_;
  aux1zf.resize(naux1z);
  aux1zb.resize(naux1z);
1370

Francois Gygi committed
1371 1372 1373 1374
  int ntrans2 = nvec_;
  int naux2z = 20000 + 2.28 * np2_ + (256 + 2*np2_)*min(64,ntrans2);
  naux2 = max( naux2, naux2z );
  aux2.resize(naux2);
1375

Francois Gygi committed
1376
  double scale = 1.0;
1377

Francois Gygi committed
1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
  // initialize xy transforms
  int initflag = 1, inc1, inc2, isign = -1;
  inc1 = 1; inc2 = np0_;
  dcft2_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np0_,&np1_,
         &isign,&scale,&aux1xyb[0],&naux1xy,&aux2[0],&naux2);
  isign = 1;
  dcft2_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np0_,&np1_,
         &isign,&scale,&aux1xyf[0],&naux1xy,&aux2[0],&naux2);

  // initialize z transforms
  int ntrans = nvec_;
1389
  inc1 = 1; inc2 = np2_;
Francois Gygi committed
1390 1391 1392
  isign = -1;
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np2_,&ntrans,
       &isign,&scale,&aux1zb[0],&naux1z,&aux2[0],&naux2);
1393
  isign = 1; scale = 1.0 / np012();
Francois Gygi committed
1394 1395
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np2_,&ntrans,
       &isign,&scale,&aux1zf[0],&naux1z,&aux2[0],&naux2);
1396
#else // USE_ESSL_2DFFT
1397

1398 1399 1400
  naux1x = (int) (20000 + 2.28 * np0_);
  naux1y = (int) (20000 + 2.28 * np1_);
  naux1z = (int) (20000 + 2.28 * np2_);
Francois Gygi committed
1401 1402 1403 1404 1405 1406 1407
  aux1xf.resize(naux1x);
  aux1yf.resize(naux1y);
  aux1zf.resize(naux1z);
  aux1xb.resize(naux1x);
  aux1yb.resize(naux1y);
  aux1zb.resize(naux1z);

Francois Gygi committed
1408
  int naux2x = (int) (20000 + 2.28 * np0_ + (256 + 2*np0_)*min(64,ntrans0_));
Francois Gygi committed
1409
  naux2 = naux2x;
Francois Gygi committed
1410
  int naux2y = (int) (20000 + 2.28 * np1_ + (256 + 2*np1_)*min(64,ntrans1_));
Francois Gygi committed
1411
  naux2 = max( naux2, naux2y );
Francois Gygi committed
1412
  int naux2z = (int) (20000 + 2.28 * np2_ + (256 + 2*np2_)*min(64,ntrans2_));
Francois Gygi committed
1413 1414
  naux2 = max( naux2, naux2z );
  aux2.resize(naux2);
1415

Francois Gygi committed
1416 1417 1418 1419
  // initialize x, y and z transforms

  int initflag = 1, inc1, inc2, ntrans, isign;
  double scale = 1.0;
1420

Francois Gygi committed
1421
  // x transforms
1422
  inc1 = 1; inc2 = np0_; ntrans = ntrans0_;
Francois Gygi committed
1423 1424 1425 1426 1427 1428
  isign = -1;
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np0_,&ntrans,
        &isign,&scale,&aux1xb[0],&naux1x,&aux2[0],&naux2);
  isign = 1;
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np0_,&ntrans,
        &isign,&scale,&aux1xf[0],&naux1x,&aux2[0],&naux2);
1429

Francois Gygi committed
1430
  // y transforms
1431
  inc1 = np0_; inc2 = 1; ntrans = ntrans1_;
Francois Gygi committed
1432 1433 1434 1435 1436 1437
  isign = -1;
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np1_,&ntrans,
        &isign,&scale,&aux1yb[0],&naux1y,&aux2[0],&naux2);
  isign = 1;
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np1_,&ntrans,
        &isign,&scale,&aux1yf[0],&naux1y,&aux2[0],&naux2);
1438

Francois Gygi committed
1439
  // z transforms
1440
  inc1 = 1; inc2 = np2_; ntrans = ntrans2_;
Francois Gygi committed
1441 1442 1443
  isign = -1;
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np2_,&ntrans,
        &isign,&scale,&aux1zb[0],&naux1z,&aux2[0],&naux2);
1444
  isign = 1; scale = 1.0 / np012();
Francois Gygi committed
1445 1446 1447
  dcft_(&initflag,p,&inc1,&inc2,p,&inc1,&inc2,&np2_,&ntrans,
        &isign,&scale,&aux1zf[0],&naux1z,&aux2[0],&naux2);

1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460
#endif // USE_ESSL_2DFFT

#elif USE_FFTW2

  fwplan0 = fftw_create_plan(np0_,FFTW_FORWARD,FFTW_ALGO|FFTW_IN_PLACE);
  fwplan1 = fftw_create_plan(np1_,FFTW_FORWARD,FFTW_ALGO|FFTW_IN_PLACE);
  fwplan2 = fftw_create_plan(np2_,FFTW_FORWARD,FFTW_ALGO|FFTW_IN_PLACE);
  bwplan0 = fftw_create_plan(np0_,FFTW_BACKWARD,FFTW_ALGO|FFTW_IN_PLACE);
  bwplan1 = fftw_create_plan(np1_,FFTW_BACKWARD,FFTW_ALGO|FFTW_IN_PLACE);
  bwplan2 = fftw_create_plan(np2_,FFTW_BACKWARD,FFTW_ALGO|FFTW_IN_PLACE);

#elif USE_FFTW3
  vector<complex<double> > aux(np0_*np1_);
1461 1462
#if defined(USE_FFTW3MKL) && !defined(USE_FFTW3_THREADS) && _OPENMP
  fftw3_mkl.number_of_user_threads = omp_get_max_threads();
Francois Gygi committed
1463 1464
#endif

1465 1466
#if USE_FFTW3_THREADS
  fftw_init_threads();
1467
  fftw_plan_with_nthreads(omp_get_max_threads());
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
  vector<complex<double> > aux1(np0_*np1_*np2_loc_[myproc_]);

  // xy
  int rank = 2;
  int n[] = {np1_,np0_};
  int howmany = np2_loc_[myproc_];
  //int howmany = 1;
  int idist = np0_*np1_, odist = np0_*np1_;
  int istride = 1, ostride = 1; /* array is contiguous in memory */
  int *inembed = n, *onembed = n;

  fwplan2d = fftw_plan_many_dft(rank, n, howmany, (fftw_complex*)&aux1[0],
                                    inembed, istride, idist,
                                    (fftw_complex*)&aux1[0], onembed,
                                    ostride, odist, -1, FFTW_ALGO);
  bwplan2d = fftw_plan_many_dft(rank, n, howmany, (fftw_complex*)&aux1[0],
                                    inembed, istride, idist,
                                    (fftw_complex*)&aux1[0], onembed,
                                    ostride, odist, 1, FFTW_ALGO);

  // z
  rank = 1;
  int nz[] = {np2_};
  howmany = nvec_;
  idist = np2_, odist = np2_;
  istride = 1, ostride = 1; /* array is contiguous in memory */
  inembed = nz, onembed = nz;

  fwplan = fftw_plan_many_dft(rank, nz, howmany, (fftw_complex*)&zvec_[0],
                                    inembed, istride, idist,
                                    (fftw_complex*)&zvec_[0], onembed,
                                    ostride, odist, -1, FFTW_ALGO);
  bwplan = fftw_plan_many_dft(rank, nz, howmany, (fftw_complex*)&zvec_[0],
                                    inembed, istride, idist,
                                    (fftw_complex*)&zvec_[0], onembed,
                                    ostride, odist, 1, FFTW_ALGO);


#else // USE_FFTW3_THREADS
#if USE_FFTW3_2D
  // row major in FFTW3 2d plans
  fwplan2d = fftw_plan_dft_2d ( np1_, np0_, (fftw_complex*)(&aux[0]),
                                (fftw_complex*)(&aux[0]), -1,
                                FFTW_ALGO );
  bwplan2d = fftw_plan_dft_2d ( np1_, np0_, (fftw_complex*)(&aux[0]),
                                (fftw_complex*)(&aux[0]), 1,
                                FFTW_ALGO );
#else // USE_FFTW3_2D
  // FFTW3 1D
  fwplanx = fftw_plan_dft_1d ( np0_, (fftw_complex*)(&aux[0]),
                                (fftw_complex*)(&aux[0]), -1,
                                FFTW_ALGO );
  bwplanx = fftw_plan_dft_1d ( np0_, (fftw_complex*)(&aux[0]),
                                (fftw_complex*)(&aux[0]), 1,
                                FFTW_ALGO );

#if FFTW_TRANSPOSE
  fwplany = fftw_plan_dft_1d ( np1_, (fftw_complex*)(&aux[0]),
                                (fftw_complex*)(&aux[0]), -1,
                                FFTW_ALGO );
  bwplany = fftw_plan_dft_1d ( np1_, (fftw_complex*)(&aux[0]),
                                (fftw_complex*)(&aux[0]), 1,
                                FFTW_ALGO );

#else // FFTW_TRANSPOSE
  // strided FFT
  int rank = 1;
  int n[] = {np1_};
  int howmany = 1;
  int idist = 1, odist = 1;
  int istride = np0_, ostride = np0_; /* array is contiguous in memory */
  int *inembed = n, *onembed = n;

  fwplany = fftw_plan_many_dft(rank, n, howmany, (fftw_complex*)&aux[0],
                                    inembed, istride, idist,
                                    (fftw_complex*)&aux[0], onembed,
                                    ostride, odist, -1, FFTW_ALGO);
  bwplany = fftw_plan_many_dft(rank, n, howmany, (fftw_complex*)&aux[0],
                                    inembed, istride, idist,
                                    (fftw_complex*)&aux[0], onembed,
                                    ostride, odist, 1, FFTW_ALGO);
#endif // FFTW_TRANSPOSE
#endif // USE_FFTW3_2D
  // do z using 1d plans
  fwplan = fftw_plan_dft_1d ( np2_, (fftw_complex*)(&zvec_[0]),
                                (fftw_complex*)(&zvec_[0]), -1,
                                FFTW_ALGO );
  bwplan = fftw_plan_dft_1d ( np2_, (fftw_complex*)(&zvec_[0]),
                                (fftw_complex*)(&zvec_[0]), 1,
                                FFTW_ALGO );
#endif //USE_FFTW3_THREADS

#elif FFT_NOLIB // USE_FFTW3
Francois Gygi committed
1561
  /* no library */
1562 1563
#else
#error "Must define USE_FFTW2, USE_FFTW3, USE_ESSL_FFT or FFT_NOLIB"
Francois Gygi committed
1564 1565 1566 1567 1568 1569 1570 1571 1572 1573
#endif

}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::vector_to_zvec(const complex<double> *c)
{
  const int ng = basis_.localsize();
  const int zvec_size = zvec_.size();
  double* const pz = (double*) &zvec_[0];
1574
  #pragma omp parallel for
Francois Gygi committed
1575 1576 1577 1578 1579 1580 1581
  for ( int i = 0; i < zvec_size; i++ )
  {
    pz[2*i]   = 0.0;
    pz[2*i+1] = 0.0;
  }
  const double* const pc = (double*) &c[0];
  if ( basis_.real() )
1582
  {
1583
    #pragma omp parallel for
Francois Gygi committed
1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
    for ( int ig = 0; ig < ng; ig++ )
    {
      // zvec_[ifftp_[ig]] = c[ig];
      // zvec_[ifftm_[ig]] = conj(c[ig]);
      const double a = pc[2*ig];
      const double b = pc[2*ig+1];
      const int ip = ifftp_[ig];
      const int im = ifftm_[ig];
      pz[2*ip] = a;
      pz[2*ip+1] = b;
      pz[2*im] = a;
      pz[2*im+1] = -b;
    }
  }
  else
1599
    #pragma omp parallel for
Francois Gygi committed
1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615
    for ( int ig = 0; ig < ng; ig++ )
    {
      // zvec_[ifftp_[ig]] = c[ig];
      const double a = pc[2*ig];
      const double b = pc[2*ig+1];
      const int ip = ifftp_[ig];
      pz[2*ip] = a;
      pz[2*ip+1] = b;
    }
}
////////////////////////////////////////////////////////////////////////////////
void FourierTransform::zvec_to_vector(complex<double> *c)
{
  const int ng = basis_.localsize();
  const double* const pz = (double*) &zvec_[0];
  double* const pc = (double*) &c[0];
1616
  #pragma omp parallel for
Francois Gygi committed
1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
  for ( int ig = 0; ig < ng; ig++ )
  {
    // c[ig] = zvec_[ifftp_[ig]];
    const int ip = ifftp_[ig];
    const double pz0 = pz[2*ip];
    const double pz1 = pz[2*ip+1];
    pc[2*ig]   = pz0;
    pc[2*ig+1] = pz1;
  }
}

////////////////////////////////////////////////////////////////////////////////
void FourierTransform::doublevector_to_zvec(const complex<double> *c1,
  const complex<double> *c2)
{
  // Mapping of two real functions onto zvec
  assert(basis_.real());
  const int zvec_size = zvec_.size();
  double* const pz = (double*) &zvec_[0];
1636
  #pragma omp parallel for
Francois Gygi committed
1637 1638 1639 1640 1641 1642 1643 1644
  for ( int i = 0; i < zvec_size; i++ )
  {
    pz[2*i] = 0.0;
    pz[2*i+1] = 0.0;
  }
  const int ng = basis_.localsize();
  const double* const pc1 = (double*) &c1[0];
  const double* const pc2 = (double*) &c2[0];
1645
  #pragma omp parallel for
Francois Gygi committed
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667
  for ( int ig = 0; ig < ng; ig++ )
  {
    // const double a = c1[ig].real();
    // const double b = c1[ig].imag();
    // const double c = c2[ig].real();
    // const double d = c2[ig].imag();
    // zvec_[ip] = complex<double>(a-d, b+c);
    // zvec_[im] = complex<double>(a+d, c-b);
    const double a = pc1[2*ig];
    const double b = pc1[2*ig+1];
    const double c = pc2[2*ig];
    const double d = pc2[2*ig+1];
    const int ip = ifftp_[ig];
    const int im = ifftm_[ig];
    pz[2*ip]   = a - d;
    pz[2*ip+1] = b + c;
    pz[2*im]   = a + d;
    pz[2*im+1] = c - b;
  }
}

////////////////////////////////////////////////////////////////////////////////
1668
void FourierTransform::zvec_to_doublevector(complex<double> *c1,
Francois Gygi committed
1669 1670
  complex<double> *c2 )
{
1671
  // Mapping of zvec onto two real functions
Francois Gygi committed
1672 1673 1674 1675 1676
  assert(basis_.real());
  const int ng = basis_.localsize();
  const double* const pz = (double*) &zvec_[0];
  double* const pc1 = (double*) &c1[0];
  double* const pc2 = (double*) &c2[0];
1677
  #pragma omp parallel for
Francois Gygi committed
1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699
  for ( int ig = 0; ig < ng; ig++ )
  {
    // const double a = 0.5*zvec_[ip].real();
    // const double b = 0.5*zvec_[ip].imag();
    // const double c = 0.5*zvec_[im].real();
    // const double d = 0.5*zvec_[im].imag();
    // c1[ig] = complex<double>(a+c, b-d);
    // c2[ig] = complex<double>(b+d, c-a);