// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Stan Moore (SNL), Julien Tranchida (SNL)
------------------------------------------------------------------------- */

#include "pppm_dipole.h"

#include "atom.h"
#include "comm.h"
#include "domain.h"
#include "error.h"
#include "fft3d_wrap.h"
#include "force.h"
#include "grid3d.h"
#include "math_const.h"
#include "math_special.h"
#include "memory.h"
#include "neighbor.h"
#include "pair.h"
#include "remap_wrap.h"
#include "update.h"

#include <cmath>
#include <cstring>

using namespace LAMMPS_NS;
using namespace MathConst;
using namespace MathSpecial;

static constexpr int MAXORDER = 7;
static constexpr int OFFSET = 16384;
static constexpr double SMALL = 0.00001;
static constexpr double EPS_HOC = 1.0e-7;
static constexpr FFT_SCALAR ZEROF = 0.0;

enum { REVERSE_MU };
enum { FORWARD_MU, FORWARD_MU_PERATOM };

/* ---------------------------------------------------------------------- */

PPPMDipole::PPPMDipole(LAMMPS *lmp) : PPPM(lmp),
  densityx_brick_dipole(nullptr), densityy_brick_dipole(nullptr),
  densityz_brick_dipole(nullptr),
  vdxx_brick_dipole(nullptr), vdyy_brick_dipole(nullptr), vdzz_brick_dipole(nullptr),
  vdxy_brick_dipole(nullptr), vdxz_brick_dipole(nullptr), vdyz_brick_dipole(nullptr),
  ux_brick_dipole(nullptr), uy_brick_dipole(nullptr), uz_brick_dipole(nullptr),
  v0x_brick_dipole(nullptr), v1x_brick_dipole(nullptr),
  v2x_brick_dipole(nullptr), v3x_brick_dipole(nullptr), v4x_brick_dipole(nullptr),
  v5x_brick_dipole(nullptr), v0y_brick_dipole(nullptr), v1y_brick_dipole(nullptr),
  v2y_brick_dipole(nullptr), v3y_brick_dipole(nullptr), v4y_brick_dipole(nullptr),
  v5y_brick_dipole(nullptr), v0z_brick_dipole(nullptr), v1z_brick_dipole(nullptr),
  v2z_brick_dipole(nullptr), v3z_brick_dipole(nullptr), v4z_brick_dipole(nullptr),
  v5z_brick_dipole(nullptr), work3(nullptr), work4(nullptr),
  densityx_fft_dipole(nullptr), densityy_fft_dipole(nullptr),
  densityz_fft_dipole(nullptr)
{
  dipoleflag = 1;
  group_group_enable = 0;

  gc_dipole = nullptr;
}

/* ----------------------------------------------------------------------
   free all memory
------------------------------------------------------------------------- */

PPPMDipole::~PPPMDipole()
{
  if (copymode) return;

  PPPMDipole::deallocate();
  if (peratom_allocate_flag) PPPMDipole::deallocate_peratom();
}

/* ----------------------------------------------------------------------
   called once before run
------------------------------------------------------------------------- */

void PPPMDipole::init()
{
  if (me == 0) utils::logmesg(lmp,"PPPMDipole initialization ...\n");

  // error check

  dipoleflag = atom->mu?1:0;
  qsum_qsq(0); // q[i] might not be declared ?

  if (dipoleflag && q2)
    error->all(FLERR,"Cannot (yet) use charges with Kspace style PPPMDipole");

  triclinic_check();

  if (triclinic != domain->triclinic)
    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");

  if (domain->dimension == 2)
    error->all(FLERR,"Cannot use PPPMDipole with 2d simulation");

  if (comm->style != Comm::BRICK)
    error->universe_all(FLERR,"PPPMDipole can only currently be used with comm_style brick");

  if (!atom->mu) error->all(FLERR,"Kspace style requires atom attribute mu");

  if (atom->mu && differentiation_flag == 1)
    error->all(FLERR,"Cannot (yet) use kspace_modify diff ad with dipoles");

  if (dipoleflag && strcmp(update->unit_style,"electron") == 0)
    error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");

  if (slabflag == 0 && domain->nonperiodic > 0)
    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipole");
  if (slabflag) {
    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
      error->all(FLERR,"Incorrect boundaries with slab PPPMDipole");
  }

  if (order < 2 || order > MAXORDER)
    error->all(FLERR,"PPPMDipole order cannot be < 2 or > {}", MAXORDER);

  // compute two charge force

  two_charge();

  // extract short-range Coulombic cutoff from pair style

  triclinic = domain->triclinic;
  if (triclinic)
    error->all(FLERR,"Cannot yet use triclinic cells with PPPMDipole");

  pair_check();

  int itmp = 0;
  auto p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
  if (p_cutoff == nullptr)
    error->all(FLERR,"KSpace style is incompatible with Pair style");
  cutoff = *p_cutoff;

  // kspace TIP4P not yet supported
  // qdist = offset only for TIP4P fictitious charge

  qdist = 0.0;
  if (tip4pflag)
    error->all(FLERR,"Cannot yet use TIP4P with PPPMDipole");

  // compute musum & musqsum and warn if no dipoles

  scale = 1.0;
  qqrd2e = force->qqrd2e;
  musum_musq();
  natoms_original = atom->natoms;

  // set accuracy (force units) from accuracy_relative or accuracy_absolute

  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
  else accuracy = accuracy_relative * two_charge_force;

  // free all arrays previously allocated

  deallocate();
  if (peratom_allocate_flag) deallocate_peratom();

  // setup FFT grid resolution and g_ewald
  // normally one iteration thru while loop is all that is required
  // if grid stencil does not extend beyond neighbor proc
  //   or overlap is allowed, then done
  // else reduce order and try again

  gc_dipole = nullptr;
  int iteration = 0;

  while (order >= minorder) {
    if (iteration && me == 0)
      error->warning(FLERR,"Reducing PPPMDipole order b/c stencil extends "
                     "beyond nearest neighbor processor");

    compute_gf_denom();
    set_grid_global();
    set_grid_local();
    if (overlap_allowed) break;

    gc_dipole = new Grid3d(lmp,world,nx_pppm,ny_pppm,nz_pppm);
    gc_dipole->set_distance(0.5*neighbor->skin + qdist);
    gc_dipole->set_stencil_atom(-nlower,nupper);
    gc_dipole->set_shift_atom(shiftatom_lo,shiftatom_hi);
    gc_dipole->set_zfactor(slab_volfactor);

    gc_dipole->setup_grid(nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                          nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);

    int tmp1,tmp2;
    gc_dipole->setup_comm(tmp1,tmp2);
    if (gc_dipole->ghost_adjacent()) break;
    delete gc_dipole;

    order--;
    iteration++;
  }

  if (order < minorder) error->all(FLERR,"PPPMDipole order < minimum allowed order");
  if (!overlap_allowed && !gc_dipole->ghost_adjacent())
    error->all(FLERR,"PPPMDipole grid stencil extends beyond nearest neighbor processor");
  if (gc_dipole) delete gc_dipole;

  // adjust g_ewald

  if (!gewaldflag) adjust_gewald();

  // calculate the final accuracy

  double estimated_accuracy = final_accuracy_dipole();

  // allocate K-space dependent memory
  // don't invoke allocate peratom(), will be allocated when needed

  allocate();

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients

  compute_gf_denom();
  compute_rho_coeff();

  // print stats

  int ngrid_max,nfft_both_max;
  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);

  if (me == 0) {
    std::string mesg = fmt::format("  G vector (1/distance) = {:.8g}\n",g_ewald);
    mesg += fmt::format("  grid = {} {} {}\n",nx_pppm,ny_pppm,nz_pppm);
    mesg += fmt::format("  stencil order = {}\n",order);
    mesg += fmt::format("  estimated absolute RMS force accuracy = {:.8g}\n",
                       estimated_accuracy);
    mesg += fmt::format("  estimated relative force accuracy = {:.8g}\n",
                       estimated_accuracy/two_charge_force);
    mesg += "  using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n";
    mesg += fmt::format("  3d grid and FFT values/proc = {} {}\n",
                       ngrid_max,nfft_both_max);
    utils::logmesg(lmp,mesg);
  }
}

/* ----------------------------------------------------------------------
   adjust PPPMDipole coeffs, called initially and whenever volume has changed
------------------------------------------------------------------------- */

void PPPMDipole::setup()
{
  // perform some checks to avoid illegal boundaries with read_data

  if (slabflag == 0 && domain->nonperiodic > 0)
    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipole");
  if (slabflag) {
    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
      error->all(FLERR,"Incorrect boundaries with slab PPPMDipole");
  }

  int i,j,k,n;
  double *prd;

  // volume-dependent factors
  // adjust z dimension for 2d slab PPPMDipole
  // z dimension for 3d PPPMDipole is zprd since slab_volfactor = 1.0

  prd = domain->prd;
  double xprd = prd[0];
  double yprd = prd[1];
  double zprd = prd[2];
  double zprd_slab = zprd*slab_volfactor;
  volume = xprd * yprd * zprd_slab;

  delxinv = nx_pppm/xprd;
  delyinv = ny_pppm/yprd;
  delzinv = nz_pppm/zprd_slab;

  delvolinv = delxinv*delyinv*delzinv;

  double unitkx = (MY_2PI/xprd);
  double unitky = (MY_2PI/yprd);
  double unitkz = (MY_2PI/zprd_slab);

  // fkx,fky,fkz for my FFT grid pts

  double per;

  for (i = nxlo_fft; i <= nxhi_fft; i++) {
    per = i - nx_pppm*(2*i/nx_pppm);
    fkx[i] = unitkx*per;
  }

  for (i = nylo_fft; i <= nyhi_fft; i++) {
    per = i - ny_pppm*(2*i/ny_pppm);
    fky[i] = unitky*per;
  }

  for (i = nzlo_fft; i <= nzhi_fft; i++) {
    per = i - nz_pppm*(2*i/nz_pppm);
    fkz[i] = unitkz*per;
  }

  // virial coefficients

  double sqk,vterm;

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++) {
    for (j = nylo_fft; j <= nyhi_fft; j++) {
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
        if (sqk == 0.0) {
          vg[n][0] = 0.0;
          vg[n][1] = 0.0;
          vg[n][2] = 0.0;
          vg[n][3] = 0.0;
          vg[n][4] = 0.0;
          vg[n][5] = 0.0;
        } else {
          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
          vg[n][3] = vterm*fkx[i]*fky[j];
          vg[n][4] = vterm*fkx[i]*fkz[k];
          vg[n][5] = vterm*fky[j]*fkz[k];
        }
        n++;
      }
    }
  }

  compute_gf_dipole();
}

/* ----------------------------------------------------------------------
   reset local grid arrays and communication stencils
   called by fix balance b/c it changed sizes of processor sub-domains
------------------------------------------------------------------------- */

void PPPMDipole::reset_grid()
{
  // free all arrays previously allocated

  deallocate();
  if (peratom_allocate_flag) deallocate_peratom();

  // reset portion of global grid that each proc owns

  set_grid_local();

  // reallocate K-space dependent memory
  // check if grid communication is now overlapping if not allowed
  // don't invoke allocate peratom(), will be allocated when needed

  allocate();

  if (!overlap_allowed && !gc_dipole->ghost_adjacent())
    error->all(FLERR,"PPPMDipole grid stencil extends beyond nearest neighbor processor");

  // pre-compute Green's function denomiator expansion
  // pre-compute 1d charge distribution coefficients

  compute_gf_denom();
  compute_rho_coeff();

  // pre-compute volume-dependent coeffs

  setup();
}

/* ----------------------------------------------------------------------
   compute the PPPMDipole long-range force, energy, virial
------------------------------------------------------------------------- */

void PPPMDipole::compute(int eflag, int vflag)
{
  int i,j;

  // set energy/virial flags
  // invoke allocate_peratom() if needed for first time

  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = evflag_atom = eflag_global = vflag_global =
         eflag_atom = vflag_atom = 0;

  if (vflag_atom)
    error->all(FLERR,"Cannot (yet) compute per-atom virial "
                       "with kspace style pppm/dipole");

  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();

  // if atom count has changed, update qsum and qsqsum

  if (atom->natoms != natoms_original) {
    musum_musq();
    natoms_original = atom->natoms;
  }

  // return if there are no dipoles

  if (musqsum == 0.0) return;

  // convert atoms from box to lamda coords

  boxlo = domain->boxlo;

  // extend size of per-atom arrays if necessary

  if (atom->nmax > nmax) {
    memory->destroy(part2grid);
    nmax = atom->nmax;
    memory->create(part2grid,nmax,3,"pppm_dipole:part2grid");
  }

  // find grid points for all my particles
  // map my particle charge onto my local 3d density grid

  particle_map();
  make_rho_dipole();

  // all procs communicate density values from their ghost cells
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

  gc_dipole->reverse_comm(Grid3d::KSPACE,this,REVERSE_MU,3,sizeof(FFT_SCALAR),
                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
  brick2fft_dipole();

  // compute potential gradient on my FFT grid and
  //   portion of e_long on this proc's FFT grid
  // return gradients (electric fields) in 3d brick decomposition
  // also performs per-atom calculations via poisson_peratom()

  poisson_ik_dipole();

  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  gc_dipole->forward_comm(Grid3d::KSPACE,this,FORWARD_MU,9,sizeof(FFT_SCALAR),
                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // extra per-atom energy/virial communication

  if (evflag_atom)
    gc_dipole->forward_comm(Grid3d::KSPACE,this,FORWARD_MU_PERATOM,18,sizeof(FFT_SCALAR),
                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  // calculate the force on my particles

  fieldforce_ik_dipole();

  // extra per-atom energy/virial communication

  if (evflag_atom) fieldforce_peratom_dipole();

  // sum global energy across procs and add in volume-dependent term

  const double qscale = qqrd2e * scale;
  const double g3 = g_ewald*g_ewald*g_ewald;

  if (eflag_global) {
    double energy_all;
    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
    energy = energy_all;

    energy *= 0.5*volume;
    energy -= musqsum*2.0*g3/3.0/MY_PIS;
    energy *= qscale;
  }

  // sum global virial across procs

  if (vflag_global) {
    double virial_all[6];
    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
  }

  // per-atom energy/virial
  // energy includes self-energy correction

  if (evflag_atom) {
    double **mu = atom->mu;
    int nlocal = atom->nlocal;

    if (eflag_atom) {
      for (i = 0; i < nlocal; i++) {
        eatom[i] *= 0.5;
        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] +
                     mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
        eatom[i] *= qscale;
      }
    }

    if (vflag_atom) {
      for (i = 0; i < nlocal; i++)
        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
    }
  }

  // 2d slab correction

  if (slabflag == 1) slabcorr();
}

/* ----------------------------------------------------------------------
   allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMDipole::allocate()
{
  // create ghost grid object for rho and electric field communication
  // returns local owned and ghost grid bounds
  // setup communication patterns and buffers

  gc_dipole = new Grid3d(lmp,world,nx_pppm,ny_pppm,nz_pppm);
  gc_dipole->set_distance(0.5*neighbor->skin + qdist);
  gc_dipole->set_stencil_atom(-nlower,nupper);
  gc_dipole->set_shift_atom(shiftatom_lo,shiftatom_hi);
  gc_dipole->set_zfactor(slab_volfactor);

  gc_dipole->setup_grid(nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                        nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);

  gc_dipole->setup_comm(ngc_buf1,ngc_buf2);

  npergrid = 9;

  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");

  // tally local grid sizes
  // ngrid = count of owned+ghost grid cells on this proc
  // nfft_brick = FFT points in 3d brick-decomposition on this proc
  //              same as count of owned grid cells
  // nfft = FFT points in x-pencil FFT decomposition on this proc
  // nfft_both = greater of nfft and nfft_brick

  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);

  nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
    (nzhi_in-nzlo_in+1);

  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
    (nzhi_fft-nzlo_fft+1);

  nfft_both = MAX(nfft,nfft_brick);

  // allocate distributed grid data

  memory->create3d_offset(densityx_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:densityx_brick_dipole");
  memory->create3d_offset(densityy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:densityy_brick_dipole");
  memory->create3d_offset(densityz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:densityz_brick_dipole");

  memory->create(densityx_fft_dipole,nfft_both,"pppm_dipole:densityy_fft_dipole");
  memory->create(densityy_fft_dipole,nfft_both,"pppm_dipole:densityy_fft_dipole");
  memory->create(densityz_fft_dipole,nfft_both,"pppm_dipole:densityz_fft_dipole");

  memory->create(greensfn,nfft_both,"pppm_dipole:greensfn");
  memory->create(work1,2*nfft_both,"pppm_dipole:work1");
  memory->create(work2,2*nfft_both,"pppm_dipole:work2");
  memory->create(work3,2*nfft_both,"pppm_dipole:work3");
  memory->create(work4,2*nfft_both,"pppm_dipole:work4");
  memory->create(vg,nfft_both,6,"pppm_dipole:vg");

  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm_dipole:fkx");
  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm_dipole:fky");
  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm_dipole:fkz");

  memory->create3d_offset(ux_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:ux_brick_dipole");
  memory->create3d_offset(uy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:uy_brick_dipole");
  memory->create3d_offset(uz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:uz_brick_dipole");

  memory->create3d_offset(vdxx_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:vdxx_brick_dipole");
  memory->create3d_offset(vdxy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:vdxy_brick_dipole");
  memory->create3d_offset(vdyy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:vdyy_brick_dipole");
  memory->create3d_offset(vdxz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:vdxz_brick_dipole");
  memory->create3d_offset(vdyz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:vdyz_brick_dipole");
  memory->create3d_offset(vdzz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:vdzz_brick_dipole");

  // summation coeffs

  order_allocated = order;
  if (!gf_b) memory->create(gf_b,order,"pppm_dipole:gf_b");
  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm_dipole:rho1d");
  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm_dipole:drho1d");
  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm_dipole:rho_coeff");
  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
                          "pppm_dipole:drho_coeff");

  // create 2 FFTs and a Remap
  // 1st FFT keeps data in FFT decomposition
  // 2nd FFT returns data in 3d brick decomposition
  // remap takes data from 3d brick to FFT decomposition

  int tmp;

  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   0,0,&tmp,collective_flag);

  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                   0,0,&tmp,collective_flag);

  remap = new Remap(lmp,world,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    1,0,0,FFT_PRECISION,collective_flag);
}

/* ----------------------------------------------------------------------
   deallocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMDipole::deallocate()
{
  delete gc_dipole;
  memory->destroy(gc_buf1);
  memory->destroy(gc_buf2);
  gc_buf1 = gc_buf2 = nullptr;

  memory->destroy3d_offset(densityx_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(densityy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(densityz_brick_dipole,nzlo_out,nylo_out,nxlo_out);

  memory->destroy3d_offset(ux_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(uy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(uz_brick_dipole,nzlo_out,nylo_out,nxlo_out);

  memory->destroy3d_offset(vdxx_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(vdxy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(vdyy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(vdxz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(vdyz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(vdzz_brick_dipole,nzlo_out,nylo_out,nxlo_out);

  memory->destroy(densityx_fft_dipole);
  memory->destroy(densityy_fft_dipole);
  memory->destroy(densityz_fft_dipole);

  memory->destroy(work3);
  memory->destroy(work4);

  delete fft1;
  delete fft2;
  delete remap;

  fft1 = fft2 = nullptr;
  remap = nullptr;
}

/* ----------------------------------------------------------------------
   allocate per-atom memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMDipole::allocate_peratom()
{
  peratom_allocate_flag = 1;

  memory->create3d_offset(v0x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v0x_brick_dipole");
  memory->create3d_offset(v1x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v1x_brick_dipole");
  memory->create3d_offset(v2x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v2x_brick_dipole");
  memory->create3d_offset(v3x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v3x_brick_dipole");
  memory->create3d_offset(v4x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v4x_brick_dipole");
  memory->create3d_offset(v5x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v5x_brick_dipole");

  memory->create3d_offset(v0y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v0y_brick_dipole");
  memory->create3d_offset(v1y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v1y_brick_dipole");
  memory->create3d_offset(v2y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v2y_brick_dipole");
  memory->create3d_offset(v3y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v3y_brick_dipole");
  memory->create3d_offset(v4y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v4y_brick_dipole");
  memory->create3d_offset(v5y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v5y_brick_dipole");

  memory->create3d_offset(v0z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v0z_brick_dipole");
  memory->create3d_offset(v1z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v1z_brick_dipole");
  memory->create3d_offset(v2z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v2z_brick_dipole");
  memory->create3d_offset(v3z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v3z_brick_dipole");
  memory->create3d_offset(v4z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v4z_brick_dipole");
  memory->create3d_offset(v5z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                          nxlo_out,nxhi_out,"pppm_dipole:v5z_brick_dipole");

  // use same GC ghost grid object for peratom grid communication
  // but need to reallocate a larger gc_buf1 and gc_buf2

  npergrid = 18;

  memory->destroy(gc_buf1);
  memory->destroy(gc_buf2);
  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
}

/* ----------------------------------------------------------------------
   deallocate per-atom memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMDipole::deallocate_peratom()
{
  peratom_allocate_flag = 0;

  memory->destroy3d_offset(v0x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v1x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v2x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v3x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v4x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v5x_brick_dipole,nzlo_out,nylo_out,nxlo_out);

  memory->destroy3d_offset(v0y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v1y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v2y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v3y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v4y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v5y_brick_dipole,nzlo_out,nylo_out,nxlo_out);

  memory->destroy3d_offset(v0z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v1z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v2z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v3z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v4z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
  memory->destroy3d_offset(v5z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
}

/* ----------------------------------------------------------------------
   set global size of PPPMDipole grid = nx,ny,nz_pppm
   used for charge accumulation, FFTs, and electric field interpolation
------------------------------------------------------------------------- */

void PPPMDipole::set_grid_global()
{
  // use xprd,yprd,zprd
  // adjust z dimension for 2d slab PPPMDipole
  // 3d PPPMDipole just uses zprd since slab_volfactor = 1.0

  double xprd = domain->xprd;
  double yprd = domain->yprd;
  double zprd = domain->zprd;
  double zprd_slab = zprd*slab_volfactor;

  // make initial g_ewald estimate
  // based on desired accuracy and real space cutoff
  // fluid-occupied volume used to estimate real-space error
  // zprd used rather than zprd_slab

  double h;
  bigint natoms = atom->natoms;

  if (!gewaldflag) {
    if (accuracy <= 0.0)
      error->all(FLERR,"KSpace accuracy must be > 0");
    if (mu2 == 0.0)
     error->all(FLERR,"Must use kspace_modify gewald for systems with no dipoles");
    g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
    double g_ewald_new =
      find_gewald_dipole(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
    else error->warning(FLERR,"PPPMDipole dipole Newton solver failed, "
                        "using old method to estimate g_ewald");
  }

  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
  // nz_pppm uses extended zprd_slab instead of zprd
  // reduce it until accuracy target is met

  if (!gridflag) {

    h = h_x = h_y = h_z = 4.0/g_ewald;
    int count = 0;
    while (true) {

      // set grid dimension

      nx_pppm = static_cast<int> (xprd/h_x);
      ny_pppm = static_cast<int> (yprd/h_y);
      nz_pppm = static_cast<int> (zprd_slab/h_z);

      if (nx_pppm <= 1) nx_pppm = 2;
      if (ny_pppm <= 1) ny_pppm = 2;
      if (nz_pppm <= 1) nz_pppm = 2;

      // set local grid dimension

      int npey_fft,npez_fft;
      if (nz_pppm >= nprocs) {
        npey_fft = 1;
        npez_fft = nprocs;
      } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);

      int me_y = me % npey_fft;
      int me_z = me / npey_fft;

      nxlo_fft = 0;
      nxhi_fft = nx_pppm - 1;
      nylo_fft = me_y*ny_pppm/npey_fft;
      nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
      nzlo_fft = me_z*nz_pppm/npez_fft;
      nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;

      double df_kspace = compute_df_kspace_dipole();

      count++;

      // break loop if the accuracy has been reached or
      // too many loops have been performed

      if (df_kspace <= accuracy) break;
      if (count > 500) error->all(FLERR, "Could not compute grid size");
      h *= 0.95;
      h_x = h_y = h_z = h;
    }
  }

  // boost grid size until it is factorable

  while (!factorable(nx_pppm)) nx_pppm++;
  while (!factorable(ny_pppm)) ny_pppm++;
  while (!factorable(nz_pppm)) nz_pppm++;

  h_x = xprd/nx_pppm;
  h_y = yprd/ny_pppm;
  h_z = zprd_slab/nz_pppm;

  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
    error->all(FLERR,"PPPMDipole grid is too large");
}


/* ----------------------------------------------------------------------
   compute estimated kspace force error for dipoles
------------------------------------------------------------------------- */

double PPPMDipole::compute_df_kspace_dipole()
{
  double xprd = domain->xprd;
  double yprd = domain->yprd;
  double zprd = domain->zprd;
  double zprd_slab = zprd*slab_volfactor;
  bigint natoms = atom->natoms;
  double qopt = compute_qopt_dipole();
  double df_kspace = sqrt(qopt/natoms)*mu2/(3.0*xprd*yprd*zprd_slab);
  return df_kspace;
}

/* ----------------------------------------------------------------------
   compute qopt for dipoles with ik differentiation
------------------------------------------------------------------------- */

double PPPMDipole::compute_qopt_dipole()
{
  double qopt = 0.0;
  const double * const prd = domain->prd;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  const double unitkx = (MY_2PI/xprd);
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

  double snx,sny,snz;
  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
  double sum1,sum2,dot1,dot2;
  double denominator;
  double u1,sqk;

  int k,l,m,nx,ny,nz,kper,lper,mper;

  const int nbx = 2;
  const int nby = 2;
  const int nbz = 2;

  const int twoorder = 2*order;

  for (m = nzlo_fft; m <= nzhi_fft; m++) {
    mper = m - nz_pppm*(2*m/nz_pppm);
    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));

    for (l = nylo_fft; l <= nyhi_fft; l++) {
      lper = l - ny_pppm*(2*l/ny_pppm);
      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));

      for (k = nxlo_fft; k <= nxhi_fft; k++) {
        kper = k - nx_pppm*(2*k/nx_pppm);
        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));

        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);

        if (sqk != 0.0) {
          denominator = gf_denom(snx,sny,snz);
          sum1 = 0.0;
          sum2 = 0.0;

          for (nx = -nbx; nx <= nbx; nx++) {
            qx = unitkx*(kper+nx_pppm*nx);
            sx = exp(-0.25*square(qx/g_ewald));
            argx = 0.5*qx*xprd/nx_pppm;
            wx = powsinxx(argx,twoorder);

            for (ny = -nby; ny <= nby; ny++) {
              qy = unitky*(lper+ny_pppm*ny);
              sy = exp(-0.25*square(qy/g_ewald));
              argy = 0.5*qy*yprd/ny_pppm;
              wy = powsinxx(argy,twoorder);

              for (nz = -nbz; nz <= nbz; nz++) {
                qz = unitkz*(mper+nz_pppm*nz);
                sz = exp(-0.25*square(qz/g_ewald));
                argz = 0.5*qz*zprd_slab/nz_pppm;
                wz = powsinxx(argz,twoorder);

                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
                dot2 = qx*qx + qy*qy + qz*qz;
                //dot1 = dot1*dot1*dot1; // power of 3 for dipole forces
                //dot2 = dot2*dot2*dot2;
                u1 = sx*sy*sz;
                const double w2 = wx*wy*wz;
                const double phi = u1*MY_4PI/dot2;
                const double top = dot1*dot1*dot1*w2*phi;
                sum1 += phi*phi*dot2*dot2*dot2;
                sum2 += top*top/sqk/sqk/sqk;
              }
            }
          }
          qopt += sum1 - sum2/denominator;
        }
      }
    }
  }
  double qopt_all;
  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
  return qopt_all;
}

/* ----------------------------------------------------------------------
   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
------------------------------------------------------------------------- */

void PPPMDipole::compute_gf_dipole()
{
  const double * const prd = domain->prd;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  const double unitkx = (MY_2PI/xprd);
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

  double snx,sny,snz;
  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
  double sum1,dot1,dot2;
  double denominator;
  double sqk;

  int k,l,m,n,nx,ny,nz,kper,lper,mper;

  int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
                              pow(-log(EPS_HOC),0.25));
  int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
                              pow(-log(EPS_HOC),0.25));
  int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
                              pow(-log(EPS_HOC),0.25));
  nbx = MAX(nbx,2);
  nby = MAX(nby,2);
  nbz = MAX(nbz,2);
  const int twoorder = 2*order;

  n = 0;
  for (m = nzlo_fft; m <= nzhi_fft; m++) {
    mper = m - nz_pppm*(2*m/nz_pppm);
    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));

    for (l = nylo_fft; l <= nyhi_fft; l++) {
      lper = l - ny_pppm*(2*l/ny_pppm);
      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));

      for (k = nxlo_fft; k <= nxhi_fft; k++) {
        kper = k - nx_pppm*(2*k/nx_pppm);
        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));

        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);

        if (sqk != 0.0) {
          denominator = gf_denom(snx,sny,snz);
          sum1 = 0.0;

          for (nx = -nbx; nx <= nbx; nx++) {
            qx = unitkx*(kper+nx_pppm*nx);
            sx = exp(-0.25*square(qx/g_ewald));
            argx = 0.5*qx*xprd/nx_pppm;
            wx = powsinxx(argx,twoorder);

            for (ny = -nby; ny <= nby; ny++) {
              qy = unitky*(lper+ny_pppm*ny);
              sy = exp(-0.25*square(qy/g_ewald));
              argy = 0.5*qy*yprd/ny_pppm;
              wy = powsinxx(argy,twoorder);

              for (nz = -nbz; nz <= nbz; nz++) {
                qz = unitkz*(mper+nz_pppm*nz);
                sz = exp(-0.25*square(qz/g_ewald));
                argz = 0.5*qz*zprd_slab/nz_pppm;
                wz = powsinxx(argz,twoorder);

                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
                dot2 = qx*qx + qy*qy + qz*qz;
                const double u1 = sx*sy*sz;
                const double w2 = wx*wy*wz;
                const double phi = u1*MY_4PI/dot2;
                sum1 += dot1*dot1*dot1*w2*phi/sqk/sqk/sqk;
              }
            }
          }
          greensfn[n++] = sum1/denominator;
        } else greensfn[n++] = 0.0;
      }
    }
  }
}

/* ----------------------------------------------------------------------
   calculate f(x) for use in Newton-Raphson solver
------------------------------------------------------------------------- */

double PPPMDipole::newton_raphson_f()
{
  double xprd = domain->xprd;
  double yprd = domain->yprd;
  double zprd = domain->zprd;
  bigint natoms = atom->natoms;

  double df_rspace,df_kspace;
  double vol = xprd*yprd*zprd;
  double a = cutoff*g_ewald;
  double rg2 = a*a;
  double rg4 = rg2*rg2;
  double rg6 = rg4*rg2;
  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
  df_rspace = (mu2/(sqrt(vol*powint(g_ewald,4)*powint(cutoff,9)*natoms)) *
      sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) * exp(-rg2));
  df_kspace = compute_df_kspace_dipole();

  return df_rspace - df_kspace;
}

/* ----------------------------------------------------------------------
   find g_ewald parameter for dipoles based on desired accuracy
   using a Newton-Raphson solver
------------------------------------------------------------------------- */

double PPPMDipole::find_gewald_dipole(double x, double Rc,
                              bigint natoms, double vol, double b2)
{
  double dx,tol;
  int maxit;

  maxit = 10000; //Maximum number of iterations
  tol = 0.00001; //Convergence tolerance

  //Begin algorithm

  for (int i = 0; i < maxit; i++) {
    dx = newton_raphson_f_dipole(x,Rc,natoms,vol,b2) / derivf_dipole(x,Rc,natoms,vol,b2);
    x = x - dx; //Update x
    if (fabs(dx) < tol) return x;
    if (x < 0 || x != x) // solver failed
      return -1;
  }
  return -1;
}

/* ----------------------------------------------------------------------
   calculate f(x) objective function for dipoles
 ------------------------------------------------------------------------- */

double PPPMDipole::newton_raphson_f_dipole(double x, double Rc, bigint
natoms, double vol, double b2)
{
  double a = Rc*x;
  double rg2 = a*a;
  double rg4 = rg2*rg2;
  double rg6 = rg4*rg2;
  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
  double f = (b2/(sqrt(vol*powint(x,4)*powint(Rc,9)*natoms)) *
    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
    exp(-rg2)) - accuracy;

  return f;
}

/* ----------------------------------------------------------------------
   calculate numerical derivative f'(x) of objective function for dipoles
 ------------------------------------------------------------------------- */

double PPPMDipole::derivf_dipole(double x, double Rc,
                         bigint natoms, double vol, double b2)
{
  double h = 0.000001;  //Derivative step-size
  return (newton_raphson_f_dipole(x + h,Rc,natoms,vol,b2) - newton_raphson_f_dipole(x,Rc,natoms,vol,b2)) / h;
}

/* ----------------------------------------------------------------------
   calculate the final estimate of the accuracy
------------------------------------------------------------------------- */

double PPPMDipole::final_accuracy_dipole()
{
  double xprd = domain->xprd;
  double yprd = domain->yprd;
  double zprd = domain->zprd;
  double vol = xprd*yprd*zprd;
  bigint natoms = atom->natoms;
  if (natoms == 0) natoms = 1; // avoid division by zero

  double df_kspace = compute_df_kspace_dipole();

  double a = cutoff*g_ewald;
  double rg2 = a*a;
  double rg4 = rg2*rg2;
  double rg6 = rg4*rg2;
  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
  double df_rspace = (mu2/(sqrt(vol*powint(g_ewald,4)*powint(cutoff,9)*natoms)) *
    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
    exp(-rg2));

  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace);

  return estimated_accuracy;
}

/* ----------------------------------------------------------------------
   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
------------------------------------------------------------------------- */

void PPPMDipole::compute_gf_denom()
{
  if (gf_b) memory->destroy(gf_b);
  memory->create(gf_b,order,"pppm_dipole:gf_b");

  int k,l,m;

  for (l = 1; l < order; l++) gf_b[l] = 0.0;
  gf_b[0] = 1.0;

  for (m = 1; m < order; m++) {
    for (l = m; l > 0; l--)
      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
  }

  bigint ifact = 1;
  for (k = 1; k < 2*order; k++) ifact *= k;
  double gaminv = 1.0/ifact;
  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
}

/* ----------------------------------------------------------------------
   create discretized "density" on section of global grid due to my particles
   density(x,y,z) = charge "density" at grid points of my 3d brick
   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
   in global grid
------------------------------------------------------------------------- */

void PPPMDipole::make_rho_dipole()
{
  int l,m,n,nx,ny,nz,mx,my,mz;
  FFT_SCALAR dx,dy,dz;
  FFT_SCALAR x0,y0,z0;
  FFT_SCALAR x1,y1,z1;
  FFT_SCALAR x2,y2,z2;

  // clear 3d density array

  memset(&(densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
         ngrid*sizeof(FFT_SCALAR));
  memset(&(densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
         ngrid*sizeof(FFT_SCALAR));
  memset(&(densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
         ngrid*sizeof(FFT_SCALAR));

  // loop over my charges, add their contribution to nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt

  double **mu = atom->mu;
  double **x = atom->x;
  int nlocal = atom->nlocal;

  for (int i = 0; i < nlocal; i++) {

    nx = part2grid[i][0];
    ny = part2grid[i][1];
    nz = part2grid[i][2];
    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

    compute_rho1d(dx,dy,dz);

    z0 = delvolinv * mu[i][0];
    z1 = delvolinv * mu[i][1];
    z2 = delvolinv * mu[i][2];
    for (n = nlower; n <= nupper; n++) {
      mz = n+nz;
      y0 = z0*rho1d[2][n];
      y1 = z1*rho1d[2][n];
      y2 = z2*rho1d[2][n];
      for (m = nlower; m <= nupper; m++) {
        my = m+ny;
        x0 = y0*rho1d[1][m];
        x1 = y1*rho1d[1][m];
        x2 = y2*rho1d[1][m];
        for (l = nlower; l <= nupper; l++) {
          mx = l+nx;
          densityx_brick_dipole[mz][my][mx] += x0*rho1d[0][l];
          densityy_brick_dipole[mz][my][mx] += x1*rho1d[0][l];
          densityz_brick_dipole[mz][my][mx] += x2*rho1d[0][l];
        }
      }
    }
  }
}

/* ----------------------------------------------------------------------
   remap density from 3d brick decomposition to FFT decomposition
------------------------------------------------------------------------- */

void PPPMDipole::brick2fft_dipole()
{
  int n,ix,iy,iz;

  // copy grabs inner portion of density from 3d brick
  // remap could be done as pre-stage of FFT,
  //   but this works optimally on only double values, not complex values

  n = 0;
  for (iz = nzlo_in; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
        densityx_fft_dipole[n] = densityx_brick_dipole[iz][iy][ix];
        densityy_fft_dipole[n] = densityy_brick_dipole[iz][iy][ix];
        densityz_fft_dipole[n] = densityz_brick_dipole[iz][iy][ix];
        n++;
      }

  remap->perform(densityx_fft_dipole,densityx_fft_dipole,work1);
  remap->perform(densityy_fft_dipole,densityy_fft_dipole,work1);
  remap->perform(densityz_fft_dipole,densityz_fft_dipole,work1);
}

/* ----------------------------------------------------------------------
   FFT-based Poisson solver for ik
------------------------------------------------------------------------- */

void PPPMDipole::poisson_ik_dipole()
{
  int i,j,k,n,ii;
  double eng;
  double wreal,wimg;

  // transform dipole density (r -> k)

  n = 0;
  for (i = 0; i < nfft; i++) {
    work1[n] = densityx_fft_dipole[i];
    work1[n+1] = ZEROF;
    work2[n] = densityy_fft_dipole[i];
    work2[n+1] = ZEROF;
    work3[n] = densityz_fft_dipole[i];
    work3[n+1] = ZEROF;
    n += 2;
  }

  fft1->compute(work1,work1,FFT3d::FORWARD);
  fft1->compute(work2,work2,FFT3d::FORWARD);
  fft1->compute(work3,work3,FFT3d::FORWARD);

  // global energy and virial contribution

  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
  double scaleinv = 1.0/ngridtotal;
  double s2 = scaleinv*scaleinv;

  if (eflag_global || vflag_global) {
    if (vflag_global) {
      n = 0;
      ii = 0;
      for (k = nzlo_fft; k <= nzhi_fft; k++)
        for (j = nylo_fft; j <= nyhi_fft; j++)
          for (i = nxlo_fft; i <= nxhi_fft; i++) {
            wreal = (work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
            wimg = (work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
            eng = s2 * greensfn[ii] * (wreal*wreal + wimg*wimg);
            for (int jj = 0; jj < 6; jj++) virial[jj] += eng*vg[ii][jj];
            virial[0] += 2.0*s2*greensfn[ii]*fkx[i]*(work1[n]*wreal + work1[n+1]*wimg);
            virial[1] += 2.0*s2*greensfn[ii]*fky[j]*(work2[n]*wreal + work2[n+1]*wimg);
            virial[2] += 2.0*s2*greensfn[ii]*fkz[k]*(work3[n]*wreal + work3[n+1]*wimg);
            virial[3] += 2.0*s2*greensfn[ii]*fky[j]*(work1[n]*wreal + work1[n+1]*wimg);
            virial[4] += 2.0*s2*greensfn[ii]*fkz[k]*(work1[n]*wreal + work1[n+1]*wimg);
            virial[5] += 2.0*s2*greensfn[ii]*fkz[k]*(work2[n]*wreal + work2[n+1]*wimg);
            if (eflag_global) energy += eng;
            ii++;
            n += 2;
          }
    } else {
      n = 0;
      ii = 0;
      for (k = nzlo_fft; k <= nzhi_fft; k++)
        for (j = nylo_fft; j <= nyhi_fft; j++)
          for (i = nxlo_fft; i <= nxhi_fft; i++) {
            wreal = (work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
            wimg = (work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
            energy +=
            s2 * greensfn[ii] * (wreal*wreal + wimg*wimg);
            ii++;
            n += 2;
          }
    }
  }

  // scale by 1/total-grid-pts to get rho(k)
  // multiply by Green's function to get V(k)

  n = 0;
  for (i = 0; i < nfft; i++) {
    work1[n]   *= scaleinv * greensfn[i];
    work1[n+1] *= scaleinv * greensfn[i];
    work2[n]   *= scaleinv * greensfn[i];
    work2[n+1] *= scaleinv * greensfn[i];
    work3[n]   *= scaleinv * greensfn[i];
    work3[n+1] *= scaleinv * greensfn[i];
    n += 2;
  }

  // extra FFTs for per-atom energy/virial

  if (vflag_atom) poisson_peratom_dipole();

  // compute electric potential
  // FFT leaves data in 3d brick decomposition

  // Ex

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
        work4[n+1] = fkx[i]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        ux_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Ey

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
        work4[n+1] = fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        uy_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Ez

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
        work4[n+1] = fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        uz_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Vxx

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = -fkx[i]*fkx[i]*(work1[n+1]*fkx[i] +
                                   work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        work4[n+1] = fkx[i]*fkx[i]*(work1[n]*fkx[i] +
                                    work2[n]*fky[j] + work3[n]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdxx_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Vyy

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = -fky[j]*fky[j]*(work1[n+1]*fkx[i] +
                                   work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        work4[n+1] = fky[j]*fky[j]*(work1[n]*fkx[i] +
                                    work2[n]*fky[j] + work3[n]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdyy_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Vzz

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = -fkz[k]*fkz[k]*(work1[n+1]*fkx[i] +
                                   work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        work4[n+1] = fkz[k]*fkz[k]*(work1[n]*fkx[i] +
                                    work2[n]*fky[j] + work3[n]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdzz_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Vxy

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = -fkx[i]*fky[j]*(work1[n+1]*fkx[i] +
                                   work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        work4[n+1] = fkx[i]*fky[j]*(work1[n]*fkx[i] +
                                    work2[n]*fky[j] + work3[n]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdxy_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Vxz

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = -fkx[i]*fkz[k]*(work1[n+1]*fkx[i] +
                                   work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        work4[n+1] = fkx[i]*fkz[k]*(work1[n]*fkx[i] +
                                    work2[n]*fky[j] + work3[n]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdxz_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // Vyz

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = -fky[j]*fkz[k]*(work1[n+1]*fkx[i] +
                                   work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
        work4[n+1] = fky[j]*fkz[k]*(work1[n]*fkx[i] +
                                    work2[n]*fky[j] + work3[n]*fkz[k]);
        n += 2;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        vdyz_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }
}

/* ----------------------------------------------------------------------
   FFT-based Poisson solver for per-atom energy/virial
------------------------------------------------------------------------- */

void PPPMDipole::poisson_peratom_dipole()
{
  int i,ii,j,k,n;

  // 18 components of virial in v0 thru v5

  if (!vflag_atom) return;

  // V0x

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
        work4[n+1] = fkx[i]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v0x_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V0y

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
        work4[n+1] = fky[j]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v0y_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V0z

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
        work4[n+1] = fkz[k]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v0z_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V1x

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
        work4[n+1] = fkx[i]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v1x_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V1y

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
        work4[n+1] = fky[j]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v1y_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V1z

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
        work4[n+1] = fkz[k]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v1z_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V2x

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
        work4[n+1] = fkx[i]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v2x_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V2y

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
        work4[n+1] = fky[j]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v2y_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V2z

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
        work4[n+1] = fkz[k]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v2z_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V3x

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
        work4[n+1] = fkx[i]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v3x_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V3y

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
        work4[n+1] = fky[j]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v3y_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V3z

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
        work4[n+1] = fkz[k]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v3z_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V4x

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
        work4[n+1] = fkx[i]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v4x_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V4y

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
        work4[n+1] = fky[j]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v4y_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V4z

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
        work4[n+1] = fkz[k]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v4z_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V5x

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkx[i]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
        work4[n+1] = fkx[i]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v5x_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V5y

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fky[j]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
        work4[n+1] = fky[j]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v5y_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }

  // V5z

  n = 0;
  ii = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++)
    for (j = nylo_fft; j <= nyhi_fft; j++)
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
        work4[n] = fkz[k]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] +
                                      work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
        work4[n+1] = fkz[k]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] +
                                        work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
        n += 2;
        ii++;
      }

  fft2->compute(work4,work4,FFT3d::BACKWARD);

  n = 0;
  for (k = nzlo_in; k <= nzhi_in; k++)
    for (j = nylo_in; j <= nyhi_in; j++)
      for (i = nxlo_in; i <= nxhi_in; i++) {
        v5z_brick_dipole[k][j][i] = work4[n];
        n += 2;
      }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */

void PPPMDipole::fieldforce_ik_dipole()
{
  int i,l,m,n,nx,ny,nz,mx,my,mz;
  FFT_SCALAR dx,dy,dz;
  FFT_SCALAR x0,y0,z0;
  FFT_SCALAR ex,ey,ez;
  FFT_SCALAR vxx,vyy,vzz,vxy,vxz,vyz;

  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt

  double **mu = atom->mu;
  double **x = atom->x;
  double **f = atom->f;
  double **t = atom->torque;

  int nlocal = atom->nlocal;

  for (i = 0; i < nlocal; i++) {
    nx = part2grid[i][0];
    ny = part2grid[i][1];
    nz = part2grid[i][2];
    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

    compute_rho1d(dx,dy,dz);

    ex = ey = ez = ZEROF;
    vxx = vyy = vzz = vxy = vxz = vyz = ZEROF;
    for (n = nlower; n <= nupper; n++) {
      mz = n+nz;
      z0 = rho1d[2][n];
      for (m = nlower; m <= nupper; m++) {
        my = m+ny;
        y0 = z0*rho1d[1][m];
        for (l = nlower; l <= nupper; l++) {
          mx = l+nx;
          x0 = y0*rho1d[0][l];
          ex -= x0*ux_brick_dipole[mz][my][mx];
          ey -= x0*uy_brick_dipole[mz][my][mx];
          ez -= x0*uz_brick_dipole[mz][my][mx];
          vxx -= x0*vdxx_brick_dipole[mz][my][mx];
          vyy -= x0*vdyy_brick_dipole[mz][my][mx];
          vzz -= x0*vdzz_brick_dipole[mz][my][mx];
          vxy -= x0*vdxy_brick_dipole[mz][my][mx];
          vxz -= x0*vdxz_brick_dipole[mz][my][mx];
          vyz -= x0*vdyz_brick_dipole[mz][my][mx];
        }
      }
    }

    // convert E-field to torque

    const double mufactor = qqrd2e * scale;
    f[i][0] += mufactor*(vxx*mu[i][0] + vxy*mu[i][1] + vxz*mu[i][2]);
    f[i][1] += mufactor*(vxy*mu[i][0] + vyy*mu[i][1] + vyz*mu[i][2]);
    f[i][2] += mufactor*(vxz*mu[i][0] + vyz*mu[i][1] + vzz*mu[i][2]);

    t[i][0] += mufactor*(mu[i][1]*ez - mu[i][2]*ey);
    t[i][1] += mufactor*(mu[i][2]*ex - mu[i][0]*ez);
    t[i][2] += mufactor*(mu[i][0]*ey - mu[i][1]*ex);
  }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get per-atom energy/virial
------------------------------------------------------------------------- */

void PPPMDipole::fieldforce_peratom_dipole()
{
  int i,l,m,n,nx,ny,nz,mx,my,mz;
  FFT_SCALAR dx,dy,dz,x0,y0,z0;
  FFT_SCALAR ux,uy,uz;
  FFT_SCALAR v0x,v1x,v2x,v3x,v4x,v5x;
  FFT_SCALAR v0y,v1y,v2y,v3y,v4y,v5y;
  FFT_SCALAR v0z,v1z,v2z,v3z,v4z,v5z;

  // loop over my charges, interpolate from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt

  double **mu = atom->mu;
  double **x = atom->x;

  int nlocal = atom->nlocal;

  for (i = 0; i < nlocal; i++) {
    nx = part2grid[i][0];
    ny = part2grid[i][1];
    nz = part2grid[i][2];
    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

    compute_rho1d(dx,dy,dz);

    ux = uy = uz = ZEROF;
    v0x = v1x = v2x = v3x = v4x = v5x = ZEROF;
    v0y = v1y = v2y = v3y = v4y = v5y = ZEROF;
    v0z = v1z = v2z = v3z = v4z = v5z = ZEROF;
    for (n = nlower; n <= nupper; n++) {
      mz = n+nz;
      z0 = rho1d[2][n];
      for (m = nlower; m <= nupper; m++) {
        my = m+ny;
        y0 = z0*rho1d[1][m];
        for (l = nlower; l <= nupper; l++) {
          mx = l+nx;
          x0 = y0*rho1d[0][l];
          if (eflag_atom) {
            ux += x0*ux_brick_dipole[mz][my][mx];
            uy += x0*uy_brick_dipole[mz][my][mx];
            uz += x0*uz_brick_dipole[mz][my][mx];
          }
          if (vflag_atom) {
            v0x += x0*v0x_brick_dipole[mz][my][mx];
            v1x += x0*v1x_brick_dipole[mz][my][mx];
            v2x += x0*v2x_brick_dipole[mz][my][mx];
            v3x += x0*v3x_brick_dipole[mz][my][mx];
            v4x += x0*v4x_brick_dipole[mz][my][mx];
            v5x += x0*v5x_brick_dipole[mz][my][mx];
            v0y += x0*v0y_brick_dipole[mz][my][mx];
            v1y += x0*v1y_brick_dipole[mz][my][mx];
            v2y += x0*v2y_brick_dipole[mz][my][mx];
            v3y += x0*v3y_brick_dipole[mz][my][mx];
            v4y += x0*v4y_brick_dipole[mz][my][mx];
            v5y += x0*v5y_brick_dipole[mz][my][mx];
            v0z += x0*v0z_brick_dipole[mz][my][mx];
            v1z += x0*v1z_brick_dipole[mz][my][mx];
            v2z += x0*v2z_brick_dipole[mz][my][mx];
            v3z += x0*v3z_brick_dipole[mz][my][mx];
            v4z += x0*v4z_brick_dipole[mz][my][mx];
            v5z += x0*v5z_brick_dipole[mz][my][mx];
          }
        }
      }
    }

    if (eflag_atom) eatom[i] += mu[i][0]*ux + mu[i][1]*uy + mu[i][2]*uz;
    if (vflag_atom) {
      vatom[i][0] += mu[i][0]*v0x + mu[i][1]*v0y + mu[i][2]*v0z;
      vatom[i][1] += mu[i][0]*v1x + mu[i][1]*v1y + mu[i][2]*v1z;
      vatom[i][2] += mu[i][0]*v2x + mu[i][1]*v2y + mu[i][2]*v2z;
      vatom[i][3] += mu[i][0]*v3x + mu[i][1]*v3y + mu[i][2]*v3z;
      vatom[i][4] += mu[i][0]*v4x + mu[i][1]*v4y + mu[i][2]*v4z;
      vatom[i][5] += mu[i][0]*v5x + mu[i][1]*v5y + mu[i][2]*v5z;
    }
  }
}

/* ----------------------------------------------------------------------
   pack own values to buf to send to another proc
------------------------------------------------------------------------- */

void PPPMDipole::pack_forward_grid(int flag, void *vbuf, int nlist, int *list)
{
  auto buf = (FFT_SCALAR *) vbuf;

  int n = 0;

  if (flag == FORWARD_MU) {
    FFT_SCALAR *src_ux = &ux_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_uy = &uy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_uz = &uz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_vxx = &vdxx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_vyy = &vdyy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_vzz = &vdzz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_vxy = &vdxy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_vxz = &vdxz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_vyz = &vdyz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      buf[n++] = src_ux[list[i]];
      buf[n++] = src_uy[list[i]];
      buf[n++] = src_uz[list[i]];
      buf[n++] = src_vxx[list[i]];
      buf[n++] = src_vyy[list[i]];
      buf[n++] = src_vzz[list[i]];
      buf[n++] = src_vxy[list[i]];
      buf[n++] = src_vxz[list[i]];
      buf[n++] = src_vyz[list[i]];
    }
  } else if (flag == FORWARD_MU_PERATOM) {
    FFT_SCALAR *v0xsrc = &v0x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1xsrc = &v1x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2xsrc = &v2x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3xsrc = &v3x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4xsrc = &v4x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5xsrc = &v5x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v0ysrc = &v0y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1ysrc = &v1y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2ysrc = &v2y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3ysrc = &v3y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4ysrc = &v4y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5ysrc = &v5y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v0zsrc = &v0z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1zsrc = &v1z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2zsrc = &v2z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3zsrc = &v3z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4zsrc = &v4z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5zsrc = &v5z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      buf[n++] = v0xsrc[list[i]];
      buf[n++] = v1xsrc[list[i]];
      buf[n++] = v2xsrc[list[i]];
      buf[n++] = v3xsrc[list[i]];
      buf[n++] = v4xsrc[list[i]];
      buf[n++] = v5xsrc[list[i]];
      buf[n++] = v0ysrc[list[i]];
      buf[n++] = v1ysrc[list[i]];
      buf[n++] = v2ysrc[list[i]];
      buf[n++] = v3ysrc[list[i]];
      buf[n++] = v4ysrc[list[i]];
      buf[n++] = v5ysrc[list[i]];
      buf[n++] = v0zsrc[list[i]];
      buf[n++] = v1zsrc[list[i]];
      buf[n++] = v2zsrc[list[i]];
      buf[n++] = v3zsrc[list[i]];
      buf[n++] = v4zsrc[list[i]];
      buf[n++] = v5zsrc[list[i]];
    }
  }
}

/* ----------------------------------------------------------------------
   unpack another proc's own values from buf and set own ghost values
------------------------------------------------------------------------- */

void PPPMDipole::unpack_forward_grid(int flag, void *vbuf, int nlist, int *list)
{
  auto buf = (FFT_SCALAR *) vbuf;

  int n = 0;

  if (flag == FORWARD_MU) {
    FFT_SCALAR *dest_ux = &ux_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_uy = &uy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_uz = &uz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_vxx = &vdxx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_vyy = &vdyy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_vzz = &vdzz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_vxy = &vdxy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_vxz = &vdxz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_vyz = &vdyz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      dest_ux[list[i]] = buf[n++];
      dest_uy[list[i]] = buf[n++];
      dest_uz[list[i]] = buf[n++];
      dest_vxx[list[i]] = buf[n++];
      dest_vyy[list[i]] = buf[n++];
      dest_vzz[list[i]] = buf[n++];
      dest_vxy[list[i]] = buf[n++];
      dest_vxz[list[i]] = buf[n++];
      dest_vyz[list[i]] = buf[n++];
    }
  } else if (flag == FORWARD_MU_PERATOM) {
    FFT_SCALAR *v0xsrc = &v0x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1xsrc = &v1x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2xsrc = &v2x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3xsrc = &v3x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4xsrc = &v4x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5xsrc = &v5x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v0ysrc = &v0y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1ysrc = &v1y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2ysrc = &v2y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3ysrc = &v3y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4ysrc = &v4y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5ysrc = &v5y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v0zsrc = &v0z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1zsrc = &v1z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2zsrc = &v2z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3zsrc = &v3z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4zsrc = &v4z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5zsrc = &v5z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      v0xsrc[list[i]] = buf[n++];
      v1xsrc[list[i]] = buf[n++];
      v2xsrc[list[i]] = buf[n++];
      v3xsrc[list[i]] = buf[n++];
      v4xsrc[list[i]] = buf[n++];
      v5xsrc[list[i]] = buf[n++];
      v0ysrc[list[i]] = buf[n++];
      v1ysrc[list[i]] = buf[n++];
      v2ysrc[list[i]] = buf[n++];
      v3ysrc[list[i]] = buf[n++];
      v4ysrc[list[i]] = buf[n++];
      v5ysrc[list[i]] = buf[n++];
      v0zsrc[list[i]] = buf[n++];
      v1zsrc[list[i]] = buf[n++];
      v2zsrc[list[i]] = buf[n++];
      v3zsrc[list[i]] = buf[n++];
      v4zsrc[list[i]] = buf[n++];
      v5zsrc[list[i]] = buf[n++];
    }
  }
}

/* ----------------------------------------------------------------------
   pack ghost values into buf to send to another proc
------------------------------------------------------------------------- */

void PPPMDipole::pack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
{
  auto buf = (FFT_SCALAR *) vbuf;

  int n = 0;
  if (flag == REVERSE_MU) {
    FFT_SCALAR *src_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_dipole1 = &densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *src_dipole2 = &densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      buf[n++] = src_dipole0[list[i]];
      buf[n++] = src_dipole1[list[i]];
      buf[n++] = src_dipole2[list[i]];
    }
  }
}

/* ----------------------------------------------------------------------
   unpack another proc's ghost values from buf and add to own values
------------------------------------------------------------------------- */

void PPPMDipole::unpack_reverse_grid(int flag, void *vbuf, int nlist, int *list)
{
  auto buf = (FFT_SCALAR *) vbuf;

  int n = 0;
  if (flag == REVERSE_MU) {
    FFT_SCALAR *dest_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_dipole1 = &densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *dest_dipole2 = &densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      dest_dipole0[list[i]] += buf[n++];
      dest_dipole1[list[i]] += buf[n++];
      dest_dipole2[list[i]] += buf[n++];
    }
  }
}

/* ----------------------------------------------------------------------
   Slab-geometry correction term to dampen inter-slab interactions between
   periodically repeating slabs.  Yields good approximation to 2D Ewald if
   adequate empty space is left between repeating slabs (J. Chem. Phys.
   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
------------------------------------------------------------------------- */

void PPPMDipole::slabcorr()
{
  // compute local contribution to global dipole moment

  double dipole = 0.0;
  double **mu = atom->mu;
  int nlocal = atom->nlocal;

  for (int i = 0; i < nlocal; i++) dipole += mu[i][2];

  // sum local contributions to get global dipole moment

  double dipole_all;
  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);

  // need to make non-neutral systems and/or
  //  per-atom energy translationally invariant

  if (eflag_atom || fabs(qsum) > SMALL) {

    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
      "long-range dipoles and non-neutral systems or per-atom energy");
  }

  // compute corrections

  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all/12.0)/volume;
  const double qscale = qqrd2e * scale;

  if (eflag_global) energy += qscale * e_slabcorr;

  // per-atom energy

  if (eflag_atom) {
    double efact = qscale * MY_2PI/volume/12.0;
    for (int i = 0; i < nlocal; i++)
      eatom[i] += efact * mu[i][2]*dipole_all;
  }

  // add on torque corrections

  if (atom->torque) {
    double ffact = qscale * (-4.0*MY_PI/volume);
    double **mu = atom->mu;
    double **torque = atom->torque;
    for (int i = 0; i < nlocal; i++) {
      torque[i][0] += ffact * dipole_all * mu[i][1];
      torque[i][1] += -ffact * dipole_all * mu[i][0];
    }
  }
}

/* ----------------------------------------------------------------------
   perform and time the 1d FFTs required for N timesteps
------------------------------------------------------------------------- */

int PPPMDipole::timing_1d(int n, double &time1d)
{
  double time1,time2;

  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;

  MPI_Barrier(world);
  time1 = platform::walltime();

  for (int i = 0; i < n; i++) {
    fft1->timing1d(work1,nfft_both,FFT3d::FORWARD);
    fft1->timing1d(work1,nfft_both,FFT3d::FORWARD);
    fft1->timing1d(work1,nfft_both,FFT3d::FORWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
    fft2->timing1d(work1,nfft_both,FFT3d::BACKWARD);
  }

  MPI_Barrier(world);
  time2 = platform::walltime();
  time1d = time2 - time1;

  return 12;
}

/* ----------------------------------------------------------------------
   perform and time the 3d FFTs required for N timesteps
------------------------------------------------------------------------- */

int PPPMDipole::timing_3d(int n, double &time3d)
{
  double time1,time2;

  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;

  MPI_Barrier(world);
  time1 = platform::walltime();

  for (int i = 0; i < n; i++) {
    fft1->compute(work1,work1,FFT3d::FFT3d::FORWARD);
    fft1->compute(work1,work1,FFT3d::FFT3d::FORWARD);
    fft1->compute(work1,work1,FFT3d::FFT3d::FORWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
    fft2->compute(work1,work1,FFT3d::FFT3d::BACKWARD);
  }

  MPI_Barrier(world);
  time2 = platform::walltime();
  time3d = time2 - time1;

  return 12;
}

/* ----------------------------------------------------------------------
   memory usage of local arrays
------------------------------------------------------------------------- */

double PPPMDipole::memory_usage()
{
  double bytes = (double)nmax*3 * sizeof(double);

  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);
  bytes += (double)6 * nfft_both * sizeof(double);   // vg
  bytes += (double)nfft_both * sizeof(double);       // greensfn
  bytes += (double)nfft_both*5 * sizeof(FFT_SCALAR); // work*2*2
  bytes += (double)9 * nbrick * sizeof(FFT_SCALAR);  // ubrick*3 + vdbrick*6
  bytes += (double)nfft_both*7 * sizeof(FFT_SCALAR); // density_ffx*3 + work*2*2

  if (peratom_allocate_flag)
    bytes += (double)21 * nbrick * sizeof(FFT_SCALAR);

  // two Grid3d bufs

  bytes += (double)(ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);

  return bytes;
}

/* ----------------------------------------------------------------------
   compute musum,musqsum,mu2
   called initially, when particle count changes, when dipoles are changed
------------------------------------------------------------------------- */

void PPPMDipole::musum_musq()
{
  const int nlocal = atom->nlocal;

  musum = musqsum = mu2 = 0.0;
  if (atom->mu_flag) {
    double** mu = atom->mu;
    double musum_local(0.0), musqsum_local(0.0);

    for (int i = 0; i < nlocal; i++) {
      musum_local += mu[i][0] + mu[i][1] + mu[i][2];
      musqsum_local += mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2];
    }

    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);

    mu2 = musqsum * force->qqrd2e;
  }

  if (mu2 == 0 && comm->me == 0)
    error->all(FLERR,"Using kspace solver PPPMDipole on system with no dipoles");
}
