QMcBeaver: QMCSlater.cpp Source File

00001 //            QMcBeaver
00002 //
00003 //         Constructed by
00004 //
00005 //     Michael Todd Feldmann
00006 //              and
00007 //   David Randall "Chip" Kent IV
00008 //
00009 // Copyright 2000.  All rights reserved.
00010 //
00011 // drkent@users.sourceforge.net mtfeldmann@users.sourceforge.net
00012 
00013 /**************************************************************************
00014 This SOFTWARE has been authored or contributed to by an employee or 
00015 employees of the University of California, operator of the Los Alamos 
00016 National Laboratory under Contract No. W-7405-ENG-36 with the U.S. 
00017 Department of Energy.  The U.S. Government has rights to use, reproduce, 
00018 and distribute this SOFTWARE.  Neither the Government nor the University 
00019 makes any warranty, express or implied, or assumes any liability or 
00020 responsibility for the use of this SOFTWARE.  If SOFTWARE is modified 
00021 to produce derivative works, such modified SOFTWARE should be clearly 
00022 marked, so as not to confuse it with the version available from LANL.   
00023  
00024 Additionally, this program is free software; you can distribute it and/or 
00025 modify it under the terms of the GNU General Public License. Accordingly, 
00026 this program is  distributed in the hope that it will be useful, but WITHOUT 
00027 ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
00028 FITNESS FOR A  PARTICULAR PURPOSE.  See the GNU General Public License 
00029 for more details. 
00030 **************************************************************************/
00031 
00032 
00033 #include "QMCSlater.h"
00034 
00035 static const bool showTimings = false;
00036 static const bool printPsi    = false;
00037 
00038 QMCSlater::QMCSlater()
00039 {
00040   Start = 0;
00041   Stop = 0;
00042 }
00043 
00044 void QMCSlater::operator=(const QMCSlater & rhs )
00045 {
00046   Input              = rhs.Input;
00047   Dwc                = rhs.Dwc;
00048   rDwc_x             = rhs.rDwc_x;
00049   rDwc_xx            = rhs.rDwc_xx;
00050 
00051   if (Input->flags.calculate_bf_density == 1)
00052     Xw_Density      = rhs.Xw_Density;
00053 
00054   BF         = rhs.BF;
00055   WF         = rhs.WF;
00056   Start      = rhs.Start;
00057   Stop       = rhs.Stop;
00058   if (Input->flags.replace_electron_nucleus_cusps == 1)
00059     ElectronNucleusCusp = rhs.ElectronNucleusCusp;
00060 
00061   Singular = rhs.Singular;
00062 
00063   // This was done in this round about way to avoid some weird compiler errors
00064   // dealing with when const was used.  In the simplest format this would be
00065   // allocate( rhs.D.dim1() );
00066   Dw = rhs.Dw;
00067   allocate();
00068 }
00069 
00070 void QMCSlater::initialize(QMCInput *INPUT,
00071                            int startEl, int stopEl,
00072                            bool selectAlpha)
00073 {
00074   Start = startEl;
00075   Stop = stopEl;
00076   isAlpha = selectAlpha;
00077   Input = INPUT;
00078   BF = &Input->BF;
00079   WF = &Input->WF;
00080 
00081   if(getNumberElectrons() == 0)
00082     return;
00083 
00084   allocate();
00085 
00086   for(int i=0; i<Singular.dim1(); i++)
00087     Singular(i) = false;
00088 
00089   if (Input->flags.replace_electron_nucleus_cusps == 1)
00090     {
00091       Array2D<qmcfloat> * coeffs = WF->getCoeff(isAlpha);
00092       ElectronNucleusCusp.initialize(Input, * coeffs);
00093       ElectronNucleusCusp.fitReplacementOrbitals();
00094     }
00095 
00096 #ifdef QMC_GPU
00097   // I should put some more effort into this question of copy
00098   // constructors/assignment
00099   GPUQMCBasisFunction temp(*BF, getNumberElectrons(), Input->flags.getNumGPUWalkers());
00100   gpuBF = temp;
00101   // I replaced WF_coeffs with getCoeff(ci,isAlpha), so GPU compiles will fail right here
00102   gpuMatMult = GPUQMCMatrix(Input, WF_coeffs,Input->flags.getNumGPUWalkers());
00103 #endif
00104 }
00105 
00106 void QMCSlater::allocate()
00107 {
00108   int N = getNumberElectrons();
00109   int ndet = WF->getNumberDeterminants();
00110   int nbasisfunc = WF->getNumberBasisFunctions();
00111   int wpp = Input->flags.walkers_per_pass;
00112   int gpp = Input->flags.getNumGPUWalkers();
00113 
00114   if(wpp - gpp > 0)
00115     {
00116       if(WF->getNumberDeterminants() > 1)
00117         ciDet = new Array2D<double>();
00118       Dw.allocate(wpp-gpp);
00119       Dwc_inv.allocate(wpp-gpp,ndet);
00120       Dw_xx.allocate(wpp-gpp);
00121       Dw_x.allocate(wpp-gpp,3);
00122     }
00123 
00124 #ifdef QMC_GPU
00125   gpu_Dw.allocate(gpp);
00126   gpu_Dwc_inv.allocate(gpp);
00127   gpu_Dw_xx.allocate(gpp);
00128   gpu_Dw_x.allocate(gpp,3);
00129   
00130   //this code is now out of sync when we move
00131   //one e at a time.
00132   assert(0);
00133   for(int j=0; j<gpu_Dw.dim1(); j++)
00134     {
00135       for (int i=0; i<gpu_Dw.dim2(); i++)
00136         gpu_Dwc_inv(j,i).allocate(N,N);
00137       
00138       gpu_D(j).allocate(N,N);
00139       
00140       gpu_Dw_xx(j).allocate(N,N);
00141       for(int k=0; k<3; k++)
00142         gpu_Dw_x(j,k).allocate(N,N);
00143     }
00144 #endif
00145 
00146   Singular.allocate(wpp);
00147   Dwc.allocate(wpp);
00148   rDwc_xx.allocate(wpp);
00149   rDwc_x.allocate(wpp);
00150 
00151   pointer_Dwc.allocate(wpp);
00152   pointer_rDwc_xx.allocate(wpp);
00153   pointer_rDwc_x.allocate(wpp);
00154 
00155   if (Input->flags.calculate_bf_density == 1)
00156     Xw_Density.allocate(wpp);
00157 
00158   for(int w=0; w<wpp; w++)
00159     {
00160       Singular(w).allocate(ndet);
00161       Dwc(w).allocate(ndet);
00162       rDwc_xx(w).allocate(ndet);
00163       rDwc_x(w).allocate(ndet,N,3);
00164       if (Input->flags.calculate_bf_density == 1)
00165         Xw_Density(w).allocate(nbasisfunc);
00166     }
00167 
00168   if (Input->flags.optimize_Orbitals == 1)
00169     {
00170       p_a.allocate(wpp);
00171       p2_xa.allocate(wpp);
00172       p3_xxa.allocate(wpp);
00173 
00174       for(int w=0; w<wpp; w++)
00175         {
00176           p_a(w).allocate(ndet);
00177           p2_xa(w).allocate(ndet,N,3);
00178           p3_xxa(w).allocate(ndet);
00179 
00180           for(int pi=0; pi<ndet; pi++)
00181             {
00182               (p_a(w))(pi).allocate(N,nbasisfunc);
00183               (p3_xxa(w))(pi).allocate(N,nbasisfunc);
00184 
00185               for(int pj=0; pj<N; pj++)
00186                 for(int pk=0; pk<3; pk++)
00187                   (p2_xa(w))(pi,pj,pk).allocate(N,nbasisfunc);
00188             }
00189         }
00190     }
00191 
00192 #ifdef QMC_GPU
00193   /* The dimensions of these data are numWalkers x numDeterminants then numElec
00194      x numOrb.  What I'm doing here is filling the resultsCollector database 
00195      with addresses of our actual data storage. This creates an alias which I
00196      can then send to the GPUQMC code without duplicating data storage or 
00197      ruining the previously existing data design.
00198 
00199      resultsCollector will be assigned the first gpu_walkers_per_pass elements.
00200   */
00201   resultsCollector.allocate(ndet, gpp*5);
00202   for(int iDet=0; iDet<ndet; iDet++)
00203     {
00204       for(int iWalker=0; iWalker<gpp; iWalker++)
00205         {
00206           resultsCollector(iDet,iWalker*5    ) = & gpu_Dw(iWalker, iDet);
00207           resultsCollector(iDet,iWalker*5 + 1) = & gpu_Dw_x(iWalker, iDet, 0);
00208           resultsCollector(iDet,iWalker*5 + 2) = & gpu_Dw_x(iWalker, iDet, 1);
00209           resultsCollector(iDet,iWalker*5 + 3) = & gpu_Dw_x(iWalker, iDet, 2);
00210           resultsCollector(iDet,iWalker*5 + 4) = & gpu_Dw_xx(iWalker, iDet);
00211         }
00212     }
00213 #endif
00214 }
00215 
00216 void QMCSlater::allocateIteration(int whichE, int & start, int & stop)
00217 {
00218   int nOR = WF->getNumberOrbitals(isAlpha);
00219   int nEL = getNumberElectrons();
00220   int nBF = WF->getNumberBasisFunctions();
00221   int wpp = Input->flags.walkers_per_pass;
00222 
00223   //nUp is the number of electrons updated per iteration.
00224   int nUP = whichE == -1 ? nEL : 1;
00225 
00226   if(whichE == -1)
00227     {
00228       start = Start;
00229       stop  = Stop;
00230     }
00231   else
00232     {
00233       start = whichE;
00234       stop  = whichE;
00235     }
00236 
00237   bool needToKeepChi = false;
00238   if(globalInput.flags.optimize_Orbitals)
00239     needToKeepChi = true;
00240 
00241   if(needToKeepChi)
00242     {
00243       /*
00244         If we're optimizing the orbitals, then we will
00245         need to keep Chi data a little bit longer than
00246         otherwise.
00247       */
00248       Xw.allocate(wpp);
00249       Xw_x.allocate(wpp);
00250       Xw_xx.allocate(wpp);
00251     }
00252   else
00253     {
00254       Xw.allocate(1);
00255       Xw_x.allocate(1);
00256       Xw_xx.allocate(1);
00257     }
00258 
00259   for(int w=0; w<Xw.dim1(); w++)
00260     {
00261       Xw(w).allocate(nUP,nBF);
00262       Xw_xx(w).allocate(nUP,nBF);
00263       Xw_x(w).allocate(3);
00264       for(int xyz=0; xyz<3; xyz++)
00265         (Xw_x(w))(xyz).allocate(nUP,nBF);
00266     }
00267 
00268   for(int w=0; w<Dw.dim1(); w++)
00269     {
00270       for (int ci=0; ci<WF->getNumberDeterminants(); ci++)
00271         Dwc_inv(w,ci).allocate(nEL,nEL);
00272       
00273       Dw(w).allocate(nUP,nOR);
00274       
00275       Dw_xx(w).allocate(nUP,nOR);
00276       for(int xyz=0; xyz<3; xyz++)
00277         Dw_x(w,xyz).allocate(nUP,nOR);
00278     }
00279 }
00280 
00281 QMCSlater::~QMCSlater()
00282 {
00283   if (Start != Stop)
00284     {
00285       for(int j=0; j<Dw.dim1(); j++)
00286         {
00287           for (int ci=0; ci<WF->getNumberDeterminants(); ci++)
00288             Dwc_inv(j,ci).deallocate();
00289 
00290           Dw(j).deallocate();
00291           Dw_xx(j).deallocate();
00292           for(int k=0; k<3; k++)
00293             Dw_x(j,k).deallocate();
00294         }
00295       if(ciDet != 0)
00296         delete ciDet;
00297       Dw.deallocate();
00298       Dwc_inv.deallocate();
00299       Dw_xx.deallocate();
00300       Dw_x.deallocate();
00301 
00302 #ifdef QMC_GPU
00303       for(int j=0; j<gpu_Dw.dim1(); j++)
00304         {
00305           for (int i=0; i<WF->getNumberDeterminants(); i++)
00306             gpu_Dwc_inv(j,i).deallocate();
00307           
00308           
00309           gpu_Dw(j).deallocate();
00310           
00311           gpu_Dw_xx(j).deallocate();
00312           for(int k=0; k<3; k++)
00313             gpu_Dw_x(j,k).deallocate();
00314         }
00315       gpu_Dw.deallocate();
00316       gpu_Dwc_inv.deallocate();
00317       gpu_Dw_xx.deallocate();
00318       gpu_Dw_x.deallocate();
00319 #endif
00320 
00321       for(int j=0; j < Input->flags.walkers_per_pass; j++)
00322         {
00323           Singular(j).deallocate();
00324           Dwc(j).deallocate();
00325           rDwc_xx(j).deallocate();
00326           rDwc_x(j).deallocate();
00327           if (Input->flags.calculate_bf_density == 1)
00328             Xw_Density(j).deallocate();
00329         }
00330 
00331       if (Input->flags.optimize_Orbitals == 1)
00332         {
00333           for(int j=0; j<p_a.dim1(); j++)
00334             {
00335               for(int pi=0; pi<p_a(j).dim1(); pi++)
00336                 {
00337                   (p_a(j))(pi).deallocate();
00338                   (p3_xxa(j))(pi).deallocate();
00339 
00340                   for(int pj=0; pj<p2_xa(j).dim2(); pj++)
00341                     for(int pk=0; pk<3; pk++)
00342                       (p2_xa(j))(pi,pj,pk).deallocate();
00343                 }
00344 
00345               p_a(j).deallocate();
00346               p2_xa(j).deallocate();
00347               p3_xxa(j).deallocate();
00348             }
00349           p_a.deallocate();
00350           p2_xa.deallocate();
00351           p3_xxa.deallocate();
00352         }
00353 
00354       Singular.deallocate();
00355       Dwc.deallocate();
00356       rDwc_xx.deallocate();
00357       rDwc_x.deallocate();
00358 
00359       if (Input->flags.calculate_bf_density == 1)
00360         Xw_Density.deallocate();
00361 
00362 #ifdef QMC_GPU
00363       //We do not need to deallocate the contents of
00364       //resultsCollector because the contents were
00365       //references to gpu_D, gpu_Laplacian, etc
00366       //and those were already deallocated.
00367       resultsCollector.deallocate();
00368 
00369       gpuMatMult.destroy();
00370 #endif
00371 
00372       for(int k=0; k<Xw.dim1(); k++)
00373         {
00374           Xw(k).deallocate();
00375           Xw_xx(k).deallocate();
00376           for (int j=0; j<3; j++)
00377             (Xw_x(k))(j).deallocate();
00378           Xw_x(k).deallocate();
00379         }
00380       Xw.deallocate();
00381       Xw_xx.deallocate();
00382       Xw_x.deallocate();
00383     }
00384 }
00385 
00386 void QMCSlater::update_Ds(Array1D< QMCWalkerData * > & walkerData)
00387 {
00388   int whichE = walkerData(0)->whichE;
00389   int gpp = Input->flags.getNumGPUWalkers();
00390   int wpp = walkerData.dim1();
00391 
00392   if(getNumberElectrons() == 0)
00393     return;
00394 
00395   for(int w=0; w<wpp; w++)
00396     {
00397       /*
00398         Even if we didn't move any of our electrons, we will
00399         still be asked for our data, so we need to assign the
00400         pointers.
00401       */
00402 
00403       if(globalInput.flags.one_e_per_iter ||
00404          globalInput.flags.use_psuedopotential == 1)
00405         {
00406           if(isAlpha)
00407             {
00408               pointer_Dwc(w)     = & walkerData(w+gpp)->DcA;
00409             } else {
00410               pointer_Dwc(w)     = & walkerData(w+gpp)->DcB;
00411             }
00412         } else {
00413           pointer_Dwc(w)     = & Dwc(w);
00414         }
00415       
00416       if(globalInput.flags.one_e_per_iter)
00417         {
00418           if(isAlpha)
00419             {
00420               pointer_rDwc_x(w)  = & walkerData(w+gpp)->rDc_xA;
00421               pointer_rDwc_xx(w) = & walkerData(w+gpp)->rDc_xxA;
00422             } else {
00423               pointer_rDwc_x(w)  = & walkerData(w+gpp)->rDc_xB;
00424               pointer_rDwc_xx(w) = & walkerData(w+gpp)->rDc_xxB;
00425             }
00426         } else {
00427           pointer_rDwc_x(w)  = & rDwc_x(w);
00428           pointer_rDwc_xx(w) = & rDwc_xx(w);
00429         }
00430     }
00431       
00432   if(whichE != -1)
00433     if(whichE < Start || whichE > Stop)
00434       return;
00435 
00436   for(int w=0; w<wpp; w++)
00437     {
00438       //walker offset index
00439       int off;    
00440       Array2D<double> * inv;
00441       Array2D<qmcfloat> * gradx, * grady, * gradz, * lap;
00442 
00443 #ifdef QMC_GPU
00444       //i changed a lot in qmcslater to move one e per iter
00445       //this bit of code will need to be updated when
00446       //i have the GPU stuff in front of me again.
00447       assert(0);
00448       
00449       /* This will fill the resultsCollector with
00450          the results from the GPU. However, all of 
00451          resultsCollector's elements are really
00452          just pointers to the D object.*/
00453       off = w;
00454       gpuMatMult.getResults(resultsCollector);
00455       calculate_DerivativeRatios(0,ci,gpu_D,gpu_Dwc_inv,
00456                                  gpu_Dw_xx,gpu_Dw_x);
00457       
00458 #endif
00459       off = w + gpp;
00460       
00461       if(globalInput.flags.one_e_per_iter)
00462         {
00463           //it might have been an all electron update
00464           int numE = Dw(w).dim1();
00465           //index of first updated electron
00466           int iEl = whichE == -1 ? 0 : whichE - Start;
00467           
00468           //reload the data from the other electrons
00469           if(isAlpha)
00470             {
00471               lap =   &  walkerData(off)->D_xxA;
00472               gradx = & (walkerData(off)->D_xA)(0);
00473               grady = & (walkerData(off)->D_xA)(1);
00474               gradz = & (walkerData(off)->D_xA)(2);
00475             } else {
00476               lap =   &  walkerData(off)->D_xxB;
00477               gradx = & (walkerData(off)->D_xB)(0);
00478               grady = & (walkerData(off)->D_xB)(1);
00479               gradz = & (walkerData(off)->D_xB)(2);
00480             }
00481 
00482           //and update the data for our electron
00483           lap->setRows(iEl,0,numE,Dw_xx(w));
00484           gradx->setRows(iEl,0,numE,Dw_x(w,0));
00485           grady->setRows(iEl,0,numE,Dw_x(w,1));
00486           gradz->setRows(iEl,0,numE,Dw_x(w,2));
00487           
00488         } else {
00489           lap =   & Dw_xx(w);
00490           gradx = & Dw_x(w,0);
00491           grady = & Dw_x(w,1);
00492           gradz = & Dw_x(w,2);
00493         }
00494       
00495       for(int ci=0; ci<WF->getNumberDeterminants(); ci++)
00496         {
00497           if(globalInput.flags.one_e_per_iter ||
00498              globalInput.flags.use_psuedopotential == 1)
00499             {
00500               if(isAlpha) inv =   &  walkerData(off)->Dc_invA(ci);
00501               else        inv =   &  walkerData(off)->Dc_invB(ci);
00502 
00503               if(ci > 0 && whichE < 0)
00504                 {
00505                   //We need to copy the previous inverse during all electron
00506                   //updates
00507                   if(isAlpha) (*inv) = walkerData(off)->Dc_invA(ci-1);
00508                   else        (*inv) = walkerData(off)->Dc_invB(ci-1);
00509                 }
00510             } else {
00511               //this might only be used by orbital derivatives...
00512               inv =   & Dwc_inv(w,ci);
00513               
00514               //this part is used when updating the inverse,
00515               //as opposed to calculating all of it
00516               if(ci > 0)
00517                 (*inv) = Dwc_inv(w,ci-1);
00518             }
00519 
00520           Array2D<double> * psi;
00521 #if defined SINGLEPRECISION || defined QMC_GPU
00522           static Array2D<double> temp;
00523           temp = Dw(w);
00524           psi = & temp;
00525 #else
00526           psi = & Dw(w);
00527 #endif
00528 
00529           (Singular(off))(ci) = calculate_DerivativeRatios(ci,whichE - Start,
00530                                                            *psi,*inv,*lap,
00531                                                            *gradx, *grady, *gradz,
00532                                                            *pointer_Dwc(w),
00533                                                            *pointer_rDwc_x(w),
00534                                                            *pointer_rDwc_xx(w));
00535 
00536         }
00537     }
00538 
00539   if(Input->flags.optimize_Orbitals)
00540     {
00541 #if defined SINGLEPRECISION || defined QMC_GPU
00542       /*
00543         It's tedious to get qmcfloat variables to cooperate here,
00544         so i'm not going to program it. it's non essential.
00545         Just don't compile in single precision.
00546       */
00547       cout << "Error: Don't use single precision to optimize the orbitals.\n";
00548       //It's sure to crash eventually...
00549       exit(0);
00550 #else
00551       /*
00552         Let d |D| be shorthand for  \frac{ d |D| }{ d c_{jk} }
00553         Let nabla |D| refer to derivatives with respect to position
00554 
00555         Looking at the code probably doesn't make sense.
00556 
00557         Jacobi's formula:
00558         d|D| = |D| tr( D^{-1} . d D )
00559 
00560         After figuring out how to take these derivatives, I took
00561         shortcuts to calculate derivatives wrt all the c_{jk} at
00562         once. Specifically,
00563 
00564         Single derivatives:
00565         d |D| = |D| D^{-1} . Chi
00566 
00567         Mixed derivatives:
00568         d \frac{ \nabla |D| }{ |D| }
00569         = d tr( D^{-1} . \nabla D )
00570         = tr( D^{-1} . d \nabla D ) + tr( d D^{-1} . \nabla D )
00571         = tr( D^{-1} . d \nabla D ) - tr( D^{-1} . d D . D^{-1} . \nabla D )
00572         = tr( D^{-1} . d \nabla D ) - tr( \nabla D . D^{-1} . d D . D^{-1} )
00573       */
00574 
00575       // \nabla D . D^{-1} . d D
00576       Array1D< Array2D<double> > nabDinvDdD;
00577       nabDinvDdD.allocate(3);
00578 
00579       // \nabla^2 D . D^{-1} . d D
00580       Array2D<double> nab2DinvDdD;
00581 
00582       Array2D<double> temp;
00583       Array2D<double> ciData(getNumberElectrons(),getNumberElectrons());
00584 
00585       for(int w=0; w<Dw.dim1(); w++)
00586         {
00587           for(int ci=0; ci<WF->getNumberDeterminants(); ci++)
00588             {
00589               Array2D<double> * pa = & (p_a(w+gpp))(ci);
00590               Array2D<double> * p3xxa = & (p3_xxa(w+gpp))(ci);
00591               * pa    = 0.0;
00592               * p3xxa = 0.0;
00593 
00594               //calculate  d |D|
00595               Dwc_inv(w,ci).gemm(Xw(w),*pa,false);
00596               (*pa) *= (Dwc(w+gpp))(ci);
00597 
00598               for(int xyz=0; xyz<3; xyz++)
00599                 {
00600                   WF->getDataForCI(ci,isAlpha,Dw_x(w,xyz),ciData);
00601                   ciData.gemm(Dwc_inv(w,ci),temp,false);
00602                   temp.gemm(Xw(w),nabDinvDdD(xyz),false);
00603                   //temp = 0.0;
00604                 }
00605               WF->getDataForCI(ci,isAlpha,Dw_xx(w),ciData);
00606               ciData.gemm(Dwc_inv(w,ci),temp,false);
00607               temp.gemm(Xw(w),nab2DinvDdD,false);
00608 
00609               int nor = pa->dim1();
00610               int nbf = pa->dim2();
00611               for(int e=0; e<nor; e++)
00612                 {
00613                   for(int o=0; o<nor; o++)
00614                     {
00615                       for(int bf=0; bf<nbf; bf++)
00616                         {
00617                           for(int xyz=0; xyz<3; xyz++)
00618                             {
00619                               double d2xa = 0.0;
00620                               d2xa += (Dwc_inv(w,ci))(o,e) * ((Xw_x(w))(xyz))(e,bf);
00621                               d2xa -= (Dwc_inv(w,ci))(o,e) * (nabDinvDdD(xyz))(e,bf);
00622                               ((p2_xa(w+gpp))(ci,e,xyz))(o,bf) = d2xa;
00623                             }
00624                           double d3xxa = 0.0;
00625                           d3xxa += (Dwc_inv(w,ci))(o,e) * (Xw_xx(w))(e,bf);
00626                           d3xxa -= (Dwc_inv(w,ci))(o,e) * nab2DinvDdD(e,bf);
00627                           (*p3xxa)(o,bf) += d3xxa;
00628                         }
00629                     }
00630                 }
00631             }
00632         }
00633 
00634       for(int dim=0; dim<nabDinvDdD.dim1(); dim++)
00635         nabDinvDdD(dim).deallocate();
00636       nabDinvDdD.deallocate();
00637       nab2DinvDdD.deallocate();
00638       temp.deallocate();
00639 #endif
00640     }
00641 }
00642 
00643 template <class T>
00644 bool QMCSlater::calculate_DerivativeRatios(int ci, int row,
00645                                            Array2D<double> & psi,
00646                                            Array2D<double> & inv,
00647                                            Array2D<T> & lap,
00648                                            Array2D<T> & gradx,
00649                                            Array2D<T> & grady,
00650                                            Array2D<T> & gradz,
00651                                            Array1D<double> & det,
00652                                            Array3D<double> & gradPR,
00653                                            Array1D<double> & lapPR)
00654 {
00655   Array2D<int> * occ = WF->getOccupations(isAlpha);
00656   Array2D<int> * swap = WF->getDeterminantSwaps(isAlpha);
00657 
00658   bool calcOK = true;
00659   double ratio = 0;
00660 
00661   if(WF->getNumberDeterminants() == 1)
00662     {
00663       //ciDet is either used as a pointer to the ci matrix
00664       ciDet = & psi;
00665     } else {
00666       //or as storage for each ci matrix
00667       ciDet->allocate(psi.dim1(),inv.dim1());
00668       WF->getDataForCI(ci,isAlpha,psi,*ciDet);
00669     }
00670   if(ciDet->dim1() > 1 || row < 0)
00671     {
00672       if(ci == 0)
00673         {
00674           /* The LU Decomposition inverse used here is O(1*N^3)
00675              Notice: after this, psi is lost!
00676           */
00677           ciDet->determinant_and_inverse(inv,det(ci),&calcOK);
00678         } else {
00679           /* Each determinant is different from the previous determinant
00680              by probably only a couple of orbitals. So instead of performing
00681              the entire inverse, we use the Sherman-Morrison formula to update
00682              the previous inverse.
00683              Note: In order for this to work, inv needs to already be set to the previous inv
00684           */
00685           det(ci) = det(ci-1);
00686           for(int o=0; o<swap->dim2(); o++)       
00687             {
00688               int so = swap->get(ci,o);
00689               if(so != -1)
00690                 {
00691                   ratio = inv.inverseUpdateOneColumn(so,*ciDet,so);
00692                   
00693                   det[ci] *= ratio;
00694                   if(ratio == 0) break;
00695                 } else {
00696                   break;
00697                 }
00698             }
00699           
00700           //The update might have failed, so we could try inverting from scratch...
00701           if(det(ci) == 0)
00702             {
00703               calcOK = false;
00704               ciDet->determinant_and_inverse(inv,det(ci),&calcOK);
00705             }
00706         }
00707     }
00708   else
00709     {
00710       // Updating one electron at a time is O(2*N^3-N^2)
00711       ratio = inv.inverseUpdateOneRow(row,*ciDet,0);
00712       /* If ratio is 0, then we don't want to update the determinant, since
00713          we'll just try another move
00714       */
00715       if(ratio != 0.0)
00716         det[ci] *= ratio;
00717       else
00718         calcOK = false;
00719     }
00720 
00721   if(WF->getNumberDeterminants() == 1)    
00722     ciDet = 0;
00723 
00724   if(!calcOK) return true;
00725 
00726   int numE = getNumberElectrons();
00727   double &   lap_PR = lapPR[ci];
00728   double ** grad_PR = gradPR.array()[ci];
00729   lap_PR = 0.0;
00730 
00731   double * p_lap = lap.array();
00732   double * p_grx = gradx.array();
00733   double * p_gry = grady.array();
00734   double * p_grz = gradz.array();
00735 
00736   for(int e=0; e<numE; e++)
00737     {
00738       grad_PR[e][0] = 0.0;
00739       grad_PR[e][1] = 0.0;
00740       grad_PR[e][2] = 0.0;
00741     }
00742 
00743   for(int o=0; o<numE; o++)
00744     {
00745       int orb = occ->get(ci,o);
00746 
00747       for(int e=0; e<numE; e++)
00748         {
00749           int ind = lap.map(e,orb);
00750           double inv_v = inv(o,e);
00751 
00752           lap_PR        += p_lap[ind] * inv_v;
00753           grad_PR[e][0] += p_grx[ind] * inv_v;
00754           grad_PR[e][1] += p_gry[ind] * inv_v;
00755           grad_PR[e][2] += p_grz[ind] * inv_v;
00756         }
00757     }
00758   return false;
00759 }
00760 
00761 #if defined SINGLEPRECISION || defined QMC_GPU
00762 template bool QMCSlater::calculate_DerivativeRatios<float>
00763 (int ci, int row,
00764  Array2D<double> & psi,
00765  Array2D<double> & inv,
00766  Array2D<float> & lap,
00767  Array2D<float> & gradx,
00768  Array2D<float> & grady,
00769  Array2D<float> & gradz,
00770  Array1D<double> & det,
00771  Array3D<double> & gradPR,
00772  Array1D<double> & lapPR);
00773 #endif
00774 
00775 template bool QMCSlater::calculate_DerivativeRatios<double>
00776 (int ci, int row,
00777  Array2D<double> & psi,
00778  Array2D<double> & inv,
00779  Array2D<double> & lap,
00780  Array2D<double> & gradx,
00781  Array2D<double> & grady,
00782  Array2D<double> & gradz,
00783  Array1D<double> & det,
00784  Array3D<double> & gradPR,
00785  Array1D<double> & lapPR);
00786   
00800 #ifdef QMC_GPU
00801 void QMCSlater::gpuEvaluate(Array1D<Array2D<double>*> &X, int num)
00802 {
00803   if(getNumberElectrons == 0)
00804     return;
00805 
00806   static double averageM = 0, timeM = 0;
00807   static double averageB = 0, timeB = 0;
00808   static double numT = -5;
00809   static int nBF = WF->getNumberBasisFunctions();
00810   static int nOE = getNumberElectrons();
00811   static int mat_multiplier = gpuMatMult.getNumIterations() * 1000;
00812   static int bas_multiplier = gpuBF.getNumIterations();
00813   // 1 add + 1 mul per nBF * nOE * nOE
00814   double numOps = 5*num*(nBF * nOE * nOE * 2.0 - nOE * nOE) / 1000.0;
00815 
00816   GLuint texture;
00817   Stopwatch sw = Stopwatch();
00818   sw.reset();
00819 
00820   if(showTimings)
00821     {
00822       sw.reset(); sw.start();
00823     }
00824 
00825   texture = gpuBF.runCalculation(X,num, Start, Stop);
00826 
00827   if(showTimings)
00828     {
00829       glFinish();
00830       sw.stop();
00831       timeB = sw.timeUS();
00832       if(numT >= 0) averageB += sw.timeUS();
00833     }
00834 
00835   if(showTimings)
00836     {
00837       sw.reset(); sw.start();
00838     }
00839 
00840   gpuMatMult.runCalculation(num,texture);
00841 
00842   if(showTimings)
00843     {
00844       //the more standard timing test does not include the download time
00845       //gpuMatMult.getResults(resultsCollector);
00846       glFinish();
00847       sw.stop();
00848       timeM = sw.timeUS();
00849       if(numT >= 0) averageM += sw.timeUS();
00850 
00851       if( num > 1) numT++;
00852       cout << "\ngpu bf: " << (int)(timeB/bas_multiplier+0.5) << " ( " << (int)(averageB/(numT*bas_multiplier)+0.5) << ") ";
00853       cout << "mm: " << (int)(timeM/mat_multiplier+0.5) << " (" << (int)(averageM/(numT*mat_multiplier)+0.5) << ") ";
00854       cout << "; mflops: " << (int)(mat_multiplier*numOps/timeM+0.5) <<
00855       " (" << (int)(mat_multiplier*numOps/(averageM/numT)+0.5) << ")\n";
00856     }
00857 }
00858 #endif
00859 
00860 void QMCSlater::evaluate(Array1D<Array2D<double>*> &R,
00861                          int num, int start, int whichE)
00862 {
00863   if(getNumberElectrons() == 0)
00864     return;
00865 
00866   //if this iteration doesn't involve updating
00867   //one of our electrons, we can return
00868   if(whichE != -1)
00869     if(whichE < Start || whichE > Stop)
00870       return;
00871 
00872   static double averageM = 0, timeM = 0;
00873   static double averageB = 0, timeB = 0;
00874   static double numT = -5;
00875   static int nBF = WF->getNumberBasisFunctions();
00876   static int nOE = getNumberElectrons();
00877   static int mat_multiplier = 1000;
00878   static int bas_multiplier = 1000;
00879   // 1 add + 1 mul per nBF * nOE * nOE
00880   double numOps = 5*num*(nBF * nOE * nOE * 2.0 - nOE * nOE) / 1000.0;
00881   int aveB=0, aveM=0, aveF=0;
00882 
00883   Stopwatch sw = Stopwatch();
00884   sw.reset();
00885 
00886   timeB = 0; timeM = 0;
00887   for(int walker = 0; walker < num; walker++)
00888     {
00889       if(showTimings)
00890         {
00891           sw.reset(); sw.start();
00892         }
00893 
00894       int Chi_i = Xw.dim1() == 1 ? 0 : walker;
00895       int startE, stopE;
00896 
00897       allocateIteration(whichE,startE,stopE);
00898       
00899       BF->evaluateBasisFunctions(*R(walker+start),
00900                                  startE,stopE,
00901                                  Xw(Chi_i),
00902                                  (Xw_x(Chi_i))(0),
00903                                  (Xw_x(Chi_i))(1),
00904                                  (Xw_x(Chi_i))(2),
00905                                  Xw_xx(Chi_i));
00906 
00907       if(showTimings)
00908         {
00909           sw.stop();
00910           timeB += sw.timeUS();
00911           if(numT >= 0) averageB += sw.timeUS();
00912           sw.reset(); sw.start();
00913         }
00914 
00915       Array2D<qmcfloat> * coeffs = WF->getCoeff(isAlpha);
00916 
00917       Xw(Chi_i).gemm( *coeffs, Dw(walker),true);
00918       Xw_xx(Chi_i).gemm( *coeffs, Dw_xx(walker),true);
00919       (Xw_x(Chi_i))(0).gemm( *coeffs, Dw_x(walker,0),true);
00920       (Xw_x(Chi_i))(1).gemm( *coeffs, Dw_x(walker,1),true);
00921       (Xw_x(Chi_i))(2).gemm( *coeffs, Dw_x(walker,2),true);
00922 
00923       if (Input->flags.replace_electron_nucleus_cusps == 1)
00924         ElectronNucleusCusp.replaceCusps(*R(walker+start),
00925                                          startE,stopE,
00926                                          Dw(walker),
00927                                          Dw_x(walker,0),
00928                                          Dw_x(walker,1),
00929                                          Dw_x(walker,2),
00930                                          Dw_xx(walker));
00931 
00932       if(showTimings)
00933         {
00934           sw.stop();
00935           timeM += sw.timeUS();
00936           if(numT >= 0) averageM += sw.timeUS();
00937         }
00938     }
00939 
00940   for(int walker = 0; walker < num; walker++)
00941     {
00942       int Chi_i = Xw.dim1() == 1 ? 0 : walker;
00943 
00944       if (Input->flags.calculate_bf_density == 1)
00945         {
00946           Xw_Density(walker+start) = 0.0;
00947           for (int i=0; i<WF->getNumberBasisFunctions(); i++)
00948             for (int j=0; j<getNumberElectrons(); j++)
00949               {
00950                 (Xw_Density(walker))(i) += (Xw(Chi_i))(j,i);
00951               }
00952         }
00953     }
00954 
00955   if(showTimings)
00956     {
00957       //if you don't check this, your averages might be off
00958       //when switching from equilibration calls to production calls
00959       if(num>1) numT++;
00960       if(numT > 0)
00961         {
00962           aveB = (int)(averageB/(numT*bas_multiplier)+0.5);
00963           aveM = (int)(averageM/(numT*mat_multiplier)+0.5);
00964           aveF = (int)(numOps/(averageM/(numT*mat_multiplier))+0.5);
00965         }
00966       else
00967         {
00968           aveB = 0;
00969           aveM = 0;
00970           aveF = 0;
00971         }
00972 
00973       cout << "cpu bf: " << (int)(timeB/bas_multiplier+0.5) << " (" << aveB << ") ";
00974       cout << "mm: "     << (int)(timeM/mat_multiplier+0.5) << " (" << aveM << ") ";
00975       cout << "; mflops: " << (int)(mat_multiplier*numOps/timeM+0.5) <<
00976       " (" << aveF << ")\n";
00977     }
00978 }
00979 
00980 Array1D<double>* QMCSlater::getPsi(int i)
00981 {
00982   if(getNumberElectrons() != 0)
00983     return pointer_Dwc(i);
00984 
00985   if(Dwc.dim1() == 0)
00986     {
00987       Dwc.allocate(1);
00988       Dwc(0).allocate(WF->getNumberDeterminants());
00989       Dwc(0) = 1.0;
00990     }
00991   return &Dwc(0);
00992 }
00993 
00994 Array1D<double>* QMCSlater::getLaplacianPsiRatio(int i)
00995 {
00996   if(getNumberElectrons() != 0)
00997     return pointer_rDwc_xx(i);
00998   return 0;
00999 }
01000 
01001 Array3D<double>* QMCSlater::getGradPsiRatio(int i)
01002 {
01003   if(getNumberElectrons() != 0)
01004     return pointer_rDwc_x(i);
01005 
01006   return 0;
01007 }
01008 
01009 Array1D<double>* QMCSlater::getChiDensity(int i)
01010 {
01011   if(getNumberElectrons() != 0)
01012     return &Xw_Density(i);
01013 
01014   return 0;
01015 }
01016 
01017 bool QMCSlater::isSingular(int j)
01018 {
01019   if(getNumberElectrons() == 0)
01020     return false;
01021 
01022   for (int i=0; i<WF->getNumberDeterminants(); i++)
01023     {
01024       if (Singular(j)(i)) return true;
01025     }
01026   return false;
01027 }