00001
00002
00003
00004
00005
00006
00007
00008 #include "GPUQMCMatrix.h"
00009
00010 #ifdef QMC_GPU
00011
00012
00013
00014 static const bool PRINT_SHADER = false;
00015 static const int TIMING_REPS = 10;
00016 static const bool INT_FINISHES = false;
00017
00018
00033 static int LOOPS_PER_PASS = 150;
00034 static const bool UNROLL_LOOP = true;
00035 static const bool USE_CHEAPER = true;
00036 static const bool USE_TRIANGLES = true;
00037 static const bool CHEAPER_FIRST = false;
00038 static const bool ALL_HALF = false;
00039
00040 static const int SEPARATE_MAD = 1;
00041 static const int TOGETHER_MAD = 2;
00042 static const int KAHAN_SUMS = 3;
00043 static const int EXPERIMENTAL = 4;
00044 static const int HOW_TO_ADD = SEPARATE_MAD;
00045
00052 static const int MRT_W = 2;
00053 static const int MRT_H = 2;
00054
00055 static const bool REUSE_SHADERS = false;
00056 static bool shadersCreated = false;
00057
00058
00059
00061 #if UNROLL_LOOP
00062 static const CGprofile matrixProfile = CG_PROFILE_FP30;
00063 #else
00064 static const CGprofile matrixProfile = CG_PROFILE_FP40;
00065 #endif
00066
00074 #if ALL_HALF
00075 #define TEXTURE_INTERNAL_FORMAT GL_FLOAT_RGBA32_NV
00076 #define TEXTURE_TARGET GL_TEXTURE_RECTANGLE_NV
00077 #else
00078 #define TEXTURE_INTERNAL_FORMAT GL_FLOAT_RGBA32_NV
00079 #define TEXTURE_TARGET GL_TEXTURE_RECTANGLE_NV
00080 #endif
00081
00082
00083
00084
00085 vector<CGprogram> GPUQMCMatrix::fp;
00086 vector<CGparameter> GPUQMCMatrix::tELxBF;
00087 vector<CGparameter> GPUQMCMatrix::tORxBF;
00088 vector< vector<CGparameter> > GPUQMCMatrix::tELxOR;
00089
00090 GPUQMCMatrix::GPUQMCMatrix()
00091 {
00092 nOrbitals = 0;
00093 nBasisF = 0;
00094 nRows = 0;
00095 nCols = 0;
00096 resultsIn = -1;
00097 }
00098
00099 GPUQMCMatrix::GPUQMCMatrix(QMCInput *INPUT, Array1D< Array2D<qmcfloat> > & Coeffs, int numCalcs)
00100 {
00101 Input = INPUT;
00102
00103
00104
00105 getFactors(numCalcs,nRows,nCols);
00106 nDets = Coeffs.dim1();
00107 nOrbitals = Coeffs(0).dim1();
00108 nBasisF = Coeffs(0).dim2();
00109
00110 deltaBF = (int)(nBasisF/2.0);
00111 deltaOE = (int)(nOrbitals/2.0);
00112 if(nBasisF%2 != 0) deltaBF += 1;
00113 if(nOrbitals%2 != 0) deltaOE += 1;
00114
00115 numLoops = deltaBF;
00116 numPasses = (int)(ceil((double)numLoops/LOOPS_PER_PASS));
00117
00118 deltaW = (int)(deltaOE/MRT_W);
00119 deltaH = (int)(deltaOE/MRT_H);
00120
00121 if(MRT_W > 1 || MRT_H > 1)
00122 {
00123
00124 if(nOrbitals%(MRT_W*2) != 0 || nOrbitals%(MRT_H*2) != 0)
00125 cerr << "Error: MRT fringe cases not implemented\n";
00126 }
00127 if(MRT_H*MRT_W < 1 || MRT_H*MRT_W > 4)
00128 cerr << "Error: 1 <= MRT_W*MRT_H <= 4 (max 4 render textures)\n";
00129
00130
00131
00132
00133 cpuData = new GLfloat[ nCols*deltaOE * nRows*nMats*deltaOE * 4 ];
00134
00135 int rebigulator = deltaBF*deltaOE*4;
00136 pixelData = (GLfloat *) calloc( nCols*deltaOE * nRows*nMats*deltaOE * 4 + rebigulator, sizeof(GLfloat) );
00137
00138
00139 gpuDataFB = new GPUQMCFramebuffer[nDets];
00140 for(int i=0; i<nDets; i++)
00141 {
00142 gpuDataFB[i].initialize(nCols*deltaW, nRows*nMats*deltaH, 2, MRT_H*MRT_W);
00143 }
00144 getOpenGLError("Error creating framebuffers");
00145
00146
00147 coTexID = new GLuint[nDets];
00148 glGenTextures(nDets, coTexID);
00149
00150 ArrayGPU coeffA = ArrayGPU(1);
00151 for(int i=0; i<nDets; i++)
00152 {
00153 coeffA(0) = Coeffs(i);
00154 loadData(coeffA, coTexID[i], 1, 1);
00155 }
00156 for(int i=0; i<coeffA.dim1(); i++)
00157 coeffA(i).deallocate();
00158 coeffA.deallocate();
00159 getOpenGLError("Error loading Coeffs");
00160
00161
00162 if(fp.empty())
00163 loadShaders();
00164
00165
00166 resultsIn = -1;
00167 }
00168
00169 int GPUQMCMatrix::runCalculation(int numCalcs, GLuint bfInput)
00170 {
00171 if(numCalcs == 0) cerr << "ERROR: numCalcs can not be zero\n";
00172 resultsIn = -1;
00173 getFactors(numCalcs,nRows,nCols);
00174
00175 #ifdef PRINT_TIMINGS
00176 Stopwatch sw = Stopwatch();
00177 double temp;
00178 sw.reset(); sw.start();
00179 for(int numReps=0; numReps<TIMING_REPS; numReps++)
00180 {
00181 #endif
00182
00183 for(int iDet=0; iDet<nDets; iDet++)
00184 {
00185
00186 gpuDataFB[iDet].cleanAllBuffers();
00187 gpuDataFB[iDet].drawTo(0);
00188
00189 cgGLEnableProfile(matrixProfile);
00190 for(int passIndex=0; passIndex<numPasses; passIndex++)
00191 {
00192 cgGLSetTextureParameter(tELxBF[passIndex], bfInput);
00193 cgGLSetTextureParameter(tORxBF[passIndex], coTexID[iDet]);
00194
00195 cgGLEnableTextureParameter(tELxBF[passIndex]);
00196 cgGLEnableTextureParameter(tORxBF[passIndex]);
00197 }
00198
00199 int maxs = gpuDataFB[iDet].getWidth();
00200 int maxt = gpuDataFB[iDet].getHeight();
00201
00202 glEnable(GL_SCISSOR_TEST);
00203 glScissor( 0, 0, maxs, maxt);
00204
00205 for(int i=0; i<numPasses; i++)
00206 {
00207 gpuDataFB[iDet].drawTo(i%2);
00208
00209 int whichRT;
00210 for(int mrth=0; mrth<MRT_H; mrth++)
00211 {
00212 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00213 {
00214 whichRT = mrth*MRT_W + mrtw;
00215 cgGLSetTextureParameter(tELxOR[i][whichRT], gpuDataFB[iDet].getTextureID((i+1)%2,whichRT));
00216 cgGLEnableTextureParameter(tELxOR[i][whichRT]);
00217 }
00218 }
00219
00220 cgGLBindProgram(fp[i]);
00221
00222 if(USE_TRIANGLES)
00223 {
00224 glBegin(GL_TRIANGLES);
00225 glTexCoord2f(0.0f, 0.0f); glVertex2f(-1.0f, -1.0f);
00226 glTexCoord2f(0.0f, maxt*2); glVertex2f(-1.0f, 3.0f);
00227 glTexCoord2f(maxs*2, 0.0f); glVertex2f(3.0f, -1.0f);
00228 glEnd();
00229 }
00230 else
00231 {
00232 glBegin(GL_QUADS);
00233 glTexCoord2f(0.0, 0.0 ); glVertex2f(-1.0, -1.0);
00234 glTexCoord2f(0.0, maxt); glVertex2f(-1.0, 1.0);
00235 glTexCoord2f(maxs, maxt); glVertex2f( 1.0, 1.0);
00236 glTexCoord2f(maxs, 0.0 ); glVertex2f( 1.0, -1.0);
00237 glEnd();
00238 }
00239 }
00240
00241 gpuDataFB[iDet].drawTo(0);
00242
00243 cgGLDisableProfile(matrixProfile);
00244 for(int passIndex=0; passIndex<numPasses; passIndex++)
00245 {
00246 cgGLDisableTextureParameter(tELxBF[passIndex]);
00247 cgGLDisableTextureParameter(tORxBF[passIndex]);
00248 for(int mrt=0; mrt<MRT_H*MRT_W; mrt++)
00249 {
00250 cgGLDisableTextureParameter(tELxOR[passIndex][mrt]);
00251 }
00252 }
00253 }
00254
00255 glDisable(GL_SCISSOR_TEST);
00256 glFlush();
00257
00258 if(INT_FINISHES)
00259 {
00260 glFinish();
00261 }
00262
00263
00264 #ifdef PRINT_TIMINGS
00265
00266 }
00267 sw.stop();
00268 temp = (double)sw.timeMS()/TIMING_REPS;
00269 printf(" mm_cg: %7.2f", temp );
00270 #endif
00271
00272 resultsIn = (numPasses+1)%2;
00273
00274 GET_GLERROR("Error in QMC matrix multiply");
00275 return 0;
00276 }
00277
00278 void GPUQMCMatrix::loadData(ArrayGPU & data, GLuint & textureID, int numRows, int numCols)
00279 {
00280 glBindTexture(TEXTURE_TARGET, textureID);
00281 mapData(data,numRows,numCols);
00282 glTexImage2D(TEXTURE_TARGET, 0, TEXTURE_INTERNAL_FORMAT,
00283 numCols*deltaBF, numRows*deltaOE, 0, GL_RGBA, GL_FLOAT,
00284 pixelData);
00285 glTexParameterf(TEXTURE_TARGET, GL_TEXTURE_WRAP_S, GL_CLAMP);
00286 glTexParameterf(TEXTURE_TARGET, GL_TEXTURE_WRAP_T, GL_CLAMP);
00287 glTexParameterf(TEXTURE_TARGET, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
00288 glTexParameterf(TEXTURE_TARGET, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
00289 }
00290
00291 inline int GPUQMCMatrix::operandMapping(int r, int c, int i, int j, int numCols)
00292 {
00298 return 4*( (r*deltaOE + i)*numCols*deltaBF + (c*deltaBF + j) );
00299 }
00300
00301 void GPUQMCMatrix::mapData(ArrayGPU & data, int numRows, int numCols)
00302 {
00303 int h = deltaOE;
00304 int w = deltaBF;
00305
00306 GLfloat fringe = 0.0;
00307 Array2D<qmcfloat> * matrix;
00308
00309
00310 for(int c = 0; c < numCols; c++)
00311 {
00312 for(int r = 0; r < numRows; r++)
00313 {
00314 matrix = & data( c*numRows + r);
00315
00316 int i, j, index;
00317 for(i=0; i<h-1; i++)
00318 {
00319 for(j=0; j<w-1; j++)
00320 {
00321 index = operandMapping(r,c,i,j,numCols);
00322 pixelData[index ] = matrix->get( i*2 , j*2 );
00323 pixelData[index +1] = matrix->get( i*2 , j*2+1 );
00324 pixelData[index +2] = matrix->get( i*2+1 , j*2 );
00325 pixelData[index +3] = matrix->get( i*2+1 , j*2+1 );
00326 }
00327 }
00328
00329 j = w-1;
00330 if(nBasisF%2 != 0)
00331 {
00332 for(int i=0; i<h; i++)
00333 {
00334 index = operandMapping(r,c,i,j,numCols);
00335 pixelData[index ] = matrix->get( i*2 , j*2 );
00336 pixelData[index +1] = fringe;
00337 if(nOrbitals%2 == 0 || i<h-1)
00338 pixelData[index +2] = matrix->get( i*2+1 , j*2 );
00339 else
00340 pixelData[index +2] = fringe;
00341 pixelData[index +3] = fringe;
00342 }
00343 }
00344 else
00345 {
00346 for(int i=0; i<h; i++)
00347 {
00348 index = operandMapping(r,c,i,j,numCols);
00349 pixelData[index ] = matrix->get( i*2 , j*2 );
00350 pixelData[index +1] = matrix->get( i*2 , j*2+1 );
00351 if(nOrbitals%2 == 0 || i<h-1)
00352 {
00353 pixelData[index +2] = matrix->get( i*2+1 , j*2 );
00354 pixelData[index +3] = matrix->get( i*2+1 , j*2+1 );
00355 }
00356 else
00357 {
00358 pixelData[index +2] = fringe;
00359 pixelData[index +3] = fringe;
00360 }
00361 }
00362 }
00363
00364 i = h-1;
00365 if(nOrbitals%2 != 0)
00366 {
00367 for(int j=0; j<w; j++)
00368 {
00369 index = operandMapping(r,c,i,j,numCols);
00370 pixelData[index ] = matrix->get( i*2 , j*2 );
00371 if(nBasisF%2 == 0 || j<w-1)
00372 pixelData[index +1] = matrix->get( i*2 , j*2+1 );
00373 else
00374 pixelData[index +1] = fringe;
00375 pixelData[index +2] = fringe;
00376 pixelData[index +3] = fringe;
00377 }
00378 }
00379 else
00380 {
00381 for(int j=0; j<w; j++)
00382 {
00383 index = operandMapping(r,c,i,j,numCols);
00384 pixelData[index ] = matrix->get( i*2 , j*2 );
00385 pixelData[index +2] = matrix->get( i*2+1 , j*2 );
00386 if(nBasisF%2 == 0 || j<w-1)
00387 {
00388 pixelData[index +1] = matrix->get( i*2 , j*2+1 );
00389 pixelData[index +3] = matrix->get( i*2+1 , j*2+1 );
00390 }
00391 else
00392 {
00393 pixelData[index +1] = fringe;
00394 pixelData[index +3] = fringe;
00395 }
00396 }
00397 }
00398 }
00399 }
00400
00401 if(numCols*numRows > 0 && false)
00402 {
00403 cout << "loaded by matrix multiply\n";
00404 PrintRGBAPixelsBoxE(pixelData,numCols*deltaBF,numRows*deltaOE,20,20,-1,-1,true);
00405 }
00406
00407
00408
00409
00410
00411 }
00412
00413 void GPUQMCMatrix::getResults(Array2D< Array2D<float>* > & results)
00414 {
00415 if(resultsIn < 0)
00416 cerr << "Error: call runCalculation first!\n";
00417
00418 Array2D<float> * fetch;
00419 int subSize = 4 * nCols*deltaW * nRows*nMats*deltaH;
00420
00421 if(results.dim1() != nDets)
00422 {
00423 cout << "results data structure has incorrect dimensions";
00424 return;
00425 }
00426
00427 for(int iDet=0; iDet<nDets; iDet++)
00428 {
00429
00430 #ifdef PRINT_TIMINGS
00431 Stopwatch sw = Stopwatch();
00432 sw.reset(); sw.start();
00433 for(int numReps=0; numReps<TIMING_REPS; numReps++)
00434 {
00435 #endif
00436
00437 for(int mrth=0; mrth<MRT_H; mrth++)
00438 {
00439 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00440 {
00441 int whichRT = mrth*MRT_W + mrtw;
00442 gpuDataFB[iDet].readFrom(resultsIn, whichRT );
00443 glReadPixels(0, 0, nCols*deltaW, nRows*nMats*deltaH, GL_RGBA, GL_FLOAT,
00444 (GLfloat *)(cpuData + whichRT*subSize));
00445 }
00446 }
00447
00448 #ifdef PRINT_TIMINGS
00449
00450 }
00451 sw.stop();
00452 double temp = (double)sw.timeMS()/TIMING_REPS;
00453 printf(" mm_reading: %7.2f", temp );
00454
00455 sw.reset(); sw.start();
00456 for(int numReps=0; numReps<TIMING_REPS; numReps++)
00457 {
00458 #endif
00459
00460 int rstart, cstart, index;
00461 for(int r=0; r<nRows*nMats; r++)
00462 {
00463 rstart = r*deltaH;
00464 for(int c=0; c<nCols; c++)
00465 {
00466 cstart = c*deltaW;
00467
00468
00469 fetch = results(iDet,c*nRows*nMats + r);
00470
00471
00472
00473
00474
00475
00476
00477 int i, j;
00478 for(i=0; i<deltaOE-1; i++)
00479 {
00480 for(j=0; j<deltaOE-1; j++)
00481 {
00482 index = 4*( (rstart + i%deltaH)*nCols*deltaW + (cstart + j%deltaW) );
00483 index += subSize*( (int)(i/deltaH)*MRT_W + (int)(j/deltaW) );
00484 (*fetch)(i*2,j*2) = cpuData[index ];
00485 (*fetch)(i*2,j*2+1) = cpuData[index +1];
00486 (*fetch)(i*2+1,j*2) = cpuData[index +2];
00487 (*fetch)(i*2+1,j*2+1) = cpuData[index +3];
00488 }
00489 }
00490
00491
00492 j = deltaOE-1;
00493 if(nOrbitals%2 != 0)
00494 {
00495 for(i=0; i<deltaOE; i++)
00496 {
00497 index = 4*( (rstart + i%deltaH)*nCols*deltaW + (cstart + j%deltaW) );
00498 index += subSize*( (int)(i/deltaH)*MRT_W + (int)(j/deltaW) );
00499 (*fetch)(i*2,j*2) = cpuData[index ];
00500 if(i*2+1 < nOrbitals)
00501 (*fetch)(i*2+1,j*2) = cpuData[index +2];
00502 }
00503 }
00504 else
00505 {
00506 for(i=0; i<deltaOE; i++)
00507 {
00508 index = 4*( (rstart + i%deltaH)*nCols*deltaW + (cstart + j%deltaW) );
00509 index += subSize*( (int)(i/deltaH)*MRT_W + (int)(j/deltaW) );
00510 (*fetch)(i*2,j*2) = cpuData[index ];
00511 (*fetch)(i*2,j*2+1) = cpuData[index +1];
00512 if(i*2+1 < nOrbitals)
00513 {
00514 (*fetch)(i*2+1,j*2) = cpuData[index +2];
00515 (*fetch)(i*2+1,j*2+1) = cpuData[index +3];
00516 }
00517 }
00518 }
00519
00520
00521 i = deltaOE-1;
00522 if(nOrbitals%2 != 0)
00523 {
00524 for(j=0; j<deltaOE; j++)
00525 {
00526 index = 4*( (rstart + i%deltaH)*nCols*deltaW + (cstart + j%deltaW) );
00527 index += subSize*( (int)(i/deltaH)*MRT_W + (int)(j/deltaW) );
00528 (*fetch)(i*2,j*2) = cpuData[index ];
00529 if(j*2+1 < nOrbitals)
00530 (*fetch)(i*2,j*2+1) = cpuData[index +1];
00531 }
00532 }
00533 else
00534 {
00535 for(j=0; j<deltaOE; j++)
00536 {
00537 index = 4*( (rstart + i%deltaH)*nCols*deltaW + (cstart + j%deltaW) );
00538 index += subSize*( (int)(i/deltaH)*MRT_W + (int)(j/deltaW) );
00539 (*fetch)(i*2,j*2) = cpuData[index ];
00540 (*fetch)(i*2+1,j*2) = cpuData[index +2];
00541 if(j*2+1 < nOrbitals)
00542 {
00543 (*fetch)(i*2,j*2+1) = cpuData[index +1];
00544 (*fetch)(i*2+1,j*2+1) = cpuData[index +3];
00545 }
00546 }
00547 }
00548
00549
00550
00551
00552
00553
00554
00555 }
00556 }
00557
00558 #ifdef PRINT_TIMINGS
00559
00560 }
00561 sw.stop();
00562 temp = (double)sw.timeMS()/TIMING_REPS;
00563 printf(" mm_copying: %7.2f\n", temp );
00564 #endif
00565
00566 }
00567
00568
00569
00570
00571
00572
00573
00574
00575 }
00576
00577 string GPUQMCMatrix::generateShader(int start, int stop, bool isLastPass)
00578 {
00595
00596 string outputAccumulator =
00597 "struct outputType { \n";
00598
00599 string inputAccumulator;
00600
00601 string loadAccumulator;
00602
00603 string abAccumulator;
00604
00605 string coordAccumulator;
00606
00607
00608 string outputSingle =
00609 " float4 oIIJJ : COLORKK; \n";
00610 string inputSingle =
00611 " uniform samplerRECT accumIIJJ, \n";
00612 string loadSingle =
00613 " output.oIIJJ = texRECT(accumIIJJ, position); \n";
00614 string abSingle =
00615 " float4 LETTERII; \n";
00616 string coordSingle =
00617 " temptype2 coordKK = temptype2(LL*DELTAW,LL*DELTAH); \n";
00618
00619 for(int mrth=0; mrth<MRT_H; mrth++)
00620 {
00621 abAccumulator += abSingle;
00622 findandreplace(abAccumulator,"LETTER", "a");
00623 findandreplace(abAccumulator,"II", mrth+1);
00624 }
00625 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00626 {
00627 abAccumulator += abSingle;
00628 findandreplace(abAccumulator,"LETTER", "b");
00629 findandreplace(abAccumulator,"II", mrtw+1);
00630 }
00631
00632 for(int mrth=0; mrth<MRT_H; mrth++)
00633 {
00634 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00635 {
00636 loadAccumulator += loadSingle;
00637 inputAccumulator += inputSingle;
00638 outputAccumulator += outputSingle;
00639 coordAccumulator += coordSingle;
00640 findandreplace(loadAccumulator,"II", mrth+1);
00641 findandreplace(inputAccumulator,"II", mrth+1);
00642 findandreplace(outputAccumulator,"II", mrth+1);
00643 findandreplace(outputAccumulator,"JJ", mrtw+1);
00644 findandreplace(inputAccumulator,"JJ", mrtw+1);
00645 findandreplace(loadAccumulator,"JJ", mrtw+1);
00646 findandreplace(outputAccumulator,"KK", mrth*MRT_W + mrtw);
00647 findandreplace(coordAccumulator,"KK", mrth*MRT_W + mrtw + 1);
00648 findandreplace(coordAccumulator,"LL", mrth*MRT_W + mrtw);
00649 }
00650 }
00651 outputAccumulator +=
00652 "}; \n\n";
00653
00654
00655 string shader;
00656 shader += outputAccumulator;
00657 shader +=
00658 "outputType main( \n"
00659 " uniform samplerRECT elbf, \n"
00660 " uniform samplerRECT oebf, \n";
00661 shader += inputAccumulator;
00662 shader +=
00663 " in temptype2 position : TEX0) \n"
00664 "{ \n"
00665 " int c = position.x/DELTAW; \n"
00666 " ROW_SHIFTER1 \n"
00667 " temptype4 coord = \n"
00668 " {index + c*deltaBF, position.y ROW_SHIFTER2, \n"
00669 " index, fmod(position.x, DELTAW)}; \n"
00670 " outputType output; \n";
00671 shader += coordAccumulator;
00672 shader += abAccumulator;
00673 shader += loadAccumulator;
00674
00675
00676
00677 if(MRT_H != 1)
00678 {
00679 findandreplace(shader,"ROW_SHIFTER1", "int r = position.y/DELTAH;");
00680 findandreplace(shader,"ROW_SHIFTER2", "+ r*DELTAH");
00681 }
00682 else
00683 {
00684 findandreplace(shader,"ROW_SHIFTER1", "");
00685 findandreplace(shader,"ROW_SHIFTER2", "");
00686 }
00687
00688
00689 if(HOW_TO_ADD == KAHAN_SUMS)
00690 {
00691 shader +=
00692 " float4 T=0, C=0, Y=0; \n";
00693 }
00694
00695 string mad;
00696 switch(HOW_TO_ADD)
00697 {
00698
00699 case SEPARATE_MAD:
00700 {
00701 mad =
00702 " output.oIIJJ += aII.xxzz*bJJ.xzxz; \n"
00703 " output.oIIJJ += aII.yyww*bJJ.ywyw; \n";
00704 break;
00705 }
00706
00707 case TOGETHER_MAD:
00708 {
00709 mad =
00710 " output.oIIJJ += aII.xxzz*bJJ.xzxz + aII.yyww*bJJ.ywyw; \n";
00711 break;
00712 }
00713
00714 case KAHAN_SUMS:
00715 {
00716 mad =
00717 " Y = aII.xxzz*bJJ.xzxz - C; \n"
00718 " T = output.oIIJJ + Y; \n"
00719 " C = (T - output.oIIJJ) - Y; \n"
00720 " output.oIIJJ = T; \n"
00721 " Y = aII.yyww*bJJ.ywyw - C; \n"
00722 " T = output.oIIJJ + Y; \n"
00723 " C = (T - output.oIIJJ) - Y; \n"
00724 " output.oIIJJ = T; \n";
00725 break;
00726 }
00727
00728 case EXPERIMENTAL:
00729 {
00730 mad =
00731 " output.oIIJJ += aII.xxzz*bJJ.xzxz; \n"
00732 " output.oIIJJ += aII.yyww*bJJ.ywyw; \n";
00733 break;
00734 }
00735 }
00736
00737 string aLoader =
00738 " aII = texRECT(elbf, coord.xy + float2(0,coordII.y)); \n";
00739 string bLoader =
00740 " bJJ = texRECT(oebf, coord.zw + float2(0,coordJJ.x)); \n";
00741 string innerLooper =
00742 " coord += temptype4(1,0,1,0); \n";
00743
00744
00745
00746
00747 bool bNeeded = true;
00748 if(UNROLL_LOOP)
00749 {
00750 for(int i=start; i<stop; i++)
00751 {
00752 bNeeded = true;
00753 for(int mrth=0; mrth<MRT_H; mrth++)
00754 {
00755 shader += aLoader;
00756 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00757 {
00758 if(bNeeded)
00759 {
00760 shader += bLoader;
00761 }
00762 shader += innerLooper;
00763 shader += mad;
00764 findandreplace(shader,"II", mrth+1);
00765 findandreplace(shader,"JJ", mrtw+1);
00766 }
00767 bNeeded = false;
00768 }
00769 }
00770 }
00771 else
00772 {
00773 shader +=
00774 " while(coord.z < stopOps) { \n";
00775 for(int mrth=0; mrth<MRT_H; mrth++)
00776 {
00777 shader += aLoader;
00778 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00779 {
00780 if(bNeeded)
00781 {
00782 shader += bLoader;
00783 }
00784 shader += mad;
00785 findandreplace(shader,"II", mrth+1);
00786 findandreplace(shader,"JJ", mrtw+1);
00787 }
00788 bNeeded = false;
00789 }
00790 shader += innerLooper;
00791 shader +=
00792 " } \n";
00793 findandreplace(shader,"startOps", start);
00794 findandreplace(shader,"stopOps", stop);
00795 }
00796
00797 if(!isLastPass && HOW_TO_ADD == KAHAN_SUMS){
00798 shader +=
00799 " output.o11 -= C; \n";
00800 }
00801
00802
00803 shader +=
00804 " return output; \n"
00805 "} \n";
00806
00807 findandreplace(shader,"index", start);
00808 findandreplace(shader,"deltaOE", deltaOE);
00809 findandreplace(shader,"deltaBF", deltaBF);
00810 findandreplace(shader,"DELTAW", deltaW);
00811 findandreplace(shader,"DELTAH", deltaH);
00812 findandreplace(shader,"MRTH", MRT_H);
00813 findandreplace(shader,"MRTW", MRT_W);
00814
00815
00816
00817 if(USE_CHEAPER)
00818 {
00819 findandreplace(shader,"temptype", "half");
00820 }
00821 else
00822 {
00823 findandreplace(shader,"temptype", "float");
00824 }
00825
00826
00827 if(ALL_HALF)
00828 {
00829 findandreplace(shader,"float", "half");
00830 }
00831
00832 if(PRINT_SHADER)
00833 {
00834 cout << "shader is:\n" << shader << endl;
00835 getchar();
00836 }
00837 return shader;
00838 }
00839
00840
00841 int GPUQMCMatrix::getNumIterations()
00842 {
00843 #ifdef PRINT_TIMINGS
00844 return TIMING_REPS;
00845 #else
00846 return 1;
00847 #endif
00848 }
00849
00850 void GPUQMCMatrix::loadShaders()
00851 {
00852 tELxOR.resize(numPasses);
00853
00854 for(int i=0; i<numPasses; i++)
00855 {
00856 char shaderName[256];
00857 sprintf(shaderName,"shader_matmul.%d.%d.%d_pass%d.cg-asm",LOOPS_PER_PASS,deltaOE,deltaBF,i);
00858
00859 if(!shadersCreated && !REUSE_SHADERS)
00860 {
00861
00862 string generatedShader = generateShader(i*LOOPS_PER_PASS,
00863 i<numPasses-1?(i+1)*LOOPS_PER_PASS:numLoops,
00864 i==numPasses-1?true:false);
00865
00866
00867 char cgName[256];
00868 sprintf(cgName,"shader_matmul.%d.%d.%d_pass%d.cg",LOOPS_PER_PASS,deltaOE,deltaBF,i);
00869 writeShader(generatedShader.c_str(),cgName);
00870
00871
00872 fp.push_back(cgCreateProgram(g_cgContext, CG_SOURCE,
00873 generatedShader.c_str(),
00874 matrixProfile, "main", NULL));
00875
00876
00877 if(fp[i])
00878 writeShader(cgGetProgramString(fp[i],CG_COMPILED_PROGRAM),shaderName);
00879
00880 }
00881 else
00882 {
00883
00884
00885 fp.push_back(cgCreateProgramFromFile(g_cgContext, CG_OBJECT,
00886 shaderName,
00887 matrixProfile, "main", NULL));
00888
00889 }
00890
00891 if(!fp[i])
00892 {
00893 cerr << "ERROR: Matrix multiply shader did not compile.\n";
00894 cerr << " : LOOPS_PER_PASS is set to " << LOOPS_PER_PASS << endl;
00895 exit(1);
00896 }
00897
00898
00899
00900
00901 string accumIIJJ = "accumIIJJ";
00902 string temp;
00903 for(int mrth=0; mrth<MRT_H; mrth++)
00904 {
00905 for(int mrtw=0; mrtw<MRT_W; mrtw++)
00906 {
00907 temp = accumIIJJ;
00908 findandreplace(temp,"II", mrth+1);
00909 findandreplace(temp,"JJ", mrtw+1);
00910 tELxOR[i].push_back(cgGetNamedParameter(fp[i], temp.c_str()));
00911 }
00912 }
00913 tELxBF.push_back(cgGetNamedParameter(fp[i], "elbf"));
00914 tORxBF.push_back(cgGetNamedParameter(fp[i], "oebf"));
00915
00916 cgGLLoadProgram(fp[i]);
00917 }
00918
00919 shadersCreated = true;
00920 }
00921
00922 void GPUQMCMatrix::destroy()
00923 {
00924 for(int i=0; i<nDets; i++)
00925 {
00926
00927 }
00928
00929 glDeleteTextures(1, coTexID);
00930 delete [] gpuDataFB;
00931 delete [] cpuData;
00932 delete [] pixelData;
00933 }
00934 #endif