diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index eb8c6d386c..2e03817a09 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -572,6 +572,32 @@ void MemoryManager::Audit(std::string s) } +// void MemoryManager::PrintStateNew(void* _CpuPtr) +// { +// uint64_t CpuPtr = (uint64_t)_CpuPtr; + +// if ( EntryPresent(CpuPtr) ){ +// auto AccCacheIterator = EntryLookup(CpuPtr); +// auto & AccCache = AccCacheIterator->second; +// std::string str; +// if ( AccCache.state==Empty ) str = std::string("Empty"); +// if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); +// if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); +// if ( AccCache.state==Consistent)str = std::string("Consistent"); +// if ( AccCache.state==EvictNext) str = std::string("EvictNext"); + +// std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<::value,void>::type * = nullptr> auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) { + GRID_TRACE("Cshift"); return Cshift(closure(expr),dim,shift); } NAMESPACE_END(Grid); diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index fdb98cd4e3..ffd83acad0 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -31,21 +31,73 @@ NAMESPACE_BEGIN(Grid); extern std::vector > Cshift_table; extern deviceVector > Cshift_table_device; +extern std::vector Cshift_vector; +extern deviceVector Cshift_vector_device; -inline std::pair *MapCshiftTable(void) +// Copy Cshift map object (table or vector) to device +template +inline void MapCshiftCopy(std::vector &Cshift_obj, deviceVector &Cshift_obj_device) { - // GPU version - uint64_t sz=Cshift_table.size(); - if (Cshift_table_device.size()!=sz ) { - Cshift_table_device.resize(sz); + // GPU version only + uint64_t sz=Cshift_obj.size(); + if (Cshift_obj_device.size()!=sz ) { + Cshift_obj_device.resize(sz); } - acceleratorCopyToDevice((void *)&Cshift_table[0], - (void *)&Cshift_table_device[0], - sizeof(Cshift_table[0])*sz); + acceleratorCopyToDevice((void *)&Cshift_obj[0], + (void *)&Cshift_obj_device[0], + sizeof(Cshift_obj[0])*sz); - return &Cshift_table_device[0]; - // CPU version use identify map } + +// Copy Cshift map object (table or vector) to device and return pointer to device copy +template +inline vobj *MapCshift(std::vector &Cshift_obj, deviceVector &Cshift_obj_device) +{ + MapCshiftCopy(Cshift_obj, Cshift_obj_device); + + return &Cshift_obj_device[0]; +} + +// Calculate Cshift_vector +template +void CalculateCshiftVector(Lattice &ret, const Lattice &rhs, int dimension, int cbmask) +{ + GridBase *grid = rhs.Grid(); + + if ( !grid->CheckerBoarded(dimension) ) { + cbmask=0x3; + } + + int e1=grid->_slice_nblock[dimension]; // clearly loop invariant for icpc + int e2=grid->_slice_block[dimension]; + int stride = grid->_slice_stride[dimension]; + + if (Cshift_vector.size() < e1*e2) Cshift_vector.resize(e1*e2); // Let it grow to biggest + + int ent = 0; + if(cbmask == 0x3 ){ + for(int n=0;nCheckerBoardFromOindex(o); + if ( ocb&cbmask ) { + Cshift_vector[ent++] = o; + } + } + } + } + + if (ent < Cshift_vector.size()) Cshift_vector.resize(ent); // trim vector to actual size (relevant for checkerboarded dimensions) +} + + /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -89,7 +141,7 @@ Gather_plane_simple (const Lattice &rhs,deviceVector &buffer,int dim } { auto buffer_p = & buffer[0]; - auto table = MapCshiftTable(); + auto table = MapCshift >(Cshift_table, Cshift_table_device); autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); @@ -201,7 +253,7 @@ template void Scatter_plane_simple (Lattice &rhs,deviceVector< { auto buffer_p = & buffer[0]; - auto table = MapCshiftTable(); + auto table = MapCshift >(Cshift_table, Cshift_table_device); autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); @@ -263,94 +315,36 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA template void Copy_plane(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask) { - int rd = rhs.Grid()->_rdimensions[dimension]; - if ( !rhs.Grid()->CheckerBoarded(dimension) ) { - cbmask=0x3; - } + GRID_TRACE("Copy_plane"); int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane - int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc - int e2=rhs.Grid()->_slice_block[dimension]; - int stride = rhs.Grid()->_slice_stride[dimension]; - - if(Cshift_table.size()(lo+o,ro+o); - } - } - } else { - for(int n=0;nCheckerBoardFromOindex(o); - if ( ocb&cbmask ) { - Cshift_table[ent++] = std::pair(lo+o,ro+o); - } - } - } - } - - { - auto table = MapCshiftTable(); - autoView(rhs_v , rhs, AcceleratorRead); - autoView(lhs_v , lhs, AcceleratorWrite); - accelerator_for(i,ent,vobj::Nsimd(),{ - coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); - }); - } + auto table = &Cshift_vector_device[0]; + tracePush("copy_plane-av"); + autoView(rhs_v , rhs, AcceleratorRead); + autoView(lhs_v , lhs, AcceleratorWrite); + tracePush("copy_plane-acc_for"); + accelerator_for(i,Cshift_vector.size(),vobj::Nsimd(),{ + coalescedWrite(lhs_v[table[i]+lo],coalescedRead(rhs_v[table[i]+ro])); + }); + tracePop("copy_plane-acc_for"); + tracePop("copy_plane-av"); } template void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { - int rd = rhs.Grid()->_rdimensions[dimension]; - - if ( !rhs.Grid()->CheckerBoarded(dimension) ) { - cbmask=0x3; - } int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane - int e1=rhs.Grid()->_slice_nblock[dimension]; - int e2=rhs.Grid()->_slice_block [dimension]; - int stride = rhs.Grid()->_slice_stride[dimension]; - - if(Cshift_table.size()(lo+o+b,ro+o+b); - }} - } else { - for(int n=0;nCheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); - }} - } - - { - auto table = MapCshiftTable(); - autoView( rhs_v, rhs, AcceleratorRead); - autoView( lhs_v, lhs, AcceleratorWrite); - accelerator_for(i,ent,1,{ - permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); + auto table = &Cshift_vector_device[0]; + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + accelerator_for(i,Cshift_vector.size(),1,{ + permute(lhs_v[table[i]+lo],rhs_v[table[i]+ro],permute_type); }); - } } ////////////////////////////////////////////////////// @@ -373,6 +367,7 @@ template void Cshift_local(Lattice& ret,const Lattice &r template void Cshift_local(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { + GRID_TRACE("Cshift_local"); GridBase *grid = rhs.Grid(); int fd = grid->_fdimensions[dimension]; int rd = grid->_rdimensions[dimension]; @@ -383,51 +378,58 @@ template void Cshift_local(Lattice &ret,const Lattice &r // Map to always positive shift modulo global full dimension. shift = (shift+fd)%fd; + int cb= (cbmask==0x2)? Odd : Even; + int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + // the permute type ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); int permute_dim =grid->PermuteDim(dimension); int permute_type=grid->PermuteType(dimension); int permute_type_dist; - for(int x=0;x rd. + // num is sshift mod rd. + // + // shift 7 + // + // XoXo YcYc + // oXoX cYcY + // XoXo YcYc + // oXoX cYcY + // + // sshift -- + // + // XX YY ; 3 + // XX YY ; 0 + // XX YY ; 3 + // XX YY ; 0 + // + int wrap = sshift/rd; wrap=wrap % ly; + int num = sshift%rd; + + // Calculate Cshift_vector - it's the same for all slices + tracePush("CalcCshiftTable"); + CalculateCshiftVector(ret, rhs, dimension, cbmask); + tracePop("CalcCshiftTable"); + // Copy it to the device + tracePush("MapCshiftTable"); + MapCshiftCopy(Cshift_vector, Cshift_vector_device); + tracePop("MapCshiftTable"); - // int o = 0; - int bo = x * grid->_ostride[dimension]; - int cb= (cbmask==0x2)? Odd : Even; + for(int x=0;xCheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sx = (x+sshift)%rd; - // wrap is whether sshift > rd. - // num is sshift mod rd. - // - // shift 7 - // - // XoXo YcYc - // oXoX cYcY - // XoXo YcYc - // oXoX cYcY - // - // sshift -- - // - // XX YY ; 3 - // XX YY ; 0 - // XX YY ; 3 - // XX YY ; 0 - // int permute_slice=0; if(permute_dim){ - int wrap = sshift/rd; wrap=wrap % ly; - int num = sshift%rd; - if ( x< rd-num ) permute_slice=wrap; else permute_slice = (wrap+1)%ly; if ( (ly>2) && (permute_slice) ) { - assert(permute_type & RotateBit); - permute_type_dist = permute_type|permute_slice; + assert(permute_type & RotateBit); + permute_type_dist = permute_type|permute_slice; } else { - permute_type_dist = permute_type; + permute_type_dist = permute_type; } } diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 710792ee30..4a22a7d992 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -34,6 +34,7 @@ NAMESPACE_BEGIN(Grid); const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { + GRID_TRACE("Cshift_MPI"); typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; @@ -56,7 +57,9 @@ template Lattice Cshift(const Lattice &rhs,int dimension t0=usecond(); if ( !comm_dim ) { // std::cout << "CSHIFT: Cshift_local" < void Cshift_comms(Lattice &ret,const Lattice &r int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + // Calculate Cshift_vector - it's the same for all slices + tracePush("CalcCshiftTableMPI"); + CalculateCshiftVector(ret, rhs, dimension, cbmask); + tracePop("CalcCshiftTableMPI"); + // Copy it to the device + tracePush("MapCshiftTableMPI"); + MapCshiftCopy(Cshift_vector, Cshift_vector_device); + tracePop("MapCshiftTableMPI"); + RealD tcopy=0.0; RealD tgather=0.0; RealD tscatter=0.0; @@ -203,6 +217,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r template void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { + GRID_TRACE("Cshift_comms_simd_MPI"); GridBase *grid=rhs.Grid(); const int Nsimd = grid->Nsimd(); typedef typename vobj::vector_type vector_type; diff --git a/Grid/cshift/Cshift_none.h b/Grid/cshift/Cshift_none.h index d3c0bfa088..6bf8fc839f 100644 --- a/Grid/cshift/Cshift_none.h +++ b/Grid/cshift/Cshift_none.h @@ -30,6 +30,7 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { + GRID_TRACE("Cshift_local"); Lattice ret(rhs.Grid()); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); Cshift_local(ret,rhs,dimension,shift); diff --git a/Grid/cshift/Cshift_table.cc b/Grid/cshift/Cshift_table.cc index 7e0cbe120c..359d94b8f4 100644 --- a/Grid/cshift/Cshift_table.cc +++ b/Grid/cshift/Cshift_table.cc @@ -2,4 +2,6 @@ NAMESPACE_BEGIN(Grid); std::vector > Cshift_table; deviceVector > Cshift_table_device; +std::vector Cshift_vector; +deviceVector Cshift_vector_device; NAMESPACE_END(Grid); diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc index 166aea0ac7..a8bac6ff69 100644 --- a/Grid/log/Log.cc +++ b/Grid/log/Log.cc @@ -72,6 +72,7 @@ GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); +GridLogger GridLogEd (1, "Ed" , GridLogColours, "GREEN"); GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE"); void GridLogConfigure(std::vector &logstreams) { @@ -87,6 +88,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogIntegrator.Active(1); GridLogColours.Active(0); GridLogHMC.Active(1); + GridLogEd.Active(1); for (int i = 0; i < logstreams.size(); i++) { if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1); diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 370b042809..3cb34b04de 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -187,6 +187,7 @@ extern GridLogger GridLogIntegrator; extern GridLogger GridLogHMC; extern GridLogger GridLogMemory; extern GridLogger GridLogTracing; +extern GridLogger GridLogEd; extern Colours GridLogColours; std::string demangle(const char* name) ; diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h index 10b638dc73..a81746c8dc 100644 --- a/Grid/perfmon/Tracing.h +++ b/Grid/perfmon/Tracing.h @@ -25,11 +25,11 @@ class GridTracer { public: GridTracer(const char* name) { roctxRangePushA(name); - std::cout << "roctxRangePush "< } static inline void update_field(Field& P, Field& U, double ep){ - //static std::chrono::duration diff; + tracePush("GaugeImplTypes_update_field"); + // static std::chrono::duration diff; //auto start = std::chrono::high_resolution_clock::now(); autoView(U_v,U,AcceleratorWrite); autoView(P_v,P,AcceleratorRead); - accelerator_for(ss, P.Grid()->oSites(),1,{ - for (int mu = 0; mu < Nd; mu++) { - U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); - U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); - } - }); - //auto end = std::chrono::high_resolution_clock::now(); - // diff += end - start; - // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; + + // accelerator_for(ss, P.Grid()->oSites(), 1, { + // for (int mu = 0; mu < Nd; mu++) { + // U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); + // U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); + // } + // }); + + // for (int mu = 0; mu < Nd; mu++) { + // accelerator_for(ss, P.Grid()->oSites(), 1, { + // U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); + // U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); + // }); + // } + + for (int mu = 0; mu < Nd; mu++) { + tracePush("Exponentiate"); + accelerator_for(ss, P.Grid()->oSites(), 1, { + U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); + }); + tracePop("Exponentiate"); + tracePush("ProjectOnSpGroup"); + accelerator_for(ss, P.Grid()->oSites(), 1, { + U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); + }); + tracePop("ProjectOnSpGroup"); + } + + // auto end = std::chrono::high_resolution_clock::now(); + // diff += end - start; + // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; + tracePop("GaugeImplTypes_update_field"); } static inline RealD FieldSquareNorm(Field& U){ diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 9116e78f6f..e0ad5c0557 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -99,7 +99,6 @@ class PlaqPlusRectangleAction : public Action { PokeIndex(dSdU, dSdU_mu, mu); } - }; }; diff --git a/Grid/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h index b3c3041658..b568f85bdb 100644 --- a/Grid/qcd/action/gauge/WilsonGaugeAction.h +++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h @@ -69,6 +69,7 @@ class WilsonGaugeAction : public Action { }; virtual void deriv(const GaugeField &U, GaugeField &dSdU) { + tracePush("WilsonGaugeAction_deriv"); // not optimal implementation FIXME // extend Ta to include Lorentz indexes @@ -76,18 +77,33 @@ class WilsonGaugeAction : public Action { GridBase *grid = U.Grid(); GaugeLinkField dSdU_mu(grid); + // Create a writeable GPU GaugeLinkField for dSdU_mu (Staple). + // autoView(dSdU_mu_v , dSdU_mu, AcceleratorWrite); std::vector Umu(Nd, grid); for (int mu = 0; mu < Nd; mu++) { Umu[mu] = PeekIndex(U, mu); } + // Is Umu now on the GPU after the PeekIndex? for (int mu = 0; mu < Nd; mu++) { // Staple in direction mu + tracePush("Staple"); WilsonLoops::Staple(dSdU_mu, Umu, mu); + // WilsonLoops::Staple(dSdU_mu_v, Umu, mu); + tracePop("Staple"); + + tracePush("TA_v"); dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor; + // dSdU_mu = Ta(Umu[mu] * dSdU_mu_v) * factor; + tracePop("TA_v"); + // Think: need to use accelerator_for with pokeIndex instead of PokeIndex. + tracePush("Poke_dSdU"); PokeIndex(dSdU, dSdU_mu, mu); + tracePop("Poke_dSdU"); } + // Best place to do the copy + tracePop("WilsonGaugeAction_deriv"); } private: diff --git a/Grid/qcd/action/pseudofermion/TwoFlavour.h b/Grid/qcd/action/pseudofermion/TwoFlavour.h index 2ac97dddfe..18033a64ac 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavour.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h @@ -129,6 +129,8 @@ class TwoFlavourPseudoFermionAction : public Action { // ////////////////////////////////////////////////////// virtual void deriv(const GaugeField &U, GaugeField &dSdU) { + + tracePush("TwoFlavour_deriv"); FermOp.ImportGauge(U); FermionField X(FermOp.FermionGrid()); @@ -150,6 +152,7 @@ class TwoFlavourPseudoFermionAction : public Action { dSdU = dSdU + tmp; // not taking here the traceless antihermitian component + tracePop("TwoFlavour_deriv"); }; }; diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 583c0f2b9d..044b3df57f 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -42,6 +42,7 @@ namespace PeriodicBC { int mu, const Lattice &field) { + GRID_TRACE("PeriodicBC CovShiftForward"); return Link*Cshift(field,mu,1);// moves towards negative mu } //Out(x) = Link^dag(x-mu)*field(x-mu) @@ -49,6 +50,7 @@ namespace PeriodicBC { int mu, const Lattice &field) { + GRID_TRACE("PeriodicBC CovShiftBackward"); Lattice tmp(field.Grid()); tmp = adj(Link)*field; return Cshift(tmp,mu,-1);// moves towards positive mu @@ -57,6 +59,7 @@ namespace PeriodicBC { template Lattice CovShiftIdentityBackward(const Lattice &Link, int mu) { + GRID_TRACE("PeriodicBC CovShiftIdentityBackward"); return Cshift(adj(Link), mu, -1); } //Out(x) = Link(x) @@ -69,6 +72,7 @@ namespace PeriodicBC { template Lattice ShiftStaple(const Lattice &Link, int mu) { + GRID_TRACE("PeriodicBC ShiftStaple"); return Cshift(Link, mu, 1); } @@ -131,6 +135,7 @@ namespace ConjugateBC { int mu, const Lattice &field) { + GRID_TRACE("ConjugateBC CovShiftForward"); GridBase * grid = Link.Grid(); int Lmu = grid->GlobalDimensions()[mu]-1; @@ -150,6 +155,7 @@ namespace ConjugateBC { int mu, const Lattice &field) { + GRID_TRACE("ConjugateBC CovShiftBackward"); GridBase * grid = field.Grid(); int Lmu = grid->GlobalDimensions()[mu]-1; @@ -192,6 +198,7 @@ namespace ConjugateBC { template Lattice ShiftStaple(const Lattice &Link, int mu) { + GRID_TRACE("ConjugateBC ShiftStaple"); GridBase *grid = Link.Grid(); int Lmu = grid->GlobalDimensions()[mu] - 1; diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 655255d65b..97708fadd8 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -317,12 +317,16 @@ template class WilsonLoops : public Gimpl { // __| // + // Cshift(adj(Link), mu, -1) -> adj followed by lots of small copy_plane (from Cshift) (CovShiftIdentityBackward) + // tmp = adj(Link)*field -> adj followed by binary mul; Cshift(tmp,mu,-1) -> lots of small copy_plane (from Cshift) (CovShiftBackward) + // Link*Cshift(field,mu,1) -> lots of small copy_plane (from Cshift) followed by binary mul (CovShiftForward) + // Cshift(Link, mu, 1) lots of small copy_plane (from Cshift) (ShiftStaple) + // Large HtoD copy followed by bindary add (staple is on the host, is this copying data to the device to do the add because the shifted GaugeLinkField is on the GPU?) + tracePush("ShiftStapleForward"); staple += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - Umu[nu], nu, - Gimpl::CovShiftBackward( - Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))), - mu); + Gimpl::CovShiftForward(Umu[nu], nu, + Gimpl::CovShiftBackward(Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))), mu); + tracePop("ShiftStapleForward"); // __ // | @@ -330,9 +334,11 @@ template class WilsonLoops : public Gimpl { // // + tracePush("ShiftStapleBack"); staple += Gimpl::ShiftStaple( - Gimpl::CovShiftBackward(Umu[nu], nu, - Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu); + Gimpl::CovShiftBackward(Umu[nu], nu, + Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu); + tracePop("ShiftStapleBack"); } } } diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index 59b25f36e4..0fd4ad4105 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -29,7 +29,6 @@ Author: neo #ifndef GRID_MATH_TA_H #define GRID_MATH_TA_H - NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////// @@ -207,30 +206,29 @@ template accelerator_inline iVector ProjectOnSpGroup // int N is 2n in Sp(2n) -template::TensorLevel == 0 >::type * =nullptr> +template::TensorLevel == 0 >::type * =nullptr> accelerator_inline iMatrix ProjectOnSpGroup(const iMatrix &arg) { - // need a check for the group type? iMatrix ret(arg); vtype nrm; vtype inner; - + for(int c1=0;c1 prn += conjugate(ret._internal[c1][c])*ret._internal[b+N/2][c]; // } - + for(int c=0; c accelerator_inline iScalar Exponentiate(const iScalar&r, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP) { + // Goes through here! iScalar ret; ret._internal = Exponentiate(r._internal, alpha, Nexp); return ret; @@ -118,22 +119,21 @@ accelerator_inline iMatrix Exponentiate(const iMatrix &arg, Re // General exponential -template::TensorLevel == 0 >::type * =nullptr> -accelerator_inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) +template::TensorLevel == 0 >::type * =nullptr> +accelerator_inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP) { // notice that it actually computes // exp ( input matrix ) // the i sign is coming from outside // input matrix is anti-hermitian NOT hermitian - typedef iMatrix mat; + typedef iMatrix mat; mat unit(1.0); - mat temp(unit); - for(int i=Nexp; i>=1;--i){ - temp *= alpha/RealD(i); - temp = unit + temp*arg; + mat ret(unit); + for (int i = Nexp; i >= 1; --i) { + ret *= alpha / RealD(i); + ret = unit + ret * arg; } - return temp; - + return ret; } NAMESPACE_END(Grid); diff --git a/tests/hmc/Test_hmc_IwasakiGauge_expected.txt b/tests/hmc/Test_hmc_IwasakiGauge_expected.txt new file mode 100644 index 0000000000..9391cc8c3d --- /dev/null +++ b/tests/hmc/Test_hmc_IwasakiGauge_expected.txt @@ -0,0 +1 @@ +8.8.8.8 1.1.1.1 0.269298793 633bf471 3a22ad20 diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge.cc b/tests/hmc/Test_hmc_WilsonFermionGauge.cc index a0c43c515b..a0ee096997 100644 --- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc @@ -129,8 +129,8 @@ int main(int argc, char **argv) { */ // HMC parameters are serialisable - TheHMC.Parameters.MD.MDsteps = 20; - TheHMC.Parameters.MD.trajL = 1.0; + TheHMC.Parameters.MD.MDsteps = 2; + TheHMC.Parameters.MD.trajL = 0.1; TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file TheHMC.Run(); // no smearing diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge_expected.txt b/tests/hmc/Test_hmc_WilsonFermionGauge_expected.txt new file mode 100644 index 0000000000..6b53130bfa --- /dev/null +++ b/tests/hmc/Test_hmc_WilsonFermionGauge_expected.txt @@ -0,0 +1 @@ +8.8.8.8 1.1.1.1 0.254950222 7f25d41 9d40279f diff --git a/tests/run_regression_test.py b/tests/run_regression_test.py new file mode 100644 index 0000000000..c7ad036744 --- /dev/null +++ b/tests/run_regression_test.py @@ -0,0 +1,134 @@ +def read_expected(test_name="Test_hmc_Sp_WilsonFundFermionGauge", grid="8.8.8.8", mpi="1.1.1.1"): + """ + Read expected values from file. + + The file contains one or more entries of the following format: + + Eg. + 8.8.8.8 1.1.1.1 0.0256253844 922c392f d1e4cc1c + """ + + with open(f"{test_name}_expected.txt") as file: + for line in file: + line_split = line.split() + if line_split[0] == grid and line_split[1] == mpi: + return float(line_split[2]), line_split[3], line_split[4] + + +def read_output(): + """ + Read test output and fish out values of interest. + """ + + checksum_rng = None + checksum_lat = None + plaquette = None + with open("output.txt", 'r') as file: + for line in file: + if "Written NERSC" in line: + subline = line.split('checksum ')[1] + if len(subline.split()) == 1: # this is the rng checksum line + checksum_rng = subline.strip() + elif len(subline.split()) == 3: # this is the lat checksum and plaquette value line + checksum_lat = subline.split()[0] + plaquette = float(subline.split()[2]) + else: + print("Picked wrong line...") + + if (checksum_rng is None) or (checksum_lat is None) or (plaquette is None): + print("Error reading values from output file. Make sure you compile the test with CPparams.saveInterval=1 in order to produce the required output.") + exit(1) + + return plaquette, checksum_rng, checksum_lat + + +def compare(actual, expected, what, stop=False): + """ + Compare actual with expected output, and output message if failed. + """ + + if actual != expected: + print(f"{what} comparison failed: actual={actual} , expected={expected}") + if stop: + exit(1) + else: + return False + return True + + + +if __name__ == '__main__': + import argparse + import subprocess + import os + + parser = argparse.ArgumentParser(description='Run end-to-end tests and compare results with expectations.') + parser.add_argument("test_name", help="File name of the test") + parser.add_argument("grid", help="Grid configuration") + parser.add_argument("mpi", help="MPI configuration") + parser.add_argument("-s", "--stop", action='store_true', help="Flag to stop testing when a test fails.") + args = parser.parse_args() + + expected_plaquette, expected_checksum_rng, expected_checksum_lat = read_expected(args.test_name, args.grid, args.mpi) + + result = subprocess.run([f"./{args.test_name} --grid {args.grid} --mpi {args.mpi} --Thermalizations 0 --Trajectories 1 > output.txt"], shell=True, encoding="text") + plaquette, checksum_rng, checksum_lat = read_output() + + print(f"Running {args.test_name}") + result = compare(plaquette, expected_plaquette, "plaquette", args.stop) + result = result and compare(checksum_rng, expected_checksum_rng, "Checksum RNG file ", args.stop) + result = result and compare(checksum_lat, expected_checksum_lat, "Checksum LAT file ", args.stop) + if result: + print("All tests passed!") + else: + print("Some tests failed...") + + os.remove("output.txt") + os.remove("ckpoint_rng.1") + os.remove("ckpoint_lat.1") + +#result = subprocess.run(["./Test_hmc_Sp_WilsonFundFermionGauge --grid 8.8.8.8 --mpi 1.1.1.1 --Thermalizations 0 --Trajectories 1 > output1.txt"], shell=True, encoding="text") + +# expected_value = 0.0256253844 +# checksum_rng = "922c392f" +# checksum_lat = "d1e4cc1c" + +# with open("output1.txt", 'r') as file: +# for line in file: +# # if "Plaquette" in line: +# # #print(line) +# # plaquette_value = float(line.split('] ')[1]) +# # #print(plaquette_value) +# # if plaquette_value == expected_value: +# # print("Success!") +# if "Written NERSC" in line: +# print(line) +# subline = line.split('checksum ')[1] +# if len(subline.split()) == 1: # this is the rng checksum line +# print(subline) +# if subline.strip() == checksum_rng: +# print("RNG file checksum success!") +# else: +# print("RNG file checksum failed!") +# elif len(subline.split()) == 3: # this is the lat checksum and plaquette value line +# print(subline) +# checksum_value = subline.split()[0] +# plaquette_value = float(subline.split()[2]) +# print(checksum_value, plaquette_value) +# if checksum_value == checksum_lat: +# print("LAT file checksum success!") +# else: +# print("LAT file checksum failed!") +# if plaquette_value == expected_value: +# print("Plaquette value success!") +# else: +# print("Plaquette value failed!") +# else: +# print("Picked wrong line...") + + +#loc1 = result.find("Plaquette") +#print(loc1) +#loc2 = result.find("Smeared") +#print(loc2) +#print(result[loc1,loc2]) diff --git a/tests/sp2n/Test_hmc_Sp_WilsonFundFermionGauge_expected.txt b/tests/sp2n/Test_hmc_Sp_WilsonFundFermionGauge_expected.txt new file mode 100644 index 0000000000..7edc6fa0a9 --- /dev/null +++ b/tests/sp2n/Test_hmc_Sp_WilsonFundFermionGauge_expected.txt @@ -0,0 +1 @@ +8.8.8.8 1.1.1.1 0.0256253844 922c392f d1e4cc1c