From 369f4ba6f2d6b112f0240c4dee6230d81f33349c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 14:17:25 +0200 Subject: [PATCH 01/11] Restart with different domain decomposition --- .../picongpu/plugins/openPMD/NDScalars.hpp | 8 +- .../plugins/openPMD/openPMDWriter.x.cpp | 6 + .../plugins/openPMD/restart/LoadSpecies.hpp | 494 +++++++++++++++--- .../openPMD/restart/RestartFieldLoader.hpp | 7 + 4 files changed, 435 insertions(+), 80 deletions(-) diff --git a/include/picongpu/plugins/openPMD/NDScalars.hpp b/include/picongpu/plugins/openPMD/NDScalars.hpp index 7ef07f62352..d027417ed17 100644 --- a/include/picongpu/plugins/openPMD/NDScalars.hpp +++ b/include/picongpu/plugins/openPMD/NDScalars.hpp @@ -174,11 +174,17 @@ namespace picongpu DataSpace gridPos = Environment::get().GridController().getPosition(); ::openPMD::Offset start; ::openPMD::Extent count; + ::openPMD::Extent extent = mrc.getExtent(); start.reserve(ndim); count.reserve(ndim); for(int d = 0; d < ndim; ++d) { - start.push_back(gridPos.revert()[d]); + /* + * When restarting with more parallel processes than the checkpoint had originally been written + * with, we must take care not to index past the dataset boundaries. Just loop around to the start + * in that case. Not the finest way, but it does the job for now.. + */ + start.push_back(gridPos.revert()[d] % extent[d]); count.push_back(1); } diff --git a/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp b/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp index b835de9627c..5f5921f4e31 100644 --- a/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp +++ b/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp @@ -1024,6 +1024,12 @@ make sure that environment variable OPENPMD_BP_BACKEND is not set to ADIOS1. { loadRngStatesImpl(&mThreadParams, restartStep); } + catch(std::exception const& e) + { + log("openPMD: loading RNG states failed, they will be re-initialized " + "instead. Original error:\n\t%1%") + % e.what(); + } catch(...) { log( diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index d7487bbda40..a9ff39fce98 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -35,8 +35,10 @@ # include +# include # include # include +# include # include @@ -46,6 +48,138 @@ namespace picongpu { using namespace pmacc; + struct RedistributeFilteredParticlesKernel + { + template + HDINLINE void operator()( + T_Worker const& worker, + ValueType* dataPtr, + FilterType&& filter, + RemapType&& remap, + MemIdxType const size, + char const filterRemove) const + { + constexpr uint32_t blockDomSize = T_Worker::blockDomSize(); + auto numDataBlocks = (size + blockDomSize - 1u) / blockDomSize; + + ValueType* s_mem = ::alpaka::getDynSharedMem(worker.getAcc()); + + // grid-strided loop over the chunked data + for(int dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; dataBlock += worker.gridDomSize()) + { + auto dataBlockOffset = dataBlock * blockDomSize; + auto forEach = pmacc::lockstep::makeForEach(worker); + // read + forEach( + [&](uint32_t const inBlockIdx) + { + auto idx = dataBlockOffset + inBlockIdx; + if(idx < size) + { + s_mem[inBlockIdx] = dataPtr[idx]; + } + }); + worker.sync(); + + // write + forEach( + [&](uint32_t const inBlockIdx) + { + auto idx = dataBlockOffset + inBlockIdx; + if(idx < size && filter[idx] != filterRemove) + { + dataPtr[remap[idx]] = s_mem[inBlockIdx]; + } + }); + + // The next Iteration of the outer for loop does not depend on the data overwritten until now. + // Filtering moves data only to lower indexes and each of the outer loop's iterations deals with a + // contiguous chunk of data. + } + } + }; + + template + struct RedistributeFilteredParticles + + { + template + HINLINE void operator()( + FrameType& frame, + FilterType const& filter, + RemapType const& remap, + uint64_t const numParticlesCurrentBatch, + char const filterRemove) + { + using Identifier = T_Identifier; + using ValueType = typename pmacc::traits::Resolve::type::type; + using ComponentType = typename GetComponentsType::type; + + + constexpr uint32_t blockDomSize = decltype(lockstep::makeBlockCfg())::blockDomSize(); + constexpr size_t requiredSharedMemBytes = blockDomSize * sizeof(ValueType); + + ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer(); + + PMACC_LOCKSTEP_KERNEL(RedistributeFilteredParticlesKernel{}) + .configSMem(pmacc::math::Vector{numParticlesCurrentBatch}, requiredSharedMemBytes)( + dataPtr, + alpaka::getPtrNative(filter), + alpaka::getPtrNative(remap), + numParticlesCurrentBatch, + filterRemove); + } + }; + + struct KernelFilterParticles + { + template + DINLINE void operator()( + T_Worker const& worker, + FrameType&& loadedData, + MemIdxType size, + DataSpace patchTotalOffset, + DataSpace patchUpperCorner, + char const filterKeep, + char const filterRemove, + FilterType&& filterOut) const + { + // DataSpace<1> particleIndex = worker.blockDomIdxND(); + constexpr uint32_t blockDomSize = T_Worker::blockDomSize(); + auto numDataBlocks = (size + blockDomSize - 1u) / blockDomSize; + auto position_ = loadedData.getIdentifier(position()).getPointer(); + auto positionOffset = loadedData.getIdentifier(totalCellIdx()).getPointer(); + + // grid-strided loop over the chunked data + for(int dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; dataBlock += worker.gridDomSize()) + { + auto dataBlockOffset = dataBlock * blockDomSize; + auto forEach = pmacc::lockstep::makeForEach(worker); + forEach( + [&](uint32_t const inBlockIdx) + { + auto idx = dataBlockOffset + inBlockIdx; + if(idx < size) + { + auto& positionVec = position_[idx]; + auto& positionOffsetVec = positionOffset[idx]; + char filterCurrent = filterKeep; + for(size_t d = 0; d < simDim; ++d) + { + auto positionInD = positionVec[d] + positionOffsetVec[d]; + if(positionInD < patchTotalOffset[d] || positionInD > patchUpperCorner[d]) + { + filterCurrent = filterRemove; + break; + } + } + filterOut[idx] = filterCurrent; + } + }); + } + } + }; + /** Load species from openPMD checkpoint storage * * @tparam T_Species type of species @@ -101,91 +235,262 @@ namespace picongpu auto numRanks = gc.getGlobalSize(); - size_t patchIdx = getPatchIdx(params, particleSpecies, numRanks); + auto [fullMatches, partialMatches] = getPatchIdx(params, particleSpecies); - std::shared_ptr fullParticlesInfoShared - = particleSpecies.particlePatches["numParticles"][::openPMD::RecordComponent::SCALAR] - .load(); + std::shared_ptr numParticlesShared + = particleSpecies.particlePatches["numParticles"].load(); + std::shared_ptr numParticlesOffsetShared + = particleSpecies.particlePatches["numParticlesOffset"].load(); particles.seriesFlush(); - uint64_t* fullParticlesInfo = fullParticlesInfoShared.get(); + uint64_t* patchNumParticles = numParticlesShared.get(); + uint64_t* patchNumParticlesOffset = numParticlesOffsetShared.get(); - /* Run a prefix sum over the numParticles[0] element in - * particlesInfo to retreive the offset of particles - */ - uint64_t particleOffset = 0u; - /* count total number of particles on the device */ - uint64_t totalNumParticles = 0u; - - assert(patchIdx < numRanks); - - for(size_t i = 0u; i <= patchIdx; ++i) { - if(i < patchIdx) - particleOffset += fullParticlesInfo[i]; - if(i == patchIdx) - totalNumParticles = fullParticlesInfo[i]; - } - - log("openPMD: Loading %1% particles from offset %2%") - % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset; + uint64_t totalNumParticles = std::transform_reduce( + fullMatches.begin(), + fullMatches.end(), + 0, + /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, + /* transform = */ [patchNumParticles](size_t patchIdx) + { return patchNumParticles[patchIdx]; }); - log("openPMD: malloc mapped memory: %1%") % speciesName; + log("openPMD: malloc mapped memory: %1%") % speciesName; - using FrameType = Frame; - using BufferType = Frame; + using FrameType = Frame; + using BufferType = Frame; - BufferType buffers; - FrameType mappedFrame; + BufferType buffers; + FrameType mappedFrame; - uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); + uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); - /*malloc mapped memory*/ - meta::ForEach> - mallocMem; - mallocMem(buffers, mappedFrame, maxChunkSize); + /*malloc mapped memory*/ + meta::ForEach> + mallocMem; + mallocMem(buffers, mappedFrame, maxChunkSize); - uint32_t const numLoadIterations - = maxChunkSize == 0u ? 0u : alpaka::core::divCeil(totalNumParticles, maxChunkSize); + for(size_t const patchIdx : fullMatches) + { + uint64_t particleOffset = patchNumParticlesOffset[patchIdx]; + uint64_t numParticles = patchNumParticles[patchIdx]; + + log("openPMD: Loading %1% particles from offset %2%") + % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset; + + + uint32_t const numLoadIterations + = maxChunkSize == 0u ? 0u : alpaka::core::divCeil(numParticles, maxChunkSize); + + for(uint64_t loadRound = 0u; loadRound < numLoadIterations; ++loadRound) + { + auto particleLoadOffset = particleOffset + loadRound * maxChunkSize; + auto numLeftParticles = numParticles - loadRound * maxChunkSize; + + auto numParticlesCurrentBatch = std::min(numLeftParticles, maxChunkSize); + + log("openPMD: (begin) load species %1% round: %2%/%3%") % speciesName + % (loadRound + 1) % numLoadIterations; + if(numParticlesCurrentBatch != 0) + { + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + LoadParticleAttributesFromOpenPMD> + loadAttributes; + loadAttributes( + params, + mappedFrame, + particleSpecies, + particleLoadOffset, + numParticlesCurrentBatch); + + pmacc::particles::operations::splitIntoListOfFrames( + *speciesTmp, + mappedFrame, + numParticlesCurrentBatch, + cellOffsetToTotalDomain, + totalCellIdx_, + *(params->cellDescription), + picLog::INPUT_OUTPUT()); + } + log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName + % (loadRound + 1) % numLoadIterations; + } + } + } - for(uint64_t loadRound = 0u; loadRound < numLoadIterations; ++loadRound) { - auto particleLoadOffset = particleOffset + loadRound * maxChunkSize; - auto numLeftParticles = totalNumParticles - loadRound * maxChunkSize; - - auto numParticlesCurrentBatch = std::min(numLeftParticles, maxChunkSize); - - log("openPMD: (begin) load species %1% round: %2%/%3%") % speciesName - % (loadRound + 1) % numLoadIterations; - if(numParticlesCurrentBatch != 0) + SubGrid const& subGrid = Environment::get().SubGrid(); + pmacc::Selection const localDomain = subGrid.getLocalDomain(); + pmacc::Selection const globalDomain = subGrid.getGlobalDomain(); + /* Offset to transform local particle offsets into total offsets for all particles within the + * current local domain. + * @attention A window can be the full simulation domain or the moving window. + */ + DataSpace localToTotalDomainOffset(localDomain.offset + globalDomain.offset); + + /* params->localWindowToDomainOffset is in PIConGPU for a restart zero but to stay generic we take + * the variable into account. + */ + DataSpace const patchTotalOffset + = localToTotalDomainOffset + params->localWindowToDomainOffset; + DataSpace const patchExtent = params->window.localDimensions.size; + DataSpace const patchUpperCorner = patchTotalOffset + patchExtent; + + uint64_t totalNumParticles = std::transform_reduce( + partialMatches.begin(), + partialMatches.end(), + 0, + /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, + /* transform = */ [patchNumParticles](size_t patchIdx) + { return patchNumParticles[patchIdx]; }); + + log("openPMD: malloc mapped memory for partial patches: %1%") % speciesName; + + using FrameType = Frame; + using BufferType = Frame; + + BufferType buffers; + FrameType mappedFrame; + + uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); + + /*malloc mapped memory*/ + meta::ForEach> + mallocMem; + mallocMem(buffers, mappedFrame, maxChunkSize); + constexpr bool isMappedMemorySupported + = alpaka::hasMappedBufSupport<::alpaka::Platform>; + PMACC_VERIFY_MSG(isMappedMemorySupported, "Device must support mapped memory!"); + // alpaka::Buf> filter; + auto filter = alpaka::allocMappedBuf( + manager::Device::get().current(), + manager::Device::get().getPlatform(), + MemSpace(maxChunkSize).toAlpakaMemVec()); + auto remap = alpaka::allocMappedBuf( + manager::Device::get().current(), + manager::Device::get().getPlatform(), + MemSpace(maxChunkSize).toAlpakaMemVec()); + for(size_t const patchIdx : partialMatches) { - meta::ForEach< - typename NewParticleDescription::ValueTypeSeq, - LoadParticleAttributesFromOpenPMD> - loadAttributes; - loadAttributes( - params, - mappedFrame, - particleSpecies, - particleLoadOffset, - numParticlesCurrentBatch); - - - pmacc::particles::operations::splitIntoListOfFrames( - *speciesTmp, - mappedFrame, - numParticlesCurrentBatch, - cellOffsetToTotalDomain, - totalCellIdx_, - *(params->cellDescription), - picLog::INPUT_OUTPUT()); + uint64_t particleOffset = patchNumParticlesOffset[patchIdx]; + uint64_t numParticles = patchNumParticles[patchIdx]; + + log("openPMD: Loading up to %1% particles from offset %2%") + % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset; + + + uint32_t const numLoadIterations + = maxChunkSize == 0u ? 0u : alpaka::core::divCeil(numParticles, maxChunkSize); + + for(uint64_t loadRound = 0u; loadRound < numLoadIterations; ++loadRound) + { + auto particleLoadOffset = particleOffset + loadRound * maxChunkSize; + auto numLeftParticles = numParticles - loadRound * maxChunkSize; + + auto numParticlesCurrentBatch = std::min(numLeftParticles, maxChunkSize); + + log("openPMD: (begin) load species %1% round: %2%/%3%") % speciesName + % (loadRound + 1) % numLoadIterations; + if(numParticlesCurrentBatch != 0) + { + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + LoadParticleAttributesFromOpenPMD> + loadAttributes; + loadAttributes( + params, + mappedFrame, + particleSpecies, + particleLoadOffset, + numParticlesCurrentBatch); + + // now filter + + constexpr char filterKeep{1}, filterRemove{0}; + PMACC_LOCKSTEP_KERNEL(KernelFilterParticles{}) + .config(pmacc::math::Vector{numParticlesCurrentBatch})( + mappedFrame, + numParticlesCurrentBatch, + patchTotalOffset, + patchUpperCorner, + filterKeep, + filterRemove, + alpaka::getPtrNative(filter)); + eventSystem::getTransactionEvent().waitForFinished(); + + // This part is inherently sequential, keep it on CPU + // (maybe run also this on GPU to avoid multiple data copies) + // std::cout << "REMAP: "; + MemIdxType remapCurrent = 0; + for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; + ++particleIndex) + { + if(filter[particleIndex] == filterKeep) + { + remap[particleIndex] = remapCurrent++; + // std::cout << '1'; + } + else + { + remap[particleIndex] = std::numeric_limits::max(); + // std::cout << '0'; + } + } + // std::cout << std::endl; + + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + RedistributeFilteredParticles> + redistributeFilteredParticles; + redistributeFilteredParticles( + mappedFrame, + filter, + remap, + numParticlesCurrentBatch, + filterRemove); + + std::cout << "Filtered " << remapCurrent << " out of " << numParticlesCurrentBatch + << " particles" << std::endl; + + pmacc::particles::operations::splitIntoListOfFrames( + *speciesTmp, + mappedFrame, + remapCurrent, // !! not numParticlesCurrentBatch, filtered vs. unfiltered number + cellOffsetToTotalDomain, + totalCellIdx_, + *(params->cellDescription), + picLog::INPUT_OUTPUT()); + } + log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName + % (loadRound + 1) % numLoadIterations; + } } - log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName - % (loadRound + 1) % numLoadIterations; } log("openPMD: ( end ) load species: %1%") % speciesName; } private: + // o: offset, e: extent, u: upper corner (= o+e) + static std::pair, DataSpace> intersect( + DataSpace const& o1, + DataSpace const& e1, + DataSpace const& o2, + DataSpace const& e2) + { + // Convert extents into upper coordinates + auto u1 = o1 + e1; + auto u2 = o2 + e2; + + DataSpace intersect_o, intersect_u, intersect_e; + for(unsigned d = 0; d < simDim; ++d) + { + intersect_o[d] = std::max(o1[d], o2[d]); + intersect_u[d] = std::min(u1[d], u2[d]); + intersect_e[d] = intersect_u[d] > intersect_o[d] ? intersect_u[d] - intersect_o[d] : 0; + } + return {intersect_o, intersect_e}; + } + /** get index for particle data within the openPMD patch data * * It is not possible to assume that we can use the MPI rank to load the particle data. @@ -197,13 +502,16 @@ namespace picongpu * * @return index of the particle patch within the openPMD data */ - HINLINE size_t - getPatchIdx(ThreadParams* params, ::openPMD::ParticleSpecies particleSpecies, size_t numRanks) + HINLINE std::pair, std::deque> getPatchIdx( + ThreadParams* params, + ::openPMD::ParticleSpecies particleSpecies) { std::string const name_lookup[] = {"x", "y", "z"}; - std::vector> offsets(numRanks); - std::vector> extents(numRanks); + size_t patches = particleSpecies.particlePatches["numParticles"].getExtent()[0]; + + std::vector> offsets(patches); + std::vector> extents(patches); // transform openPMD particle patch data into PIConGPU data objects for(uint32_t d = 0; d < simDim; ++d) @@ -213,7 +521,7 @@ namespace picongpu std::shared_ptr patchExtentsInfoShared = particleSpecies.particlePatches["extent"][name_lookup[d]].load(); particleSpecies.seriesFlush(); - for(size_t i = 0; i < numRanks; ++i) + for(size_t i = 0; i < patches; ++i) { offsets[i][d] = patchOffsetsInfoShared.get()[i]; extents[i][d] = patchExtentsInfoShared.get()[i]; @@ -235,18 +543,46 @@ namespace picongpu DataSpace const patchTotalOffset = localToTotalDomainOffset + params->localWindowToDomainOffset; DataSpace const patchExtent = params->window.localDimensions.size; + math::Vector true_; + for(unsigned d = 0; d < simDim; ++d) + { + true_[d] = true; + } // search the patch index based on the offset and extents of local domain size - for(size_t i = 0; i < numRanks; ++i) + std::deque fullMatches; + std::deque partialMatches; + size_t noMatches = 0; + for(size_t i = 0; i < patches; ++i) { - if(patchTotalOffset == offsets[i] && patchExtent == extents[i]) - return i; + // std::cout << "Comp.: " << patchTotalOffset << " - " << (patchTotalOffset + patchExtent) + // << "\tAGAINST " << offsets[i] << " - " << (offsets[i] + extents[i]) + // << "\toffsets: " << (patchTotalOffset <= offsets[i]) + // << ", extents: " << ((offsets[i] + extents[i]) <= (patchTotalOffset + patchExtent)) << + // '\n'; + if((patchTotalOffset <= offsets[i]) == true_ + && ((offsets[i] + extents[i]) <= (patchTotalOffset + patchExtent)) == true_) + { + fullMatches.emplace_back(i); + } + else if( + intersect(offsets[i], extents[i], patchTotalOffset, patchExtent).second.productOfComponents() + != 0) + { + partialMatches.emplace_back(i); + } + else + { + ++noMatches; + } } - // If no patch fits the conditions, something went wrong before - throw std::runtime_error( - "Error while restarting: no particle patch matches the required offset and extent"); - // Fake return still needed to avoid warnings - return 0; + + // std::cout << "\n\n" + // << fullMatches.size() << " full matches, " << partialMatches.size() << " partial matches, + // " + // << noMatches << " unmatched." << std ::endl; + + return std::make_pair(std::move(fullMatches), std::move(partialMatches)); } }; diff --git a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp index 939d65b3e0f..a04379cb267 100644 --- a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp +++ b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp @@ -219,6 +219,13 @@ namespace picongpu /* load from openPMD */ bool const isDomainBound = traits::IsFieldDomainBound::value; + + // Skip PML fields for load balancing purposes + if(!isDomainBound) + { + return; + } + RestartFieldLoader::loadField( field->getGridBuffer(), (uint32_t) T_Field::numComponents, From ececd23408f0afcd6ba20e96e0f9fbbb1cafbe9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 14:26:25 +0200 Subject: [PATCH 02/11] Revert "Remap also on GPU" This reverts commit 17070daff52cd0d1ccc790ee0738e7ff9d99ef2c. --- .../plugins/openPMD/restart/LoadSpecies.hpp | 59 +++++++------------ 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index a9ff39fce98..dc64f59ca03 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -48,56 +48,41 @@ namespace picongpu { using namespace pmacc; +# if false struct RedistributeFilteredParticlesKernel { - template - HDINLINE void operator()( - T_Worker const& worker, - ValueType* dataPtr, - FilterType&& filter, - RemapType&& remap, - MemIdxType const size, - char const filterRemove) const + template + HDINLINE void operator()(T_Worker const& worker, T_DataBox data, uint32_t size) const { constexpr uint32_t blockDomSize = T_Worker::blockDomSize(); auto numDataBlocks = (size + blockDomSize - 1u) / blockDomSize; - ValueType* s_mem = ::alpaka::getDynSharedMem(worker.getAcc()); + uint32_t* s_mem = ::alpaka::getDynSharedMem(worker.getAcc()); // grid-strided loop over the chunked data for(int dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; dataBlock += worker.gridDomSize()) { auto dataBlockOffset = dataBlock * blockDomSize; auto forEach = pmacc::lockstep::makeForEach(worker); - // read forEach( [&](uint32_t const inBlockIdx) { auto idx = dataBlockOffset + inBlockIdx; + s_mem[inBlockIdx] = idx; if(idx < size) { - s_mem[inBlockIdx] = dataPtr[idx]; + // ensure that each block is not overwriting data from other blocks + PMACC_DEVICE_VERIFY_MSG( + data[idx] == 0u, + "%s\n", + "Result buffer not valid initialized!"); + data[idx] = s_mem[inBlockIdx]; } }); - worker.sync(); - - // write - forEach( - [&](uint32_t const inBlockIdx) - { - auto idx = dataBlockOffset + inBlockIdx; - if(idx < size && filter[idx] != filterRemove) - { - dataPtr[remap[idx]] = s_mem[inBlockIdx]; - } - }); - - // The next Iteration of the outer for loop does not depend on the data overwritten until now. - // Filtering moves data only to lower indexes and each of the outer loop's iterations deals with a - // contiguous chunk of data. } } }; +# endif template struct RedistributeFilteredParticles @@ -116,18 +101,19 @@ namespace picongpu using ComponentType = typename GetComponentsType::type; - constexpr uint32_t blockDomSize = decltype(lockstep::makeBlockCfg())::blockDomSize(); - constexpr size_t requiredSharedMemBytes = blockDomSize * sizeof(ValueType); + constexpr uint32_t = decltype(lockstep::makeBlockCfg())::blockDomSize(); + ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer(); - PMACC_LOCKSTEP_KERNEL(RedistributeFilteredParticlesKernel{}) - .configSMem(pmacc::math::Vector{numParticlesCurrentBatch}, requiredSharedMemBytes)( - dataPtr, - alpaka::getPtrNative(filter), - alpaka::getPtrNative(remap), - numParticlesCurrentBatch, - filterRemove); + for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; ++particleIndex) + { + if(filter[particleIndex] == filterRemove) + { + continue; + } + dataPtr[remap[particleIndex]] = dataPtr[particleIndex]; + } } }; @@ -419,7 +405,6 @@ namespace picongpu eventSystem::getTransactionEvent().waitForFinished(); // This part is inherently sequential, keep it on CPU - // (maybe run also this on GPU to avoid multiple data copies) // std::cout << "REMAP: "; MemIdxType remapCurrent = 0; for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; From 4ab985326bb629539d4efce7a123529b0becee69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 14:27:09 +0200 Subject: [PATCH 03/11] Revert "wip: redistribute on gpu" This reverts commit 2a67584769ab5d818d5fc0a1b205997e730931c7. --- .../plugins/openPMD/restart/LoadSpecies.hpp | 43 +------------------ 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index dc64f59ca03..c0bffb709bd 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -48,45 +48,8 @@ namespace picongpu { using namespace pmacc; -# if false - struct RedistributeFilteredParticlesKernel - { - template - HDINLINE void operator()(T_Worker const& worker, T_DataBox data, uint32_t size) const - { - constexpr uint32_t blockDomSize = T_Worker::blockDomSize(); - auto numDataBlocks = (size + blockDomSize - 1u) / blockDomSize; - - uint32_t* s_mem = ::alpaka::getDynSharedMem(worker.getAcc()); - - // grid-strided loop over the chunked data - for(int dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; dataBlock += worker.gridDomSize()) - { - auto dataBlockOffset = dataBlock * blockDomSize; - auto forEach = pmacc::lockstep::makeForEach(worker); - forEach( - [&](uint32_t const inBlockIdx) - { - auto idx = dataBlockOffset + inBlockIdx; - s_mem[inBlockIdx] = idx; - if(idx < size) - { - // ensure that each block is not overwriting data from other blocks - PMACC_DEVICE_VERIFY_MSG( - data[idx] == 0u, - "%s\n", - "Result buffer not valid initialized!"); - data[idx] = s_mem[inBlockIdx]; - } - }); - } - } - }; -# endif - template struct RedistributeFilteredParticles - { template HINLINE void operator()( @@ -100,10 +63,6 @@ namespace picongpu using ValueType = typename pmacc::traits::Resolve::type::type; using ComponentType = typename GetComponentsType::type; - - constexpr uint32_t = decltype(lockstep::makeBlockCfg())::blockDomSize(); - - ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer(); for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; ++particleIndex) @@ -391,6 +350,8 @@ namespace picongpu numParticlesCurrentBatch); // now filter + auto position_ = mappedFrame.getIdentifier(position()).getPointer(); + auto positionOffset = mappedFrame.getIdentifier(totalCellIdx()).getPointer(); constexpr char filterKeep{1}, filterRemove{0}; PMACC_LOCKSTEP_KERNEL(KernelFilterParticles{}) From 05d2127684661424e0db6b8a2dee73fb065ee686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 15:16:00 +0200 Subject: [PATCH 04/11] Factor out loafFull/PartialMatches --- .../plugins/openPMD/restart/LoadSpecies.hpp | 123 +++++++++++------- 1 file changed, 73 insertions(+), 50 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index c0bffb709bd..bc6110abc84 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -149,55 +149,24 @@ namespace picongpu using NewParticleDescription = typename ReplaceValueTypeSeq::type; - /** Load species from openPMD checkpoint storage - * - * @param params thread params - * @param restartChunkSize number of particles processed in one kernel - * call - */ - HINLINE void operator()(ThreadParams* params, uint32_t const currentStep, uint32_t const restartChunkSize) + struct LoadParams { - std::string const speciesName = FrameType::getName(); - log("openPMD: (begin) load species: %1%") % speciesName; - DataConnector& dc = Environment<>::get().DataConnector(); - GridController& gc = Environment::get().GridController(); - - ::openPMD::Series& series = *params->openPMDSeries; - ::openPMD::Container<::openPMD::ParticleSpecies>& particles - = series.iterations[currentStep].open().particles; - ::openPMD::ParticleSpecies particleSpecies = particles[speciesName]; - - SubGrid const& subGrid = Environment::get().SubGrid(); - DataSpace cellOffsetToTotalDomain - = subGrid.getLocalDomain().offset + subGrid.getGlobalDomain().offset; - - /* load particle without copying particle data to host */ - auto speciesTmp = dc.get(FrameType::getName()); - - // avoid deadlock between not finished pmacc tasks and mpi calls in - // openPMD - eventSystem::getTransactionEvent().waitForFinished(); - - auto numRanks = gc.getGlobalSize(); - - auto [fullMatches, partialMatches] = getPatchIdx(params, particleSpecies); - - std::shared_ptr numParticlesShared - = particleSpecies.particlePatches["numParticles"].load(); - std::shared_ptr numParticlesOffsetShared - = particleSpecies.particlePatches["numParticlesOffset"].load(); - particles.seriesFlush(); - uint64_t* patchNumParticles = numParticlesShared.get(); - uint64_t* patchNumParticlesOffset = numParticlesOffsetShared.get(); - + std::string const& speciesName; + ::openPMD::ParticleSpecies const& particleSpecies; + std::shared_ptr const& speciesTmp; + uint64_t* patchNumParticles; + uint64_t* patchNumParticlesOffset; + DataSpace const& cellOffsetToTotalDomain; + uint32_t restartChunkSize; + + void loadFullMatches(std::deque const& fullMatches, ThreadParams* threadParams) const { uint64_t totalNumParticles = std::transform_reduce( fullMatches.begin(), fullMatches.end(), 0, /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, - /* transform = */ [patchNumParticles](size_t patchIdx) - { return patchNumParticles[patchIdx]; }); + /* transform = */ [this](size_t patchIdx) { return patchNumParticles[patchIdx]; }); log("openPMD: malloc mapped memory: %1%") % speciesName; @@ -242,7 +211,7 @@ namespace picongpu LoadParticleAttributesFromOpenPMD> loadAttributes; loadAttributes( - params, + threadParams, mappedFrame, particleSpecies, particleLoadOffset, @@ -254,7 +223,7 @@ namespace picongpu numParticlesCurrentBatch, cellOffsetToTotalDomain, totalCellIdx_, - *(params->cellDescription), + *threadParams->cellDescription, picLog::INPUT_OUTPUT()); } log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName @@ -263,6 +232,7 @@ namespace picongpu } } + void loadPartialMatches(std::deque const& partialMatches, ThreadParams* threadParams) const { SubGrid const& subGrid = Environment::get().SubGrid(); pmacc::Selection const localDomain = subGrid.getLocalDomain(); @@ -277,8 +247,8 @@ namespace picongpu * the variable into account. */ DataSpace const patchTotalOffset - = localToTotalDomainOffset + params->localWindowToDomainOffset; - DataSpace const patchExtent = params->window.localDimensions.size; + = localToTotalDomainOffset + threadParams->localWindowToDomainOffset; + DataSpace const patchExtent = threadParams->window.localDimensions.size; DataSpace const patchUpperCorner = patchTotalOffset + patchExtent; uint64_t totalNumParticles = std::transform_reduce( @@ -286,8 +256,7 @@ namespace picongpu partialMatches.end(), 0, /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, - /* transform = */ [patchNumParticles](size_t patchIdx) - { return patchNumParticles[patchIdx]; }); + /* transform = */ [this](size_t patchIdx) { return patchNumParticles[patchIdx]; }); log("openPMD: malloc mapped memory for partial patches: %1%") % speciesName; @@ -343,7 +312,7 @@ namespace picongpu LoadParticleAttributesFromOpenPMD> loadAttributes; loadAttributes( - params, + threadParams, mappedFrame, particleSpecies, particleLoadOffset, @@ -404,7 +373,7 @@ namespace picongpu remapCurrent, // !! not numParticlesCurrentBatch, filtered vs. unfiltered number cellOffsetToTotalDomain, totalCellIdx_, - *(params->cellDescription), + *threadParams->cellDescription, picLog::INPUT_OUTPUT()); } log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName @@ -412,6 +381,60 @@ namespace picongpu } } } + }; + + /** Load species from openPMD checkpoint storage + * + * @param params thread params + * @param restartChunkSize number of particles processed in one kernel + * call + */ + HINLINE void operator()(ThreadParams* params, uint32_t const currentStep, uint32_t const restartChunkSize) + { + std::string const speciesName = FrameType::getName(); + log("openPMD: (begin) load species: %1%") % speciesName; + DataConnector& dc = Environment<>::get().DataConnector(); + GridController& gc = Environment::get().GridController(); + + ::openPMD::Series& series = *params->openPMDSeries; + ::openPMD::Container<::openPMD::ParticleSpecies>& particles + = series.iterations[currentStep].open().particles; + ::openPMD::ParticleSpecies particleSpecies = particles[speciesName]; + + SubGrid const& subGrid = Environment::get().SubGrid(); + DataSpace cellOffsetToTotalDomain + = subGrid.getLocalDomain().offset + subGrid.getGlobalDomain().offset; + + /* load particle without copying particle data to host */ + auto speciesTmp = dc.get(FrameType::getName()); + + // avoid deadlock between not finished pmacc tasks and mpi calls in + // openPMD + eventSystem::getTransactionEvent().waitForFinished(); + + auto numRanks = gc.getGlobalSize(); + + auto [fullMatches, partialMatches] = getPatchIdx(params, particleSpecies); + + std::shared_ptr numParticlesShared + = particleSpecies.particlePatches["numParticles"].load(); + std::shared_ptr numParticlesOffsetShared + = particleSpecies.particlePatches["numParticlesOffset"].load(); + particles.seriesFlush(); + uint64_t* patchNumParticles = numParticlesShared.get(); + uint64_t* patchNumParticlesOffset = numParticlesOffsetShared.get(); + + LoadParams lp{ + speciesName, + particleSpecies, + speciesTmp, + patchNumParticles, + patchNumParticlesOffset, + cellOffsetToTotalDomain, + restartChunkSize}; + lp.loadFullMatches(fullMatches, params); + lp.loadPartialMatches(partialMatches, params); + log("openPMD: ( end ) load species: %1%") % speciesName; } From 71a14d22d5ff61370f53311f11c9800a43c3021d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 15:40:32 +0200 Subject: [PATCH 05/11] Continue factoring out common code --- .../plugins/openPMD/restart/LoadSpecies.hpp | 333 ++++++++++-------- 1 file changed, 189 insertions(+), 144 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index bc6110abc84..b7939aee303 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -149,8 +149,44 @@ namespace picongpu using NewParticleDescription = typename ReplaceValueTypeSeq::type; + struct ChunkedBuffer + { + using FrameType = Frame; + using BufferType = Frame; + + uint64_t totalNumParticles; + uint64_t maxChunkSize; + BufferType buffers; + FrameType mappedFrame; + + ChunkedBuffer( + std::deque matches, + uint64_t const* patchNumParticles, + uint32_t restartChunkSize, + std::string const& speciesName) + { + totalNumParticles = std::transform_reduce( + matches.begin(), + matches.end(), + 0, + /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, + /* transform = */ [patchNumParticles](size_t patchIdx) + { return patchNumParticles[patchIdx]; }); + maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); + + log("openPMD: malloc mapped memory: %1%") % speciesName; + /*malloc mapped memory*/ + meta::ForEach> + mallocMem; + mallocMem(buffers, mappedFrame, maxChunkSize); + } + }; + struct LoadParams { + using FrameType = Frame; + using BufferType = Frame; + std::string const& speciesName; ::openPMD::ParticleSpecies const& particleSpecies; std::shared_ptr const& speciesTmp; @@ -159,31 +195,55 @@ namespace picongpu DataSpace const& cellOffsetToTotalDomain; uint32_t restartChunkSize; - void loadFullMatches(std::deque const& fullMatches, ThreadParams* threadParams) const +# if 0 + LoadParams( + std::string const& speciesName_in, + ::openPMD::ParticleSpecies const& particleSpecies_in, + std::shared_ptr const& speciesTmp_in, + uint64_t* patchNumParticles_in, + uint64_t* patchNumParticlesOffset_in, + DataSpace const& cellOffsetToTotalDomain_in, + uint32_t restartChunkSize_in) + : speciesName(speciesName_in) + , particleSpecies(particleSpecies_in) + , speciesTmp(speciesTmp_in) + , patchNumParticles(patchNumParticles_in) + , patchNumParticlesOffset(patchNumParticlesOffset_in) + , cellOffsetToTotalDomain(cellOffsetToTotalDomain_in) + , restartChunkSize(restartChunkSize_in) + { + } +# endif + + auto numParticlesAndChunkSize(std::deque const& matches) const -> std::pair { uint64_t totalNumParticles = std::transform_reduce( - fullMatches.begin(), - fullMatches.end(), + matches.begin(), + matches.end(), 0, /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, /* transform = */ [this](size_t patchIdx) { return patchNumParticles[patchIdx]; }); + uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); + return std::make_pair(totalNumParticles, maxChunkSize); + } - log("openPMD: malloc mapped memory: %1%") % speciesName; - - using FrameType = Frame; - using BufferType = Frame; - + template + void loadMatchesGeneric( + std::deque const& matches, + ThreadParams* threadParams, + uint64_t totalNumParticles, + uint64_t maxChunkSize, + Functor&& forEachPatch) const + { BufferType buffers; FrameType mappedFrame; - - uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); - + log("openPMD: malloc mapped memory: %1%") % speciesName; /*malloc mapped memory*/ meta::ForEach> mallocMem; mallocMem(buffers, mappedFrame, maxChunkSize); - for(size_t const patchIdx : fullMatches) + for(size_t const patchIdx : matches) { uint64_t particleOffset = patchNumParticlesOffset[patchIdx]; uint64_t numParticles = patchNumParticles[patchIdx]; @@ -206,25 +266,11 @@ namespace picongpu % (loadRound + 1) % numLoadIterations; if(numParticlesCurrentBatch != 0) { - meta::ForEach< - typename NewParticleDescription::ValueTypeSeq, - LoadParticleAttributesFromOpenPMD> - loadAttributes; - loadAttributes( - threadParams, - mappedFrame, - particleSpecies, - particleLoadOffset, - numParticlesCurrentBatch); - - pmacc::particles::operations::splitIntoListOfFrames( - *speciesTmp, - mappedFrame, + std::forward(forEachPatch)( + loadRound, numParticlesCurrentBatch, - cellOffsetToTotalDomain, - totalCellIdx_, - *threadParams->cellDescription, - picLog::INPUT_OUTPUT()); + mappedFrame, + particleLoadOffset); } log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName % (loadRound + 1) % numLoadIterations; @@ -232,8 +278,47 @@ namespace picongpu } } + void loadFullMatches(std::deque const& fullMatches, ThreadParams* threadParams) const + { + auto [totalNumParticles, maxChunkSize] = numParticlesAndChunkSize(fullMatches); + loadMatchesGeneric( + fullMatches, + threadParams, + totalNumParticles, + maxChunkSize, + /* forEachPatch = */ + [this, threadParams]( + uint64_t loadRound, + uint64_t numParticlesCurrentBatch, + FrameType& mappedFrame, + uint64_t particleLoadOffset) + { + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + LoadParticleAttributesFromOpenPMD> + loadAttributes; + loadAttributes( + threadParams, + mappedFrame, + particleSpecies, + particleLoadOffset, + numParticlesCurrentBatch); + + pmacc::particles::operations::splitIntoListOfFrames( + *speciesTmp, + mappedFrame, + numParticlesCurrentBatch, + cellOffsetToTotalDomain, + totalCellIdx_, + *threadParams->cellDescription, + picLog::INPUT_OUTPUT()); + }); + } + void loadPartialMatches(std::deque const& partialMatches, ThreadParams* threadParams) const { + auto [totalNumParticles, maxChunkSize] = numParticlesAndChunkSize(partialMatches); + SubGrid const& subGrid = Environment::get().SubGrid(); pmacc::Selection const localDomain = subGrid.getLocalDomain(); pmacc::Selection const globalDomain = subGrid.getGlobalDomain(); @@ -251,31 +336,9 @@ namespace picongpu DataSpace const patchExtent = threadParams->window.localDimensions.size; DataSpace const patchUpperCorner = patchTotalOffset + patchExtent; - uint64_t totalNumParticles = std::transform_reduce( - partialMatches.begin(), - partialMatches.end(), - 0, - /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, - /* transform = */ [this](size_t patchIdx) { return patchNumParticles[patchIdx]; }); - - log("openPMD: malloc mapped memory for partial patches: %1%") % speciesName; - - using FrameType = Frame; - using BufferType = Frame; - - BufferType buffers; - FrameType mappedFrame; - - uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); - - /*malloc mapped memory*/ - meta::ForEach> - mallocMem; - mallocMem(buffers, mappedFrame, maxChunkSize); constexpr bool isMappedMemorySupported = alpaka::hasMappedBufSupport<::alpaka::Platform>; PMACC_VERIFY_MSG(isMappedMemorySupported, "Device must support mapped memory!"); - // alpaka::Buf> filter; auto filter = alpaka::allocMappedBuf( manager::Device::get().current(), manager::Device::get().getPlatform(), @@ -284,102 +347,82 @@ namespace picongpu manager::Device::get().current(), manager::Device::get().getPlatform(), MemSpace(maxChunkSize).toAlpakaMemVec()); - for(size_t const patchIdx : partialMatches) - { - uint64_t particleOffset = patchNumParticlesOffset[patchIdx]; - uint64_t numParticles = patchNumParticles[patchIdx]; - - log("openPMD: Loading up to %1% particles from offset %2%") - % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset; - - - uint32_t const numLoadIterations - = maxChunkSize == 0u ? 0u : alpaka::core::divCeil(numParticles, maxChunkSize); - for(uint64_t loadRound = 0u; loadRound < numLoadIterations; ++loadRound) + loadMatchesGeneric( + partialMatches, + threadParams, + totalNumParticles, + maxChunkSize, /* forEachPatch = */ + [this, threadParams, &filter, &remap, &patchTotalOffset, &patchExtent, &patchUpperCorner]( + uint64_t loadRound, + uint64_t numParticlesCurrentBatch, + FrameType& mappedFrame, + uint64_t particleLoadOffset) { - auto particleLoadOffset = particleOffset + loadRound * maxChunkSize; - auto numLeftParticles = numParticles - loadRound * maxChunkSize; - - auto numParticlesCurrentBatch = std::min(numLeftParticles, maxChunkSize); - - log("openPMD: (begin) load species %1% round: %2%/%3%") % speciesName - % (loadRound + 1) % numLoadIterations; - if(numParticlesCurrentBatch != 0) - { - meta::ForEach< - typename NewParticleDescription::ValueTypeSeq, - LoadParticleAttributesFromOpenPMD> - loadAttributes; - loadAttributes( - threadParams, + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + LoadParticleAttributesFromOpenPMD> + loadAttributes; + loadAttributes( + threadParams, + mappedFrame, + particleSpecies, + particleLoadOffset, + numParticlesCurrentBatch); + + // now filter + auto position_ = mappedFrame.getIdentifier(position()).getPointer(); + auto positionOffset = mappedFrame.getIdentifier(totalCellIdx()).getPointer(); + + constexpr char filterKeep{1}, filterRemove{0}; + PMACC_LOCKSTEP_KERNEL(KernelFilterParticles{}) + .config(pmacc::math::Vector{numParticlesCurrentBatch})( mappedFrame, - particleSpecies, - particleLoadOffset, - numParticlesCurrentBatch); - - // now filter - auto position_ = mappedFrame.getIdentifier(position()).getPointer(); - auto positionOffset = mappedFrame.getIdentifier(totalCellIdx()).getPointer(); - - constexpr char filterKeep{1}, filterRemove{0}; - PMACC_LOCKSTEP_KERNEL(KernelFilterParticles{}) - .config(pmacc::math::Vector{numParticlesCurrentBatch})( - mappedFrame, - numParticlesCurrentBatch, - patchTotalOffset, - patchUpperCorner, - filterKeep, - filterRemove, - alpaka::getPtrNative(filter)); - eventSystem::getTransactionEvent().waitForFinished(); - - // This part is inherently sequential, keep it on CPU - // std::cout << "REMAP: "; - MemIdxType remapCurrent = 0; - for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; - ++particleIndex) + numParticlesCurrentBatch, + patchTotalOffset, + patchUpperCorner, + filterKeep, + filterRemove, + alpaka::getPtrNative(filter)); + eventSystem::getTransactionEvent().waitForFinished(); + + // For simplicity, do the remapping on the CPU again + MemIdxType remapCurrent = 0; + for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; ++particleIndex) + { + if(filter[particleIndex] == filterKeep) { - if(filter[particleIndex] == filterKeep) - { - remap[particleIndex] = remapCurrent++; - // std::cout << '1'; - } - else - { - remap[particleIndex] = std::numeric_limits::max(); - // std::cout << '0'; - } + remap[particleIndex] = remapCurrent++; + } + else + { + remap[particleIndex] = std::numeric_limits::max(); } - // std::cout << std::endl; - - meta::ForEach< - typename NewParticleDescription::ValueTypeSeq, - RedistributeFilteredParticles> - redistributeFilteredParticles; - redistributeFilteredParticles( - mappedFrame, - filter, - remap, - numParticlesCurrentBatch, - filterRemove); - - std::cout << "Filtered " << remapCurrent << " out of " << numParticlesCurrentBatch - << " particles" << std::endl; - - pmacc::particles::operations::splitIntoListOfFrames( - *speciesTmp, - mappedFrame, - remapCurrent, // !! not numParticlesCurrentBatch, filtered vs. unfiltered number - cellOffsetToTotalDomain, - totalCellIdx_, - *threadParams->cellDescription, - picLog::INPUT_OUTPUT()); } - log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName - % (loadRound + 1) % numLoadIterations; - } - } + + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + RedistributeFilteredParticles> + redistributeFilteredParticles; + redistributeFilteredParticles( + mappedFrame, + filter, + remap, + numParticlesCurrentBatch, + filterRemove); + + std::cout << "Filtered " << remapCurrent << " out of " << numParticlesCurrentBatch + << " particles" << std::endl; + + pmacc::particles::operations::splitIntoListOfFrames( + *speciesTmp, + mappedFrame, + remapCurrent, // !! not numParticlesCurrentBatch, filtered vs. unfiltered number + cellOffsetToTotalDomain, + totalCellIdx_, + *threadParams->cellDescription, + picLog::INPUT_OUTPUT()); + }); } }; @@ -527,7 +570,8 @@ namespace picongpu // std::cout << "Comp.: " << patchTotalOffset << " - " << (patchTotalOffset + patchExtent) // << "\tAGAINST " << offsets[i] << " - " << (offsets[i] + extents[i]) // << "\toffsets: " << (patchTotalOffset <= offsets[i]) - // << ", extents: " << ((offsets[i] + extents[i]) <= (patchTotalOffset + patchExtent)) << + // << ", extents: " << ((offsets[i] + extents[i]) <= (patchTotalOffset + + // patchExtent)) << // '\n'; if((patchTotalOffset <= offsets[i]) == true_ && ((offsets[i] + extents[i]) <= (patchTotalOffset + patchExtent)) == true_) @@ -547,7 +591,8 @@ namespace picongpu } // std::cout << "\n\n" - // << fullMatches.size() << " full matches, " << partialMatches.size() << " partial matches, + // << fullMatches.size() << " full matches, " << partialMatches.size() << " partial + // matches, // " // << noMatches << " unmatched." << std ::endl; From 03b33cdad5f7ba2c4e73e660fb43269b653e773e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 17:02:27 +0200 Subject: [PATCH 06/11] Do not double-load boundary particles --- include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index b7939aee303..aa86a5bb3dd 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -112,7 +112,7 @@ namespace picongpu for(size_t d = 0; d < simDim; ++d) { auto positionInD = positionVec[d] + positionOffsetVec[d]; - if(positionInD < patchTotalOffset[d] || positionInD > patchUpperCorner[d]) + if(positionInD < patchTotalOffset[d] || positionInD >= patchUpperCorner[d]) { filterCurrent = filterRemove; break; From 4fdf55bc036e401f2739b05b913d0ee38fd4138d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 17:23:22 +0200 Subject: [PATCH 07/11] Cleanup --- .../plugins/openPMD/restart/LoadSpecies.hpp | 53 ------------------- 1 file changed, 53 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index aa86a5bb3dd..7d5cd4cd1ff 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -149,39 +149,6 @@ namespace picongpu using NewParticleDescription = typename ReplaceValueTypeSeq::type; - struct ChunkedBuffer - { - using FrameType = Frame; - using BufferType = Frame; - - uint64_t totalNumParticles; - uint64_t maxChunkSize; - BufferType buffers; - FrameType mappedFrame; - - ChunkedBuffer( - std::deque matches, - uint64_t const* patchNumParticles, - uint32_t restartChunkSize, - std::string const& speciesName) - { - totalNumParticles = std::transform_reduce( - matches.begin(), - matches.end(), - 0, - /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, - /* transform = */ [patchNumParticles](size_t patchIdx) - { return patchNumParticles[patchIdx]; }); - maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); - - log("openPMD: malloc mapped memory: %1%") % speciesName; - /*malloc mapped memory*/ - meta::ForEach> - mallocMem; - mallocMem(buffers, mappedFrame, maxChunkSize); - } - }; - struct LoadParams { using FrameType = Frame; @@ -195,26 +162,6 @@ namespace picongpu DataSpace const& cellOffsetToTotalDomain; uint32_t restartChunkSize; -# if 0 - LoadParams( - std::string const& speciesName_in, - ::openPMD::ParticleSpecies const& particleSpecies_in, - std::shared_ptr const& speciesTmp_in, - uint64_t* patchNumParticles_in, - uint64_t* patchNumParticlesOffset_in, - DataSpace const& cellOffsetToTotalDomain_in, - uint32_t restartChunkSize_in) - : speciesName(speciesName_in) - , particleSpecies(particleSpecies_in) - , speciesTmp(speciesTmp_in) - , patchNumParticles(patchNumParticles_in) - , patchNumParticlesOffset(patchNumParticlesOffset_in) - , cellOffsetToTotalDomain(cellOffsetToTotalDomain_in) - , restartChunkSize(restartChunkSize_in) - { - } -# endif - auto numParticlesAndChunkSize(std::deque const& matches) const -> std::pair { uint64_t totalNumParticles = std::transform_reduce( From 6cdc175c20dd7293190a13cd2589551c52e71ee7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 16 Oct 2025 20:19:28 +0200 Subject: [PATCH 08/11] Fix -Werror=sign-compare problem --- include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index 7d5cd4cd1ff..1f76c1739c9 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -96,7 +96,8 @@ namespace picongpu auto positionOffset = loadedData.getIdentifier(totalCellIdx()).getPointer(); // grid-strided loop over the chunked data - for(int dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; dataBlock += worker.gridDomSize()) + for(uint32_t dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; + dataBlock += worker.gridDomSize()) { auto dataBlockOffset = dataBlock * blockDomSize; auto forEach = pmacc::lockstep::makeForEach(worker); From 0488d99b504931361eb5ad53f696502c5a8821ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 17 Oct 2025 13:14:42 +0200 Subject: [PATCH 09/11] Erase unused variables --- include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index 1f76c1739c9..6216664a829 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -318,10 +318,6 @@ namespace picongpu particleLoadOffset, numParticlesCurrentBatch); - // now filter - auto position_ = mappedFrame.getIdentifier(position()).getPointer(); - auto positionOffset = mappedFrame.getIdentifier(totalCellIdx()).getPointer(); - constexpr char filterKeep{1}, filterRemove{0}; PMACC_LOCKSTEP_KERNEL(KernelFilterParticles{}) .config(pmacc::math::Vector{numParticlesCurrentBatch})( @@ -385,7 +381,6 @@ namespace picongpu std::string const speciesName = FrameType::getName(); log("openPMD: (begin) load species: %1%") % speciesName; DataConnector& dc = Environment<>::get().DataConnector(); - GridController& gc = Environment::get().GridController(); ::openPMD::Series& series = *params->openPMDSeries; ::openPMD::Container<::openPMD::ParticleSpecies>& particles @@ -403,8 +398,6 @@ namespace picongpu // openPMD eventSystem::getTransactionEvent().waitForFinished(); - auto numRanks = gc.getGlobalSize(); - auto [fullMatches, partialMatches] = getPatchIdx(params, particleSpecies); std::shared_ptr numParticlesShared From dc6f09be0afdda9763c42a9b09e03474214207b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 17 Oct 2025 13:17:30 +0200 Subject: [PATCH 10/11] Replace stdout with logging --- .../plugins/openPMD/restart/LoadSpecies.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index 6216664a829..d1fb263f0ff 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -355,8 +355,9 @@ namespace picongpu numParticlesCurrentBatch, filterRemove); - std::cout << "Filtered " << remapCurrent << " out of " << numParticlesCurrentBatch - << " particles" << std::endl; + log( + "openPMD: Keeping %1% of the current batch's %2% particles after filtering.") + % remapCurrent % numParticlesCurrentBatch; pmacc::particles::operations::splitIntoListOfFrames( *speciesTmp, @@ -531,11 +532,10 @@ namespace picongpu } } - // std::cout << "\n\n" - // << fullMatches.size() << " full matches, " << partialMatches.size() << " partial - // matches, - // " - // << noMatches << " unmatched." << std ::endl; + log( + "openPMD: Found %1% fully and %2% partially matching particle patch(es). %3% " + "patch was / patches were not matched.") + % fullMatches.size() % partialMatches.size() % noMatches; return std::make_pair(std::move(fullMatches), std::move(partialMatches)); } From 1d02d93fb5f639e56901dd53c6177e528fe1173a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 17 Oct 2025 14:17:47 +0200 Subject: [PATCH 11/11] Auto-detect if PML loads should be skipped --- .../openPMD/restart/RestartFieldLoader.hpp | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp index a04379cb267..ec66a6ee0ba 100644 --- a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp +++ b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp @@ -77,6 +77,9 @@ namespace picongpu DataSpace local_domain_size = params->window.localDimensions.size; bool useLinearIdxAsDestination = false; + ::openPMD::Series& series = *params->openPMDSeries; + ::openPMD::Mesh& mesh = series.iterations[currentStep].open().meshes[objectName]; + /* Patch for non-domain-bound fields * This is an ugly fix to allow output of reduced 1d PML buffers */ @@ -85,6 +88,7 @@ namespace picongpu auto const field_layout = field.getGridLayout(); auto const field_no_guard = field_layout.sizeWithoutGuardND(); auto const elementCount = field_no_guard.productOfComponents(); + uint64_t pmlTotalSize = 0; /* Scan the PML buffer local size along all local domains * This code is symmetric to one in Field::writeField() @@ -112,9 +116,20 @@ namespace picongpu { if(localSizes.at(2u * r + 1u) < rank) domainOffset += localSizes.at(2u * r); + pmlTotalSize += localSizes.at(2u * r); } log("openPMD: (end) collect PML sizes for %1%") % objectName; + if(auto const& extentOnDisk = mesh.begin()->second.getExtent(); + extentOnDisk != ::openPMD::Extent{1, 1, pmlTotalSize}) + { + log( + "openPMD: Skip loading for PML fields. Expecting extent %1%, found extent %2% on disk. " + "This may happen when restarting with a different domain decomposition.") + % pmlTotalSize % extentOnDisk.at(2); + return; + } + domain_offset = DataSpace::create(0); domain_offset[0] = domainOffset; local_domain_size = DataSpace::create(1); @@ -122,9 +137,6 @@ namespace picongpu useLinearIdxAsDestination = true; } - ::openPMD::Series& series = *params->openPMDSeries; - ::openPMD::Container<::openPMD::Mesh>& meshes = series.iterations[currentStep].open().meshes; - auto destBox = field.getHostBuffer().getDataBox(); for(uint32_t n = 0; n < numComponents; ++n) { @@ -133,9 +145,8 @@ namespace picongpu // data. log("openPMD: Read from domain: offset=%1% size=%2%") % domain_offset % local_domain_size; - ::openPMD::RecordComponent rc = numComponents > 1 - ? meshes[objectName][name_lookup_tpl[n]] - : meshes[objectName][::openPMD::RecordComponent::SCALAR]; + ::openPMD::RecordComponent rc + = numComponents > 1 ? mesh[name_lookup_tpl[n]] : mesh[::openPMD::RecordComponent::SCALAR]; log("openPMD: Read from field '%1%'") % objectName; @@ -158,7 +169,7 @@ namespace picongpu std::shared_ptr field_container = rc.loadChunk(start, count); /* start a blocking read of all scheduled variables */ - meshes.seriesFlush(); + mesh.seriesFlush(); int const elementCount = local_domain_size.productOfComponents(); @@ -220,12 +231,6 @@ namespace picongpu /* load from openPMD */ bool const isDomainBound = traits::IsFieldDomainBound::value; - // Skip PML fields for load balancing purposes - if(!isDomainBound) - { - return; - } - RestartFieldLoader::loadField( field->getGridBuffer(), (uint32_t) T_Field::numComponents,