diff options
Diffstat (limited to 'libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h')
-rw-r--r-- | libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h | 809 |
1 files changed, 809 insertions, 0 deletions
diff --git a/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h b/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h new file mode 100644 index 0000000..c791508 --- /dev/null +++ b/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h @@ -0,0 +1,809 @@ +/************************************************************************* + * * + * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. * + * All rights reserved. Email: russ@q12.org Web: www.q12.org * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of EITHER: * + * (1) The GNU Lesser General Public License as published by the Free * + * Software Foundation; either version 2.1 of the License, or (at * + * your option) any later version. The text of the GNU Lesser * + * General Public License is included with this library in the * + * file LICENSE.TXT. * + * (2) The BSD-style license that is included with this library in * + * the file LICENSE-BSD.TXT. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files * + * LICENSE.TXT and LICENSE-BSD.TXT for more details. * + * * + *************************************************************************/ + +/* + * Equation System Threaded Solver + * Copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e") + */ + + + +#ifndef _ODE_THREADED_SOLVER_LDLT_H_ +#define _ODE_THREADED_SOLVER_LDLT_H_ + + +#include "coop_matrix_types.h" +#include <ode/threading.h> + + +class dxThreadingBase; +class dxResourceRequirementDescriptor; +class dxRequiredResourceContainer; + + +class ThreadedEquationSolverLDLT +{ +public: + static void estimateCooperativeFactoringLDLTResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void cooperativelyFactorLDLT(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip); + + static void estimateCooperativeSolvingL1StraightResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void cooperativelySolveL1Straight(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip); + + static void estimateCooperativeSolvingL1TransposedResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void cooperativelySolveL1Transposed(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip); + + static void estimateCooperativeScalingVectorResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned elementCount); + static void cooperativelyScaleVector(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + dReal *vectorData, const dReal *scaleData, unsigned elementCount); + + static void estimateCooperativeSolvingLDLTResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void cooperativelySolveLDLT(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + const dReal *L, const dReal *d, dReal *b, unsigned rowCount, unsigned rowSkip); + +public: + enum + { + ALLOCATION_DEFAULT_ALIGNMENT = COOP_THREAD_DATA_ALIGNMENT_SIZE, + }; + +private: + struct FactorizationSolveL1StripeCellContext; + struct FactorizationFactorizeL1StripeThreadContext; + + enum + { + FLDLT_D_STRIDE = 1, + FLDLT_COOPERATIVE_BLOCK_COUNT_MINIMUM = 5, + + FSL1S_BLOCK_SIZE = 2, + + FSL1S_REGULAR_B_ROWS = FSL1S_BLOCK_SIZE, + FSL1S_FINAL_B_ROWS = 1, + + FFL1S_REGULAR_A_ROWS = FSL1S_BLOCK_SIZE, + FFL1S_FINAL_A_ROWS = 1, + FFL1S_REGULAR_BLOCK_SIZE = 16, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6 + FFL1S_FINAL_BLOCK_SIZE = 32, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6 + }; + + static unsigned restrictFactoringLDLTAllowedThreadCount( + dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount); + static void doEstimateCooperativeFactoringLDLTResourceRequirementsValidated( + dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void doCooperativelyFactorLDLTValidated( + dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip); + + + static unsigned deriveSolvingL1StripeBlockCount(unsigned rowCount, unsigned blockStep) + { + return (rowCount + (blockStep - 1)) / blockStep; + } + + struct FactorizationSolvingL1StripeMemoryEstimates + { + void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired) + { + m_descriptorSizeRequired = descriptorSizeRequired; + m_contextSizeRequired = contextSizeRequired; + } + + sizeint m_descriptorSizeRequired; + sizeint m_contextSizeRequired; + }; + + static unsigned deriveSolvingL1StripeThreadCount(unsigned blockCount, unsigned allowedThreadCount) + { + dIASSERT(allowedThreadCount >= 1); + + unsigned maximumCount = blockCount / 2; + return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U); + } + + static sizeint estimateCooperativelySolvingL1Stripe_XMemoryRequirement(unsigned blockCount, + FactorizationSolvingL1StripeMemoryEstimates &ref_memoryEstimates) + { + sizeint descriptorSizeRequired = dOVERALIGNED_SIZE(sizeof(cellindexint) * blockCount, COOP_THREAD_DATA_ALIGNMENT_SIZE); + sizeint contextSizeRequired = dOVERALIGNED_SIZE(sizeof(FactorizationSolveL1StripeCellContext) * (CCI__MAX + 1) * blockCount, COOP_THREAD_DATA_ALIGNMENT_SIZE); + ref_memoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired); + + sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired; + return totalSizeRequired; + } + + static void *markCooperativelySolvingL1Stripe_XMemoryStructuresOut(void *buffer, + const FactorizationSolvingL1StripeMemoryEstimates &memoryEstimates, + cellindexint *&out_blockProgressDescriptors, FactorizationSolveL1StripeCellContext *&out_cellContexts) + { + void *currentLocation = buffer; + + out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_descriptorSizeRequired; + out_cellContexts = (FactorizationSolveL1StripeCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_contextSizeRequired; + + return currentLocation; + } + + static void initializeCooperativelySolvingL1Stripe_XMemoryStructures(unsigned blockCount, + atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, FactorizationSolveL1StripeCellContext *dUNUSED(cellContexts)) + { + out_blockCompletionProgress = 0; + memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors)); + } + + template<unsigned int block_step, unsigned int b_rows> + static void participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip, + volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/, + FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex); + + static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromSolvingBlockIndex(unsigned solvingBlockIndex, unsigned solvingBlockStep, unsigned blockARows) + { + unsigned factorizingBlockSize = deriveScalingAndFactorizingL1StripeBlockSize(blockARows); + return deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(solvingBlockIndex * solvingBlockStep, factorizingBlockSize); + } + + static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(unsigned factorizationRowIndex, unsigned factorizationBlockSize) + { + return (factorizationRowIndex + (factorizationBlockSize - 1)) / factorizationBlockSize; + } + + static unsigned deriveScalingAndFactorizingL1StripeBlockSize(unsigned blockARows) + { + unsigned result = blockARows != 1 ? FFL1S_REGULAR_BLOCK_SIZE : FFL1S_FINAL_BLOCK_SIZE; + dIASSERT(blockARows >= 1 && blockARows <= 2); + + return result; + } + + + static unsigned deriveScalingAndFactorizingL1StripeThreadCount(unsigned blockCount, unsigned allowedThreadCount) + { + dIASSERT(blockCount != 0); + dIASSERT(allowedThreadCount >= 1); + + return dMACRO_MIN(blockCount, allowedThreadCount); + } + + struct FactorizationFactorizeL1StripeContext; + + struct FactorizationScalingAndFactorizingL1StripeMemoryEstimates + { + void assignData(sizeint contextSizeRequired) + { + m_contextSizeRequired = contextSizeRequired; + } + + sizeint m_contextSizeRequired; + }; + + static sizeint estimateCooperativelyScalingAndFactorizingL1Stripe_XMemoryRequirement(unsigned factorizingMaximumThreads, + FactorizationScalingAndFactorizingL1StripeMemoryEstimates &ref_memoryEstimates) + { + dIASSERT(factorizingMaximumThreads != 0); + + sizeint contextSizeRequired = dOVERALIGNED_SIZE(sizeof(FactorizationFactorizeL1StripeContext) + sizeof(FactorizationFactorizeL1StripeThreadContext) * (factorizingMaximumThreads - 1), COOP_THREAD_DATA_ALIGNMENT_SIZE); + ref_memoryEstimates.assignData(contextSizeRequired); + + sizeint totalSizeRequired = contextSizeRequired; + return totalSizeRequired; + } + + static void *markCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructuresOut(void *buffer, + const FactorizationScalingAndFactorizingL1StripeMemoryEstimates &memoryEstimates, FactorizationFactorizeL1StripeContext *&out_factorizationContext) + { + void *currentLocation = buffer; + + out_factorizationContext = (FactorizationFactorizeL1StripeContext *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_contextSizeRequired; + + return currentLocation; + } + + static void initializeCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructures( + FactorizationFactorizeL1StripeContext *factorizationContext, unsigned threadCount) + { + factorizationContext->initialize(threadCount); + } + + + template<unsigned int a_rows, unsigned int d_stride> + static void participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip, + FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex); + +private: + struct FactorLDLTWorkerContext + { + FactorLDLTWorkerContext(dxThreadingBase *threading, unsigned allowedThreadCount, + dReal *A, dReal *d, unsigned totalBlockCount, unsigned rowCount, unsigned rowSkip, + atomicord32 &ref_solvingBlockCompletionProgress, cellindexint *solvingBlockProgressDescriptors, + FactorizationSolveL1StripeCellContext *solvingCellContexts, + FactorizationFactorizeL1StripeContext *factorizingFactorizationContext, + dCallReleaseeID calculationFinishReleasee): + m_threading(threading), + m_allowedThreadCount(allowedThreadCount), + m_A(A), + m_ARow(A), + m_d(d), + m_solvingBlockIndex(0), + m_totalBlockCount(totalBlockCount), + m_rowCount(rowCount), + m_rowSkip(rowSkip), + m_refSolvingBlockCompletionProgress(ref_solvingBlockCompletionProgress), + m_solvingBlockProgressDescriptors(solvingBlockProgressDescriptors), + m_solvingCellContexts(solvingCellContexts), + m_factorizingFactorizationContext(factorizingFactorizationContext), + m_calculationFinishReleasee(calculationFinishReleasee) + { + } + + void incrementForNextBlock() + { + const unsigned blockStep = FSL1S_BLOCK_SIZE; + + m_ARow += blockStep * m_rowSkip; + m_solvingBlockIndex += 1; + } + + dxThreadingBase *m_threading; + unsigned m_allowedThreadCount; + dReal *m_A; + dReal *m_ARow; + dReal *m_d; + unsigned m_solvingBlockIndex; + unsigned m_totalBlockCount; + unsigned m_rowCount; + unsigned m_rowSkip; + atomicord32 &m_refSolvingBlockCompletionProgress; + cellindexint *m_solvingBlockProgressDescriptors; + FactorizationSolveL1StripeCellContext *m_solvingCellContexts; + FactorizationFactorizeL1StripeContext *m_factorizingFactorizationContext; + dCallReleaseeID m_calculationFinishReleasee; + }; + + static int factotLDLT_solvingComplete_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_solvingComplete(FactorLDLTWorkerContext &ref_context, unsigned ownThreadIndex); + + static int factotLDLT_solvingCompleteSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_solvingCompleteSync(FactorLDLTWorkerContext &ref_workerContext); + + static int factotLDLT_scalingAndFactorizingComplete_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_scalingAndFactorizingComplete(FactorLDLTWorkerContext &ref_workerContext, unsigned ownThreadIndex); + + static int factotLDLT_scalingAndFactorizingCompleteSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_scalingAndFactorizingCompleteSync(FactorLDLTWorkerContext &ref_workerContext); + + static int factotLDLT_solvingFinal_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_solvingFinal(FactorLDLTWorkerContext &ref_context, unsigned ownThreadIndex); + + static int factotLDLT_solvingFinalSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_solvingFinalSync(FactorLDLTWorkerContext &ref_workerContext); + + static int factotLDLT_scalingAndFactorizingFinal_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void factotLDLT_scalingAndFactorizingFinal(FactorLDLTWorkerContext &ref_workerContext, unsigned ownThreadIndex); + + static int factotLDLT_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + +private: + struct FactorizationSolveL1StripeCellContext + { + template<unsigned int block_step, unsigned int b_rows> + static void initializePrecalculatedZs(dReal (&Z)[block_step][b_rows]) + { + Z[0][0] = 0; + if (b_rows >= 2) + { + Z[0][1] = 0; + } + Z[1][0] = 0; + if (b_rows >= 2) + { + Z[1][1] = 0; + } + dSASSERT(block_step == 2); + dSASSERT(b_rows >= 1 && b_rows <= 2); + } + + template<unsigned int block_step, unsigned int b_rows> + void loadPrecalculatedZs(dReal (&Z)[block_step][b_rows]) const + { + dSASSERT(block_step <= dARRAY_SIZE(m_c)); + dSASSERT(b_rows <= dARRAY_SIZE(m_c[0])); + + Z[0][0] = m_c[0][0]; + if (b_rows >= 2) + { + Z[0][1] = m_c[0][1]; + } + Z[1][0] = m_c[1][0]; + if (b_rows >= 2) + { + Z[1][1] = m_c[1][1]; + } + dSASSERT(block_step == 2); + dSASSERT(b_rows >= 1 && b_rows <= 2); + } + + template<unsigned int block_step, unsigned int b_rows> + void storePrecalculatedZs(const dReal (&Z)[block_step][b_rows]) + { + dSASSERT(block_step <= dARRAY_SIZE(m_c)); + dSASSERT(b_rows <= dARRAY_SIZE(m_c[0])); + + m_c[0][0] = Z[0][0]; + if (b_rows >= 2) + { + m_c[0][1] = Z[0][1]; + } + m_c[1][0] = Z[1][0]; + if (b_rows >= 2) + { + m_c[1][1] = Z[1][1]; + } + dSASSERT(block_step == 2); + dSASSERT(b_rows >= 1 && b_rows <= 2); + } + + dReal m_c[FSL1S_BLOCK_SIZE][FSL1S_REGULAR_B_ROWS]; + // dReal m_reserved[4]; + }; + + static FactorizationSolveL1StripeCellContext &buildBlockContextRef(FactorizationSolveL1StripeCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance) + { + return cellContexts[blockIndex * CCI__MAX + contextInstance]; + } + + static FactorizationSolveL1StripeCellContext &buildResultContextRef(FactorizationSolveL1StripeCellContext *cellContexts, unsigned blockIndex, unsigned blockCount) + { + return cellContexts[blockCount * CCI__MAX + blockIndex]; + } + +private: + struct FactorizationFactorizeL1StripeThreadContext + { + template<unsigned int a_rows> + void assignDataSum(const dReal (&sameZ)[a_rows], const dReal (&mixedZ)[dMACRO_MAX(a_rows - 1, 1)], + const FactorizationFactorizeL1StripeThreadContext &partialSumContext) + { + m_sameZ[0] = sameZ[0] + partialSumContext.m_sameZ[0]; + if (a_rows >= 2) + { + m_sameZ[1] = sameZ[1] + partialSumContext.m_sameZ[1]; + m_mixedZ[0] = mixedZ[0] + partialSumContext.m_mixedZ[0]; + } + } + + template<unsigned int a_rows> + void assignDataAlone(const dReal (&sameZ)[a_rows], const dReal (&mixedZ)[dMACRO_MAX(a_rows - 1, 1)]) + { + m_sameZ[0] = sameZ[0]; + if (a_rows >= 2) + { + m_sameZ[1] = sameZ[1]; + m_mixedZ[0] = mixedZ[0]; + } + } + + template<unsigned int a_rows> + void retrieveData(dReal (&out_sameZ)[a_rows], dReal (&out_mixedZ)[dMACRO_MAX(a_rows - 1, 1)]) const + { + out_sameZ[0] = m_sameZ[0]; + if (a_rows >= 2) + { + out_sameZ[1] = m_sameZ[1]; + out_mixedZ[0] = m_mixedZ[0]; + } + dAASSERT(a_rows >= 1 && a_rows <= 2); + } + + dReal m_sameZ[FFL1S_REGULAR_A_ROWS]; + dReal m_mixedZ[dMACRO_MAX(FFL1S_REGULAR_A_ROWS - 1, 1)]; + dReal m_reserved[1]; // [5]; // for alignment + }; + + struct FactorizationFactorizeL1StripeContext + { + void initialize(unsigned threadCount) + { + m_threadsRunning = threadCount; + m_nextColumnIndex = 0; + m_sumThreadIndex = 0; + } + + atomicord32 m_threadsRunning; + atomicord32 m_nextColumnIndex; + volatile atomicord32 m_sumThreadIndex; + atomicord32 m_reserved[1]; // [13]; // for alignment + FactorizationFactorizeL1StripeThreadContext m_threadContexts[1]; // =[threadCount] + }; + +private: + struct SolveL1StraightCellContext; + + enum + { + SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM = 8, + + SL1S_B_STRIDE = 1, + SL1S_BLOCK_SIZE = 4, + }; + + static unsigned restrictSolvingL1StraightAllowedThreadCount( + dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount); + static void doEstimateCooperativeSolvingL1StraightResourceRequirementsValidated( + dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void doCooperativelySolveL1StraightValidated( + dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip); + + static unsigned deriveSolvingL1StraightBlockCount(unsigned rowCount, unsigned blockStep) + { + return (rowCount + (blockStep - 1)) / blockStep; + } + + struct SolvingL1StraightMemoryEstimates + { + void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired) + { + m_descriptorSizeRequired = descriptorSizeRequired; + m_contextSizeRequired = contextSizeRequired; + } + + sizeint m_descriptorSizeRequired; + sizeint m_contextSizeRequired; + }; + + static unsigned deriveSolvingL1StraightThreadCount(unsigned blockCount, unsigned allowedThreadCount) + { + dIASSERT(allowedThreadCount >= 1); + + unsigned maximumCount = 1 + blockCount / SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM; + return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U); + } + + template<unsigned int block_step> + static sizeint estimateCooperativelySolvingL1StraightMemoryRequirement(unsigned rowCount, SolvingL1StraightMemoryEstimates &ref_solvingMemoryEstimates); + + static void *markCooperativelySolvingL1StraightMemoryStructuresOut(void *buffer, + const SolvingL1StraightMemoryEstimates &solvingMemoryEstimates, + cellindexint *&out_blockProgressDescriptors, SolveL1StraightCellContext *&out_cellContexts) + { + void *currentLocation = buffer; + + out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_descriptorSizeRequired; + out_cellContexts = (SolveL1StraightCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_contextSizeRequired; + return currentLocation; + } + + template<unsigned int block_step> + static void initializeCooperativelySolveL1StraightMemoryStructures(unsigned rowCount, + atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *cellContexts); + template<unsigned int block_step, unsigned int b_stride> + static void participateSolvingL1Straight(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip, + volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/, + SolveL1StraightCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex); + +private: + struct SolveL1StraightWorkerContext + { + void init(const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip, + atomicord32 &ref_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *cellContexts) + { + m_L = L; + m_b = b; + m_rowCount = rowCount; + m_rowSkip = rowSkip; + m_ptrBlockCompletionProgress = &ref_blockCompletionProgress; + m_blockProgressDescriptors = blockProgressDescriptors; + m_cellContexts = cellContexts; + } + + const dReal *m_L; + dReal *m_b; + unsigned m_rowCount; + unsigned m_rowSkip; + atomicord32 *m_ptrBlockCompletionProgress; + cellindexint *m_blockProgressDescriptors; + SolveL1StraightCellContext *m_cellContexts; + }; + + static int solveL1Straight_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void solveL1Straight_worker(SolveL1StraightWorkerContext &ref_context, unsigned ownThreadIndex); + + static int solveL1Straight_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + +private: + struct SolveL1StraightCellContext + { + template<unsigned int block_step> + static void initializePrecalculatedZs(dReal (&Z)[block_step]) + { + std::fill(Z, Z + block_step, REAL(0.0)); + } + + template<unsigned int block_step> + void loadPrecalculatedZs(dReal (&Z)[block_step]) const + { + dSASSERT(block_step <= dARRAY_SIZE(m_c)); + + std::copy(m_c, m_c + block_step, Z); + } + + template<unsigned int block_step> + void storePrecalculatedZs(const dReal (&Z)[block_step]) + { + dSASSERT(block_step <= dARRAY_SIZE(m_c)); + + std::copy(Z, Z + block_step, m_c); + } + + dReal m_c[SL1S_BLOCK_SIZE]; + }; + + + static SolveL1StraightCellContext &buildBlockContextRef(SolveL1StraightCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance) + { + return cellContexts[blockIndex * CCI__MAX + contextInstance]; + } + + static SolveL1StraightCellContext &buildResultContextRef(SolveL1StraightCellContext *cellContexts, unsigned blockIndex, unsigned blockCount) + { + return cellContexts[blockCount * CCI__MAX + blockIndex]; + } + + +private: + struct SolveL1TransposedCellContext; + + enum + { + SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM = SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM, + + SL1T_B_STRIDE = SL1S_B_STRIDE, + SL1T_BLOCK_SIZE = 4, + }; + + static unsigned restrictSolvingL1TransposedAllowedThreadCount( + dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount); + static void doEstimateCooperativeSolvingL1TransposedResourceRequirementsValidated( + dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned rowCount); + static void doCooperativelySolveL1TransposedValidated( + dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip); + + static unsigned deriveSolvingL1TransposedBlockCount(unsigned rowCount, unsigned blockStep) + { + return (rowCount + (blockStep - 1)) / blockStep; + } + + struct SolvingL1TransposedMemoryEstimates + { + void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired) + { + m_descriptorSizeRequired = descriptorSizeRequired; + m_contextSizeRequired = contextSizeRequired; + } + + sizeint m_descriptorSizeRequired; + sizeint m_contextSizeRequired; + }; + + static unsigned deriveSolvingL1TransposedThreadCount(unsigned blockCount, unsigned allowedThreadCount) + { + dSASSERT(SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM + 0 == SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM); + + return deriveSolvingL1StraightThreadCount(blockCount, allowedThreadCount); + } + + template<unsigned int block_step> + static sizeint estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount, SolvingL1TransposedMemoryEstimates &ref_solvingMemoryEstimates); + + static void *markCooperativelySolvingL1TransposedMemoryStructuresOut(void *buffer, + const SolvingL1TransposedMemoryEstimates &solvingMemoryEstimates, + cellindexint *&out_blockProgressDescriptors, SolveL1TransposedCellContext *&out_cellContexts) + { + void *currentLocation = buffer; + + out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_descriptorSizeRequired; + out_cellContexts = (SolveL1TransposedCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_contextSizeRequired; + return currentLocation; + } + + template<unsigned int block_step> + static void *allocateCooperativelySolveL1TransposedMemoryStructures(sizeint &out_sizeAllocated, unsigned rowCount, + cellindexint *&out_blockProgressDescriptors, SolveL1TransposedCellContext *&out_cellContexts); + template<unsigned int block_step> + static void initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount, + atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *cellContexts); + template<unsigned int block_step, unsigned int b_stride> + static void participateSolvingL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip, + volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/, + SolveL1TransposedCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex); + +private: + struct SolveL1TransposedWorkerContext + { + void init(const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip, + atomicord32 &ref_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *cellContexts) + { + m_L = L; + m_b = b; + m_rowCount = rowCount; + m_rowSkip = rowSkip; + m_ptrBlockCompletionProgress = &ref_blockCompletionProgress; + m_blockProgressDescriptors = blockProgressDescriptors; + m_cellContexts = cellContexts; + } + + const dReal *m_L; + dReal *m_b; + unsigned m_rowCount; + unsigned m_rowSkip; + atomicord32 *m_ptrBlockCompletionProgress; + cellindexint *m_blockProgressDescriptors; + SolveL1TransposedCellContext *m_cellContexts; + }; + + static int solveL1Transposed_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void solveL1Transposed_worker(SolveL1TransposedWorkerContext &ref_context, unsigned ownThreadIndex); + + static int solveL1Transposed_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + +private: + struct SolveL1TransposedCellContext + { + template<unsigned int block_step> + static void initializePrecalculatedZs(dReal (&Z)[block_step]) + { + std::fill(Z, Z + block_step, REAL(0.0)); + } + + template<unsigned int block_step> + void loadPrecalculatedZs(dReal (&Z)[block_step]) const + { + dSASSERT(block_step <= dARRAY_SIZE(m_c)); + + std::copy(m_c, m_c + block_step, Z); + } + + template<unsigned int block_step> + void storePrecalculatedZs(const dReal (&Z)[block_step]) + { + dSASSERT(block_step <= dARRAY_SIZE(m_c)); + + std::copy(Z, Z + block_step, m_c); + } + + dReal m_c[SL1T_BLOCK_SIZE]; + }; + + static SolveL1TransposedCellContext &buildBlockContextRef(SolveL1TransposedCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance) + { + return cellContexts[blockIndex * CCI__MAX + contextInstance]; + } + + static SolveL1TransposedCellContext &buildResultContextRef(SolveL1TransposedCellContext *cellContexts, unsigned blockIndex, unsigned blockCount) + { + return cellContexts[blockCount * CCI__MAX + blockIndex]; + } + +private: + enum + { + SV_A_STRIDE = 1, + SV_D_STRIDE = 1, + + SV_BLOCK_SIZE = 128, + SV_COOPERATIVE_BLOCK_COUNT_MINIMUM = 3, + }; + + static unsigned restrictScalingVectorAllowedThreadCount( + dxThreadingBase *threading, unsigned allowedThreadCount, unsigned elementCount); + static void doEstimateCooperativeScalingVectorResourceRequirementsValidated( + dxResourceRequirementDescriptor *summaryRequirementsDescriptor, + unsigned allowedThreadCount, unsigned elementCount); + static void doCooperativelyScaleVectorValidated(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, + dReal *vectorData, const dReal *scaleData, unsigned elementCount); + + static unsigned deriveScalingVectorBlockCount(unsigned elementCount, unsigned blockStep) + { + return (elementCount + (blockStep - 1)) / blockStep; + } + + static unsigned deriveScalingVectorThreadCount(unsigned lastBlockIndex, unsigned allowedThreadCount) + { + dIASSERT(allowedThreadCount >= 1); + + unsigned maximumCount = lastBlockIndex; + return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U); + } + + static void initializeCooperativelyScaleVectorMemoryStructures(atomicord32 &out_blockCompletionProgress) + { + out_blockCompletionProgress = 0; + } + template<unsigned int block_step, unsigned int a_stride, unsigned int d_stride> + static void participateScalingVector(dReal *ptrAStart, const dReal *ptrDStart, const unsigned elementCount, + volatile atomicord32 &refBlockCompletionProgress/*=0*/); + +private: + struct ScaleVectorWorkerContext + { + void init(dReal *vectorData, const dReal *scaleData, unsigned elementCount, + atomicord32 &ref_blockCompletionProgress) + { + m_vectorData = vectorData; + m_scaleData = scaleData; + m_elementCount = elementCount; + m_ptrBlockCompletionProgress = &ref_blockCompletionProgress; + } + + dReal *m_vectorData; + const dReal *m_scaleData; + unsigned m_elementCount; + atomicord32 *m_ptrBlockCompletionProgress; + }; + + static int scaleVector_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + static void scaleVector_worker(ScaleVectorWorkerContext &ref_context); + + static int scaleVector_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee); + + +private: + enum SolvingLDLTStage + { + SLDLTS__MIN, + + SLDLTS_SOLVING_STRAIGHT = SLDLTS__MIN, + SLDLTS_SCALING_VECTOR, + SLDLTS_SOLVING_TRANSPOSED, + + SLDLTS__MAX, + }; + + enum + { + SLDLT_B_STRIDE = SL1S_B_STRIDE, + SLDLT_D_STRIDE = FLDLT_D_STRIDE, + }; + + static unsigned restrictSolvingLDLTAllowedThreadCount( + dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount, unsigned &out_stageBlockCountSifficiencyMask); + + static void doCooperativelySolveLDLTValidated( + dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, unsigned stageBlockCountSifficiencyMask, + const dReal *L, const dReal *d, dReal *b, unsigned rowCount, unsigned rowSkip); +}; + + +#endif + |