summaryrefslogtreecommitdiff
path: root/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h')
-rw-r--r--libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h809
1 files changed, 809 insertions, 0 deletions
diff --git a/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h b/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h
new file mode 100644
index 0000000..c791508
--- /dev/null
+++ b/libs/ode-0.16.1/ode/src/threaded_solver_ldlt.h
@@ -0,0 +1,809 @@
+/*************************************************************************
+ * *
+ * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
+ * All rights reserved. Email: russ@q12.org Web: www.q12.org *
+ * *
+ * This library is free software; you can redistribute it and/or *
+ * modify it under the terms of EITHER: *
+ * (1) The GNU Lesser General Public License as published by the Free *
+ * Software Foundation; either version 2.1 of the License, or (at *
+ * your option) any later version. The text of the GNU Lesser *
+ * General Public License is included with this library in the *
+ * file LICENSE.TXT. *
+ * (2) The BSD-style license that is included with this library in *
+ * the file LICENSE-BSD.TXT. *
+ * *
+ * This library is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
+ * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
+ * *
+ *************************************************************************/
+
+/*
+ * Equation System Threaded Solver
+ * Copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
+ */
+
+
+
+#ifndef _ODE_THREADED_SOLVER_LDLT_H_
+#define _ODE_THREADED_SOLVER_LDLT_H_
+
+
+#include "coop_matrix_types.h"
+#include <ode/threading.h>
+
+
+class dxThreadingBase;
+class dxResourceRequirementDescriptor;
+class dxRequiredResourceContainer;
+
+
+class ThreadedEquationSolverLDLT
+{
+public:
+ static void estimateCooperativeFactoringLDLTResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void cooperativelyFactorLDLT(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip);
+
+ static void estimateCooperativeSolvingL1StraightResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void cooperativelySolveL1Straight(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
+
+ static void estimateCooperativeSolvingL1TransposedResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void cooperativelySolveL1Transposed(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
+
+ static void estimateCooperativeScalingVectorResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned elementCount);
+ static void cooperativelyScaleVector(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ dReal *vectorData, const dReal *scaleData, unsigned elementCount);
+
+ static void estimateCooperativeSolvingLDLTResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void cooperativelySolveLDLT(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ const dReal *L, const dReal *d, dReal *b, unsigned rowCount, unsigned rowSkip);
+
+public:
+ enum
+ {
+ ALLOCATION_DEFAULT_ALIGNMENT = COOP_THREAD_DATA_ALIGNMENT_SIZE,
+ };
+
+private:
+ struct FactorizationSolveL1StripeCellContext;
+ struct FactorizationFactorizeL1StripeThreadContext;
+
+ enum
+ {
+ FLDLT_D_STRIDE = 1,
+ FLDLT_COOPERATIVE_BLOCK_COUNT_MINIMUM = 5,
+
+ FSL1S_BLOCK_SIZE = 2,
+
+ FSL1S_REGULAR_B_ROWS = FSL1S_BLOCK_SIZE,
+ FSL1S_FINAL_B_ROWS = 1,
+
+ FFL1S_REGULAR_A_ROWS = FSL1S_BLOCK_SIZE,
+ FFL1S_FINAL_A_ROWS = 1,
+ FFL1S_REGULAR_BLOCK_SIZE = 16, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6
+ FFL1S_FINAL_BLOCK_SIZE = 32, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6
+ };
+
+ static unsigned restrictFactoringLDLTAllowedThreadCount(
+ dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount);
+ static void doEstimateCooperativeFactoringLDLTResourceRequirementsValidated(
+ dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void doCooperativelyFactorLDLTValidated(
+ dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip);
+
+
+ static unsigned deriveSolvingL1StripeBlockCount(unsigned rowCount, unsigned blockStep)
+ {
+ return (rowCount + (blockStep - 1)) / blockStep;
+ }
+
+ struct FactorizationSolvingL1StripeMemoryEstimates
+ {
+ void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired)
+ {
+ m_descriptorSizeRequired = descriptorSizeRequired;
+ m_contextSizeRequired = contextSizeRequired;
+ }
+
+ sizeint m_descriptorSizeRequired;
+ sizeint m_contextSizeRequired;
+ };
+
+ static unsigned deriveSolvingL1StripeThreadCount(unsigned blockCount, unsigned allowedThreadCount)
+ {
+ dIASSERT(allowedThreadCount >= 1);
+
+ unsigned maximumCount = blockCount / 2;
+ return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U);
+ }
+
+ static sizeint estimateCooperativelySolvingL1Stripe_XMemoryRequirement(unsigned blockCount,
+ FactorizationSolvingL1StripeMemoryEstimates &ref_memoryEstimates)
+ {
+ sizeint descriptorSizeRequired = dOVERALIGNED_SIZE(sizeof(cellindexint) * blockCount, COOP_THREAD_DATA_ALIGNMENT_SIZE);
+ sizeint contextSizeRequired = dOVERALIGNED_SIZE(sizeof(FactorizationSolveL1StripeCellContext) * (CCI__MAX + 1) * blockCount, COOP_THREAD_DATA_ALIGNMENT_SIZE);
+ ref_memoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired);
+
+ sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired;
+ return totalSizeRequired;
+ }
+
+ static void *markCooperativelySolvingL1Stripe_XMemoryStructuresOut(void *buffer,
+ const FactorizationSolvingL1StripeMemoryEstimates &memoryEstimates,
+ cellindexint *&out_blockProgressDescriptors, FactorizationSolveL1StripeCellContext *&out_cellContexts)
+ {
+ void *currentLocation = buffer;
+
+ out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_descriptorSizeRequired;
+ out_cellContexts = (FactorizationSolveL1StripeCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_contextSizeRequired;
+
+ return currentLocation;
+ }
+
+ static void initializeCooperativelySolvingL1Stripe_XMemoryStructures(unsigned blockCount,
+ atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, FactorizationSolveL1StripeCellContext *dUNUSED(cellContexts))
+ {
+ out_blockCompletionProgress = 0;
+ memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors));
+ }
+
+ template<unsigned int block_step, unsigned int b_rows>
+ static void participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip,
+ volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
+ FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex);
+
+ static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromSolvingBlockIndex(unsigned solvingBlockIndex, unsigned solvingBlockStep, unsigned blockARows)
+ {
+ unsigned factorizingBlockSize = deriveScalingAndFactorizingL1StripeBlockSize(blockARows);
+ return deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(solvingBlockIndex * solvingBlockStep, factorizingBlockSize);
+ }
+
+ static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(unsigned factorizationRowIndex, unsigned factorizationBlockSize)
+ {
+ return (factorizationRowIndex + (factorizationBlockSize - 1)) / factorizationBlockSize;
+ }
+
+ static unsigned deriveScalingAndFactorizingL1StripeBlockSize(unsigned blockARows)
+ {
+ unsigned result = blockARows != 1 ? FFL1S_REGULAR_BLOCK_SIZE : FFL1S_FINAL_BLOCK_SIZE;
+ dIASSERT(blockARows >= 1 && blockARows <= 2);
+
+ return result;
+ }
+
+
+ static unsigned deriveScalingAndFactorizingL1StripeThreadCount(unsigned blockCount, unsigned allowedThreadCount)
+ {
+ dIASSERT(blockCount != 0);
+ dIASSERT(allowedThreadCount >= 1);
+
+ return dMACRO_MIN(blockCount, allowedThreadCount);
+ }
+
+ struct FactorizationFactorizeL1StripeContext;
+
+ struct FactorizationScalingAndFactorizingL1StripeMemoryEstimates
+ {
+ void assignData(sizeint contextSizeRequired)
+ {
+ m_contextSizeRequired = contextSizeRequired;
+ }
+
+ sizeint m_contextSizeRequired;
+ };
+
+ static sizeint estimateCooperativelyScalingAndFactorizingL1Stripe_XMemoryRequirement(unsigned factorizingMaximumThreads,
+ FactorizationScalingAndFactorizingL1StripeMemoryEstimates &ref_memoryEstimates)
+ {
+ dIASSERT(factorizingMaximumThreads != 0);
+
+ sizeint contextSizeRequired = dOVERALIGNED_SIZE(sizeof(FactorizationFactorizeL1StripeContext) + sizeof(FactorizationFactorizeL1StripeThreadContext) * (factorizingMaximumThreads - 1), COOP_THREAD_DATA_ALIGNMENT_SIZE);
+ ref_memoryEstimates.assignData(contextSizeRequired);
+
+ sizeint totalSizeRequired = contextSizeRequired;
+ return totalSizeRequired;
+ }
+
+ static void *markCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructuresOut(void *buffer,
+ const FactorizationScalingAndFactorizingL1StripeMemoryEstimates &memoryEstimates, FactorizationFactorizeL1StripeContext *&out_factorizationContext)
+ {
+ void *currentLocation = buffer;
+
+ out_factorizationContext = (FactorizationFactorizeL1StripeContext *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_contextSizeRequired;
+
+ return currentLocation;
+ }
+
+ static void initializeCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructures(
+ FactorizationFactorizeL1StripeContext *factorizationContext, unsigned threadCount)
+ {
+ factorizationContext->initialize(threadCount);
+ }
+
+
+ template<unsigned int a_rows, unsigned int d_stride>
+ static void participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip,
+ FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex);
+
+private:
+ struct FactorLDLTWorkerContext
+ {
+ FactorLDLTWorkerContext(dxThreadingBase *threading, unsigned allowedThreadCount,
+ dReal *A, dReal *d, unsigned totalBlockCount, unsigned rowCount, unsigned rowSkip,
+ atomicord32 &ref_solvingBlockCompletionProgress, cellindexint *solvingBlockProgressDescriptors,
+ FactorizationSolveL1StripeCellContext *solvingCellContexts,
+ FactorizationFactorizeL1StripeContext *factorizingFactorizationContext,
+ dCallReleaseeID calculationFinishReleasee):
+ m_threading(threading),
+ m_allowedThreadCount(allowedThreadCount),
+ m_A(A),
+ m_ARow(A),
+ m_d(d),
+ m_solvingBlockIndex(0),
+ m_totalBlockCount(totalBlockCount),
+ m_rowCount(rowCount),
+ m_rowSkip(rowSkip),
+ m_refSolvingBlockCompletionProgress(ref_solvingBlockCompletionProgress),
+ m_solvingBlockProgressDescriptors(solvingBlockProgressDescriptors),
+ m_solvingCellContexts(solvingCellContexts),
+ m_factorizingFactorizationContext(factorizingFactorizationContext),
+ m_calculationFinishReleasee(calculationFinishReleasee)
+ {
+ }
+
+ void incrementForNextBlock()
+ {
+ const unsigned blockStep = FSL1S_BLOCK_SIZE;
+
+ m_ARow += blockStep * m_rowSkip;
+ m_solvingBlockIndex += 1;
+ }
+
+ dxThreadingBase *m_threading;
+ unsigned m_allowedThreadCount;
+ dReal *m_A;
+ dReal *m_ARow;
+ dReal *m_d;
+ unsigned m_solvingBlockIndex;
+ unsigned m_totalBlockCount;
+ unsigned m_rowCount;
+ unsigned m_rowSkip;
+ atomicord32 &m_refSolvingBlockCompletionProgress;
+ cellindexint *m_solvingBlockProgressDescriptors;
+ FactorizationSolveL1StripeCellContext *m_solvingCellContexts;
+ FactorizationFactorizeL1StripeContext *m_factorizingFactorizationContext;
+ dCallReleaseeID m_calculationFinishReleasee;
+ };
+
+ static int factotLDLT_solvingComplete_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_solvingComplete(FactorLDLTWorkerContext &ref_context, unsigned ownThreadIndex);
+
+ static int factotLDLT_solvingCompleteSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_solvingCompleteSync(FactorLDLTWorkerContext &ref_workerContext);
+
+ static int factotLDLT_scalingAndFactorizingComplete_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_scalingAndFactorizingComplete(FactorLDLTWorkerContext &ref_workerContext, unsigned ownThreadIndex);
+
+ static int factotLDLT_scalingAndFactorizingCompleteSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_scalingAndFactorizingCompleteSync(FactorLDLTWorkerContext &ref_workerContext);
+
+ static int factotLDLT_solvingFinal_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_solvingFinal(FactorLDLTWorkerContext &ref_context, unsigned ownThreadIndex);
+
+ static int factotLDLT_solvingFinalSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_solvingFinalSync(FactorLDLTWorkerContext &ref_workerContext);
+
+ static int factotLDLT_scalingAndFactorizingFinal_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void factotLDLT_scalingAndFactorizingFinal(FactorLDLTWorkerContext &ref_workerContext, unsigned ownThreadIndex);
+
+ static int factotLDLT_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+private:
+ struct FactorizationSolveL1StripeCellContext
+ {
+ template<unsigned int block_step, unsigned int b_rows>
+ static void initializePrecalculatedZs(dReal (&Z)[block_step][b_rows])
+ {
+ Z[0][0] = 0;
+ if (b_rows >= 2)
+ {
+ Z[0][1] = 0;
+ }
+ Z[1][0] = 0;
+ if (b_rows >= 2)
+ {
+ Z[1][1] = 0;
+ }
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+ }
+
+ template<unsigned int block_step, unsigned int b_rows>
+ void loadPrecalculatedZs(dReal (&Z)[block_step][b_rows]) const
+ {
+ dSASSERT(block_step <= dARRAY_SIZE(m_c));
+ dSASSERT(b_rows <= dARRAY_SIZE(m_c[0]));
+
+ Z[0][0] = m_c[0][0];
+ if (b_rows >= 2)
+ {
+ Z[0][1] = m_c[0][1];
+ }
+ Z[1][0] = m_c[1][0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] = m_c[1][1];
+ }
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+ }
+
+ template<unsigned int block_step, unsigned int b_rows>
+ void storePrecalculatedZs(const dReal (&Z)[block_step][b_rows])
+ {
+ dSASSERT(block_step <= dARRAY_SIZE(m_c));
+ dSASSERT(b_rows <= dARRAY_SIZE(m_c[0]));
+
+ m_c[0][0] = Z[0][0];
+ if (b_rows >= 2)
+ {
+ m_c[0][1] = Z[0][1];
+ }
+ m_c[1][0] = Z[1][0];
+ if (b_rows >= 2)
+ {
+ m_c[1][1] = Z[1][1];
+ }
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+ }
+
+ dReal m_c[FSL1S_BLOCK_SIZE][FSL1S_REGULAR_B_ROWS];
+ // dReal m_reserved[4];
+ };
+
+ static FactorizationSolveL1StripeCellContext &buildBlockContextRef(FactorizationSolveL1StripeCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance)
+ {
+ return cellContexts[blockIndex * CCI__MAX + contextInstance];
+ }
+
+ static FactorizationSolveL1StripeCellContext &buildResultContextRef(FactorizationSolveL1StripeCellContext *cellContexts, unsigned blockIndex, unsigned blockCount)
+ {
+ return cellContexts[blockCount * CCI__MAX + blockIndex];
+ }
+
+private:
+ struct FactorizationFactorizeL1StripeThreadContext
+ {
+ template<unsigned int a_rows>
+ void assignDataSum(const dReal (&sameZ)[a_rows], const dReal (&mixedZ)[dMACRO_MAX(a_rows - 1, 1)],
+ const FactorizationFactorizeL1StripeThreadContext &partialSumContext)
+ {
+ m_sameZ[0] = sameZ[0] + partialSumContext.m_sameZ[0];
+ if (a_rows >= 2)
+ {
+ m_sameZ[1] = sameZ[1] + partialSumContext.m_sameZ[1];
+ m_mixedZ[0] = mixedZ[0] + partialSumContext.m_mixedZ[0];
+ }
+ }
+
+ template<unsigned int a_rows>
+ void assignDataAlone(const dReal (&sameZ)[a_rows], const dReal (&mixedZ)[dMACRO_MAX(a_rows - 1, 1)])
+ {
+ m_sameZ[0] = sameZ[0];
+ if (a_rows >= 2)
+ {
+ m_sameZ[1] = sameZ[1];
+ m_mixedZ[0] = mixedZ[0];
+ }
+ }
+
+ template<unsigned int a_rows>
+ void retrieveData(dReal (&out_sameZ)[a_rows], dReal (&out_mixedZ)[dMACRO_MAX(a_rows - 1, 1)]) const
+ {
+ out_sameZ[0] = m_sameZ[0];
+ if (a_rows >= 2)
+ {
+ out_sameZ[1] = m_sameZ[1];
+ out_mixedZ[0] = m_mixedZ[0];
+ }
+ dAASSERT(a_rows >= 1 && a_rows <= 2);
+ }
+
+ dReal m_sameZ[FFL1S_REGULAR_A_ROWS];
+ dReal m_mixedZ[dMACRO_MAX(FFL1S_REGULAR_A_ROWS - 1, 1)];
+ dReal m_reserved[1]; // [5]; // for alignment
+ };
+
+ struct FactorizationFactorizeL1StripeContext
+ {
+ void initialize(unsigned threadCount)
+ {
+ m_threadsRunning = threadCount;
+ m_nextColumnIndex = 0;
+ m_sumThreadIndex = 0;
+ }
+
+ atomicord32 m_threadsRunning;
+ atomicord32 m_nextColumnIndex;
+ volatile atomicord32 m_sumThreadIndex;
+ atomicord32 m_reserved[1]; // [13]; // for alignment
+ FactorizationFactorizeL1StripeThreadContext m_threadContexts[1]; // =[threadCount]
+ };
+
+private:
+ struct SolveL1StraightCellContext;
+
+ enum
+ {
+ SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM = 8,
+
+ SL1S_B_STRIDE = 1,
+ SL1S_BLOCK_SIZE = 4,
+ };
+
+ static unsigned restrictSolvingL1StraightAllowedThreadCount(
+ dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount);
+ static void doEstimateCooperativeSolvingL1StraightResourceRequirementsValidated(
+ dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void doCooperativelySolveL1StraightValidated(
+ dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
+
+ static unsigned deriveSolvingL1StraightBlockCount(unsigned rowCount, unsigned blockStep)
+ {
+ return (rowCount + (blockStep - 1)) / blockStep;
+ }
+
+ struct SolvingL1StraightMemoryEstimates
+ {
+ void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired)
+ {
+ m_descriptorSizeRequired = descriptorSizeRequired;
+ m_contextSizeRequired = contextSizeRequired;
+ }
+
+ sizeint m_descriptorSizeRequired;
+ sizeint m_contextSizeRequired;
+ };
+
+ static unsigned deriveSolvingL1StraightThreadCount(unsigned blockCount, unsigned allowedThreadCount)
+ {
+ dIASSERT(allowedThreadCount >= 1);
+
+ unsigned maximumCount = 1 + blockCount / SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM;
+ return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U);
+ }
+
+ template<unsigned int block_step>
+ static sizeint estimateCooperativelySolvingL1StraightMemoryRequirement(unsigned rowCount, SolvingL1StraightMemoryEstimates &ref_solvingMemoryEstimates);
+
+ static void *markCooperativelySolvingL1StraightMemoryStructuresOut(void *buffer,
+ const SolvingL1StraightMemoryEstimates &solvingMemoryEstimates,
+ cellindexint *&out_blockProgressDescriptors, SolveL1StraightCellContext *&out_cellContexts)
+ {
+ void *currentLocation = buffer;
+
+ out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_descriptorSizeRequired;
+ out_cellContexts = (SolveL1StraightCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_contextSizeRequired;
+ return currentLocation;
+ }
+
+ template<unsigned int block_step>
+ static void initializeCooperativelySolveL1StraightMemoryStructures(unsigned rowCount,
+ atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *cellContexts);
+ template<unsigned int block_step, unsigned int b_stride>
+ static void participateSolvingL1Straight(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
+ volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
+ SolveL1StraightCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex);
+
+private:
+ struct SolveL1StraightWorkerContext
+ {
+ void init(const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip,
+ atomicord32 &ref_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *cellContexts)
+ {
+ m_L = L;
+ m_b = b;
+ m_rowCount = rowCount;
+ m_rowSkip = rowSkip;
+ m_ptrBlockCompletionProgress = &ref_blockCompletionProgress;
+ m_blockProgressDescriptors = blockProgressDescriptors;
+ m_cellContexts = cellContexts;
+ }
+
+ const dReal *m_L;
+ dReal *m_b;
+ unsigned m_rowCount;
+ unsigned m_rowSkip;
+ atomicord32 *m_ptrBlockCompletionProgress;
+ cellindexint *m_blockProgressDescriptors;
+ SolveL1StraightCellContext *m_cellContexts;
+ };
+
+ static int solveL1Straight_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void solveL1Straight_worker(SolveL1StraightWorkerContext &ref_context, unsigned ownThreadIndex);
+
+ static int solveL1Straight_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+private:
+ struct SolveL1StraightCellContext
+ {
+ template<unsigned int block_step>
+ static void initializePrecalculatedZs(dReal (&Z)[block_step])
+ {
+ std::fill(Z, Z + block_step, REAL(0.0));
+ }
+
+ template<unsigned int block_step>
+ void loadPrecalculatedZs(dReal (&Z)[block_step]) const
+ {
+ dSASSERT(block_step <= dARRAY_SIZE(m_c));
+
+ std::copy(m_c, m_c + block_step, Z);
+ }
+
+ template<unsigned int block_step>
+ void storePrecalculatedZs(const dReal (&Z)[block_step])
+ {
+ dSASSERT(block_step <= dARRAY_SIZE(m_c));
+
+ std::copy(Z, Z + block_step, m_c);
+ }
+
+ dReal m_c[SL1S_BLOCK_SIZE];
+ };
+
+
+ static SolveL1StraightCellContext &buildBlockContextRef(SolveL1StraightCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance)
+ {
+ return cellContexts[blockIndex * CCI__MAX + contextInstance];
+ }
+
+ static SolveL1StraightCellContext &buildResultContextRef(SolveL1StraightCellContext *cellContexts, unsigned blockIndex, unsigned blockCount)
+ {
+ return cellContexts[blockCount * CCI__MAX + blockIndex];
+ }
+
+
+private:
+ struct SolveL1TransposedCellContext;
+
+ enum
+ {
+ SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM = SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM,
+
+ SL1T_B_STRIDE = SL1S_B_STRIDE,
+ SL1T_BLOCK_SIZE = 4,
+ };
+
+ static unsigned restrictSolvingL1TransposedAllowedThreadCount(
+ dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount);
+ static void doEstimateCooperativeSolvingL1TransposedResourceRequirementsValidated(
+ dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned rowCount);
+ static void doCooperativelySolveL1TransposedValidated(
+ dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
+
+ static unsigned deriveSolvingL1TransposedBlockCount(unsigned rowCount, unsigned blockStep)
+ {
+ return (rowCount + (blockStep - 1)) / blockStep;
+ }
+
+ struct SolvingL1TransposedMemoryEstimates
+ {
+ void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired)
+ {
+ m_descriptorSizeRequired = descriptorSizeRequired;
+ m_contextSizeRequired = contextSizeRequired;
+ }
+
+ sizeint m_descriptorSizeRequired;
+ sizeint m_contextSizeRequired;
+ };
+
+ static unsigned deriveSolvingL1TransposedThreadCount(unsigned blockCount, unsigned allowedThreadCount)
+ {
+ dSASSERT(SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM + 0 == SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM);
+
+ return deriveSolvingL1StraightThreadCount(blockCount, allowedThreadCount);
+ }
+
+ template<unsigned int block_step>
+ static sizeint estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount, SolvingL1TransposedMemoryEstimates &ref_solvingMemoryEstimates);
+
+ static void *markCooperativelySolvingL1TransposedMemoryStructuresOut(void *buffer,
+ const SolvingL1TransposedMemoryEstimates &solvingMemoryEstimates,
+ cellindexint *&out_blockProgressDescriptors, SolveL1TransposedCellContext *&out_cellContexts)
+ {
+ void *currentLocation = buffer;
+
+ out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_descriptorSizeRequired;
+ out_cellContexts = (SolveL1TransposedCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_contextSizeRequired;
+ return currentLocation;
+ }
+
+ template<unsigned int block_step>
+ static void *allocateCooperativelySolveL1TransposedMemoryStructures(sizeint &out_sizeAllocated, unsigned rowCount,
+ cellindexint *&out_blockProgressDescriptors, SolveL1TransposedCellContext *&out_cellContexts);
+ template<unsigned int block_step>
+ static void initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount,
+ atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *cellContexts);
+ template<unsigned int block_step, unsigned int b_stride>
+ static void participateSolvingL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
+ volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
+ SolveL1TransposedCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex);
+
+private:
+ struct SolveL1TransposedWorkerContext
+ {
+ void init(const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip,
+ atomicord32 &ref_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *cellContexts)
+ {
+ m_L = L;
+ m_b = b;
+ m_rowCount = rowCount;
+ m_rowSkip = rowSkip;
+ m_ptrBlockCompletionProgress = &ref_blockCompletionProgress;
+ m_blockProgressDescriptors = blockProgressDescriptors;
+ m_cellContexts = cellContexts;
+ }
+
+ const dReal *m_L;
+ dReal *m_b;
+ unsigned m_rowCount;
+ unsigned m_rowSkip;
+ atomicord32 *m_ptrBlockCompletionProgress;
+ cellindexint *m_blockProgressDescriptors;
+ SolveL1TransposedCellContext *m_cellContexts;
+ };
+
+ static int solveL1Transposed_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void solveL1Transposed_worker(SolveL1TransposedWorkerContext &ref_context, unsigned ownThreadIndex);
+
+ static int solveL1Transposed_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+private:
+ struct SolveL1TransposedCellContext
+ {
+ template<unsigned int block_step>
+ static void initializePrecalculatedZs(dReal (&Z)[block_step])
+ {
+ std::fill(Z, Z + block_step, REAL(0.0));
+ }
+
+ template<unsigned int block_step>
+ void loadPrecalculatedZs(dReal (&Z)[block_step]) const
+ {
+ dSASSERT(block_step <= dARRAY_SIZE(m_c));
+
+ std::copy(m_c, m_c + block_step, Z);
+ }
+
+ template<unsigned int block_step>
+ void storePrecalculatedZs(const dReal (&Z)[block_step])
+ {
+ dSASSERT(block_step <= dARRAY_SIZE(m_c));
+
+ std::copy(Z, Z + block_step, m_c);
+ }
+
+ dReal m_c[SL1T_BLOCK_SIZE];
+ };
+
+ static SolveL1TransposedCellContext &buildBlockContextRef(SolveL1TransposedCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance)
+ {
+ return cellContexts[blockIndex * CCI__MAX + contextInstance];
+ }
+
+ static SolveL1TransposedCellContext &buildResultContextRef(SolveL1TransposedCellContext *cellContexts, unsigned blockIndex, unsigned blockCount)
+ {
+ return cellContexts[blockCount * CCI__MAX + blockIndex];
+ }
+
+private:
+ enum
+ {
+ SV_A_STRIDE = 1,
+ SV_D_STRIDE = 1,
+
+ SV_BLOCK_SIZE = 128,
+ SV_COOPERATIVE_BLOCK_COUNT_MINIMUM = 3,
+ };
+
+ static unsigned restrictScalingVectorAllowedThreadCount(
+ dxThreadingBase *threading, unsigned allowedThreadCount, unsigned elementCount);
+ static void doEstimateCooperativeScalingVectorResourceRequirementsValidated(
+ dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
+ unsigned allowedThreadCount, unsigned elementCount);
+ static void doCooperativelyScaleVectorValidated(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
+ dReal *vectorData, const dReal *scaleData, unsigned elementCount);
+
+ static unsigned deriveScalingVectorBlockCount(unsigned elementCount, unsigned blockStep)
+ {
+ return (elementCount + (blockStep - 1)) / blockStep;
+ }
+
+ static unsigned deriveScalingVectorThreadCount(unsigned lastBlockIndex, unsigned allowedThreadCount)
+ {
+ dIASSERT(allowedThreadCount >= 1);
+
+ unsigned maximumCount = lastBlockIndex;
+ return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U);
+ }
+
+ static void initializeCooperativelyScaleVectorMemoryStructures(atomicord32 &out_blockCompletionProgress)
+ {
+ out_blockCompletionProgress = 0;
+ }
+ template<unsigned int block_step, unsigned int a_stride, unsigned int d_stride>
+ static void participateScalingVector(dReal *ptrAStart, const dReal *ptrDStart, const unsigned elementCount,
+ volatile atomicord32 &refBlockCompletionProgress/*=0*/);
+
+private:
+ struct ScaleVectorWorkerContext
+ {
+ void init(dReal *vectorData, const dReal *scaleData, unsigned elementCount,
+ atomicord32 &ref_blockCompletionProgress)
+ {
+ m_vectorData = vectorData;
+ m_scaleData = scaleData;
+ m_elementCount = elementCount;
+ m_ptrBlockCompletionProgress = &ref_blockCompletionProgress;
+ }
+
+ dReal *m_vectorData;
+ const dReal *m_scaleData;
+ unsigned m_elementCount;
+ atomicord32 *m_ptrBlockCompletionProgress;
+ };
+
+ static int scaleVector_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+ static void scaleVector_worker(ScaleVectorWorkerContext &ref_context);
+
+ static int scaleVector_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+
+private:
+ enum SolvingLDLTStage
+ {
+ SLDLTS__MIN,
+
+ SLDLTS_SOLVING_STRAIGHT = SLDLTS__MIN,
+ SLDLTS_SCALING_VECTOR,
+ SLDLTS_SOLVING_TRANSPOSED,
+
+ SLDLTS__MAX,
+ };
+
+ enum
+ {
+ SLDLT_B_STRIDE = SL1S_B_STRIDE,
+ SLDLT_D_STRIDE = FLDLT_D_STRIDE,
+ };
+
+ static unsigned restrictSolvingLDLTAllowedThreadCount(
+ dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount, unsigned &out_stageBlockCountSifficiencyMask);
+
+ static void doCooperativelySolveLDLTValidated(
+ dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, unsigned stageBlockCountSifficiencyMask,
+ const dReal *L, const dReal *d, dReal *b, unsigned rowCount, unsigned rowSkip);
+};
+
+
+#endif
+