summaryrefslogtreecommitdiff
path: root/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ode-0.16.1/ode/src/fastldltfactor_impl.h')
-rw-r--r--libs/ode-0.16.1/ode/src/fastldltfactor_impl.h1530
1 files changed, 1530 insertions, 0 deletions
diff --git a/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h b/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
new file mode 100644
index 0000000..8f633d3
--- /dev/null
+++ b/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
@@ -0,0 +1,1530 @@
+
+
+/*************************************************************************
+ * *
+ * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
+ * All rights reserved. Email: russ@q12.org Web: www.q12.org *
+ * *
+ * This library is free software; you can redistribute it and/or *
+ * modify it under the terms of EITHER: *
+ * (1) The GNU Lesser General Public License as published by the Free *
+ * Software Foundation; either version 2.1 of the License, or (at *
+ * your option) any later version. The text of the GNU Lesser *
+ * General Public License is included with this library in the *
+ * file LICENSE.TXT. *
+ * (2) The BSD-style license that is included with this library in *
+ * the file LICENSE-BSD.TXT. *
+ * *
+ * This library is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
+ * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
+ * *
+ *************************************************************************/
+
+/*
+ * Code style improvements and optimizations by Oleh Derevenko ????-2019
+ * LDLT cooperative factorization code of ThreadedEquationSolverLDLT copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
+ */
+
+#ifndef _ODE_FASTLDLT_IMPL_H_
+#define _ODE_FASTLDLT_IMPL_H_
+
+
+#include "error.h"
+#include "common.h"
+
+
+static void solveL1Stripe_2 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned rowIndex, unsigned rowSkip);
+template<unsigned int d_stride>
+inline void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip);
+
+static void solveStripeL1_1 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned rowIndex);
+template<unsigned int d_stride>
+inline void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d);
+
+
+template<unsigned int d_stride>
+void factorMatrixAsLDLT(dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip)
+{
+ if (rowCount < 1) return;
+
+ dReal *ARow = A;
+ unsigned blockStartRow = 0;
+
+ const unsigned blockStep = 2;
+ const unsigned lastRowIndex = rowCount >= blockStep ? rowCount - blockStep + 1 : 0;
+
+ /* compute blocks of 2 rows */
+ bool subsequentPass = false;
+ for (; blockStartRow < lastRowIndex; subsequentPass = true, ARow += blockStep * rowSkip, blockStartRow += blockStep)
+ {
+ if (subsequentPass)
+ {
+ /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
+ solveL1Stripe_2(A, ARow, blockStartRow, rowSkip);
+ scaleAndFactorizeL1Stripe_2<d_stride>(ARow, d, blockStartRow, rowSkip);
+ }
+ else
+ {
+ scaleAndFactorizeL1FirstRowStripe_2<d_stride>(ARow, d, rowSkip);
+ }
+ dSASSERT(blockStep == 2);
+ /* done factorizing 2 x 2 block */
+ }
+
+ /* compute the (less than 2) rows at the bottom */
+ if (!subsequentPass || blockStartRow == lastRowIndex)
+ {
+ dSASSERT(blockStep == 2); // for the blockStartRow == lastRowIndex comparison above
+
+ if (subsequentPass)
+ {
+ solveStripeL1_1(A, ARow, blockStartRow, rowSkip);
+ scaleAndFactorizeL1Stripe_1<d_stride>(ARow, d, blockStartRow);
+ }
+ else
+ {
+ scaleAndFactorizeL1FirstRowStripe_1<d_stride>(ARow, d);
+ }
+ dSASSERT(blockStep == 2);
+ /* done factorizing 1 x 1 block */
+ }
+}
+
+/* solve L*X=B, with B containing 2 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is rowSkip.
+ * B is an n*2 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also rowSkip.
+ * B is overwritten with X.
+ * this processes blocks of 2*2.
+ * if this is in the factorizer source file, n must be a multiple of 2.
+ */
+static
+void solveL1Stripe_2(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
+{
+ dIASSERT(rowCount != 0);
+ dIASSERT(rowCount % 2 == 0);
+
+ /* compute all 2 x 2 blocks of X */
+ unsigned blockStartRow = 0;
+ for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount)
+ {
+ const dReal *ptrLElement;
+ dReal *ptrBElement;
+
+ /* declare variables - Z matrix */
+ dReal Z11, Z12, Z21, Z22;
+
+ /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+ if (subsequentPass)
+ {
+ ptrLElement = L + blockStartRow * rowSkip;
+ ptrBElement = B;
+
+ /* set Z matrix to 0 */
+ Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
+
+ /* the inner loop that computes outer products and adds them to Z */
+ // The iteration starts with even number and decreases it by 2. So, it must end in zero
+ for (unsigned columnCounter = blockStartRow; ;)
+ {
+ /* declare p and q vectors, etc */
+ dReal p1, q1, p2, q2;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[0];
+ q1 = ptrBElement[0];
+ Z11 += p1 * q1;
+ q2 = ptrBElement[rowSkip];
+ Z12 += p1 * q2;
+ p2 = ptrLElement[rowSkip];
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[1];
+ q1 = ptrBElement[1];
+ Z11 += p1 * q1;
+ q2 = ptrBElement[1 + rowSkip];
+ Z12 += p1 * q2;
+ p2 = ptrLElement[1 + rowSkip];
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ if (columnCounter > 6)
+ {
+ columnCounter -= 6;
+
+ /* advance pointers */
+ ptrLElement += 6;
+ ptrBElement += 6;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-4];
+ q1 = ptrBElement[-4];
+ Z11 += p1 * q1;
+ q2 = ptrBElement[-4 + rowSkip];
+ Z12 += p1 * q2;
+ p2 = ptrLElement[-4 + rowSkip];
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-3];
+ q1 = ptrBElement[-3];
+ Z11 += p1 * q1;
+ q2 = ptrBElement[-3 + rowSkip];
+ Z12 += p1 * q2;
+ p2 = ptrLElement[-3 + rowSkip];
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-2];
+ q1 = ptrBElement[-2];
+ Z11 += p1 * q1;
+ q2 = ptrBElement[-2 + rowSkip];
+ Z12 += p1 * q2;
+ p2 = ptrLElement[-2 + rowSkip];
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-1];
+ q1 = ptrBElement[-1];
+ Z11 += p1 * q1;
+ q2 = ptrBElement[-1 + rowSkip];
+ Z12 += p1 * q2;
+ p2 = ptrLElement[-1 + rowSkip];
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+ }
+ else
+ {
+ /* advance pointers */
+ ptrLElement += 2;
+ ptrBElement += 2;
+
+ if ((columnCounter -= 2) == 0)
+ {
+ break;
+ }
+ }
+ /* end of inner loop */
+ }
+ }
+ else
+ {
+ ptrLElement = L/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
+ ptrBElement = B;
+
+ /* set Z matrix to 0 */
+ Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
+ }
+
+ /* finish computing the X(i) block */
+
+ dReal Y11 = ptrBElement[0] - Z11;
+ dReal Y12 = ptrBElement[rowSkip] - Z12;
+
+ dReal p2 = ptrLElement[rowSkip];
+
+ ptrBElement[0] = Y11;
+ ptrBElement[rowSkip] = Y12;
+
+ dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
+ dReal Y22 = ptrBElement[1 + rowSkip] - Z22 - p2 * Y12;
+
+ ptrBElement[1] = Y21;
+ ptrBElement[1 + rowSkip] = Y22;
+ /* end of outer loop */
+ }
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip)
+{
+ dIASSERT(factorizationRow != 0);
+ dIASSERT(factorizationRow % 2 == 0);
+
+ dReal *ptrAElement = ARow;
+ dReal *ptrDElement = d;
+
+ /* scale the elements in a 2 x i block at A(i,0), and also */
+ /* compute Z = the outer product matrix that we'll need. */
+ dReal Z11 = 0, Z21 = 0, Z22 = 0;
+
+ for (unsigned columnCounter = factorizationRow; ; )
+ {
+ dReal p1, q1, p2, q2, dd;
+
+ p1 = ptrAElement[0];
+ p2 = ptrAElement[rowSkip];
+ dd = ptrDElement[0 * d_stride];
+ q1 = p1 * dd;
+ q2 = p2 * dd;
+ ptrAElement[0] = q1;
+ ptrAElement[rowSkip] = q2;
+ Z11 += p1 * q1;
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ p1 = ptrAElement[1];
+ p2 = ptrAElement[1 + rowSkip];
+ dd = ptrDElement[1 * d_stride];
+ q1 = p1 * dd;
+ q2 = p2 * dd;
+ ptrAElement[1] = q1;
+ ptrAElement[1 + rowSkip] = q2;
+ Z11 += p1 * q1;
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ if (columnCounter > 6)
+ {
+ columnCounter -= 6;
+
+ ptrAElement += 6;
+ ptrDElement += 6 * d_stride;
+
+ p1 = ptrAElement[-4];
+ p2 = ptrAElement[-4 + rowSkip];
+ dd = ptrDElement[-4 * (int)d_stride];
+ q1 = p1 * dd;
+ q2 = p2 * dd;
+ ptrAElement[-4] = q1;
+ ptrAElement[-4 + rowSkip] = q2;
+ Z11 += p1 * q1;
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ p1 = ptrAElement[-3];
+ p2 = ptrAElement[-3 + rowSkip];
+ dd = ptrDElement[-3 * (int)d_stride];
+ q1 = p1 * dd;
+ q2 = p2 * dd;
+ ptrAElement[-3] = q1;
+ ptrAElement[-3 + rowSkip] = q2;
+ Z11 += p1 * q1;
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ p1 = ptrAElement[-2];
+ p2 = ptrAElement[-2 + rowSkip];
+ dd = ptrDElement[-2 * (int)d_stride];
+ q1 = p1 * dd;
+ q2 = p2 * dd;
+ ptrAElement[-2] = q1;
+ ptrAElement[-2 + rowSkip] = q2;
+ Z11 += p1 * q1;
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+
+ p1 = ptrAElement[-1];
+ p2 = ptrAElement[-1 + rowSkip];
+ dd = ptrDElement[-1 * (int)d_stride];
+ q1 = p1 * dd;
+ q2 = p2 * dd;
+ ptrAElement[-1] = q1;
+ ptrAElement[-1 + rowSkip] = q2;
+ Z11 += p1 * q1;
+ Z21 += p2 * q1;
+ Z22 += p2 * q2;
+ }
+ else
+ {
+ ptrAElement += 2;
+ ptrDElement += 2 * d_stride;
+
+ if ((columnCounter -= 2) == 0)
+ {
+ break;
+ }
+ }
+ }
+
+ /* solve for diagonal 2 x 2 block at A(i,i) */
+ dReal Y11 = ptrAElement[0] - Z11;
+ dReal Y21 = ptrAElement[rowSkip] - Z21;
+ dReal Y22 = ptrAElement[1 + rowSkip] - Z22;
+
+ /* factorize 2 x 2 block Y, ptrDElement */
+ /* factorize row 1 */
+ dReal dd = dRecip(Y11);
+
+ ptrDElement[0 * d_stride] = dd;
+ dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
+
+ /* factorize row 2 */
+ dReal q2 = Y21 * dd;
+ ptrAElement[rowSkip] = q2;
+
+ dReal sum = Y21 * q2;
+ ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip)
+{
+ dReal *ptrAElement = ARow;
+ dReal *ptrDElement = d;
+
+ /* solve for diagonal 2 x 2 block at A(0,0) */
+ dReal Y11 = ptrAElement[0]/* - Z11*/;
+ dReal Y21 = ptrAElement[rowSkip]/* - Z21*/;
+ dReal Y22 = ptrAElement[1 + rowSkip]/* - Z22*/;
+
+ /* factorize 2 x 2 block Y, ptrDElement */
+ /* factorize row 1 */
+ dReal dd = dRecip(Y11);
+
+ ptrDElement[0 * d_stride] = dd;
+ dIASSERT(ptrDElement == d/* + (sizeint)factorizationRow * d_stride*/);
+
+ /* factorize row 2 */
+ dReal q2 = Y21 * dd;
+ ptrAElement[rowSkip] = q2;
+
+ dReal sum = Y21 * q2;
+ ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
+}
+
+
+/* solve L*X=B, with B containing 1 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is lskip.
+ * B is an n*1 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also lskip.
+ * B is overwritten with X.
+ * this processes blocks of 2*2.
+ * if this is in the factorizer source file, n must be a multiple of 2.
+ */
+static
+void solveStripeL1_1(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
+{
+ dIASSERT(rowCount != 0);
+ dIASSERT(rowCount % 2 == 0);
+
+ /* compute all 2 x 1 blocks of X */
+ unsigned blockStartRow = 0;
+ for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount)
+ {
+ const dReal *ptrLElement;
+ dReal *ptrBElement;
+
+ /* declare variables - Z matrix */
+ dReal Z11, Z21;
+
+ if (subsequentPass)
+ {
+ ptrLElement = L + (sizeint)blockStartRow * rowSkip;
+ ptrBElement = B;
+
+ /* set the Z matrix to 0 */
+ Z11 = 0; Z21 = 0;
+
+ /* compute all 2 x 1 block of X, from rows i..i+2-1 */
+
+ /* the inner loop that computes outer products and adds them to Z */
+ // The iteration starts with even number and decreases it by 2. So, it must end in zero
+ for (unsigned columnCounter = blockStartRow; ; )
+ {
+ /* declare p and q vectors, etc */
+ dReal p1, q1, p2;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[0];
+ q1 = ptrBElement[0];
+ Z11 += p1 * q1;
+ p2 = ptrLElement[rowSkip];
+ Z21 += p2 * q1;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[1];
+ q1 = ptrBElement[1];
+ Z11 += p1 * q1;
+ p2 = ptrLElement[1 + rowSkip];
+ Z21 += p2 * q1;
+
+ if (columnCounter > 6)
+ {
+ columnCounter -= 6;
+
+ /* advance pointers */
+ ptrLElement += 6;
+ ptrBElement += 6;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-4];
+ q1 = ptrBElement[-4];
+ Z11 += p1 * q1;
+ p2 = ptrLElement[-4 + rowSkip];
+ Z21 += p2 * q1;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-3];
+ q1 = ptrBElement[-3];
+ Z11 += p1 * q1;
+ p2 = ptrLElement[-3 + rowSkip];
+ Z21 += p2 * q1;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-2];
+ q1 = ptrBElement[-2];
+ Z11 += p1 * q1;
+ p2 = ptrLElement[-2 + rowSkip];
+ Z21 += p2 * q1;
+
+ /* compute outer product and add it to the Z matrix */
+ p1 = ptrLElement[-1];
+ q1 = ptrBElement[-1];
+ Z11 += p1 * q1;
+ p2 = ptrLElement[-1 + rowSkip];
+ Z21 += p2 * q1;
+ }
+ else
+ {
+ /* advance pointers */
+ ptrLElement += 2;
+ ptrBElement += 2;
+
+ if ((columnCounter -= 2) == 0)
+ {
+ break;
+ }
+ }
+ /* end of inner loop */
+ }
+ }
+ else
+ {
+ ptrLElement = L/* + (sizeint)blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
+ ptrBElement = B;
+
+ /* set the Z matrix to 0 */
+ Z11 = 0; Z21 = 0;
+ }
+
+ /* finish computing the X(i) block */
+ dReal p2 = ptrLElement[rowSkip];
+
+ dReal Y11 = ptrBElement[0] - Z11;
+ dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
+
+ ptrBElement[0] = Y11;
+ ptrBElement[1] = Y21;
+ /* end of outer loop */
+ }
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned factorizationRow)
+{
+ dReal *ptrAElement = ARow;
+ dReal *ptrDElement = d;
+
+ /* scale the elements in a 1 x i block at A(i,0), and also */
+ /* compute Z = the outer product matrix that we'll need. */
+ dReal Z11 = 0, Z22 = 0;
+
+ for (unsigned columnCounter = factorizationRow; ; )
+ {
+ dReal p1, p2, q1, q2, dd1, dd2;
+
+ p1 = ptrAElement[0];
+ p2 = ptrAElement[1];
+ dd1 = ptrDElement[0 * d_stride];
+ dd2 = ptrDElement[1 * d_stride];
+ q1 = p1 * dd1;
+ q2 = p2 * dd2;
+ ptrAElement[0] = q1;
+ ptrAElement[1] = q2;
+ Z11 += p1 * q1;
+ Z22 += p2 * q2;
+
+ if (columnCounter > 6)
+ {
+ columnCounter -= 6;
+
+ ptrAElement += 6;
+ ptrDElement += 6 * d_stride;
+
+ p1 = ptrAElement[-4];
+ p2 = ptrAElement[-3];
+ dd1 = ptrDElement[-4 * (int)d_stride];
+ dd2 = ptrDElement[-3 * (int)d_stride];
+ q1 = p1 * dd1;
+ q2 = p2 * dd2;
+ ptrAElement[-4] = q1;
+ ptrAElement[-3] = q2;
+ Z11 += p1 * q1;
+ Z22 += p2 * q2;
+
+ p1 = ptrAElement[-2];
+ p2 = ptrAElement[-1];
+ dd1 = ptrDElement[-2 * (int)d_stride];
+ dd2 = ptrDElement[-1 * (int)d_stride];
+ q1 = p1 * dd1;
+ q2 = p2 * dd2;
+ ptrAElement[-2] = q1;
+ ptrAElement[-1] = q2;
+ Z11 += p1 * q1;
+ Z22 += p2 * q2;
+ }
+ else
+ {
+ ptrAElement += 2;
+ ptrDElement += 2 * d_stride;
+
+ if ((columnCounter -= 2) == 0)
+ {
+ break;
+ }
+ }
+ }
+
+ dReal Y11 = ptrAElement[0] - (Z11 + Z22);
+
+ /* solve for diagonal 1 x 1 block at A(i,i) */
+ dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
+ /* factorize 1 x 1 block Y, ptrDElement */
+ /* factorize row 1 */
+ ptrDElement[0 * d_stride] = dRecip(Y11);
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d)
+{
+ dReal *ptrAElement = ARow;
+ dReal *ptrDElement = d;
+
+ dReal Y11 = ptrAElement[0];
+
+ /* solve for diagonal 1 x 1 block at A(0,0) */
+ /* factorize 1 x 1 block Y, ptrDElement */
+ /* factorize row 1 */
+ ptrDElement[0 * d_stride] = dRecip(Y11);
+}
+
+
+
+
+template<unsigned int block_step, unsigned int b_rows>
+/*static */
+void ThreadedEquationSolverLDLT::participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip,
+ volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
+ FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
+{
+ const unsigned lookaheadRange = 64;
+ BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
+
+ unsigned completedBlocks = refBlockCompletionProgress;
+ unsigned currentBlock = completedBlocks;
+ dIASSERT(completedBlocks <= blockCount);
+
+ for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
+ {
+ bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
+ bool goAssigningTheResult = false, stayWithinTheBlock = false;
+
+ dReal Z[block_step][b_rows];
+ dReal Y[block_step][b_rows];
+
+ dReal *ptrBElement;
+
+ CellContextInstance previousContextInstance;
+ unsigned completedColumnBlock;
+
+ for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
+ {
+ if (testDescriptor == INVALID_CELLDESCRIPTOR)
+ {
+ // Invalid descriptor is the indication that the row has been fully calculated
+ // Test if this was the last row and break out if so.
+ if (currentBlock + 1 == blockCount)
+ {
+ exitLoop = true;
+ break;
+ }
+
+ // Treat detected row advancement as a row processed
+ // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
+ break;
+ }
+
+ CooperativeAtomics::AtomicReadReorderBarrier();
+ // It is necessary to read up to date completedBblocks value after the descriptor retrieval
+ // as otherwise the logic below breaks
+ completedBlocks = refBlockCompletionProgress;
+
+ if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
+ {
+ completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
+ dIASSERT(completedColumnBlock < currentBlock || (completedColumnBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
+ dIASSERT(completedColumnBlock <= completedBlocks); // Since the descriptor is not locked
+
+ if (completedColumnBlock == completedBlocks && currentBlock != completedBlocks)
+ {
+ dIASSERT(completedBlocks < currentBlock);
+ break;
+ }
+
+ if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
+ {
+ if (completedColumnBlock != 0)
+ {
+ CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
+ previousContextInstance = contextInstance;
+
+ const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
+ sourceContext.loadPrecalculatedZs(Z);
+ }
+ else
+ {
+ previousContextInstance = CCI__MIN;
+ FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
+ }
+
+ goForLockedBlockPrimaryCalculation = true;
+ break;
+ }
+
+ if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
+ {
+ break;
+ }
+
+ testDescriptor = blockProgressDescriptors[currentBlock];
+ }
+ else
+ {
+ if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
+ {
+ break;
+ }
+
+ cellindexint verificativeDescriptor;
+ bool verificationFailure = false;
+
+ completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
+ dIASSERT(completedColumnBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
+
+ if (completedColumnBlock != 0)
+ {
+ CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
+ const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
+ sourceContext.loadPrecalculatedZs(Z);
+ }
+ else
+ {
+ FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
+ }
+
+ if (completedColumnBlock != 0 && completedColumnBlock <= currentBlock)
+ {
+ // Make sure the descriptor is re-read after the precalculates
+ CooperativeAtomics::AtomicReadReorderBarrier();
+ }
+
+ if (completedColumnBlock <= currentBlock)
+ {
+ verificativeDescriptor = blockProgressDescriptors[currentBlock];
+ verificationFailure = verificativeDescriptor != testDescriptor;
+ }
+
+ if (!verificationFailure)
+ {
+ dIASSERT(completedColumnBlock <= currentBlock + 1);
+
+ goForLockedBlockDuplicateCalculation = true;
+ break;
+ }
+
+ testDescriptor = verificativeDescriptor;
+ }
+ }
+
+ if (exitLoop)
+ {
+ break;
+ }
+
+ if (goForLockedBlockPrimaryCalculation)
+ {
+ blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
+
+ // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
+ bool handleComputationTakenOver = false, rowEndReached = false;
+
+ const dReal *ptrLElement;
+ unsigned finalColumnBlock;
+
+ if (currentBlock != 0)
+ {
+ /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+ ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
+ ptrBElement = B + completedColumnBlock * block_step;
+
+ /* the inner loop that computes outer products and adds them to Z */
+ finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
+ dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);
+
+ // The iteration starts with even number and decreases it by 2. So, it must end in zero
+ for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
+ {
+ /* declare p and q vectors, etc */
+ dReal p[block_step], q[b_rows];
+
+ /* compute outer product and add it to the Z matrix */
+ p[0] = ptrLElement[0];
+ q[0] = ptrBElement[0];
+ Z[0][0] += p[0] * q[0];
+ if (b_rows >= 2)
+ {
+ q[1] = ptrBElement[rowSkip];
+ Z[0][1] += p[0] * q[1];
+ }
+ p[1] = ptrLElement[rowSkip];
+ Z[1][0] += p[1] * q[0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] += p[1] * q[1];
+ }
+
+ /* compute outer product and add it to the Z matrix */
+ p[0] = ptrLElement[1];
+ q[0] = ptrBElement[1];
+ Z[0][0] += p[0] * q[0];
+ if (b_rows >= 2)
+ {
+ q[1] = ptrBElement[1 + rowSkip];
+ Z[0][1] += p[0] * q[1];
+ }
+ p[1] = ptrLElement[1 + rowSkip];
+ Z[1][0] += p[1] * q[0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] += p[1] * q[1];
+ }
+
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+ if (columnCounter > 2)
+ {
+ /* compute outer product and add it to the Z matrix */
+ p[0] = ptrLElement[2];
+ q[0] = ptrBElement[2];
+ Z[0][0] += p[0] * q[0];
+ if (b_rows >= 2)
+ {
+ q[1] = ptrBElement[2 + rowSkip];
+ Z[0][1] += p[0] * q[1];
+ }
+ p[1] = ptrLElement[2 + rowSkip];
+ Z[1][0] += p[1] * q[0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] += p[1] * q[1];
+ }
+
+ /* compute outer product and add it to the Z matrix */
+ p[0] = ptrLElement[3];
+ q[0] = ptrBElement[3];
+ Z[0][0] += p[0] * q[0];
+ if (b_rows >= 2)
+ {
+ q[1] = ptrBElement[3 + rowSkip];
+ Z[0][1] += p[0] * q[1];
+ }
+ p[1] = ptrLElement[3 + rowSkip];
+ Z[1][0] += p[1] * q[0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] += p[1] * q[1];
+ }
+
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+ /* advance pointers */
+ ptrLElement += 2 * block_step;
+ ptrBElement += 2 * block_step;
+ columnCounter -= 2;
+ }
+ else
+ {
+ /* advance pointers */
+ ptrLElement += block_step;
+ ptrBElement += block_step;
+ /* end of inner loop */
+
+ if (--columnCounter == 0)
+ {
+ if (finalColumnBlock == currentBlock)
+ {
+ rowEndReached = true;
+ break;
+ }
+
+ // Take a look if any more rows have been completed...
+ completedBlocks = refBlockCompletionProgress;
+ dIASSERT(completedBlocks >= finalColumnBlock);
+
+ if (completedBlocks == finalColumnBlock)
+ {
+ break;
+ }
+
+ // ...continue if so.
+ unsigned columnCompletedSoFar = finalColumnBlock;
+ finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
+ columnCounter = finalColumnBlock - columnCompletedSoFar;
+ }
+ }
+ }
+ }
+ else
+ {
+ ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
+ ptrBElement = B/* + completedColumnBlock * block_step*/;
+
+ rowEndReached = true;
+ }
+
+ if (rowEndReached)
+ {
+ // Check whether there is still a need to proceed or if the computation has been taken over by another thread
+ cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
+
+ if (blockProgressDescriptors[currentBlock] == oldDescriptor)
+ {
+ /* finish computing the X(i) block */
+ Y[0][0] = ptrBElement[0] - Z[0][0];
+ if (b_rows >= 2)
+ {
+ Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
+ }
+
+ dReal p2 = ptrLElement[rowSkip];
+
+ Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
+ if (b_rows >= 2)
+ {
+ Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
+ }
+
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+ // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
+ CooperativeAtomics::AtomicReadReorderBarrier();
+
+ // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
+ // and the computation result is valid.
+ if (blockProgressDescriptors[currentBlock] == oldDescriptor)
+ {
+ // Assign the results to the result context (possibly in parallel with other threads
+ // that could and ought to be assigning exactly the same values)
+ FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+ resultContext.storePrecalculatedZs(Y);
+
+ // Assign the result assignment progress descriptor
+ cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
+ CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
+
+ // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
+ goAssigningTheResult = true;
+ }
+ else
+ {
+ // Otherwise, go on competing for copying the results
+ handleComputationTakenOver = true;
+ }
+ }
+ else
+ {
+ handleComputationTakenOver = true;
+ }
+ }
+ else
+ {
+ // If the final column has not been reached yet, store current values to the context.
+ // Select the other context instance as the previous one might be read by other threads.
+ CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
+ FactorizationSolveL1StripeCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
+ destinationContext.storePrecalculatedZs(Z);
+
+ // Unlock the row until more columns can be used
+ cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
+ cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalColumnBlock, nextContextInstance, false);
+ // The descriptor might have been updated by a competing thread
+ if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
+ {
+ // Adjust the ptrBElement to point to the result area...
+ ptrBElement = B + currentBlock * block_step;
+ // ...and go on handling the case
+ handleComputationTakenOver = true;
+ }
+ }
+
+ if (handleComputationTakenOver)
+ {
+ cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
+ // This can only happen if the row was (has become) the uppermost not fully completed one
+ // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
+ if (existingDescriptor != INVALID_CELLDESCRIPTOR)
+ {
+ // If not fully completed this must be the final stage of the result assignment into the matrix
+ dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
+
+ // Go on competing copying the result as anyway the block is the topmost not completed one
+ // and since there was competition for it, there is no other work that can be done right now.
+ const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+ resultContext.loadPrecalculatedZs(Y);
+
+ goAssigningTheResult = true;
+ }
+ else
+ {
+ // everything is over -- just go handling next blocks
+ }
+ }
+ }
+ else if (goForLockedBlockDuplicateCalculation)
+ {
+ blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
+
+ bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
+
+ /* declare variables */
+ const dReal *ptrLElement;
+
+ if (completedColumnBlock < currentBlock)
+ {
+ /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+ ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
+ ptrBElement = B + completedColumnBlock * block_step;
+
+ /* the inner loop that computes outer products and adds them to Z */
+ // The iteration starts with even number and decreases it by 2. So, it must end in zero
+ const unsigned finalColumnBlock = currentBlock;
+ dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
+
+ unsigned lastCompletedColumn = completedColumnBlock;
+ unsigned columnCounter = finalColumnBlock - completedColumnBlock;
+ for (bool exitInnerLoop = false; !exitInnerLoop; exitInnerLoop = --columnCounter == 0)
+ {
+ /* declare p and q vectors, etc */
+ dReal p[block_step], q[b_rows];
+
+ /* compute outer product and add it to the Z matrix */
+ p[0] = ptrLElement[0];
+ q[0] = ptrBElement[0];
+ Z[0][0] += p[0] * q[0];
+ if (b_rows >= 2)
+ {
+ q[1] = ptrBElement[rowSkip];
+ Z[0][1] += p[0] * q[1];
+ }
+ p[1] = ptrLElement[rowSkip];
+ Z[1][0] += p[1] * q[0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] += p[1] * q[1];
+ }
+
+ /* compute outer product and add it to the Z matrix */
+ p[0] = ptrLElement[1];
+ q[0] = ptrBElement[1];
+ Z[0][0] += p[0] * q[0];
+ if (b_rows >= 2)
+ {
+ q[1] = ptrBElement[1 + rowSkip];
+ Z[0][1] += p[0] * q[1];
+ }
+ p[1] = ptrLElement[1 + rowSkip];
+ Z[1][0] += p[1] * q[0];
+ if (b_rows >= 2)
+ {
+ Z[1][1] += p[1] * q[1];
+ }
+
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+ // Check if the primary solver thread has not made any progress
+ cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
+ unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
+
+ if (newCompletedColumn != lastCompletedColumn)
+ {
+ // Check, this is the first change the current thread detects.
+ // There is absolutely no reason in code for the computation to stop/resume twice
+ // while the current thread is competing.
+ dIASSERT(lastCompletedColumn == completedColumnBlock);
+
+ if (descriptorVerification == INVALID_CELLDESCRIPTOR)
+ {
+ skipToHandlingSubsequentRows = true;
+ break;
+ }
+
+ if (newCompletedColumn == currentBlock + 1)
+ {
+ skiptoCopyingResult = true;
+ break;
+ }
+
+ // Check if the current thread is behind
+ if (newCompletedColumn > finalColumnBlock - columnCounter)
+ {
+ // If so, go starting over one more time
+ blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
+ stayWithinTheBlock = true;
+ skipToHandlingSubsequentRows = true;
+ break;
+ }
+
+ // If current thread is ahead, just save new completed column for further comparisons and go on calculating
+ lastCompletedColumn = newCompletedColumn;
+ }
+
+ /* advance pointers */
+ ptrLElement += block_step;
+ ptrBElement += block_step;
+ /* end of inner loop */
+ }
+ }
+ else if (completedColumnBlock > currentBlock)
+ {
+ dIASSERT(completedColumnBlock == currentBlock + 1);
+
+ skiptoCopyingResult = true;
+ }
+ else
+ {
+ dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
+
+ /* assign the pointers appropriately and go on computing the results */
+ ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
+ ptrBElement = B/* + completedColumnBlock * block_step*/;
+ }
+
+ if (!skipToHandlingSubsequentRows)
+ {
+ if (!skiptoCopyingResult)
+ {
+ /* finish computing the X(i) block */
+ Y[0][0] = ptrBElement[0] - Z[0][0];
+ if (b_rows >= 2)
+ {
+ Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
+ }
+
+ dReal p2 = ptrLElement[rowSkip];
+
+ Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
+ if (b_rows >= 2)
+ {
+ Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
+ }
+
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+ // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
+ CooperativeAtomics::AtomicReadReorderBarrier();
+
+ cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
+
+ if (existingDescriptor == INVALID_CELLDESCRIPTOR)
+ {
+ // Everything is over -- proceed to subsequent rows
+ skipToHandlingSubsequentRows = true;
+ }
+ else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
+ {
+ // The values computed above may not be valid. Copy the values already in the result context.
+ skiptoCopyingResult = true;
+ }
+ else
+ {
+ // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
+ // and the computation result is valid.
+ cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
+
+ // Assign the results to the result context (possibly in parallel with other threads
+ // that could and ought to be assigning exactly the same values)
+ FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+ resultContext.storePrecalculatedZs(Y);
+
+ // Assign the result assignment progress descriptor
+ CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
+
+ // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
+ }
+ }
+
+ if (!skipToHandlingSubsequentRows)
+ {
+ if (skiptoCopyingResult)
+ {
+ // Extract the result values stored in the result context
+ const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+ resultContext.loadPrecalculatedZs(Y);
+
+ ptrBElement = B + currentBlock * block_step;
+ }
+
+ goAssigningTheResult = true;
+ }
+ }
+ }
+
+ if (goAssigningTheResult)
+ {
+ cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
+ // Check if the assignment has not been completed yet
+ if (existingDescriptor != INVALID_CELLDESCRIPTOR)
+ {
+ // Assign the computation results to their places in the matrix
+ ptrBElement[0] = Y[0][0];
+ ptrBElement[1] = Y[1][0];
+ if (b_rows >= 2)
+ {
+ ptrBElement[rowSkip] = Y[0][1];
+ ptrBElement[1 + rowSkip] = Y[1][1];
+ }
+
+ dSASSERT(block_step == 2);
+ dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+ ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
+ dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
+
+ // And assign the completed status no matter what
+ CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
+ }
+ else
+ {
+ // everything is over -- just go handling next blocks
+ }
+ }
+
+ if (!stayWithinTheBlock)
+ {
+ completedBlocks = refBlockCompletionProgress;
+
+ if (completedBlocks == blockCount)
+ {
+ break;
+ }
+
+ currentBlock += 1;
+
+ bool lookaheadBoundaryReached = false;
+
+ if (currentBlock == blockCount || completedBlocks == 0)
+ {
+ lookaheadBoundaryReached = true;
+ }
+ else if (currentBlock >= completedBlocks + lookaheadRange)
+ {
+ lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
+ }
+ else if (currentBlock < completedBlocks)
+ {
+ // Treat detected row advancement as a row processed
+ // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
+
+ currentBlock = completedBlocks;
+ }
+
+ if (lookaheadBoundaryReached)
+ {
+ dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did not we compete???
+
+ // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
+ if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
+ {
+ // Abandon job if too few blocks remain
+ if (blockCount - completedBlocks <= ownThreadIndex)
+ {
+ break;
+ }
+
+ blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
+ }
+ else
+ {
+ // If there was some progress, just continue to the next pass
+ blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
+ }
+
+ currentBlock = completedBlocks;
+ }
+ }
+ }
+}
+
+
+template<unsigned int a_rows, unsigned int d_stride>
+/*static */
+void ThreadedEquationSolverLDLT::participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip,
+ FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex)
+{
+ dIASSERT(factorizationRow != 0);
+ dIASSERT(factorizationRow % 2 == 0);
+
+ /* scale the elements in a 2 x i block at A(i,0), and also */
+ /* compute Z = the outer product matrix that we'll need. */
+ dReal sameZ[a_rows] = { REAL(0.0), }, mixedZ[dMACRO_MAX(a_rows - 1, 1)] = { REAL(0.0), };
+ bool doneAnything = false;
+
+ const unsigned blockSize = deriveScalingAndFactorizingL1StripeBlockSize(a_rows);
+
+ const unsigned blockCount = deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(factorizationRow, blockSize);
+ dIASSERT(blockCount != 0);
+
+ unsigned blockIndex;
+ while ((blockIndex = ThrsafeIncrementIntUpToLimit(&factorizationContext->m_nextColumnIndex, blockCount)) != blockCount)
+ {
+ doneAnything = true;
+ unsigned blockStartRow = blockIndex * blockSize;
+
+ dReal *ptrAElement = ARow + blockStartRow;
+ dReal *ptrDElement = d + blockStartRow * d_stride;
+ for (unsigned columnCounter = blockIndex != blockCount - 1 ? blockSize : factorizationRow - blockStartRow; ; )
+ {
+ dReal p1, q1, p2, q2, dd;
+
+ p1 = ptrAElement[0];
+ if (a_rows >= 2)
+ {
+ p2 = ptrAElement[rowSkip];
+ }
+ dd = ptrDElement[0 * d_stride];
+ q1 = p1 * dd;
+ if (a_rows >= 2)
+ {
+ q2 = p2 * dd;
+ }
+ ptrAElement[0] = q1;
+ if (a_rows >= 2)
+ {
+ ptrAElement[rowSkip] = q2;
+ }
+ sameZ[0] += p1 * q1;
+ if (a_rows >= 2)
+ {
+ sameZ[1] += p2 * q2;
+ mixedZ[0] += p2 * q1;
+ }
+
+ p1 = ptrAElement[1];
+ if (a_rows >= 2)
+ {
+ p2 = ptrAElement[1 + rowSkip];
+ }
+ dd = ptrDElement[1 * d_stride];
+ q1 = p1 * dd;
+ if (a_rows >= 2)
+ {
+ q2 = p2 * dd;
+ }
+ ptrAElement[1] = q1;
+ if (a_rows >= 2)
+ {
+ ptrAElement[1 + rowSkip] = q2;
+ }
+ sameZ[0] += p1 * q1;
+ if (a_rows >= 2)
+ {
+ sameZ[1] += p2 * q2;
+ mixedZ[0] += p2 * q1;
+ }
+
+ if (columnCounter > 6)
+ {
+ columnCounter -= 6;
+
+ ptrAElement += 6;
+ ptrDElement += 6 * d_stride;
+
+ p1 = ptrAElement[-4];
+ if (a_rows >= 2)
+ {
+ p2 = ptrAElement[-4 + rowSkip];
+ }
+ dd = ptrDElement[-4 * (int)d_stride];
+ q1 = p1 * dd;
+ if (a_rows >= 2)
+ {
+ q2 = p2 * dd;
+ }
+ ptrAElement[-4] = q1;
+ if (a_rows >= 2)
+ {
+ ptrAElement[-4 + rowSkip] = q2;
+ }
+ sameZ[0] += p1 * q1;
+ if (a_rows >= 2)
+ {
+ sameZ[1] += p2 * q2;
+ mixedZ[0] += p2 * q1;
+ }
+
+ p1 = ptrAElement[-3];
+ if (a_rows >= 2)
+ {
+ p2 = ptrAElement[-3 + rowSkip];
+ }
+ dd = ptrDElement[-3 * (int)d_stride];
+ q1 = p1 * dd;
+ if (a_rows >= 2)
+ {
+ q2 = p2 * dd;
+ }
+ ptrAElement[-3] = q1;
+ if (a_rows >= 2)
+ {
+ ptrAElement[-3 + rowSkip] = q2;
+ }
+ sameZ[0] += p1 * q1;
+ if (a_rows >= 2)
+ {
+ sameZ[1] += p2 * q2;
+ mixedZ[0] += p2 * q1;
+ }
+
+ p1 = ptrAElement[-2];
+ if (a_rows >= 2)
+ {
+ p2 = ptrAElement[-2 + rowSkip];
+ }
+ dd = ptrDElement[-2 * (int)d_stride];
+ q1 = p1 * dd;
+ if (a_rows >= 2)
+ {
+ q2 = p2 * dd;
+ }
+ ptrAElement[-2] = q1;
+ if (a_rows >= 2)
+ {
+ ptrAElement[-2 + rowSkip] = q2;
+ }
+ sameZ[0] += p1 * q1;
+ if (a_rows >= 2)
+ {
+ sameZ[1] += p2 * q2;
+ mixedZ[0] += p2 * q1;
+ }
+
+ p1 = ptrAElement[-1];
+ if (a_rows >= 2)
+ {
+ p2 = ptrAElement[-1 + rowSkip];
+ }
+ dd = ptrDElement[-1 * (int)d_stride];
+ q1 = p1 * dd;
+ if (a_rows >= 2)
+ {
+ q2 = p2 * dd;
+ }
+ ptrAElement[-1] = q1;
+ if (a_rows >= 2)
+ {
+ ptrAElement[-1 + rowSkip] = q2;
+ }
+ sameZ[0] += p1 * q1;
+ if (a_rows >= 2)
+ {
+ sameZ[1] += p2 * q2;
+ mixedZ[0] += p2 * q1;
+ }
+ }
+ else
+ {
+ ptrAElement += 2;
+ ptrDElement += 2 * d_stride;
+
+ if ((columnCounter -= 2) == 0)
+ {
+ break;
+ }
+ }
+ }
+ }
+
+ if (doneAnything)
+ {
+ unsigned partialSumThreadIndex;
+ for (bool exitLoop = false; !exitLoop; exitLoop = CooperativeAtomics::AtomicCompareExchangeUint32(&factorizationContext->m_sumThreadIndex, partialSumThreadIndex, ownThreadIndex + 1))
+ {
+ partialSumThreadIndex = factorizationContext->m_sumThreadIndex;
+
+ if (partialSumThreadIndex != 0)
+ {
+ const FactorizationFactorizeL1StripeThreadContext &partialSumContext = factorizationContext->m_threadContexts[partialSumThreadIndex - 1];
+ factorizationContext->m_threadContexts[ownThreadIndex].assignDataSum<a_rows>(sameZ, mixedZ, partialSumContext);
+ }
+ else
+ {
+ factorizationContext->m_threadContexts[ownThreadIndex].assignDataAlone<a_rows>(sameZ, mixedZ);
+ }
+ }
+ }
+
+ unsigned threadExitIndex = CooperativeAtomics::AtomicDecrementUint32(&factorizationContext->m_threadsRunning);
+ dIASSERT(threadExitIndex + 1U != 0);
+
+ if (threadExitIndex == 0)
+ {
+ // Let the last thread retrieve the sum and perform final computations
+ unsigned sumThreadIndex = factorizationContext->m_sumThreadIndex;
+ dIASSERT(sumThreadIndex != 0); // The rowIndex was asserted to be not zero, so at least one thread must have done something
+
+ const FactorizationFactorizeL1StripeThreadContext &sumContext = factorizationContext->m_threadContexts[sumThreadIndex - 1];
+ sumContext.retrieveData<a_rows>(sameZ, mixedZ);
+
+ dReal *ptrAElement = ARow + factorizationRow;
+ dReal *ptrDElement = d + factorizationRow * d_stride;
+
+ /* solve for diagonal 2 x 2 block at A(i,i) */
+ dReal Y11, Y21, Y22;
+
+ Y11 = ptrAElement[0] - sameZ[0];
+ if (a_rows >= 2)
+ {
+ Y21 = ptrAElement[rowSkip] - mixedZ[0];
+ Y22 = ptrAElement[1 + rowSkip] - sameZ[1];
+ }
+
+ /* factorize 2 x 2 block Y, ptrDElement */
+ /* factorize row 1 */
+ dReal dd = dRecip(Y11);
+
+ ptrDElement[0 * d_stride] = dd;
+ dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
+
+ if (a_rows >= 2)
+ {
+ /* factorize row 2 */
+ dReal q2 = Y21 * dd;
+ ptrAElement[rowSkip] = q2;
+
+ dReal sum = Y21 * q2;
+ ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
+ }
+ }
+}
+
+
+#endif // #ifndef _ODE_FASTLDLT_IMPL_H_