add ode

author: sanine <sanine.not@pm.me> 2022-10-01 20:59:36 -0500
committer: sanine <sanine.not@pm.me> 2022-10-01 20:59:36 -0500
commit: c5fc66ee58f2c60f2d226868bb1cf5b91badaf53 (patch)
tree: 277dd280daf10bf77013236b8edfa5f88708c7e0 /libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
parent: 1cf9cc3408af7008451f9133fb95af66a9697d15 (diff)
1 files changed, 1530 insertions, 0 deletions
diff --git a/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h b/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
new file mode 100644
index 0000000..8f633d3
--- /dev/null
+++ b/libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
@@ -0,0 +1,1530 @@
+
+
+/*************************************************************************
+ *                                                                       *
+ * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
+ * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
+ *                                                                       *
+ * This library is free software; you can redistribute it and/or         *
+ * modify it under the terms of EITHER:                                  *
+ *   (1) The GNU Lesser General Public License as published by the Free  *
+ *       Software Foundation; either version 2.1 of the License, or (at  *
+ *       your option) any later version. The text of the GNU Lesser      *
+ *       General Public License is included with this library in the     *
+ *       file LICENSE.TXT.                                               *
+ *   (2) The BSD-style license that is included with this library in     *
+ *       the file LICENSE-BSD.TXT.                                       *
+ *                                                                       *
+ * This library is distributed in the hope that it will be useful,       *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
+ * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
+ *                                                                       *
+ *************************************************************************/
+
+/*
+ * Code style improvements and optimizations by Oleh Derevenko ????-2019
+ * LDLT cooperative factorization code of ThreadedEquationSolverLDLT copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e")  
+ */
+
+#ifndef _ODE_FASTLDLT_IMPL_H_
+#define _ODE_FASTLDLT_IMPL_H_
+
+
+#include "error.h"
+#include "common.h"
+
+
+static void solveL1Stripe_2 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned rowIndex, unsigned rowSkip);
+template<unsigned int d_stride>
+inline void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip);
+
+static void solveStripeL1_1 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned rowIndex);
+template<unsigned int d_stride>
+inline void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d);
+
+
+template<unsigned int d_stride>
+void factorMatrixAsLDLT(dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip)
+{
+    if (rowCount < 1) return;
+
+    dReal *ARow = A;
+    unsigned blockStartRow = 0;
+
+    const unsigned blockStep = 2;
+    const unsigned lastRowIndex = rowCount >= blockStep ? rowCount - blockStep + 1 : 0;
+
+    /* compute blocks of 2 rows */
+    bool subsequentPass = false;
+    for (; blockStartRow < lastRowIndex; subsequentPass = true, ARow += blockStep * rowSkip, blockStartRow += blockStep) 
+    {
+        if (subsequentPass)
+        {
+            /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
+            solveL1Stripe_2(A, ARow, blockStartRow, rowSkip);
+            scaleAndFactorizeL1Stripe_2<d_stride>(ARow, d, blockStartRow, rowSkip);
+        }
+        else
+        {
+            scaleAndFactorizeL1FirstRowStripe_2<d_stride>(ARow, d, rowSkip);
+        }
+        dSASSERT(blockStep == 2);
+        /* done factorizing 2 x 2 block */
+    }
+
+    /* compute the (less than 2) rows at the bottom */
+    if (!subsequentPass || blockStartRow == lastRowIndex)
+    {
+        dSASSERT(blockStep == 2); // for the blockStartRow == lastRowIndex comparison above
+
+        if (subsequentPass)
+        {
+            solveStripeL1_1(A, ARow, blockStartRow, rowSkip);
+            scaleAndFactorizeL1Stripe_1<d_stride>(ARow, d, blockStartRow);
+        }
+        else
+        {
+            scaleAndFactorizeL1FirstRowStripe_1<d_stride>(ARow, d);
+        }
+        dSASSERT(blockStep == 2);
+        /* done factorizing 1 x 1 block */
+    }
+}
+
+/* solve L*X=B, with B containing 2 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is rowSkip.
+ * B is an n*2 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also rowSkip.
+ * B is overwritten with X.
+ * this processes blocks of 2*2.
+ * if this is in the factorizer source file, n must be a multiple of 2.
+ */
+static 
+void solveL1Stripe_2(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
+{
+    dIASSERT(rowCount != 0);
+    dIASSERT(rowCount % 2 == 0);
+
+    /* compute all 2 x 2 blocks of X */
+    unsigned blockStartRow = 0;
+    for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount) 
+    {
+        const dReal *ptrLElement;
+        dReal *ptrBElement;
+
+        /* declare variables - Z matrix */
+        dReal Z11, Z12, Z21, Z22;
+
+        /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+        if (subsequentPass)
+        {
+            ptrLElement = L + blockStartRow * rowSkip;
+            ptrBElement = B;
+
+            /* set Z matrix to 0 */
+            Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
+
+            /* the inner loop that computes outer products and adds them to Z */
+            // The iteration starts with even number and decreases it by 2. So, it must end in zero
+            for (unsigned columnCounter = blockStartRow; ;) 
+            {
+                /* declare p and q vectors, etc */
+                dReal p1, q1, p2, q2;
+
+                /* compute outer product and add it to the Z matrix */
+                p1 = ptrLElement[0];
+                q1 = ptrBElement[0];
+                Z11 += p1 * q1;
+                q2 = ptrBElement[rowSkip];
+                Z12 += p1 * q2;
+                p2 = ptrLElement[rowSkip];
+                Z21 += p2 * q1;
+                Z22 += p2 * q2;
+
+                /* compute outer product and add it to the Z matrix */
+                p1 = ptrLElement[1];
+                q1 = ptrBElement[1];
+                Z11 += p1 * q1;
+                q2 = ptrBElement[1 + rowSkip];
+                Z12 += p1 * q2;
+                p2 = ptrLElement[1 + rowSkip];
+                Z21 += p2 * q1;
+                Z22 += p2 * q2;
+
+                if (columnCounter > 6)
+                {
+                    columnCounter -= 6;
+
+                    /* advance pointers */
+                    ptrLElement += 6;
+                    ptrBElement += 6;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-4];
+                    q1 = ptrBElement[-4];
+                    Z11 += p1 * q1;
+                    q2 = ptrBElement[-4 + rowSkip];
+                    Z12 += p1 * q2;
+                    p2 = ptrLElement[-4 + rowSkip];
+                    Z21 += p2 * q1;
+                    Z22 += p2 * q2;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-3];
+                    q1 = ptrBElement[-3];
+                    Z11 += p1 * q1;
+                    q2 = ptrBElement[-3 + rowSkip];
+                    Z12 += p1 * q2;
+                    p2 = ptrLElement[-3 + rowSkip];
+                    Z21 += p2 * q1;
+                    Z22 += p2 * q2;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-2];
+                    q1 = ptrBElement[-2];
+                    Z11 += p1 * q1;
+                    q2 = ptrBElement[-2 + rowSkip];
+                    Z12 += p1 * q2;
+                    p2 = ptrLElement[-2 + rowSkip];
+                    Z21 += p2 * q1;
+                    Z22 += p2 * q2;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-1];
+                    q1 = ptrBElement[-1];
+                    Z11 += p1 * q1;
+                    q2 = ptrBElement[-1 + rowSkip];
+                    Z12 += p1 * q2;
+                    p2 = ptrLElement[-1 + rowSkip];
+                    Z21 += p2 * q1;
+                    Z22 += p2 * q2;
+                }
+                else
+                {
+                    /* advance pointers */
+                    ptrLElement += 2;
+                    ptrBElement += 2;
+
+                    if ((columnCounter -= 2) == 0)
+                    {
+                        break;
+                    }
+                }
+                /* end of inner loop */
+            }
+        }
+        else
+        {
+            ptrLElement = L/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
+            ptrBElement = B;
+
+            /* set Z matrix to 0 */
+            Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
+        }
+
+        /* finish computing the X(i) block */
+        
+        dReal Y11 = ptrBElement[0] - Z11;
+        dReal Y12 = ptrBElement[rowSkip] - Z12;
+
+        dReal p2 = ptrLElement[rowSkip];
+
+        ptrBElement[0] = Y11;
+        ptrBElement[rowSkip] = Y12;
+
+        dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
+        dReal Y22 = ptrBElement[1 + rowSkip] - Z22 - p2 * Y12;
+
+        ptrBElement[1] = Y21;
+        ptrBElement[1 + rowSkip] = Y22;
+        /* end of outer loop */
+    }
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip)
+{
+    dIASSERT(factorizationRow != 0);
+    dIASSERT(factorizationRow % 2 == 0);
+
+    dReal *ptrAElement = ARow;
+    dReal *ptrDElement = d;
+
+    /* scale the elements in a 2 x i block at A(i,0), and also */
+    /* compute Z = the outer product matrix that we'll need. */
+    dReal Z11 = 0, Z21 = 0, Z22 = 0;
+
+    for (unsigned columnCounter = factorizationRow; ; ) 
+    {
+        dReal p1, q1, p2, q2, dd;
+
+        p1 = ptrAElement[0];
+        p2 = ptrAElement[rowSkip];
+        dd = ptrDElement[0 * d_stride];
+        q1 = p1 * dd;
+        q2 = p2 * dd;
+        ptrAElement[0] = q1;
+        ptrAElement[rowSkip] = q2;
+        Z11 += p1 * q1;
+        Z21 += p2 * q1;
+        Z22 += p2 * q2;
+
+        p1 = ptrAElement[1];
+        p2 = ptrAElement[1 + rowSkip];
+        dd = ptrDElement[1 * d_stride];
+        q1 = p1 * dd;
+        q2 = p2 * dd;
+        ptrAElement[1] = q1;
+        ptrAElement[1 + rowSkip] = q2;
+        Z11 += p1 * q1;
+        Z21 += p2 * q1;
+        Z22 += p2 * q2;
+
+        if (columnCounter > 6)
+        {
+            columnCounter -= 6;
+
+            ptrAElement += 6;
+            ptrDElement += 6 * d_stride;
+
+            p1 = ptrAElement[-4];
+            p2 = ptrAElement[-4 + rowSkip];
+            dd = ptrDElement[-4 * (int)d_stride];
+            q1 = p1 * dd;
+            q2 = p2 * dd;
+            ptrAElement[-4] = q1;
+            ptrAElement[-4 + rowSkip] = q2;
+            Z11 += p1 * q1;
+            Z21 += p2 * q1;
+            Z22 += p2 * q2;
+
+            p1 = ptrAElement[-3];
+            p2 = ptrAElement[-3 + rowSkip];
+            dd = ptrDElement[-3 * (int)d_stride];
+            q1 = p1 * dd;
+            q2 = p2 * dd;
+            ptrAElement[-3] = q1;
+            ptrAElement[-3 + rowSkip] = q2;
+            Z11 += p1 * q1;
+            Z21 += p2 * q1;
+            Z22 += p2 * q2;
+
+            p1 = ptrAElement[-2];
+            p2 = ptrAElement[-2 + rowSkip];
+            dd = ptrDElement[-2 * (int)d_stride];
+            q1 = p1 * dd;
+            q2 = p2 * dd;
+            ptrAElement[-2] = q1;
+            ptrAElement[-2 + rowSkip] = q2;
+            Z11 += p1 * q1;
+            Z21 += p2 * q1;
+            Z22 += p2 * q2;
+
+            p1 = ptrAElement[-1];
+            p2 = ptrAElement[-1 + rowSkip];
+            dd = ptrDElement[-1 * (int)d_stride];
+            q1 = p1 * dd;
+            q2 = p2 * dd;
+            ptrAElement[-1] = q1;
+            ptrAElement[-1 + rowSkip] = q2;
+            Z11 += p1 * q1;
+            Z21 += p2 * q1;
+            Z22 += p2 * q2;
+        }
+        else
+        {
+            ptrAElement += 2;
+            ptrDElement += 2 * d_stride;
+
+            if ((columnCounter -= 2) == 0)
+            {
+                break;
+            }
+        }
+    }
+
+    /* solve for diagonal 2 x 2 block at A(i,i) */
+    dReal Y11 = ptrAElement[0] - Z11;
+    dReal Y21 = ptrAElement[rowSkip] - Z21;
+    dReal Y22 = ptrAElement[1 + rowSkip] - Z22;
+
+    /* factorize 2 x 2 block Y, ptrDElement */
+    /* factorize row 1 */
+    dReal dd = dRecip(Y11);
+
+    ptrDElement[0 * d_stride] = dd;
+    dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
+
+    /* factorize row 2 */
+    dReal q2 = Y21 * dd;
+    ptrAElement[rowSkip] = q2;
+
+    dReal sum = Y21 * q2;
+    ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip)
+{
+    dReal *ptrAElement = ARow;
+    dReal *ptrDElement = d;
+
+    /* solve for diagonal 2 x 2 block at A(0,0) */
+    dReal Y11 = ptrAElement[0]/* - Z11*/;
+    dReal Y21 = ptrAElement[rowSkip]/* - Z21*/;
+    dReal Y22 = ptrAElement[1 + rowSkip]/* - Z22*/;
+
+    /* factorize 2 x 2 block Y, ptrDElement */
+    /* factorize row 1 */
+    dReal dd = dRecip(Y11);
+
+    ptrDElement[0 * d_stride] = dd;
+    dIASSERT(ptrDElement == d/* + (sizeint)factorizationRow * d_stride*/);
+
+    /* factorize row 2 */
+    dReal q2 = Y21 * dd;
+    ptrAElement[rowSkip] = q2;
+
+    dReal sum = Y21 * q2;
+    ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
+}
+
+
+/* solve L*X=B, with B containing 1 right hand sides.
+ * L is an n*n lower triangular matrix with ones on the diagonal.
+ * L is stored by rows and its leading dimension is lskip.
+ * B is an n*1 matrix that contains the right hand sides.
+ * B is stored by columns and its leading dimension is also lskip.
+ * B is overwritten with X.
+ * this processes blocks of 2*2.
+ * if this is in the factorizer source file, n must be a multiple of 2.
+ */
+static 
+void solveStripeL1_1(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
+{
+    dIASSERT(rowCount != 0);
+    dIASSERT(rowCount % 2 == 0);
+
+    /* compute all 2 x 1 blocks of X */
+    unsigned blockStartRow = 0;
+    for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount) 
+    {
+        const dReal *ptrLElement;
+        dReal *ptrBElement;
+
+        /* declare variables - Z matrix */
+        dReal Z11, Z21;
+
+        if (subsequentPass)
+        {
+            ptrLElement = L + (sizeint)blockStartRow * rowSkip;
+            ptrBElement = B;
+
+            /* set the Z matrix to 0 */
+            Z11 = 0; Z21 = 0;
+
+            /* compute all 2 x 1 block of X, from rows i..i+2-1 */
+            
+            /* the inner loop that computes outer products and adds them to Z */
+            // The iteration starts with even number and decreases it by 2. So, it must end in zero
+            for (unsigned columnCounter = blockStartRow; ; ) 
+            {
+                /* declare p and q vectors, etc */
+                dReal p1, q1, p2;
+
+                /* compute outer product and add it to the Z matrix */
+                p1 = ptrLElement[0];
+                q1 = ptrBElement[0];
+                Z11 += p1 * q1;
+                p2 = ptrLElement[rowSkip];
+                Z21 += p2 * q1;
+                
+                /* compute outer product and add it to the Z matrix */
+                p1 = ptrLElement[1];
+                q1 = ptrBElement[1];
+                Z11 += p1 * q1;
+                p2 = ptrLElement[1 + rowSkip];
+                Z21 += p2 * q1;
+
+                if (columnCounter > 6)
+                {
+                    columnCounter -= 6;
+
+                    /* advance pointers */
+                    ptrLElement += 6;
+                    ptrBElement += 6;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-4];
+                    q1 = ptrBElement[-4];
+                    Z11 += p1 * q1;
+                    p2 = ptrLElement[-4 + rowSkip];
+                    Z21 += p2 * q1;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-3];
+                    q1 = ptrBElement[-3];
+                    Z11 += p1 * q1;
+                    p2 = ptrLElement[-3 + rowSkip];
+                    Z21 += p2 * q1;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-2];
+                    q1 = ptrBElement[-2];
+                    Z11 += p1 * q1;
+                    p2 = ptrLElement[-2 + rowSkip];
+                    Z21 += p2 * q1;
+
+                    /* compute outer product and add it to the Z matrix */
+                    p1 = ptrLElement[-1];
+                    q1 = ptrBElement[-1];
+                    Z11 += p1 * q1;
+                    p2 = ptrLElement[-1 + rowSkip];
+                    Z21 += p2 * q1;
+                }
+                else
+                {
+                    /* advance pointers */
+                    ptrLElement += 2;
+                    ptrBElement += 2;
+
+                    if ((columnCounter -= 2) == 0)
+                    {
+                        break;
+                    }
+                }
+                /* end of inner loop */
+            }
+        }
+        else
+        {
+            ptrLElement = L/* + (sizeint)blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
+            ptrBElement = B;
+
+            /* set the Z matrix to 0 */
+            Z11 = 0; Z21 = 0;
+        }
+        
+        /* finish computing the X(i) block */
+        dReal p2 = ptrLElement[rowSkip];
+
+        dReal Y11 = ptrBElement[0] - Z11;
+        dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
+
+        ptrBElement[0] = Y11;
+        ptrBElement[1] = Y21;
+        /* end of outer loop */
+    }
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned factorizationRow)
+{
+    dReal *ptrAElement = ARow;
+    dReal *ptrDElement = d;
+
+    /* scale the elements in a 1 x i block at A(i,0), and also */
+    /* compute Z = the outer product matrix that we'll need. */
+    dReal Z11 = 0, Z22 = 0;
+
+    for (unsigned columnCounter = factorizationRow; ; ) 
+    {
+        dReal p1, p2, q1, q2, dd1, dd2;
+
+        p1 = ptrAElement[0];
+        p2 = ptrAElement[1];
+        dd1 = ptrDElement[0 * d_stride];
+        dd2 = ptrDElement[1 * d_stride];
+        q1 = p1 * dd1;
+        q2 = p2 * dd2;
+        ptrAElement[0] = q1;
+        ptrAElement[1] = q2;
+        Z11 += p1 * q1;
+        Z22 += p2 * q2;
+
+        if (columnCounter > 6)
+        {
+            columnCounter -= 6;
+
+            ptrAElement += 6;
+            ptrDElement += 6 * d_stride;
+
+            p1 = ptrAElement[-4];
+            p2 = ptrAElement[-3];
+            dd1 = ptrDElement[-4 * (int)d_stride];
+            dd2 = ptrDElement[-3 * (int)d_stride];
+            q1 = p1 * dd1;
+            q2 = p2 * dd2;
+            ptrAElement[-4] = q1;
+            ptrAElement[-3] = q2;
+            Z11 += p1 * q1;
+            Z22 += p2 * q2;
+
+            p1 = ptrAElement[-2];
+            p2 = ptrAElement[-1];
+            dd1 = ptrDElement[-2 * (int)d_stride];
+            dd2 = ptrDElement[-1 * (int)d_stride];
+            q1 = p1 * dd1;
+            q2 = p2 * dd2;
+            ptrAElement[-2] = q1;
+            ptrAElement[-1] = q2;
+            Z11 += p1 * q1;
+            Z22 += p2 * q2;
+        }
+        else
+        {
+            ptrAElement += 2;
+            ptrDElement += 2 * d_stride;
+
+            if ((columnCounter -= 2) == 0)
+            {
+                break;
+            }
+        }
+    }
+
+    dReal Y11 = ptrAElement[0] - (Z11 + Z22);
+
+    /* solve for diagonal 1 x 1 block at A(i,i) */
+    dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
+    /* factorize 1 x 1 block Y, ptrDElement */
+    /* factorize row 1 */
+    ptrDElement[0 * d_stride] = dRecip(Y11);
+}
+
+template<unsigned int d_stride>
+void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d)
+{
+    dReal *ptrAElement = ARow;
+    dReal *ptrDElement = d;
+
+    dReal Y11 = ptrAElement[0];
+
+    /* solve for diagonal 1 x 1 block at A(0,0) */
+    /* factorize 1 x 1 block Y, ptrDElement */
+    /* factorize row 1 */
+    ptrDElement[0 * d_stride] = dRecip(Y11);
+}
+
+
+
+
+template<unsigned int block_step, unsigned int b_rows>
+/*static */
+void ThreadedEquationSolverLDLT::participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip, 
+    volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/, 
+    FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
+{
+    const unsigned lookaheadRange = 64;
+    BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
+
+    unsigned completedBlocks = refBlockCompletionProgress;
+    unsigned currentBlock = completedBlocks;
+    dIASSERT(completedBlocks <= blockCount);
+
+    for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
+    {
+        bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
+        bool goAssigningTheResult = false, stayWithinTheBlock = false;
+
+        dReal Z[block_step][b_rows];
+        dReal Y[block_step][b_rows];
+
+        dReal *ptrBElement;
+
+        CellContextInstance previousContextInstance;
+        unsigned completedColumnBlock;
+
+        for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
+        {
+            if (testDescriptor == INVALID_CELLDESCRIPTOR)
+            {
+                // Invalid descriptor is the indication that the row has been fully calculated
+                // Test if this was the last row and break out if so.
+                if (currentBlock + 1 == blockCount)
+                {
+                    exitLoop = true;
+                    break;
+                }
+
+                // Treat detected row advancement as a row processed
+                // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
+                break;
+            }
+            
+            CooperativeAtomics::AtomicReadReorderBarrier();
+            // It is necessary to read up to date completedBblocks value after the descriptor retrieval
+            // as otherwise the logic below breaks
+            completedBlocks = refBlockCompletionProgress;
+
+            if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
+            {
+                completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
+                dIASSERT(completedColumnBlock < currentBlock || (completedColumnBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
+                dIASSERT(completedColumnBlock <= completedBlocks); // Since the descriptor is not locked
+
+                if (completedColumnBlock == completedBlocks && currentBlock != completedBlocks)
+                {
+                    dIASSERT(completedBlocks < currentBlock);
+                    break;
+                }
+
+                if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
+                {
+                    if (completedColumnBlock != 0)
+                    {
+                        CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
+                        previousContextInstance = contextInstance;
+
+                        const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
+                        sourceContext.loadPrecalculatedZs(Z);
+                    }
+                    else
+                    {
+                        previousContextInstance = CCI__MIN;
+                        FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
+                    }
+
+                    goForLockedBlockPrimaryCalculation = true;
+                    break;
+                }
+
+                if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
+                {
+                    break;
+                }
+
+                testDescriptor = blockProgressDescriptors[currentBlock];
+            }
+            else
+            {
+                if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
+                {
+                    break;
+                }
+
+                cellindexint verificativeDescriptor;
+                bool verificationFailure = false;
+
+                completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
+                dIASSERT(completedColumnBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
+
+                if (completedColumnBlock != 0)
+                {
+                    CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
+                    const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
+                    sourceContext.loadPrecalculatedZs(Z);
+                }
+                else
+                {
+                    FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
+                }
+
+                if (completedColumnBlock != 0 && completedColumnBlock <= currentBlock)
+                {
+                    // Make sure the descriptor is re-read after the precalculates
+                    CooperativeAtomics::AtomicReadReorderBarrier();
+                }
+
+                if (completedColumnBlock <= currentBlock)
+                {
+                    verificativeDescriptor = blockProgressDescriptors[currentBlock];
+                    verificationFailure = verificativeDescriptor != testDescriptor;
+                }
+
+                if (!verificationFailure)
+                {
+                    dIASSERT(completedColumnBlock <= currentBlock + 1);
+
+                    goForLockedBlockDuplicateCalculation = true;
+                    break;
+                }
+
+                testDescriptor = verificativeDescriptor;
+            }
+        }
+
+        if (exitLoop)
+        {
+            break;
+        }
+
+        if (goForLockedBlockPrimaryCalculation)
+        {
+            blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
+
+            // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
+            bool handleComputationTakenOver = false, rowEndReached = false;
+
+            const dReal *ptrLElement;
+            unsigned finalColumnBlock;
+
+            if (currentBlock != 0)
+            {
+                /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+                ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
+                ptrBElement = B + completedColumnBlock * block_step;
+
+                /* the inner loop that computes outer products and adds them to Z */
+                finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
+                dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);
+
+                // The iteration starts with even number and decreases it by 2. So, it must end in zero
+                for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
+                {
+                    /* declare p and q vectors, etc */
+                    dReal p[block_step], q[b_rows];
+
+                    /* compute outer product and add it to the Z matrix */
+                    p[0] = ptrLElement[0];
+                    q[0] = ptrBElement[0];
+                    Z[0][0] += p[0] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        q[1] = ptrBElement[rowSkip];
+                        Z[0][1] += p[0] * q[1];
+                    }
+                    p[1] = ptrLElement[rowSkip];
+                    Z[1][0] += p[1] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        Z[1][1] += p[1] * q[1];
+                    }
+
+                    /* compute outer product and add it to the Z matrix */
+                    p[0] = ptrLElement[1];
+                    q[0] = ptrBElement[1];
+                    Z[0][0] += p[0] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        q[1] = ptrBElement[1 + rowSkip];
+                        Z[0][1] += p[0] * q[1];
+                    }
+                    p[1] = ptrLElement[1 + rowSkip];
+                    Z[1][0] += p[1] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        Z[1][1] += p[1] * q[1];
+                    }
+                    
+                    dSASSERT(block_step == 2);
+                    dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+                    if (columnCounter > 2)
+                    {
+                        /* compute outer product and add it to the Z matrix */
+                        p[0] = ptrLElement[2];
+                        q[0] = ptrBElement[2];
+                        Z[0][0] += p[0] * q[0];
+                        if (b_rows >= 2)
+                        {
+                            q[1] = ptrBElement[2 + rowSkip];
+                            Z[0][1] += p[0] * q[1];
+                        }
+                        p[1] = ptrLElement[2 + rowSkip];
+                        Z[1][0] += p[1] * q[0];
+                        if (b_rows >= 2)
+                        {
+                            Z[1][1] += p[1] * q[1];
+                        }
+
+                        /* compute outer product and add it to the Z matrix */
+                        p[0] = ptrLElement[3];
+                        q[0] = ptrBElement[3];
+                        Z[0][0] += p[0] * q[0];
+                        if (b_rows >= 2)
+                        {
+                            q[1] = ptrBElement[3 + rowSkip];
+                            Z[0][1] += p[0] * q[1];
+                        }
+                        p[1] = ptrLElement[3 + rowSkip];
+                        Z[1][0] += p[1] * q[0];
+                        if (b_rows >= 2)
+                        {
+                            Z[1][1] += p[1] * q[1];
+                        }
+
+                        dSASSERT(block_step == 2);
+                        dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+                        /* advance pointers */
+                        ptrLElement += 2 * block_step;
+                        ptrBElement += 2 * block_step;
+                        columnCounter -= 2;
+                    }
+                    else
+                    {
+                        /* advance pointers */
+                        ptrLElement += block_step;
+                        ptrBElement += block_step;
+                        /* end of inner loop */
+
+                        if (--columnCounter == 0)
+                        {
+                            if (finalColumnBlock == currentBlock)
+                            {
+                                rowEndReached = true;
+                                break;
+                            }
+
+                            // Take a look if any more rows have been completed...
+                            completedBlocks = refBlockCompletionProgress;
+                            dIASSERT(completedBlocks >= finalColumnBlock);
+
+                            if (completedBlocks == finalColumnBlock)
+                            {
+                                break;
+                            }
+
+                            // ...continue if so.
+                            unsigned columnCompletedSoFar = finalColumnBlock;
+                            finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
+                            columnCounter = finalColumnBlock - columnCompletedSoFar;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
+                ptrBElement = B/* + completedColumnBlock * block_step*/;
+
+                rowEndReached = true;
+            }
+
+            if (rowEndReached)
+            {
+                // Check whether there is still a need to proceed or if the computation has been taken over by another thread
+                cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
+                
+                if (blockProgressDescriptors[currentBlock] == oldDescriptor)
+                {
+                    /* finish computing the X(i) block */
+                    Y[0][0] = ptrBElement[0] - Z[0][0];
+                    if (b_rows >= 2)
+                    {
+                        Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
+                    }
+
+                    dReal p2 = ptrLElement[rowSkip];
+
+                    Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
+                    if (b_rows >= 2)
+                    {
+                        Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
+                    }
+
+                    dSASSERT(block_step == 2);
+                    dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+                    // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
+                    CooperativeAtomics::AtomicReadReorderBarrier();
+                    
+                    // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet 
+                    // and the computation result is valid.
+                    if (blockProgressDescriptors[currentBlock] == oldDescriptor)
+                    {
+                        // Assign the results to the result context (possibly in parallel with other threads 
+                        // that could and ought to be assigning exactly the same values)
+                        FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+                        resultContext.storePrecalculatedZs(Y);
+
+                        // Assign the result assignment progress descriptor
+                        cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
+                        CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
+
+                        // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
+                        goAssigningTheResult = true;
+                    }
+                    else
+                    {
+                        // Otherwise, go on competing for copying the results
+                        handleComputationTakenOver = true;
+                    }
+                }
+                else
+                {
+                    handleComputationTakenOver = true;
+                }
+            }
+            else
+            {
+                // If the final column has not been reached yet, store current values to the context.
+                // Select the other context instance as the previous one might be read by other threads.
+                CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
+                FactorizationSolveL1StripeCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
+                destinationContext.storePrecalculatedZs(Z);
+
+                // Unlock the row until more columns can be used
+                cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
+                cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalColumnBlock, nextContextInstance, false);
+                // The descriptor might have been updated by a competing thread
+                if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
+                {
+                    // Adjust the ptrBElement to point to the result area...
+                    ptrBElement = B + currentBlock * block_step;
+                    // ...and go on handling the case
+                    handleComputationTakenOver = true;
+                }
+            }
+
+            if (handleComputationTakenOver)
+            {
+                cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
+                // This can only happen if the row was (has become) the uppermost not fully completed one
+                // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
+                if (existingDescriptor != INVALID_CELLDESCRIPTOR)
+                {
+                    // If not fully completed this must be the final stage of the result assignment into the matrix
+                    dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
+
+                    // Go on competing copying the result as anyway the block is the topmost not completed one
+                    // and since there was competition for it, there is no other work that can be done right now.
+                    const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+                    resultContext.loadPrecalculatedZs(Y);
+
+                    goAssigningTheResult = true;
+                }
+                else 
+                {
+                    // everything is over -- just go handling next blocks
+                }
+            }
+        }
+        else if (goForLockedBlockDuplicateCalculation)
+        {
+            blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
+
+            bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
+
+            /* declare variables */
+            const dReal *ptrLElement;
+
+            if (completedColumnBlock < currentBlock)
+            {
+                /* compute all 2 x 2 block of X, from rows i..i+2-1 */
+                ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
+                ptrBElement = B + completedColumnBlock * block_step;
+
+                /* the inner loop that computes outer products and adds them to Z */
+                // The iteration starts with even number and decreases it by 2. So, it must end in zero
+                const unsigned finalColumnBlock = currentBlock;
+                dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
+
+                unsigned lastCompletedColumn = completedColumnBlock;
+                unsigned columnCounter = finalColumnBlock - completedColumnBlock;
+                for (bool exitInnerLoop = false; !exitInnerLoop; exitInnerLoop = --columnCounter == 0)
+                {
+                    /* declare p and q vectors, etc */
+                    dReal p[block_step], q[b_rows];
+
+                    /* compute outer product and add it to the Z matrix */
+                    p[0] = ptrLElement[0];
+                    q[0] = ptrBElement[0];
+                    Z[0][0] += p[0] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        q[1] = ptrBElement[rowSkip];
+                        Z[0][1] += p[0] * q[1];
+                    }
+                    p[1] = ptrLElement[rowSkip];
+                    Z[1][0] += p[1] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        Z[1][1] += p[1] * q[1];
+                    }
+
+                    /* compute outer product and add it to the Z matrix */
+                    p[0] = ptrLElement[1];
+                    q[0] = ptrBElement[1];
+                    Z[0][0] += p[0] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        q[1] = ptrBElement[1 + rowSkip];
+                        Z[0][1] += p[0] * q[1];
+                    }
+                    p[1] = ptrLElement[1 + rowSkip];
+                    Z[1][0] += p[1] * q[0];
+                    if (b_rows >= 2)
+                    {
+                        Z[1][1] += p[1] * q[1];
+                    }
+
+                    dSASSERT(block_step == 2);
+                    dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+                    // Check if the primary solver thread has not made any progress
+                    cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
+                    unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
+                    
+                    if (newCompletedColumn != lastCompletedColumn)
+                    {
+                        // Check, this is the first change the current thread detects.
+                        // There is absolutely no reason in code for the computation to stop/resume twice 
+                        // while the current thread is competing.
+                        dIASSERT(lastCompletedColumn == completedColumnBlock);
+
+                        if (descriptorVerification == INVALID_CELLDESCRIPTOR)
+                        {
+                            skipToHandlingSubsequentRows = true;
+                            break;
+                        }
+
+                        if (newCompletedColumn == currentBlock + 1)
+                        {
+                            skiptoCopyingResult = true;
+                            break;
+                        }
+
+                        // Check if the current thread is behind
+                        if (newCompletedColumn > finalColumnBlock - columnCounter)
+                        {
+                            // If so, go starting over one more time
+                            blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
+                            stayWithinTheBlock = true;
+                            skipToHandlingSubsequentRows = true;
+                            break;
+                        }
+
+                        // If current thread is ahead, just save new completed column for further comparisons and go on calculating
+                        lastCompletedColumn = newCompletedColumn;
+                    }
+
+                    /* advance pointers */
+                    ptrLElement += block_step;
+                    ptrBElement += block_step;
+                    /* end of inner loop */
+                }
+            }
+            else if (completedColumnBlock > currentBlock)
+            {
+                dIASSERT(completedColumnBlock == currentBlock + 1);
+
+                skiptoCopyingResult = true;
+            }
+            else
+            {
+                dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
+
+                /* assign the pointers appropriately and go on computing the results */
+                ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
+                ptrBElement = B/* + completedColumnBlock * block_step*/;
+            }
+
+            if (!skipToHandlingSubsequentRows)
+            {
+                if (!skiptoCopyingResult)
+                {
+                    /* finish computing the X(i) block */
+                    Y[0][0] = ptrBElement[0] - Z[0][0];
+                    if (b_rows >= 2)
+                    {
+                        Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
+                    }
+
+                    dReal p2 = ptrLElement[rowSkip];
+
+                    Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
+                    if (b_rows >= 2)
+                    {
+                        Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
+                    }
+
+                    dSASSERT(block_step == 2);
+                    dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+                    // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
+                    CooperativeAtomics::AtomicReadReorderBarrier();
+
+                    cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
+
+                    if (existingDescriptor == INVALID_CELLDESCRIPTOR)
+                    {
+                        // Everything is over -- proceed to subsequent rows
+                        skipToHandlingSubsequentRows = true;
+                    }
+                    else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
+                    {
+                        // The values computed above may not be valid. Copy the values already in the result context.
+                        skiptoCopyingResult = true;
+                    }
+                    else
+                    {
+                        // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet 
+                        // and the computation result is valid.
+                        cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
+
+                        // Assign the results to the result context (possibly in parallel with other threads 
+                        // that could and ought to be assigning exactly the same values)
+                        FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+                        resultContext.storePrecalculatedZs(Y);
+
+                        // Assign the result assignment progress descriptor
+                        CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
+
+                        // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
+                    }
+                }
+
+                if (!skipToHandlingSubsequentRows)
+                {
+                    if (skiptoCopyingResult)
+                    {
+                        // Extract the result values stored in the result context
+                        const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
+                        resultContext.loadPrecalculatedZs(Y);
+
+                        ptrBElement = B + currentBlock * block_step;
+                    }
+
+                    goAssigningTheResult = true;
+                }
+            }
+        }
+
+        if (goAssigningTheResult)
+        {
+            cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
+            // Check if the assignment has not been completed yet
+            if (existingDescriptor != INVALID_CELLDESCRIPTOR)
+            {
+                // Assign the computation results to their places in the matrix
+                ptrBElement[0] = Y[0][0];
+                ptrBElement[1] = Y[1][0];
+                if (b_rows >= 2)
+                {
+                    ptrBElement[rowSkip] = Y[0][1];
+                    ptrBElement[1 + rowSkip] = Y[1][1];
+                }
+
+                dSASSERT(block_step == 2);
+                dSASSERT(b_rows >= 1 && b_rows <= 2);
+
+                ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
+                dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
+
+                // And assign the completed status no matter what
+                CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
+            }
+            else 
+            {
+                // everything is over -- just go handling next blocks
+            }
+        }
+
+        if (!stayWithinTheBlock)
+        {
+            completedBlocks = refBlockCompletionProgress;
+            
+            if (completedBlocks == blockCount)
+            {
+                break;
+            }
+
+            currentBlock += 1;
+
+            bool lookaheadBoundaryReached = false;
+
+            if (currentBlock == blockCount || completedBlocks == 0)
+            {
+                lookaheadBoundaryReached = true;
+            }
+            else if (currentBlock >= completedBlocks + lookaheadRange)
+            {
+                lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
+            }
+            else if (currentBlock < completedBlocks)
+            {
+                // Treat detected row advancement as a row processed
+                // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
+
+                currentBlock = completedBlocks;
+            }
+
+            if (lookaheadBoundaryReached)
+            {
+                dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did not we compete???
+
+                // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
+                if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
+                {
+                    // Abandon job if too few blocks remain
+                    if (blockCount - completedBlocks <= ownThreadIndex)
+                    {
+                        break;
+                    }
+
+                    blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
+                }
+                else
+                {
+                    // If there was some progress, just continue to the next pass
+                    blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
+                }
+
+                currentBlock = completedBlocks;
+            }
+        }
+    }
+}
+
+
+template<unsigned int a_rows, unsigned int d_stride>
+/*static */
+void ThreadedEquationSolverLDLT::participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip,
+    FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex)
+{
+    dIASSERT(factorizationRow != 0);
+    dIASSERT(factorizationRow % 2 == 0);
+
+    /* scale the elements in a 2 x i block at A(i,0), and also */
+    /* compute Z = the outer product matrix that we'll need. */
+    dReal sameZ[a_rows] = { REAL(0.0), }, mixedZ[dMACRO_MAX(a_rows - 1, 1)] = { REAL(0.0), };
+    bool doneAnything = false;
+
+    const unsigned blockSize = deriveScalingAndFactorizingL1StripeBlockSize(a_rows);
+
+    const unsigned blockCount = deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(factorizationRow, blockSize);
+    dIASSERT(blockCount != 0);
+
+    unsigned blockIndex;
+    while ((blockIndex = ThrsafeIncrementIntUpToLimit(&factorizationContext->m_nextColumnIndex, blockCount)) != blockCount) 
+    {
+        doneAnything = true;
+        unsigned blockStartRow = blockIndex * blockSize;
+
+        dReal *ptrAElement = ARow + blockStartRow;
+        dReal *ptrDElement = d + blockStartRow * d_stride;
+        for (unsigned columnCounter = blockIndex != blockCount - 1 ? blockSize : factorizationRow - blockStartRow; ; )
+        {
+            dReal p1, q1, p2, q2, dd;
+
+            p1 = ptrAElement[0];
+            if (a_rows >= 2)
+            {
+                p2 = ptrAElement[rowSkip];
+            }
+            dd = ptrDElement[0 * d_stride];
+            q1 = p1 * dd;
+            if (a_rows >= 2)
+            {
+                q2 = p2 * dd;
+            }
+            ptrAElement[0] = q1;
+            if (a_rows >= 2)
+            {
+                ptrAElement[rowSkip] = q2;
+            }
+            sameZ[0] += p1 * q1;
+            if (a_rows >= 2)
+            {
+                sameZ[1] += p2 * q2;
+                mixedZ[0] += p2 * q1;
+            }
+
+            p1 = ptrAElement[1];
+            if (a_rows >= 2)
+            {
+                p2 = ptrAElement[1 + rowSkip];
+            }
+            dd = ptrDElement[1 * d_stride];
+            q1 = p1 * dd;
+            if (a_rows >= 2)
+            {
+                q2 = p2 * dd;
+            }
+            ptrAElement[1] = q1;
+            if (a_rows >= 2)
+            {
+                ptrAElement[1 + rowSkip] = q2;
+            }
+            sameZ[0] += p1 * q1;
+            if (a_rows >= 2)
+            {
+                sameZ[1] += p2 * q2;
+                mixedZ[0] += p2 * q1;
+            }
+
+            if (columnCounter > 6)
+            {
+                columnCounter -= 6;
+
+                ptrAElement += 6;
+                ptrDElement += 6 * d_stride;
+
+                p1 = ptrAElement[-4];
+                if (a_rows >= 2)
+                {
+                    p2 = ptrAElement[-4 + rowSkip];
+                }
+                dd = ptrDElement[-4 * (int)d_stride];
+                q1 = p1 * dd;
+                if (a_rows >= 2)
+                {
+                    q2 = p2 * dd;
+                }
+                ptrAElement[-4] = q1;
+                if (a_rows >= 2)
+                {
+                    ptrAElement[-4 + rowSkip] = q2;
+                }
+                sameZ[0] += p1 * q1;
+                if (a_rows >= 2)
+                {
+                    sameZ[1] += p2 * q2;
+                    mixedZ[0] += p2 * q1;
+                }
+
+                p1 = ptrAElement[-3];
+                if (a_rows >= 2)
+                {
+                    p2 = ptrAElement[-3 + rowSkip];
+                }
+                dd = ptrDElement[-3 * (int)d_stride];
+                q1 = p1 * dd;
+                if (a_rows >= 2)
+                {
+                    q2 = p2 * dd;
+                }
+                ptrAElement[-3] = q1;
+                if (a_rows >= 2)
+                {
+                    ptrAElement[-3 + rowSkip] = q2;
+                }
+                sameZ[0] += p1 * q1;
+                if (a_rows >= 2)
+                {
+                    sameZ[1] += p2 * q2;
+                    mixedZ[0] += p2 * q1;
+                }
+
+                p1 = ptrAElement[-2];
+                if (a_rows >= 2)
+                {
+                    p2 = ptrAElement[-2 + rowSkip];
+                }
+                dd = ptrDElement[-2 * (int)d_stride];
+                q1 = p1 * dd;
+                if (a_rows >= 2)
+                {
+                    q2 = p2 * dd;
+                }
+                ptrAElement[-2] = q1;
+                if (a_rows >= 2)
+                {
+                    ptrAElement[-2 + rowSkip] = q2;
+                }
+                sameZ[0] += p1 * q1;
+                if (a_rows >= 2)
+                {
+                    sameZ[1] += p2 * q2;
+                    mixedZ[0] += p2 * q1;
+                }
+
+                p1 = ptrAElement[-1];
+                if (a_rows >= 2)
+                {
+                    p2 = ptrAElement[-1 + rowSkip];
+                }
+                dd = ptrDElement[-1 * (int)d_stride];
+                q1 = p1 * dd;
+                if (a_rows >= 2)
+                {
+                    q2 = p2 * dd;
+                }
+                ptrAElement[-1] = q1;
+                if (a_rows >= 2)
+                {
+                    ptrAElement[-1 + rowSkip] = q2;
+                }
+                sameZ[0] += p1 * q1;
+                if (a_rows >= 2)
+                {
+                    sameZ[1] += p2 * q2;
+                    mixedZ[0] += p2 * q1;
+                }
+            }
+            else
+            {
+                ptrAElement += 2;
+                ptrDElement += 2 * d_stride;
+
+                if ((columnCounter -= 2) == 0)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (doneAnything)
+    {
+        unsigned partialSumThreadIndex;
+        for (bool exitLoop = false; !exitLoop; exitLoop = CooperativeAtomics::AtomicCompareExchangeUint32(&factorizationContext->m_sumThreadIndex, partialSumThreadIndex, ownThreadIndex + 1))
+        {
+            partialSumThreadIndex = factorizationContext->m_sumThreadIndex;
+            
+            if (partialSumThreadIndex != 0)
+            {
+                const FactorizationFactorizeL1StripeThreadContext &partialSumContext = factorizationContext->m_threadContexts[partialSumThreadIndex - 1];
+                factorizationContext->m_threadContexts[ownThreadIndex].assignDataSum<a_rows>(sameZ, mixedZ, partialSumContext);
+            }
+            else
+            {
+                factorizationContext->m_threadContexts[ownThreadIndex].assignDataAlone<a_rows>(sameZ, mixedZ);
+            }
+        }
+    }
+
+    unsigned threadExitIndex = CooperativeAtomics::AtomicDecrementUint32(&factorizationContext->m_threadsRunning);
+    dIASSERT(threadExitIndex + 1U != 0);
+
+    if (threadExitIndex == 0)
+    {
+        // Let the last thread retrieve the sum and perform final computations
+        unsigned sumThreadIndex = factorizationContext->m_sumThreadIndex;
+        dIASSERT(sumThreadIndex != 0); // The rowIndex was asserted to be not zero, so at least one thread must have done something
+
+        const FactorizationFactorizeL1StripeThreadContext &sumContext = factorizationContext->m_threadContexts[sumThreadIndex - 1];
+        sumContext.retrieveData<a_rows>(sameZ, mixedZ);
+
+        dReal *ptrAElement = ARow + factorizationRow;
+        dReal *ptrDElement = d + factorizationRow * d_stride;
+
+        /* solve for diagonal 2 x 2 block at A(i,i) */
+        dReal Y11, Y21, Y22;
+        
+        Y11 = ptrAElement[0] - sameZ[0];
+        if (a_rows >= 2)
+        {
+            Y21 = ptrAElement[rowSkip] - mixedZ[0];
+            Y22 = ptrAElement[1 + rowSkip] - sameZ[1];
+        }
+
+        /* factorize 2 x 2 block Y, ptrDElement */
+        /* factorize row 1 */
+        dReal dd = dRecip(Y11);
+
+        ptrDElement[0 * d_stride] = dd;
+        dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
+
+        if (a_rows >= 2)
+        {
+            /* factorize row 2 */
+            dReal q2 = Y21 * dd;
+            ptrAElement[rowSkip] = q2;
+
+            dReal sum = Y21 * q2;
+            ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
+        }
+    }
+}
+
+
+#endif // #ifndef _ODE_FASTLDLT_IMPL_H_
author	sanine <sanine.not@pm.me>	2022-10-01 20:59:36 -0500
committer	sanine <sanine.not@pm.me>	2022-10-01 20:59:36 -0500
commit	c5fc66ee58f2c60f2d226868bb1cf5b91badaf53 (patch)
tree	277dd280daf10bf77013236b8edfa5f88708c7e0 /libs/ode-0.16.1/ode/src/fastldltfactor_impl.h
parent	1cf9cc3408af7008451f9133fb95af66a9697d15 (diff)