1 files changed, 1672 insertions, 0 deletions
diff --git a/libs/ode-0.16.1/ode/src/step.cpp b/libs/ode-0.16.1/ode/src/step.cpp
new file mode 100644
index 0000000..033e879
--- /dev/null
+++ b/libs/ode-0.16.1/ode/src/step.cpp
@@ -0,0 +1,1672 @@
+/*************************************************************************
+ *                                                                       *
+ * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
+ * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
+ *                                                                       *
+ * This library is free software; you can redistribute it and/or         *
+ * modify it under the terms of EITHER:                                  *
+ *   (1) The GNU Lesser General Public License as published by the Free  *
+ *       Software Foundation; either version 2.1 of the License, or (at  *
+ *       your option) any later version. The text of the GNU Lesser      *
+ *       General Public License is included with this library in the     *
+ *       file LICENSE.TXT.                                               *
+ *   (2) The BSD-style license that is included with this library in     *
+ *       the file LICENSE-BSD.TXT.                                       *
+ *                                                                       *
+ * This library is distributed in the hope that it will be useful,       *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
+ * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
+ *                                                                       *
+ *************************************************************************/
+
+#include <ode/odeconfig.h>
+#include <ode/rotation.h>
+#include <ode/timer.h>
+#include <ode/error.h>
+#include "config.h"
+#include "odemath.h"
+#include "matrix.h"
+#include "objects.h"
+#include "joints/joint.h"
+#include "lcp.h"
+#include "util.h"
+#include "threadingutils.h"
+
+#include <new>
+
+
+#define dMIN(A,B)  ((A)>(B) ? (B) : (A))
+#define dMAX(A,B)  ((B)>(A) ? (B) : (A))
+
+//****************************************************************************
+// misc defines
+
+//#define TIMING
+
+
+#ifdef TIMING
+#define IFTIMING(x) x
+#else
+#define IFTIMING(x) ((void)0)
+#endif
+
+
+struct dJointWithInfo1
+{
+    dxJoint *joint;
+    dxJoint::Info1 info;
+};
+
+enum dxRHSCFMElement
+{
+    RCE_RHS = dxJoint::GI2_RHS,
+    RCE_CFM = dxJoint::GI2_CFM,
+    
+    // Elements for array reuse
+    RLE_RHS = RCE_RHS,
+    RLE_LAMBDA = RCE_CFM,
+
+    RCE__RHS_CFM_MAX = dxJoint::GI2__RHS_CFM_MAX,
+    RLE__RHS_LAMBDA_MAX = RCE__RHS_CFM_MAX,
+};
+
+enum dxLoHiElement
+{
+    LHE_LO = dxJoint::GI2_LO,
+    LHE_HI = dxJoint::GI2_HI,
+
+    LHE__LO_HI_MAX = dxJoint::GI2__LO_HI_MAX,
+};
+
+enum dxJacobiVectorElement
+{
+    JVE__MIN,
+
+    JVE__L_MIN = JVE__MIN + dDA__L_MIN,
+
+    JVE_LX = JVE__L_MIN + dSA_X,
+    JVE_LY = JVE__L_MIN + dSA_Y,
+    JVE_LZ = JVE__L_MIN + dSA_Z,
+
+    JVE__L_MAX = JVE__L_MIN + dSA__MAX,
+
+    JVE__A_MIN = JVE__MIN + dDA__A_MIN,
+
+    JVE_AX = JVE__A_MIN + dSA_X,
+    JVE_AY = JVE__A_MIN + dSA_Y,
+    JVE_AZ = JVE__A_MIN + dSA_Z,
+
+    JVE__A_MAX = JVE__A_MIN + dSA__MAX,
+
+    JVE__MAX = JVE__MIN + dDA__MAX,
+
+    JVE__L_COUNT = JVE__L_MAX - JVE__L_MIN,
+    JVE__A_COUNT = JVE__A_MAX - JVE__A_MIN,
+};
+
+
+enum dxJacobiMatrixElement
+{
+    JME__MIN,
+
+    JME__J_MIN = JME__MIN,
+    JME__JL_MIN = JME__J_MIN + JVE__L_MIN,
+
+    JME_JLX = JME__J_MIN + JVE_LX,
+    JME_JLY = JME__J_MIN + JVE_LY,
+    JME_JLZ = JME__J_MIN + JVE_LZ,
+
+    JME__JL_MAX = JME__J_MIN + JVE__L_MAX,
+
+    JME__JA_MIN = JME__J_MIN + JVE__A_MIN,
+
+    JME_JAX = JME__J_MIN + JVE_AX,
+    JME_JAY = JME__J_MIN + JVE_AY,
+    JME_JAZ = JME__J_MIN + JVE_AZ,
+
+    JME__JA_MAX = JME__J_MIN + JVE__A_MAX,
+    JME__J_MAX = JME__J_MIN + JVE__MAX,
+
+    JME__MAX = JME__J_MAX,
+
+    JME__J_COUNT = JME__J_MAX - JME__J_MIN,
+};
+
+enum dxJInvMElement
+{
+    JIM__MIN,
+
+    JIM__L_MIN = JIM__MIN + dMD_LINEAR * dV3E__MAX,
+
+    JIM__L_AXES_MIN = JIM__L_MIN + dV3E__AXES_MIN,
+
+    JIM_LX = JIM__L_MIN + dV3E_X,
+    JIM_LY = JIM__L_MIN + dV3E_Y,
+    JIM_LZ = JIM__L_MIN + dV3E_Z,
+
+    JIM__L_AXES_MAX = JIM__L_MIN + dV3E__AXES_MAX,
+
+    JIM_LPAD = JIM__L_MIN + dV3E_PAD,
+
+    JIM__L_MAX = JIM__L_MIN + dV3E__MAX,
+
+    JIM__A_MIN = JIM__MIN + dMD_ANGULAR * dV3E__MAX,
+
+    JIM__A_AXES_MIN = JIM__A_MIN + dV3E__AXES_MIN,
+
+    JIM_AX = JIM__A_MIN + dV3E_X,
+    JIM_AY = JIM__A_MIN + dV3E_Y,
+    JIM_AZ = JIM__A_MIN + dV3E_Z,
+
+    JIM__A_AXES_MAX = JIM__A_MIN + dV3E__AXES_MAX,
+
+    JIM_APAD = JIM__A_MIN + dV3E_PAD,
+
+    JIM__A_MAX = JIM__A_MIN + dV3E__MAX,
+
+    JIM__MAX = JIM__MIN + dMD__MAX * dV3E__MAX,
+};
+
+enum dxContactForceElement
+{
+    CFE__MIN,
+
+    CFE__DYNAMICS_MIN = CFE__MIN,
+
+    CFE__L_MIN = CFE__DYNAMICS_MIN + dDA__L_MIN,
+
+    CFE_LX = CFE__DYNAMICS_MIN + dDA_LX,
+    CFE_LY = CFE__DYNAMICS_MIN + dDA_LY,
+    CFE_LZ = CFE__DYNAMICS_MIN + dDA_LZ,
+
+    CFE__L_MAX = CFE__DYNAMICS_MIN + dDA__L_MAX,
+
+    CFE__A_MIN = CFE__DYNAMICS_MIN + dDA__A_MIN,
+
+    CFE_AX = CFE__DYNAMICS_MIN + dDA_AX,
+    CFE_AY = CFE__DYNAMICS_MIN + dDA_AY,
+    CFE_AZ = CFE__DYNAMICS_MIN + dDA_AZ,
+
+    CFE__A_MAX = CFE__DYNAMICS_MIN + dDA__A_MAX,
+
+    CFE__DYNAMICS_MAX = CFE__DYNAMICS_MIN + dDA__MAX,
+
+    CFE__MAX = CFE__DYNAMICS_MAX,
+};
+
+
+#define AMATRIX_ALIGNMENT   dMAX(64, EFFICIENT_ALIGNMENT)
+#define INVI_ALIGNMENT      dMAX(32, EFFICIENT_ALIGNMENT)
+#define JINVM_ALIGNMENT     dMAX(64, EFFICIENT_ALIGNMENT)
+
+struct dxStepperStage0Outputs
+{
+    sizeint                         ji_start;
+    sizeint                         ji_end;
+    unsigned int                    m;
+    unsigned int                    nub;
+};
+
+struct dxStepperStage1CallContext
+{
+    void Initialize(const dxStepperProcessingCallContext *stepperCallContext, void *stageMemArenaState, dReal *invI, dJointWithInfo1 *jointinfos)
+    {
+        m_stepperCallContext = stepperCallContext;
+        m_stageMemArenaState = stageMemArenaState;
+        m_invI = invI;
+        m_jointinfos = jointinfos;
+    }
+
+    const dxStepperProcessingCallContext *m_stepperCallContext;
+    void                            *m_stageMemArenaState;
+    dReal                           *m_invI;
+    dJointWithInfo1                 *m_jointinfos;
+    dxStepperStage0Outputs          m_stage0Outputs;
+};
+
+struct dxStepperStage0BodiesCallContext
+{
+    void Initialize(const dxStepperProcessingCallContext *stepperCallContext, dReal *invI)
+    {
+        m_stepperCallContext = stepperCallContext;
+        m_invI = invI;
+        m_tagsTaken = 0;
+        m_gravityTaken = 0;
+        m_inertiaBodyIndex = 0;
+    }
+
+    const dxStepperProcessingCallContext *m_stepperCallContext;
+    dReal                           *m_invI;
+    atomicord32                     m_tagsTaken;
+    atomicord32                     m_gravityTaken;
+    volatile atomicord32            m_inertiaBodyIndex;
+};
+
+struct dxStepperStage0JointsCallContext
+{
+    void Initialize(const dxStepperProcessingCallContext *stepperCallContext, dJointWithInfo1 *jointinfos, dxStepperStage0Outputs *stage0Outputs)
+    {
+        m_stepperCallContext = stepperCallContext;
+        m_jointinfos = jointinfos;
+        m_stage0Outputs = stage0Outputs;
+    }
+
+    const dxStepperProcessingCallContext *m_stepperCallContext;
+    dJointWithInfo1                 *m_jointinfos;
+    dxStepperStage0Outputs          *m_stage0Outputs;
+};
+
+static int dxStepIsland_Stage0_Bodies_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+// static int dxStepIsland_Stage0_Joints_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxStepIsland_Stage1_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+static void dxStepIsland_Stage0_Bodies(dxStepperStage0BodiesCallContext *callContext);
+static void dxStepIsland_Stage0_Joints(dxStepperStage0JointsCallContext *callContext);
+static void dxStepIsland_Stage1(dxStepperStage1CallContext *callContext);
+
+
+struct dxStepperLocalContext
+{
+    void Initialize(dReal *invI, dJointWithInfo1 *jointinfos, unsigned int nj, 
+        unsigned int m, unsigned int nub, const unsigned int *mindex, int *findex, 
+        dReal *J, dReal *A, dReal *pairsRhsCfm, dReal *pairsLoHi, 
+        atomicord32 *bodyStartJoints, atomicord32 *bodyJointLinks)
+    {
+        m_invI = invI;
+        m_jointinfos = jointinfos;
+        m_nj = nj;
+        m_m = m;
+        m_nub = nub;
+        m_mindex = mindex;
+        m_findex = findex; 
+        m_J = J;
+        m_A = A;
+        m_pairsRhsCfm = pairsRhsCfm;
+        m_pairsLoHi = pairsLoHi;
+        m_bodyStartJoints = bodyStartJoints;
+        m_bodyJointLinks = bodyJointLinks;
+    }
+
+    dReal                           *m_invI;
+    dJointWithInfo1                 *m_jointinfos;
+    unsigned int                    m_nj;
+    unsigned int                    m_m;
+    unsigned int                    m_nub;
+    const unsigned int              *m_mindex;
+    int                             *m_findex;
+    dReal                           *m_J;
+    dReal                           *m_A;
+    dReal                           *m_pairsRhsCfm;
+    dReal                           *m_pairsLoHi;
+    atomicord32                     *m_bodyStartJoints;
+    atomicord32                     *m_bodyJointLinks;
+};
+
+struct dxStepperStage2CallContext
+{
+    void Initialize(const dxStepperProcessingCallContext *callContext, const dxStepperLocalContext *localContext, 
+        dReal *JinvM, dReal *rhs_tmp)
+    {
+        m_stepperCallContext = callContext;
+        m_localContext = localContext;
+        m_JinvM = JinvM;
+        m_rhs_tmp = rhs_tmp;
+        m_ji_J = 0;
+        m_ji_Ainit = 0;
+        m_ji_JinvM = 0;
+        m_ji_Aaddjb = 0;
+        m_bi_rhs_tmp = 0;
+        m_ji_rhs = 0;
+    }
+
+    const dxStepperProcessingCallContext *m_stepperCallContext;
+    const dxStepperLocalContext     *m_localContext;
+    dReal                           *m_JinvM;
+    dReal                           *m_rhs_tmp;
+    volatile atomicord32            m_ji_J;
+    volatile atomicord32            m_ji_Ainit;
+    volatile atomicord32            m_ji_JinvM;
+    volatile atomicord32            m_ji_Aaddjb;
+    volatile atomicord32            m_bi_rhs_tmp;
+    volatile atomicord32            m_ji_rhs;
+};
+
+struct dxStepperStage3CallContext
+{
+    void Initialize(const dxStepperProcessingCallContext *callContext, const dxStepperLocalContext *localContext, 
+        void *stage1MemArenaState)
+    {
+        m_stepperCallContext = callContext;
+        m_localContext = localContext;
+        m_stage1MemArenaState = stage1MemArenaState;
+    }
+
+    const dxStepperProcessingCallContext *m_stepperCallContext;
+    const dxStepperLocalContext     *m_localContext;
+    void                            *m_stage1MemArenaState;
+};
+
+struct dxStepperStage4CallContext
+{
+    void Initialize(const dxStepperProcessingCallContext *callContext, const dxStepperLocalContext *localContext/*, 
+        void *stage3MemarenaState*/)
+    {
+        m_stepperCallContext = callContext;
+        m_localContext = localContext;
+        // m_stage3MemarenaState = stage3MemarenaState;
+        m_bi_constrForce = 0;
+    }
+
+    const dxStepperProcessingCallContext *m_stepperCallContext;
+    const dxStepperLocalContext     *m_localContext;
+    // void                            *m_stage3MemarenaState;
+    volatile atomicord32            m_bi_constrForce;
+};
+
+static int dxStepIsland_Stage2a_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxStepIsland_Stage2aSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxStepIsland_Stage2b_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxStepIsland_Stage2bSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxStepIsland_Stage2c_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxStepIsland_Stage3_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+static void dxStepIsland_Stage2a(dxStepperStage2CallContext *callContext);
+static void dxStepIsland_Stage2b(dxStepperStage2CallContext *callContext);
+static void dxStepIsland_Stage2c(dxStepperStage2CallContext *callContext);
+static void dxStepIsland_Stage3(dxStepperStage3CallContext *callContext);
+
+static int dxStepIsland_Stage4_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static void dxStepIsland_Stage4(dxStepperStage4CallContext *stage4CallContext);
+
+
+//****************************************************************************
+// special matrix multipliers
+
+
+// this assumes the 4th and 8th rows of B and C are zero.
+
+static inline 
+void MultiplyAddJinvMxJToA (dReal *Arow, const dReal *JinvMRow, const dReal *JRow,
+    unsigned int infomJinvM, unsigned int infomJ, unsigned int mskip)
+{
+    dIASSERT (infomJinvM > 0 && infomJ > 0 && Arow && JinvMRow && JRow);
+    const unsigned int mskip_munus_infomJ_plus_1 = mskip - infomJ + 1;
+    dIASSERT(mskip >= infomJ);
+    dReal *currA = Arow;
+    const dReal *currJinvM = JinvMRow;
+    for (unsigned int i = infomJinvM; ; ) {
+        dReal JiM0 = currJinvM[JIM_LX];
+        dReal JiM1 = currJinvM[JIM_LY];
+        dReal JiM2 = currJinvM[JIM_LZ];
+        dReal JiM4 = currJinvM[JIM_AX];
+        dReal JiM5 = currJinvM[JIM_AY];
+        dReal JiM6 = currJinvM[JIM_AZ];
+        const dReal *currJ = JRow;
+        for (unsigned int j = infomJ; ; ) {
+            dReal sum;
+            sum  = JiM0 * currJ[JME_JLX];
+            sum += JiM1 * currJ[JME_JLY];
+            sum += JiM2 * currJ[JME_JLZ];
+            sum += JiM4 * currJ[JME_JAX];
+            sum += JiM5 * currJ[JME_JAY];
+            sum += JiM6 * currJ[JME_JAZ];
+            *currA += sum; 
+            if (--j == 0) {
+                break;
+            }
+            ++currA;
+            currJ += JME__MAX;
+        }
+        if (--i == 0) {
+            break;
+        }
+        currJinvM += JIM__MAX;
+        currA += mskip_munus_infomJ_plus_1;
+    }
+}
+
+
+// this assumes the 4th and 8th rows of B are zero.
+
+static inline 
+void MultiplySubJxRhsTmpFromRHS (dReal *rowRhsCfm, const dReal *JRow, const dReal *rowRhsTmp, unsigned int infom)
+{
+    dIASSERT (infom > 0 && rowRhsCfm && JRow && rowRhsTmp);
+    dReal *currRhs = rowRhsCfm + RCE_RHS;
+    const dReal *currJ = JRow;
+    const dReal RT_LX = rowRhsTmp[dDA_LX], RT_LY = rowRhsTmp[dDA_LY], RT_LZ = rowRhsTmp[dDA_LZ];
+    const dReal RT_AX = rowRhsTmp[dDA_AX], RT_AY = rowRhsTmp[dDA_AY], RT_AZ = rowRhsTmp[dDA_AZ];
+    for (unsigned int i = infom; ; ) {
+        dReal sum;
+        sum  = currJ[JME_JLX] * RT_LX;
+        sum += currJ[JME_JLY] * RT_LY;
+        sum += currJ[JME_JLZ] * RT_LZ;
+        sum += currJ[JME_JAX] * RT_AX;
+        sum += currJ[JME_JAY] * RT_AY;
+        sum += currJ[JME_JAZ] * RT_AZ;
+        *currRhs -= sum;
+        if (--i == 0) {
+            break;
+        }
+        currRhs += RCE__RHS_CFM_MAX;
+        currJ += JME__MAX;
+    }
+}
+
+
+static inline 
+void MultiplyAddJxLambdaToCForce(dReal cforce[CFE__MAX], 
+    const dReal *JRow, const dReal *rowRhsLambda, unsigned int infom, 
+    dJointFeedback *fb/*=NULL*/, unsigned jointBodyIndex)
+{
+    dIASSERT (infom > 0 && cforce && JRow && rowRhsLambda);
+    dReal sumLX = 0, sumLY = 0, sumLZ = 0, sumAX=0, sumAY = 0, sumAZ = 0;
+    const dReal *currJ = JRow, *currLambda = rowRhsLambda + RLE_LAMBDA;
+    for (unsigned int k = infom; ; ) {
+        const dReal lambda = *currLambda;
+        sumLX += currJ[JME_JLX] * lambda;
+        sumLY += currJ[JME_JLY] * lambda;
+        sumLZ += currJ[JME_JLZ] * lambda;
+        sumAX += currJ[JME_JAX] * lambda;
+        sumAY += currJ[JME_JAY] * lambda;
+        sumAZ += currJ[JME_JAZ] * lambda;
+        if (--k == 0) {
+            break;
+        }
+        currJ += JME__MAX;
+        currLambda += RLE__RHS_LAMBDA_MAX;
+    }
+    if (fb != NULL) {
+        if (jointBodyIndex == dJCB__MIN) {
+            fb->f1[dV3E_X] = sumLX;
+            fb->f1[dV3E_Y] = sumLY;
+            fb->f1[dV3E_Z] = sumLZ;
+            fb->t1[dV3E_X] = sumAX;
+            fb->t1[dV3E_Y] = sumAY;
+            fb->t1[dV3E_Z] = sumAZ;
+        }
+        else {
+            dIASSERT(jointBodyIndex == dJCB__MIN + 1);
+            dSASSERT(dJCB__MAX == 2);
+
+            fb->f2[dV3E_X] = sumLX;
+            fb->f2[dV3E_Y] = sumLY;
+            fb->f2[dV3E_Z] = sumLZ;
+            fb->t2[dV3E_X] = sumAX;
+            fb->t2[dV3E_Y] = sumAY;
+            fb->t2[dV3E_Z] = sumAZ;
+        }
+    }
+    cforce[CFE_LX] += sumLX;
+    cforce[CFE_LY] += sumLY;
+    cforce[CFE_LZ] += sumLZ;
+    cforce[CFE_AX] += sumAX;
+    cforce[CFE_AY] += sumAY;
+    cforce[CFE_AZ] += sumAZ;
+}
+
+
+//****************************************************************************
+
+/*extern */
+void dxStepIsland(const dxStepperProcessingCallContext *callContext)
+{
+    IFTIMING(dTimerStart("preprocessing"));
+
+    dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+    dxWorld *world = callContext->m_world;
+    unsigned int nb = callContext->m_islandBodiesCount;
+    unsigned int _nj = callContext->m_islandJointsCount;
+
+    dReal *invI = memarena->AllocateOveralignedArray<dReal>(dM3E__MAX * (sizeint)nb, INVI_ALIGNMENT);
+    // Reserve twice as much memory and start from the middle so that regardless of 
+    // what direction the array grows to there would be sufficient room available.
+    const sizeint ji_reserve_count = 2 * (sizeint)_nj;
+    dJointWithInfo1 *const jointinfos = memarena->AllocateArray<dJointWithInfo1>(ji_reserve_count);
+
+    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+    dIASSERT(allowedThreads != 0);
+
+    void *stagesMemArenaState = memarena->SaveState();
+
+    dxStepperStage1CallContext *stage1CallContext = (dxStepperStage1CallContext *)memarena->AllocateBlock(sizeof(dxStepperStage1CallContext));
+    stage1CallContext->Initialize(callContext, stagesMemArenaState, invI, jointinfos);
+
+    dxStepperStage0BodiesCallContext *stage0BodiesCallContext = (dxStepperStage0BodiesCallContext *)memarena->AllocateBlock(sizeof(dxStepperStage0BodiesCallContext));
+    stage0BodiesCallContext->Initialize(callContext, invI);
+    
+    dxStepperStage0JointsCallContext *stage0JointsCallContext = (dxStepperStage0JointsCallContext *)memarena->AllocateBlock(sizeof(dxStepperStage0JointsCallContext));
+    stage0JointsCallContext->Initialize(callContext, jointinfos, &stage1CallContext->m_stage0Outputs);
+
+    if (allowedThreads == 1)
+    {
+        dxStepIsland_Stage0_Bodies(stage0BodiesCallContext);
+        dxStepIsland_Stage0_Joints(stage0JointsCallContext);
+        dxStepIsland_Stage1(stage1CallContext);
+    }
+    else
+    {
+        unsigned bodyThreads = allowedThreads;
+        unsigned jointThreads = 1;
+
+        dCallReleaseeID stage1CallReleasee;
+        world->PostThreadedCallForUnawareReleasee(NULL, &stage1CallReleasee, bodyThreads + jointThreads, callContext->m_finalReleasee, 
+            NULL, &dxStepIsland_Stage1_Callback, stage1CallContext, 0, "StepIsland Stage1");
+
+        world->PostThreadedCallsGroup(NULL, bodyThreads, stage1CallReleasee, &dxStepIsland_Stage0_Bodies_Callback, stage0BodiesCallContext, "StepIsland Stage0-Bodies");
+
+        dxStepIsland_Stage0_Joints(stage0JointsCallContext);
+        world->AlterThreadedCallDependenciesCount(stage1CallReleasee, -1);
+        dIASSERT(jointThreads == 1);
+    }
+}    
+
+static 
+int dxStepIsland_Stage0_Bodies_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage0BodiesCallContext *callContext = (dxStepperStage0BodiesCallContext *)_callContext;
+    dxStepIsland_Stage0_Bodies(callContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage0_Bodies(dxStepperStage0BodiesCallContext *callContext)
+{
+    dxBody * const *body = callContext->m_stepperCallContext->m_islandBodiesStart;
+    unsigned int nb = callContext->m_stepperCallContext->m_islandBodiesCount;
+
+    if (ThrsafeExchange(&callContext->m_tagsTaken, 1) == 0)
+    {
+        // number all bodies in the body list - set their tag values
+        for (unsigned int i=0; i<nb; i++) body[i]->tag = i;
+    }
+
+    if (ThrsafeExchange(&callContext->m_gravityTaken, 1) == 0)
+    {
+        dxWorld *world = callContext->m_stepperCallContext->m_world;
+
+        // add the gravity force to all bodies
+        // since gravity does normally have only one component it's more efficient
+        // to run three loops for each individual component
+        dxBody *const *const bodyend = body + nb;
+        dReal gravity_x = world->gravity[0];
+        if (gravity_x) {
+            for (dxBody *const *bodycurr = body; bodycurr != bodyend; ++bodycurr) {
+                dxBody *b = *bodycurr;
+                if ((b->flags & dxBodyNoGravity) == 0) {
+                    b->facc[dV3E_X] += b->mass.mass * gravity_x;
+                }
+            }
+        }
+        dReal gravity_y = world->gravity[1];
+        if (gravity_y) {
+            for (dxBody *const *bodycurr = body; bodycurr != bodyend; ++bodycurr) {
+                dxBody *b = *bodycurr;
+                if ((b->flags & dxBodyNoGravity) == 0) {
+                    b->facc[dV3E_Y] += b->mass.mass * gravity_y;
+                }
+            }
+        }
+        dReal gravity_z = world->gravity[2];
+        if (gravity_z) {
+            for (dxBody *const *bodycurr = body; bodycurr != bodyend; ++bodycurr) {
+                dxBody *b = *bodycurr;
+                if ((b->flags & dxBodyNoGravity) == 0) {
+                    b->facc[dV3E_Z] += b->mass.mass * gravity_z;
+                }
+            }
+        }
+    }
+
+    // for all bodies, compute the inertia tensor and its inverse in the global
+    // frame, and compute the rotational force and add it to the torque
+    // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body.
+    {
+        dReal *invIrow = callContext->m_invI;
+        unsigned int bodyIndex = ThrsafeIncrementIntUpToLimit(&callContext->m_inertiaBodyIndex, nb);
+
+        for (unsigned int i = 0; i != nb; invIrow += dM3E__MAX, ++i) {
+            if (i == bodyIndex) {
+                dMatrix3 tmp;
+                dxBody *b = body[i];
+
+                // compute inverse inertia tensor in global frame
+                dMultiply2_333 (tmp, b->invI, b->posr.R);
+                dMultiply0_333 (invIrow, b->posr.R, tmp);
+
+                // Don't apply gyroscopic torques to bodies
+                // if not flagged or the body is kinematic
+                if ((b->flags & dxBodyGyroscopic) && (b->invMass > 0)) {
+                    dMatrix3 I;
+                    // compute inertia tensor in global frame
+                    dMultiply2_333 (tmp,b->mass.I,b->posr.R);
+                    dMultiply0_333 (I,b->posr.R,tmp);
+                    // compute rotational force
+#if 0
+                    // Explicit computation
+                    dMultiply0_331 (tmp,I,b->avel);
+                    dSubtractVectorCross3(b->tacc,b->avel,tmp);
+#else
+                    // Do the implicit computation based on 
+                    //"Stabilizing Gyroscopic Forces in Rigid Multibody Simulations"
+                    // (Lacoursière 2006)
+                    dReal h = callContext->m_stepperCallContext->m_stepSize; // Step size
+                    dVector3 L; // Compute angular momentum
+                    dMultiply0_331(L, I, b->avel);
+                    
+                    // Compute a new effective 'inertia tensor'
+                    // for the implicit step: the cross-product 
+                    // matrix of the angular momentum plus the
+                    // old tensor scaled by the timestep.  
+                    // Itild may not be symmetric pos-definite, 
+                    // but we can still use it to compute implicit
+                    // gyroscopic torques.
+                    dMatrix3 Itild = { 0 };  
+                    dSetCrossMatrixMinus(Itild, L, dV3E__MAX);
+                    for (int ii = dM3E__MIN; ii != dM3E__MAX; ++ii) {
+                      Itild[ii] = Itild[ii] * h + I[ii];
+                    }
+
+                    // Scale momentum by inverse time to get 
+                    // a sort of "torque"
+                    dScaleVector3(L, dRecip(h)); 
+                    // Invert the pseudo-tensor
+                    dMatrix3 itInv;
+                    // This is a closed-form inversion.
+                    // It's probably not numerically stable
+                    // when dealing with small masses with
+                    // a large asymmetry.
+                    // An LU decomposition might be better.
+                    if (dInvertMatrix3(itInv, Itild) != 0) {
+                        // "Divide" the original tensor
+                        // by the pseudo-tensor (on the right)
+                        dMultiply0_333(Itild, I, itInv);
+                        // Subtract an identity matrix
+                        Itild[dM3E_XX] -= 1; Itild[dM3E_YY] -= 1; Itild[dM3E_ZZ] -= 1;
+
+                        // This new inertia matrix rotates the 
+                        // momentum to get a new set of torques
+                        // that will work correctly when applied
+                        // to the old inertia matrix as explicit
+                        // torques with a semi-implicit update
+                        // step.
+                        dVector3 tau0;
+                        dMultiply0_331(tau0,Itild,L);
+                        
+                        // Add the gyro torques to the torque 
+                        // accumulator
+                        for (int ii = dSA__MIN; ii != dSA__MAX; ++ii) {
+                          b->tacc[dV3E__AXES_MIN + ii] += tau0[dV3E__AXES_MIN + ii];
+                        }
+                    }
+#endif
+                }
+
+                bodyIndex = ThrsafeIncrementIntUpToLimit(&callContext->m_inertiaBodyIndex, nb);
+            }
+        }
+    }
+}
+
+// static 
+// int dxStepIsland_Stage0_Joints_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+// {
+//     (void)callInstanceIndex; // unused
+//     (void)callThisReleasee; // unused
+//     dxStepperStage0JointsCallContext *callContext = (dxStepperStage0JointsCallContext *)_callContext;
+//     dxStepIsland_Stage0_Joints(callContext);
+//     return 1;
+// }
+
+static 
+void dxStepIsland_Stage0_Joints(dxStepperStage0JointsCallContext *callContext)
+{
+    dxJoint * const *_joint = callContext->m_stepperCallContext->m_islandJointsStart;
+    dJointWithInfo1 *jointinfos = callContext->m_jointinfos;
+    unsigned int _nj = callContext->m_stepperCallContext->m_islandJointsCount;
+
+    // get m = total constraint dimension, nub = number of unbounded variables.
+    // create constraint offset array and number-of-rows array for all joints.
+    // the constraints are re-ordered as follows: the purely unbounded
+    // constraints, the mixed unbounded + LCP constraints, and last the purely
+    // LCP constraints. this assists the LCP solver to put all unbounded
+    // variables at the start for a quick factorization.
+    //
+    // joints with m=0 are inactive and are removed from the joints array
+    // entirely, so that the code that follows does not consider them.
+    // also number all active joints in the joint list (set their tag values).
+    // inactive joints receive a tag value of -1.
+
+    sizeint ji_start, ji_end;
+    {
+        unsigned int mcurr = 0;
+        sizeint unb_start, mix_start, mix_end, lcp_end;
+        unb_start = mix_start = mix_end = lcp_end = _nj;
+
+        dJointWithInfo1 *jicurr = jointinfos + lcp_end;
+        dxJoint *const *const _jend = _joint + _nj;
+        dxJoint *const *_jcurr = _joint;
+        while (true) {
+            // -------------------------------------------------------------------------
+            // Switch to growing array forward
+            {
+                bool fwd_end_reached = false;
+                dJointWithInfo1 *jimixend = jointinfos + mix_end;
+                while (true) {	// jicurr=dest, _jcurr=src
+                    if (_jcurr == _jend) {
+                        lcp_end = jicurr - jointinfos;
+                        fwd_end_reached = true;
+                        break;
+                    }
+                    dxJoint *j = *_jcurr++;
+                    j->getInfo1 (&jicurr->info);
+                    dIASSERT (/*jicurr->info.m >= 0 && */jicurr->info.m <= 6 && /*jicurr->info.nub >= 0 && */jicurr->info.nub <= jicurr->info.m);
+                    if (jicurr->info.m != 0) {
+                        mcurr += jicurr->info.m;
+                        if (jicurr->info.nub == 0) { // A lcp info - a correct guess!!!
+                            jicurr->joint = j;
+                            ++jicurr;
+                        } else if (jicurr->info.nub < jicurr->info.m) { // A mixed case
+                            if (unb_start == mix_start) { // no unbounded infos yet - just move to opposite side of mixed-s
+                                unb_start = mix_start = mix_start - 1;
+                                dJointWithInfo1 *jimixstart = jointinfos + mix_start;
+                                jimixstart->info = jicurr->info;
+                                jimixstart->joint = j;
+                            } else if (jimixend != jicurr) { // have to swap to the tail of mixed-s
+                                dxJoint::Info1 tmp_info = jicurr->info;
+                                *jicurr = *jimixend;
+                                jimixend->info = tmp_info;
+                                jimixend->joint = j;
+                                ++jimixend; ++jicurr;
+                            } else { // no need to swap as there are no LCP info-s yet
+                                jicurr->joint = j;
+                                jimixend = jicurr = jicurr + 1;
+                            }
+                        } else { // A purely unbounded case -- break out and proceed growing in opposite direction
+                            unb_start = unb_start - 1;
+                            dJointWithInfo1 *jiunbstart = jointinfos + unb_start;
+                            jiunbstart->info = jicurr->info;
+                            jiunbstart->joint = j;
+                            lcp_end = jicurr - jointinfos;
+                            mix_end = jimixend - jointinfos;
+                            jicurr = jiunbstart - 1;
+                            break;
+                        }
+                    } else {
+                        j->tag = -1;
+                    }
+                }
+                if (fwd_end_reached) {
+                    break;
+                }
+            }
+            // -------------------------------------------------------------------------
+            // Switch to growing array backward
+            {
+                bool bkw_end_reached = false;
+                dJointWithInfo1 *jimixstart = jointinfos + mix_start - 1;
+                while (true) {	// jicurr=dest, _jcurr=src
+                    if (_jcurr == _jend) {
+                        unb_start = (jicurr + 1) - jointinfos;
+                        mix_start = (jimixstart + 1) - jointinfos;
+                        bkw_end_reached = true;
+                        break;
+                    }
+                    dxJoint *j = *_jcurr++;
+                    j->getInfo1 (&jicurr->info);
+                    dIASSERT (/*jicurr->info.m >= 0 && */jicurr->info.m <= 6 && /*jicurr->info.nub >= 0 && */jicurr->info.nub <= jicurr->info.m);
+                    if (jicurr->info.m != 0) {
+                        mcurr += jicurr->info.m;
+                        if (jicurr->info.nub == jicurr->info.m) { // An unbounded info - a correct guess!!!
+                            jicurr->joint = j;
+                            --jicurr;
+                        } else if (jicurr->info.nub != 0) { // A mixed case
+                            if (mix_end == lcp_end) { // no lcp infos yet - just move to opposite side of mixed-s
+                                dJointWithInfo1 *jimixend = jointinfos + mix_end;
+                                lcp_end = mix_end = mix_end + 1;
+                                jimixend->info = jicurr->info;
+                                jimixend->joint = j;
+                            } else if (jimixstart != jicurr) { // have to swap to the head of mixed-s
+                                dxJoint::Info1 tmp_info = jicurr->info;
+                                *jicurr = *jimixstart;
+                                jimixstart->info = tmp_info;
+                                jimixstart->joint = j;
+                                --jimixstart; --jicurr;
+                            } else { // no need to swap as there are no unbounded info-s yet
+                                jicurr->joint = j;
+                                jimixstart = jicurr = jicurr - 1;
+                            }
+                        } else { // A purely lcp case -- break out and proceed growing in opposite direction
+                            dJointWithInfo1 *jilcpend = jointinfos + lcp_end;
+                            lcp_end = lcp_end + 1;
+                            jilcpend->info = jicurr->info;
+                            jilcpend->joint = j;
+                            unb_start = (jicurr + 1) - jointinfos;
+                            mix_start = (jimixstart + 1) - jointinfos;
+                            jicurr = jilcpend + 1;
+                            break;
+                        }
+                    } else {
+                        j->tag = -1;
+                    }
+                }
+                if (bkw_end_reached) {
+                    break;
+                }
+            }
+        }
+
+        callContext->m_stage0Outputs->m = mcurr;
+        callContext->m_stage0Outputs->nub = (unsigned)(mix_start - unb_start);
+        dIASSERT((sizeint)(mix_start - unb_start) <= (sizeint)UINT_MAX);
+        ji_start = unb_start;
+        ji_end = lcp_end;
+    }
+
+    {
+        const dJointWithInfo1 *jicurr = jointinfos + ji_start;
+        const dJointWithInfo1 *const jiend = jointinfos + ji_end;
+        for (unsigned int i = 0; jicurr != jiend; i++, ++jicurr) {
+            jicurr->joint->tag = i;
+        }
+    }
+
+    callContext->m_stage0Outputs->ji_start = ji_start;
+    callContext->m_stage0Outputs->ji_end = ji_end;
+}
+
+static 
+int dxStepIsland_Stage1_Callback(void *_stage1CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage1CallContext *stage1CallContext = (dxStepperStage1CallContext *)_stage1CallContext;
+    dxStepIsland_Stage1(stage1CallContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage1(dxStepperStage1CallContext *stage1CallContext)
+{
+    const dxStepperProcessingCallContext *callContext = stage1CallContext->m_stepperCallContext;
+    dJointWithInfo1 *_jointinfos = stage1CallContext->m_jointinfos;
+    dReal *invI = stage1CallContext->m_invI;
+    sizeint ji_start = stage1CallContext->m_stage0Outputs.ji_start;
+    sizeint ji_end = stage1CallContext->m_stage0Outputs.ji_end;
+    unsigned int m = stage1CallContext->m_stage0Outputs.m;
+    unsigned int nub = stage1CallContext->m_stage0Outputs.nub;
+
+    dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+    {
+        memarena->RestoreState(stage1CallContext->m_stageMemArenaState);
+        stage1CallContext = NULL; // WARNING! _stage1CallContext is not valid after this point!
+        dIVERIFY(stage1CallContext == NULL); // To suppress compiler warnings about unused variable assignment
+
+        unsigned int _nj = callContext->m_islandJointsCount;
+        const sizeint ji_reserve_count = 2 * (sizeint)_nj;
+        memarena->ShrinkArray<dJointWithInfo1>(_jointinfos, ji_reserve_count, ji_end);
+    }
+
+    dJointWithInfo1 *jointinfos = _jointinfos + ji_start;
+    unsigned int nj = (unsigned int)(ji_end - ji_start);
+    dIASSERT((sizeint)(ji_end - ji_start) <= (sizeint)UINT_MAX);
+
+    unsigned int *mindex = NULL;
+    dReal *J = NULL, *A = NULL, *pairsRhsCfm = NULL, *pairsLoHi = NULL;
+    int *findex = NULL;
+    atomicord32 *bodyStartJoints = NULL, *bodyJointLinks = NULL;
+
+    // if there are constraints, compute constrForce
+    if (m > 0) {
+        mindex = memarena->AllocateArray<unsigned int>((sizeint)(nj + 1));
+        {
+            unsigned int *mcurr = mindex;
+            unsigned int moffs = 0;
+            mcurr[0] = moffs;
+            mcurr += 1;
+
+            const dJointWithInfo1 *const jiend = jointinfos + nj;
+            for (const dJointWithInfo1 *jicurr = jointinfos; jicurr != jiend; ++jicurr) {
+                //dxJoint *joint = jicurr->joint;
+                moffs += jicurr->info.m;
+                mcurr[0] = moffs;
+                mcurr += 1;
+            }
+        }
+
+        // create a constraint equation right hand side vector `c', a constraint
+        // force mixing vector `cfm', and LCP low and high bound vectors, and an
+        // 'findex' vector.
+        findex = memarena->AllocateArray<int>(m);
+        J = memarena->AllocateArray<dReal>((sizeint)m * (2 * JME__MAX));
+        A = memarena->AllocateOveralignedArray<dReal>((sizeint)m * dPAD(m), AMATRIX_ALIGNMENT);
+        pairsRhsCfm = memarena->AllocateArray<dReal>((sizeint)m * RCE__RHS_CFM_MAX);
+        pairsLoHi = memarena->AllocateArray<dReal>((sizeint)m * LHE__LO_HI_MAX);
+        const unsigned int nb = callContext->m_islandBodiesCount;
+        bodyStartJoints = memarena->AllocateArray<atomicord32>(nb);
+        bodyJointLinks = memarena->AllocateArray<atomicord32>((sizeint)nj * dJCB__MAX);
+        dICHECK(nj < ~((atomicord32)0) / dJCB__MAX); // If larger joint counts are to be used, pointers (or sizeint) need to be stored rather than atomicord32 indices
+    }
+
+    dxStepperLocalContext *localContext = (dxStepperLocalContext *)memarena->AllocateBlock(sizeof(dxStepperLocalContext));
+    localContext->Initialize(invI, jointinfos, nj, m, nub, mindex, findex, J, A, pairsRhsCfm, pairsLoHi, bodyStartJoints, bodyJointLinks);
+
+    void *stage1MemarenaState = memarena->SaveState();
+    dxStepperStage3CallContext *stage3CallContext = (dxStepperStage3CallContext*)memarena->AllocateBlock(sizeof(dxStepperStage3CallContext));
+    stage3CallContext->Initialize(callContext, localContext, stage1MemarenaState);
+
+    if (m > 0) {
+        dReal *JinvM = memarena->AllocateOveralignedArray<dReal>((sizeint)m * (2 * JIM__MAX), JINVM_ALIGNMENT);
+        const unsigned int nb = callContext->m_islandBodiesCount;
+        dReal *rhs_tmp = memarena->AllocateArray<dReal>((sizeint)nb * dDA__MAX);
+
+        dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)memarena->AllocateBlock(sizeof(dxStepperStage2CallContext));
+        stage2CallContext->Initialize(callContext, localContext, JinvM, rhs_tmp);
+
+        const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+        dIASSERT(allowedThreads != 0);
+
+        if (allowedThreads == 1) {
+            IFTIMING(dTimerNow("create J"));
+            dxStepIsland_Stage2a(stage2CallContext);
+            IFTIMING(dTimerNow("compute Adiag, JinvM and rhs_tmp"));
+            dxStepIsland_Stage2b(stage2CallContext);
+            IFTIMING(dTimerNow("compute A and rhs"));
+            dxStepIsland_Stage2c(stage2CallContext);
+            dxStepIsland_Stage3(stage3CallContext);
+        }
+        else {
+            dxWorld *world = callContext->m_world;
+            dCallReleaseeID stage3CallReleasee;
+            world->PostThreadedCallForUnawareReleasee(NULL, &stage3CallReleasee, 1, callContext->m_finalReleasee, 
+                NULL, &dxStepIsland_Stage3_Callback, stage3CallContext, 0, "StepIsland Stage3");
+
+            dCallReleaseeID stage2bSyncReleasee;
+            world->PostThreadedCall(NULL, &stage2bSyncReleasee, 1, stage3CallReleasee, 
+                NULL, &dxStepIsland_Stage2bSync_Callback, stage2CallContext, 0, "StepIsland Stage2b Sync");
+
+            dCallReleaseeID stage2aSyncReleasee;
+            world->PostThreadedCall(NULL, &stage2aSyncReleasee, allowedThreads, stage2bSyncReleasee, 
+                NULL, &dxStepIsland_Stage2aSync_Callback, stage2CallContext, 0, "StepIsland Stage2a Sync");
+
+            dIASSERT(allowedThreads > 1); /*if (allowedThreads > 1) */{
+                world->PostThreadedCallsGroup(NULL, allowedThreads - 1, stage2aSyncReleasee, &dxStepIsland_Stage2a_Callback, stage2CallContext, "StepIsland Stage2a");
+            }
+            dxStepIsland_Stage2a(stage2CallContext);
+            world->AlterThreadedCallDependenciesCount(stage2aSyncReleasee, -1);
+        }
+    }
+    else {
+        dxStepIsland_Stage3(stage3CallContext);
+    }
+}
+
+
+static 
+int dxStepIsland_Stage2a_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
+    dxStepIsland_Stage2a(stage2CallContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage2a(dxStepperStage2CallContext *stage2CallContext)
+{
+    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+    const dxStepperLocalContext *localContext = stage2CallContext->m_localContext;
+    dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
+    unsigned int nj = localContext->m_nj;
+    const unsigned int *mindex = localContext->m_mindex;
+
+    const dReal stepsizeRecip = dRecip(callContext->m_stepSize);
+    dxWorld *world = callContext->m_world;
+
+    {
+        int *findex = localContext->m_findex;
+        dReal *J = localContext->m_J;
+        dReal *pairsRhsCfm = localContext->m_pairsRhsCfm;
+        dReal *pairsLoHi = localContext->m_pairsLoHi;
+
+        // get jacobian data from constraints. a (2*m)x8 matrix will be created
+        // to store the two jacobian blocks from each constraint. it has this
+        // format:
+        //
+        //   l l l 0 a a a 0  \    .
+        //   l l l 0 a a a 0   }-- jacobian body 1 block for joint 0 (3 rows)
+        //   l l l 0 a a a 0  /
+        //   l l l 0 a a a 0  \    .
+        //   l l l 0 a a a 0   }-- jacobian body 2 block for joint 0 (3 rows)
+        //   l l l 0 a a a 0  /
+        //   l l l 0 a a a 0  }--- jacobian body 1 block for joint 1 (1 row)
+        //   l l l 0 a a a 0  }--- jacobian body 2 block for joint 1 (1 row)
+        //   etc...
+        //
+        //   (lll) = linear jacobian data
+        //   (aaa) = angular jacobian data
+        //
+
+        const dReal worldERP = world->global_erp;
+        const dReal worldCFM = world->global_cfm;
+
+        unsigned ji;
+        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_J, nj)) != nj) {
+            const unsigned ofsi = mindex[ji];
+            const unsigned int infom = mindex[ji + 1] - ofsi;
+
+            dReal *const JRow = J + (sizeint)ofsi * (2 * JME__MAX);
+            dReal *rowRhsCfm = pairsRhsCfm + (sizeint)ofsi * RCE__RHS_CFM_MAX;
+            dReal *rowLoHi = pairsLoHi + (sizeint)ofsi * LHE__LO_HI_MAX;
+            {
+                dSetZero (JRow, infom * (2 * JME__MAX));
+
+                dReal *const endRhsCfm = rowRhsCfm + infom * RCE__RHS_CFM_MAX;
+                for (dReal *currRhsCfm = rowRhsCfm; currRhsCfm != endRhsCfm; currRhsCfm += RCE__RHS_CFM_MAX) {
+                    currRhsCfm[RCE_RHS] = REAL(0.0);
+                    currRhsCfm[RCE_CFM] = worldCFM;
+                }
+
+                dReal *const endLoHi = rowLoHi + infom * LHE__LO_HI_MAX;
+                for (dReal *currLoHi = rowLoHi; currLoHi != endLoHi; currLoHi += LHE__LO_HI_MAX) {
+                    currLoHi[LHE_LO] = -dInfinity;
+                    currLoHi[LHE_HI] = dInfinity;
+                }
+            }
+            int *findexRow = findex + ofsi;
+            dSetValue(findexRow, infom, -1);
+
+            dxJoint *joint = jointinfos[ji].joint;
+            joint->getInfo2(stepsizeRecip, worldERP, JME__MAX, JRow + JME__J_MIN, JRow + infom * JME__MAX + JME__J_MIN, RCE__RHS_CFM_MAX, rowRhsCfm, rowLoHi, findexRow);
+            dSASSERT((int)LHE__LO_HI_MAX == RCE__RHS_CFM_MAX); // To make sure same step fits for both pairs in the call to dxJoint::getInfo2() above
+
+            // findex iteration is compact and is not going to pollute caches - do it first
+            {
+                // adjust returned findex values for global index numbering
+                int *const findicesEnd = findexRow + infom;
+                for (int *findexCurr = findexRow; findexCurr != findicesEnd; ++findexCurr) {
+                    int fival = *findexCurr;
+                    if (fival != -1) {
+                        *findexCurr = fival + ofsi;
+                    }
+                }
+            }
+            {
+                dReal *const endRhsCfm = rowRhsCfm + infom * RCE__RHS_CFM_MAX;
+                for (dReal *currRhsCfm = rowRhsCfm; currRhsCfm != endRhsCfm; currRhsCfm += RCE__RHS_CFM_MAX) {
+                    currRhsCfm[RCE_RHS] *= stepsizeRecip;
+                    currRhsCfm[RCE_CFM] *= stepsizeRecip;
+                }
+            }
+        }
+    }
+}
+
+static 
+int dxStepIsland_Stage2aSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
+    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+
+    dIASSERT(allowedThreads > 1); /*if (allowedThreads > 1) */{ // The allowed thread count is greater than one as otherwise current function would not be scheduled for execution from the previous stage
+        dxWorld *world = callContext->m_world;
+        world->AlterThreadedCallDependenciesCount(callThisReleasee, allowedThreads - 1);
+        world->PostThreadedCallsGroup(NULL, allowedThreads - 1, callThisReleasee, &dxStepIsland_Stage2b_Callback, stage2CallContext, "StepIsland Stage2b");
+    }
+    dxStepIsland_Stage2b(stage2CallContext);
+
+    return 1;
+}
+
+static 
+int dxStepIsland_Stage2b_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
+    dxStepIsland_Stage2b(stage2CallContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage2b(dxStepperStage2CallContext *stage2CallContext)
+{
+    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+    const dxStepperLocalContext *localContext = stage2CallContext->m_localContext;
+    dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
+    unsigned int nj = localContext->m_nj;
+    const unsigned int *mindex = localContext->m_mindex;
+
+    {
+        // Warning!!!
+        // This code depends on cfm elements and therefore must be in different sub-stage 
+        // from Jacobian construction in Stage2a to ensure proper synchronization 
+        // and avoid accessing numbers being modified.
+        // Warning!!!
+        dReal *A = localContext->m_A;
+        const dReal *pairsRhsCfm = localContext->m_pairsRhsCfm;
+        const unsigned m = localContext->m_m;
+
+        const unsigned int mskip = dPAD(m);
+
+        unsigned ji;
+        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_Ainit, nj)) != nj) {
+            const unsigned ofsi = mindex[ji];
+            const unsigned int infom = mindex[ji + 1] - ofsi;
+
+            dReal *Arow = A + (sizeint)mskip * ofsi;
+            dSetZero(Arow, (sizeint)mskip * infom);
+            dReal *Adiag = Arow + ofsi;
+            const dReal *rowRfsCrm = pairsRhsCfm + (sizeint)ofsi * RCE__RHS_CFM_MAX;
+            for (unsigned int i = 0; i != infom; Adiag += mskip, ++i) {
+                Adiag[i] = (rowRfsCrm + i * RCE__RHS_CFM_MAX)[RCE_CFM];
+            }
+        }
+    }
+
+    {
+        // Warning!!!
+        // This code depends on J elements and therefore must be in different sub-stage 
+        // from Jacobian construction in Stage2a to ensure proper synchronization 
+        // and avoid accessing numbers being modified.
+        // Warning!!!
+        const dReal *invI = localContext->m_invI;
+        const dReal *J = localContext->m_J;
+        dReal *JinvM = stage2CallContext->m_JinvM;
+
+        // compute A = J*invM*J'. first compute JinvM = J*invM. this has the same
+        // format as J so we just go through the constraints in J multiplying by
+        // the appropriate scalars and matrices.
+        unsigned ji;
+        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_JinvM, nj)) != nj) {
+            const unsigned ofsi = mindex[ji];
+            const unsigned int infom = mindex[ji + 1] - ofsi;
+
+            dReal *Jdst = JinvM + (sizeint)ofsi * (2 * JIM__MAX);
+            dSetZero(Jdst, infom * (2 * JIM__MAX));
+
+            const dReal *Jsrc = J + (sizeint)ofsi * (2 * JME__MAX);
+            dxJoint *joint = jointinfos[ji].joint;
+
+            dxBody *jb0 = joint->node[0].body;
+            if (true || jb0 != NULL) { // -- always true
+                dReal body_invMass0 = jb0->invMass;
+                const dReal *body_invI0 = invI + (sizeint)(unsigned int)jb0->tag * dM3E__MAX;
+                for (unsigned int j = infom; j != 0; --j) {
+                    for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) Jdst[JIM__L_AXES_MIN + k] = Jsrc[JME__JL_MIN + k] * body_invMass0;
+                    dMultiply0_133(Jdst + JIM__A_AXES_MIN, Jsrc + JME__JA_MIN, body_invI0);
+                    Jsrc += JME__MAX;
+                    Jdst += JIM__MAX;
+                }
+            }
+
+            dxBody *jb1 = joint->node[1].body;
+            if (jb1 != NULL) {
+                dReal body_invMass1 = jb1->invMass;
+                const dReal *body_invI1 = invI + (sizeint)(unsigned int)jb1->tag * dM3E__MAX;
+                for (unsigned int j = infom; j != 0; --j) {
+                    for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) Jdst[JIM__L_AXES_MIN + k] = Jsrc[JME__JL_MIN + k] * body_invMass1;
+                    dMultiply0_133 (Jdst + JIM__A_AXES_MIN, Jsrc + JME__JA_MIN, body_invI1);
+                    Jsrc += JME__MAX;
+                    Jdst += JIM__MAX;
+                }
+            }
+        }
+    }
+
+    {
+        // Warning!!!
+        // This code reads facc/tacc fields of body objects which (the fields)
+        // may be modified by dxJoint::getInfo2(). Therefore the code must be
+        // in different sub-stage from Jacobian construction in Stage2a 
+        // to ensure proper synchronization and avoid accessing numbers being modified.
+        // Warning!!!
+        dxBody * const *const body = callContext->m_islandBodiesStart;
+        const unsigned int nb = callContext->m_islandBodiesCount;
+        const dReal *invI = localContext->m_invI;
+        atomicord32 *bodyStartJoints = localContext->m_bodyStartJoints;
+        dReal *rhs_tmp = stage2CallContext->m_rhs_tmp;
+
+        // compute the right hand side `rhs'
+        const dReal stepsizeRecip = dRecip(callContext->m_stepSize);
+
+        // put v/h + invM*fe into rhs_tmp
+        unsigned bi;
+        while ((bi = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_bi_rhs_tmp, nb)) != nb) {
+            dReal *tmp1curr = rhs_tmp + (sizeint)bi * dDA__MAX;
+            const dReal *invIrow = invI + (sizeint)bi * dM3E__MAX;
+            dxBody *b = body[bi];
+            // dSetZero(tmp1curr, 8); -- not needed
+            for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) tmp1curr[dDA__L_MIN + j] = b->facc[dV3E__AXES_MIN + j] * b->invMass + b->lvel[dV3E__AXES_MIN + j] * stepsizeRecip;
+            dMultiply0_331 (tmp1curr + dDA__A_MIN, invIrow, b->tacc);
+            for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) tmp1curr[dDA__A_MIN + k] += b->avel[dV3E__AXES_MIN + k] * stepsizeRecip;
+            // Initialize body start joint indices -- this will be needed later for building body related joint list in dxStepIsland_Stage2c
+            bodyStartJoints[bi] = 0;
+        }
+    }
+}
+
+static 
+int dxStepIsland_Stage2bSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
+    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+
+    dIASSERT(allowedThreads > 1); /*if (allowedThreads > 1) */{ // The allowed thread count is greater than one as otherwise current function would not be scheduled for execution from the previous stage
+        dxWorld *world = callContext->m_world;
+        world->AlterThreadedCallDependenciesCount(callThisReleasee, allowedThreads - 1);
+        world->PostThreadedCallsGroup(NULL, allowedThreads - 1, callThisReleasee, &dxStepIsland_Stage2c_Callback, stage2CallContext, "StepIsland Stage2c");
+    }
+    dxStepIsland_Stage2c(stage2CallContext);
+
+    return 1;
+}
+
+
+static 
+int dxStepIsland_Stage2c_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
+    dxStepIsland_Stage2c(stage2CallContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage2c(dxStepperStage2CallContext *stage2CallContext)
+{
+    //const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+    const dxStepperLocalContext *localContext = stage2CallContext->m_localContext;
+    dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
+    unsigned int nj = localContext->m_nj;
+    const unsigned int *mindex = localContext->m_mindex;
+
+    {
+        // Warning!!!
+        // This code depends on A elements and JinvM elements and therefore 
+        // must be in different sub-stage from A initialization and JinvM calculation in Stage2b 
+        // to ensure proper synchronization and avoid accessing numbers being modified.
+        // Warning!!!
+        dReal *A = localContext->m_A;
+        const dReal *JinvM = stage2CallContext->m_JinvM;
+        const dReal *J = localContext->m_J;
+        const unsigned m = localContext->m_m;
+
+        // now compute A = JinvM * J'. A's rows and columns are grouped by joint,
+        // i.e. in the same way as the rows of J. block (i,j) of A is only nonzero
+        // if joints i and j have at least one body in common. 
+        const unsigned int mskip = dPAD(m);
+
+        unsigned ji;
+        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_Aaddjb, nj)) != nj) {
+            const unsigned ofsi = mindex[ji];
+            const unsigned int infom = mindex[ji + 1] - ofsi;
+
+            dReal *Arow = A + (sizeint)mskip * ofsi;
+            const dReal *JinvMRow = JinvM + (sizeint)ofsi * (2 * JIM__MAX);
+            dxJoint *joint = jointinfos[ji].joint;
+
+            dxBody *jb0 = joint->node[0].body;
+            if (true || jb0 != NULL) { // -- always true
+                // compute diagonal block of A
+                const dReal *JRow = J + (sizeint)ofsi * (2 * JME__MAX);
+                MultiplyAddJinvMxJToA (Arow + ofsi, JinvMRow, JRow, infom, infom, mskip);
+
+                for (dxJointNode *n0 = (ji != 0 ? jb0->firstjoint : NULL); n0; n0 = n0->next) {
+                    // if joint was tagged as -1 then it is an inactive (m=0 or disabled)
+                    // joint that should not be considered
+                    int j0 = n0->joint->tag;
+                    if (j0 != -1 && (unsigned)j0 < ji) {
+                        const unsigned int jiother_ofsi = mindex[j0];
+                        const unsigned int jiother_infom = mindex[j0 + 1] - jiother_ofsi;
+                        const dJointWithInfo1 *jiother = jointinfos + j0;
+                        unsigned int smart_infom = (jiother->joint->node[1].body == jb0) ? jiother_infom : 0;
+                        // set block of A
+                        const dReal *JOther = J + ((sizeint)jiother_ofsi * 2 + smart_infom) * JME__MAX;
+                        MultiplyAddJinvMxJToA (Arow + jiother_ofsi, JinvMRow, JOther, infom, jiother_infom, mskip);
+                    }
+                }
+            }
+
+            dxBody *jb1 = joint->node[1].body;
+            dIASSERT(jb1 != jb0);
+            if (jb1 != NULL) {
+                const dReal *JinvMOther = JinvMRow + infom * JIM__MAX;
+                // compute diagonal block of A
+                const dReal *JRow = J + ((sizeint)ofsi * 2 + infom) * JME__MAX;
+                MultiplyAddJinvMxJToA (Arow + ofsi, JinvMOther, JRow, infom, infom, mskip);
+
+                for (dxJointNode *n1 = (ji != 0 ? jb1->firstjoint : NULL); n1; n1 = n1->next) {
+                    // if joint was tagged as -1 then it is an inactive (m=0 or disabled)
+                    // joint that should not be considered
+                    int j1 = n1->joint->tag;
+                    if (j1 != -1 && (unsigned)j1 < ji) {
+                        const unsigned int jiother_ofsi = mindex[j1];
+                        const unsigned int jiother_infom = mindex[j1 + 1] - jiother_ofsi;
+                        const dJointWithInfo1 *jiother = jointinfos + j1;
+                        unsigned int smart_infom = (jiother->joint->node[1].body == jb1) ? jiother_infom : 0;
+                        // set block of A
+                        const dReal *JOther = J + ((sizeint)jiother_ofsi * 2 + smart_infom) * JME__MAX;
+                        MultiplyAddJinvMxJToA (Arow + jiother_ofsi, JinvMOther, JOther, infom, jiother_infom, mskip);
+                    }
+                }
+            }
+        }
+    }
+
+    {
+        // Warning!!!
+        // This code depends on rhs_tmp elements and therefore must be in 
+        // different sub-stage from rhs_tmp calculation in Stage2b to ensure 
+        // proper synchronization and avoid accessing numbers being modified.
+        // Warning!!!
+        const dReal *J = localContext->m_J;
+        const dReal *rhs_tmp = stage2CallContext->m_rhs_tmp;
+        dReal *pairsRhsCfm = localContext->m_pairsRhsCfm;
+        atomicord32 *bodyStartJoints = localContext->m_bodyStartJoints;
+        atomicord32 *bodyJointLinks = localContext->m_bodyJointLinks;
+
+        // compute the right hand side `rhs'
+        // put J*rhs_tmp into rhs
+        unsigned ji;
+        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_rhs, nj)) != nj) {
+            const unsigned ofsi = mindex[ji];
+            const unsigned int infom = mindex[ji + 1] - ofsi;
+
+            dReal *currRhsCfm = pairsRhsCfm + (sizeint)ofsi * RCE__RHS_CFM_MAX;
+            const dReal *JRow = J + (sizeint)ofsi * (2 * JME__MAX);
+            
+            dxJoint *joint = jointinfos[ji].joint;
+
+            dxBody *jb0 = joint->node[0].body;
+            if (true || jb0 != NULL) { // -- always true
+                unsigned bodyIndex = (unsigned)jb0->tag;
+                MultiplySubJxRhsTmpFromRHS (currRhsCfm, JRow, rhs_tmp + (sizeint)bodyIndex * dDA__MAX, infom);
+
+                // Link joints connected to each body into a list to be used on results incorporation. The bodyStartJoints have been initialized in dxStepIsland_Stage2b.
+                const atomicord32 linkIndex = (atomicord32)((sizeint)ji * dJCB__MAX + dJCB_FIRST_BODY); // It is asserted at links buffer allocation that the indices can't overflow atomicord32
+                for (atomicord32 oldStartIndex = bodyStartJoints[bodyIndex]; ; oldStartIndex = bodyStartJoints[bodyIndex]) {
+                    bodyJointLinks[linkIndex] = oldStartIndex;
+                    if (ThrsafeCompareExchange(&bodyStartJoints[bodyIndex], oldStartIndex, linkIndex + 1)) { // The link index is stored incremented to allow 0 as end indicator
+                        break;
+                    }
+                }
+            }
+
+            dxBody *jb1 = joint->node[1].body;
+            if (jb1 != NULL) {
+                unsigned bodyIndex = (unsigned)jb1->tag;
+                MultiplySubJxRhsTmpFromRHS (currRhsCfm, JRow + infom * JME__MAX, rhs_tmp + (sizeint)bodyIndex * dDA__MAX, infom);
+
+                // Link joints connected to each body into a list to be used on results incorporation. The bodyStartJoints have been initialized in dxStepIsland_Stage2b
+                const atomicord32 linkIndex = (atomicord32)((sizeint)ji * dJCB__MAX + dJCB_SECOND_BODY); // It is asserted at links buffer allocation that the indices can't overflow atomicord32
+                for (atomicord32 oldStartIndex = bodyStartJoints[bodyIndex]; ; oldStartIndex = bodyStartJoints[bodyIndex]) {
+                    bodyJointLinks[linkIndex] = oldStartIndex;
+                    if (ThrsafeCompareExchange(&bodyStartJoints[bodyIndex], oldStartIndex, linkIndex + 1)) { // The link index is stored incremented to allow 0 as end indicator
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+static 
+int dxStepIsland_Stage3_Callback(void *_stage3CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage3CallContext *stage3CallContext = (dxStepperStage3CallContext *)_stage3CallContext;
+    dxStepIsland_Stage3(stage3CallContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage3(dxStepperStage3CallContext *stage3CallContext)
+{
+    const dxStepperProcessingCallContext *callContext = stage3CallContext->m_stepperCallContext;
+    const dxStepperLocalContext *localContext = stage3CallContext->m_localContext;
+
+    dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+    memarena->RestoreState(stage3CallContext->m_stage1MemArenaState);
+    stage3CallContext = NULL; // WARNING! stage3CallContext is not valid after this point!
+    dIVERIFY(stage3CallContext == NULL); // To suppress unused variable assignment warnings
+
+    unsigned int m = localContext->m_m;
+    unsigned int nub = localContext->m_nub;
+    //const unsigned int *mindex = localContext->m_mindex;
+    int *findex = localContext->m_findex;
+    dReal *A = localContext->m_A;
+    dReal *pairsRhsLambda = localContext->m_pairsRhsCfm; // Reuse cfm buffer for lambdas as the former values are not needed any more
+    dReal *pairsLoHi = localContext->m_pairsLoHi;
+
+    if (m > 0) {
+        BEGIN_STATE_SAVE(memarena, lcpstate) {
+            IFTIMING(dTimerNow ("solve LCP problem"));
+
+            // solve the LCP problem and get lambda.
+            // this will destroy A but that's OK
+            dxSolveLCP (memarena, m, A, pairsRhsLambda, NULL, nub, pairsLoHi, findex);
+            dSASSERT((int)RLE__RHS_LAMBDA_MAX == PBX__MAX && (int)RLE_RHS == PBX_B && (int)RLE_LAMBDA == PBX_X);
+            dSASSERT((int)LHE__LO_HI_MAX == PLH__MAX && (int)LHE_LO == PLH_LO && (int)LHE_HI == PLH_HI);
+
+        } END_STATE_SAVE(memarena, lcpstate);
+    }
+
+    // void *stage3MemarenaState = memarena->SaveState();
+
+    dxStepperStage4CallContext *stage4CallContext = (dxStepperStage4CallContext *)memarena->AllocateBlock(sizeof(dxStepperStage4CallContext));
+    stage4CallContext->Initialize(callContext, localContext/*, stage3MemarenaState*/);
+
+    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+    dIASSERT(allowedThreads != 0);
+
+    if (allowedThreads == 1) {
+        IFTIMING(dTimerNow ("compute and apply constraint force"));
+        dxStepIsland_Stage4(stage4CallContext);
+        IFTIMING(dTimerEnd());
+
+        if (m > 0) {
+            IFTIMING(dTimerReport(stdout,1));
+        }
+    }
+    else {
+        dCallReleaseeID finalReleasee = callContext->m_finalReleasee;
+        dxWorld *world = callContext->m_world;
+        world->AlterThreadedCallDependenciesCount(finalReleasee, allowedThreads - 1);
+        world->PostThreadedCallsGroup(NULL, allowedThreads - 1, finalReleasee, &dxStepIsland_Stage4_Callback, stage4CallContext, "StepIsland Stage4");
+        // Note: Adding another dependency for the finalReleasee is not necessary as it already depends on the current call
+        dxStepIsland_Stage4(stage4CallContext);
+    }
+}
+
+static 
+int dxStepIsland_Stage4_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+    (void)callInstanceIndex; // unused
+    (void)callThisReleasee; // unused
+    dxStepperStage4CallContext *stage4CallContext = (dxStepperStage4CallContext *)_stage4CallContext;
+    dxStepIsland_Stage4(stage4CallContext);
+    return 1;
+}
+
+static 
+void dxStepIsland_Stage4(dxStepperStage4CallContext *stage4CallContext)
+{
+    const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+    const dxStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+    const dReal stepSize = callContext->m_stepSize;
+    dxBody *const *bodies = callContext->m_islandBodiesStart;
+    dReal *invI = localContext->m_invI;
+    dJointWithInfo1 *jointInfos = localContext->m_jointinfos;
+    dReal *J = localContext->m_J;
+    dReal *pairsRhsLambda = localContext->m_pairsRhsCfm;
+    const unsigned int *mIndex = localContext->m_mindex;
+    atomicord32 *bodyStartJoints = localContext->m_bodyStartJoints;
+    atomicord32 *bodyJointLinks = localContext->m_bodyJointLinks;
+    const unsigned int nb = callContext->m_islandBodiesCount;
+
+    unsigned bi;
+    while ((bi = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_bi_constrForce, nb)) != nb) {
+        dVector3 angularForceAccumulator;
+        dxBody *b = bodies[bi];
+        const dReal *invIrow = invI + (sizeint)bi * dM3E__MAX;
+        dReal body_invMass_mul_stepSize = stepSize * b->invMass;
+
+        dReal bodyConstrForce[CFE__MAX];
+        bool constrForceAvailable = false;
+        
+        unsigned linkIndex = bodyStartJoints != NULL ? bodyStartJoints[bi] : 0;
+        if (linkIndex != 0) {
+            dSetZero(bodyConstrForce, dARRAY_SIZE(bodyConstrForce));
+        }
+
+        // compute the constraint force as constrForce = J'*lambda
+        for (; linkIndex != 0; constrForceAvailable = true, linkIndex = bodyJointLinks[linkIndex - 1]) {
+            unsigned jointIndex = (linkIndex - 1) / dJCB__MAX;
+            unsigned jointBodyIndex = (linkIndex - 1) % dJCB__MAX;
+
+            const dJointWithInfo1 *currJointInfo = jointInfos + jointIndex;
+            unsigned ofsi = mIndex[jointIndex];
+            dIASSERT(dIN_RANGE(jointIndex, 0, localContext->m_nj));
+
+            const dReal *JRow = J + (sizeint)ofsi * (2 * JME__MAX);
+            const dReal *rowRhsLambda = pairsRhsLambda + (sizeint)ofsi * RLE__RHS_LAMBDA_MAX;
+
+            dxJoint *joint = currJointInfo->joint;
+            const unsigned int infom = currJointInfo->info.m;
+
+            // unsigned jRowExtraOffset = jointBodyIndex * infom * JME__MAX;
+            unsigned jRowExtraOffset = jointBodyIndex != dJCB__MIN ? infom * JME__MAX : 0;
+            dSASSERT(dJCB__MAX == 2);
+
+            dJointFeedback *fb = joint->feedback;
+            MultiplyAddJxLambdaToCForce(bodyConstrForce, JRow + jRowExtraOffset, rowRhsLambda, infom, fb, jointBodyIndex);
+        }
+
+        // compute the velocity update
+        if (constrForceAvailable) {
+            // add fe to cforce and multiply cforce by stepSize
+            for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) {
+                b->lvel[dV3E__AXES_MIN + j] += (bodyConstrForce[CFE__L_MIN + j] + b->facc[dV3E__AXES_MIN + j]) * body_invMass_mul_stepSize;
+            }
+            for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) {
+                angularForceAccumulator[dV3E__AXES_MIN + k] = (bodyConstrForce[CFE__A_MIN + k] + b->tacc[dV3E__AXES_MIN + k]) * stepSize;
+            }
+        }
+        else {
+            // add fe to cforce and multiply cforce by stepSize
+            dAddVectorScaledVector3(b->lvel, b->lvel, b->facc, body_invMass_mul_stepSize);
+            dCopyScaledVector3(angularForceAccumulator, b->tacc, stepSize);
+        }
+
+        dMultiplyAdd0_331 (b->avel, invIrow, angularForceAccumulator + dV3E__AXES_MIN);
+
+        // update the position and orientation from the new linear/angular velocity
+        // (over the given time step)
+        dxStepBody (b, stepSize);
+
+        // zero all force accumulators
+        dZeroVector3(b->facc);
+        dZeroVector3(b->tacc);
+    }
+}
+
+
+//****************************************************************************
+
+/*extern */
+sizeint dxEstimateStepMemoryRequirements (dxBody * const *body, unsigned int nb, dxJoint * const *_joint, unsigned int _nj)
+{
+    (void)body; // unused
+    unsigned int nj, m;
+
+    {
+        unsigned int njcurr = 0, mcurr = 0;
+        dxJoint::SureMaxInfo info;
+        dxJoint *const *const _jend = _joint + _nj;
+        for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; ++_jcurr) {	
+            dxJoint *j = *_jcurr;
+            j->getSureMaxInfo (&info);
+
+            unsigned int jm = info.max_m;
+            if (jm > 0) {
+                njcurr++;
+
+                mcurr += jm;
+            }
+        }
+        nj = njcurr; m = mcurr;
+    }
+
+    sizeint res = 0;
+
+    res += dOVERALIGNED_SIZE(sizeof(dReal) * dM3E__MAX * nb, INVI_ALIGNMENT); // for invI
+
+    {
+        sizeint sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * 2 * _nj); // for initial jointinfos
+
+        // The array can't grow right more than by nj
+        sizeint sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * ((sizeint)_nj + (sizeint)nj)); // for shrunk jointinfos
+        sub1_res2 += dEFFICIENT_SIZE(sizeof(dxStepperLocalContext)); //for dxStepperLocalContext
+        if (m > 0) {
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(unsigned int) * (nj + 1)); // for mindex
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m); // for findex
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 2 * JME__MAX * m); // for J
+            unsigned int mskip = dPAD(m);
+            sub1_res2 += dOVERALIGNED_SIZE(sizeof(dReal) * mskip * m, AMATRIX_ALIGNMENT); // for A
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * RCE__RHS_CFM_MAX * m); // for pairsRhsCfm
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * LHE__LO_HI_MAX * m); // for pairsLoHi
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(atomicord32) * nb); // for bodyStartJoints
+            sub1_res2 += dEFFICIENT_SIZE(sizeof(atomicord32)* dJCB__MAX * nj); // for bodyJointLinks
+        }
+
+        {
+            sizeint sub2_res1 = dEFFICIENT_SIZE(sizeof(dxStepperStage3CallContext)); // for dxStepperStage3CallContext
+
+            sizeint sub2_res2 = 0;
+
+            sizeint sub2_res3 = dEFFICIENT_SIZE(sizeof(dxStepperStage4CallContext)); // for dxStepperStage4CallContext
+
+            if (m > 0) {
+                sub2_res1 += dOVERALIGNED_SIZE(sizeof(dReal) * 2 * JIM__MAX * m, JINVM_ALIGNMENT); // for JinvM
+                sub2_res1 += dEFFICIENT_SIZE(sizeof(dReal) * dDA__MAX * nb); // for rhs_tmp
+                sub2_res1 += dEFFICIENT_SIZE(sizeof(dxStepperStage2CallContext)); // for dxStepperStage2CallContext
+
+                sub2_res2 += dxEstimateSolveLCPMemoryReq(m, false);
+            }
+
+            sub1_res2 += dMAX(sub2_res1, dMAX(sub2_res2, sub2_res3));
+        }
+
+        sizeint sub1_res12_max = dMAX(sub1_res1, sub1_res2);
+        sizeint stage01_contexts = dEFFICIENT_SIZE(sizeof(dxStepperStage0BodiesCallContext))
+            + dEFFICIENT_SIZE(sizeof(dxStepperStage0JointsCallContext))
+            + dEFFICIENT_SIZE(sizeof(dxStepperStage1CallContext));
+        res += dMAX(sub1_res12_max, stage01_contexts);
+    }
+
+    return res;
+}
+
+
+/*extern */
+unsigned dxEstimateStepMaxCallCount(
+    unsigned /*activeThreadCount*/, unsigned allowedThreadCount)
+{
+    unsigned result = 1 // dxStepIsland itself
+        + (2 * allowedThreadCount + 2) // (dxStepIsland_Stage2a + dxStepIsland_Stage2b) * allowedThreadCount + 2 * dxStepIsland_Stage2?_Sync
+        + 1; // dxStepIsland_Stage3
+    return result;
+}