/*************************************************************************
 *                                                                       *
 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
 * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
 *                                                                       *
 * This library is free software; you can redistribute it and/or         *
 * modify it under the terms of EITHER:                                  *
 *   (1) The GNU Lesser General Public License as published by the Free  *
 *       Software Foundation; either version 2.1 of the License, or (at  *
 *       your option) any later version. The text of the GNU Lesser      *
 *       General Public License is included with this library in the     *
 *       file LICENSE.TXT.                                               *
 *   (2) The BSD-style license that is included with this library in     *
 *       the file LICENSE-BSD.TXT.                                       *
 *                                                                       *
 * This library is distributed in the hope that it will be useful,       *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
 * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
 *                                                                       *
 *************************************************************************/

#include <ode/odeconfig.h>
#include <ode/rotation.h>
#include <ode/timer.h>
#include <ode/error.h>
#include "config.h"
#include "odemath.h"
#include "matrix.h"
#include "objects.h"
#include "joints/joint.h"
#include "lcp.h"
#include "util.h"
#include "threadingutils.h"

#include <new>


#define dMIN(A,B)  ((A)>(B) ? (B) : (A))
#define dMAX(A,B)  ((B)>(A) ? (B) : (A))

//****************************************************************************
// misc defines

//#define TIMING


#ifdef TIMING
#define IFTIMING(x) x
#else
#define IFTIMING(x) ((void)0)
#endif


struct dJointWithInfo1
{
    dxJoint *joint;
    dxJoint::Info1 info;
};

enum dxRHSCFMElement
{
    RCE_RHS = dxJoint::GI2_RHS,
    RCE_CFM = dxJoint::GI2_CFM,
    
    // Elements for array reuse
    RLE_RHS = RCE_RHS,
    RLE_LAMBDA = RCE_CFM,

    RCE__RHS_CFM_MAX = dxJoint::GI2__RHS_CFM_MAX,
    RLE__RHS_LAMBDA_MAX = RCE__RHS_CFM_MAX,
};

enum dxLoHiElement
{
    LHE_LO = dxJoint::GI2_LO,
    LHE_HI = dxJoint::GI2_HI,

    LHE__LO_HI_MAX = dxJoint::GI2__LO_HI_MAX,
};

enum dxJacobiVectorElement
{
    JVE__MIN,

    JVE__L_MIN = JVE__MIN + dDA__L_MIN,

    JVE_LX = JVE__L_MIN + dSA_X,
    JVE_LY = JVE__L_MIN + dSA_Y,
    JVE_LZ = JVE__L_MIN + dSA_Z,

    JVE__L_MAX = JVE__L_MIN + dSA__MAX,

    JVE__A_MIN = JVE__MIN + dDA__A_MIN,

    JVE_AX = JVE__A_MIN + dSA_X,
    JVE_AY = JVE__A_MIN + dSA_Y,
    JVE_AZ = JVE__A_MIN + dSA_Z,

    JVE__A_MAX = JVE__A_MIN + dSA__MAX,

    JVE__MAX = JVE__MIN + dDA__MAX,

    JVE__L_COUNT = JVE__L_MAX - JVE__L_MIN,
    JVE__A_COUNT = JVE__A_MAX - JVE__A_MIN,
};


enum dxJacobiMatrixElement
{
    JME__MIN,

    JME__J_MIN = JME__MIN,
    JME__JL_MIN = JME__J_MIN + JVE__L_MIN,

    JME_JLX = JME__J_MIN + JVE_LX,
    JME_JLY = JME__J_MIN + JVE_LY,
    JME_JLZ = JME__J_MIN + JVE_LZ,

    JME__JL_MAX = JME__J_MIN + JVE__L_MAX,

    JME__JA_MIN = JME__J_MIN + JVE__A_MIN,

    JME_JAX = JME__J_MIN + JVE_AX,
    JME_JAY = JME__J_MIN + JVE_AY,
    JME_JAZ = JME__J_MIN + JVE_AZ,

    JME__JA_MAX = JME__J_MIN + JVE__A_MAX,
    JME__J_MAX = JME__J_MIN + JVE__MAX,

    JME__MAX = JME__J_MAX,

    JME__J_COUNT = JME__J_MAX - JME__J_MIN,
};

enum dxJInvMElement
{
    JIM__MIN,

    JIM__L_MIN = JIM__MIN + dMD_LINEAR * dV3E__MAX,

    JIM__L_AXES_MIN = JIM__L_MIN + dV3E__AXES_MIN,

    JIM_LX = JIM__L_MIN + dV3E_X,
    JIM_LY = JIM__L_MIN + dV3E_Y,
    JIM_LZ = JIM__L_MIN + dV3E_Z,

    JIM__L_AXES_MAX = JIM__L_MIN + dV3E__AXES_MAX,

    JIM_LPAD = JIM__L_MIN + dV3E_PAD,

    JIM__L_MAX = JIM__L_MIN + dV3E__MAX,

    JIM__A_MIN = JIM__MIN + dMD_ANGULAR * dV3E__MAX,

    JIM__A_AXES_MIN = JIM__A_MIN + dV3E__AXES_MIN,

    JIM_AX = JIM__A_MIN + dV3E_X,
    JIM_AY = JIM__A_MIN + dV3E_Y,
    JIM_AZ = JIM__A_MIN + dV3E_Z,

    JIM__A_AXES_MAX = JIM__A_MIN + dV3E__AXES_MAX,

    JIM_APAD = JIM__A_MIN + dV3E_PAD,

    JIM__A_MAX = JIM__A_MIN + dV3E__MAX,

    JIM__MAX = JIM__MIN + dMD__MAX * dV3E__MAX,
};

enum dxContactForceElement
{
    CFE__MIN,

    CFE__DYNAMICS_MIN = CFE__MIN,

    CFE__L_MIN = CFE__DYNAMICS_MIN + dDA__L_MIN,

    CFE_LX = CFE__DYNAMICS_MIN + dDA_LX,
    CFE_LY = CFE__DYNAMICS_MIN + dDA_LY,
    CFE_LZ = CFE__DYNAMICS_MIN + dDA_LZ,

    CFE__L_MAX = CFE__DYNAMICS_MIN + dDA__L_MAX,

    CFE__A_MIN = CFE__DYNAMICS_MIN + dDA__A_MIN,

    CFE_AX = CFE__DYNAMICS_MIN + dDA_AX,
    CFE_AY = CFE__DYNAMICS_MIN + dDA_AY,
    CFE_AZ = CFE__DYNAMICS_MIN + dDA_AZ,

    CFE__A_MAX = CFE__DYNAMICS_MIN + dDA__A_MAX,

    CFE__DYNAMICS_MAX = CFE__DYNAMICS_MIN + dDA__MAX,

    CFE__MAX = CFE__DYNAMICS_MAX,
};


#define AMATRIX_ALIGNMENT   dMAX(64, EFFICIENT_ALIGNMENT)
#define INVI_ALIGNMENT      dMAX(32, EFFICIENT_ALIGNMENT)
#define JINVM_ALIGNMENT     dMAX(64, EFFICIENT_ALIGNMENT)

struct dxStepperStage0Outputs
{
    sizeint                         ji_start;
    sizeint                         ji_end;
    unsigned int                    m;
    unsigned int                    nub;
};

struct dxStepperStage1CallContext
{
    void Initialize(const dxStepperProcessingCallContext *stepperCallContext, void *stageMemArenaState, dReal *invI, dJointWithInfo1 *jointinfos)
    {
        m_stepperCallContext = stepperCallContext;
        m_stageMemArenaState = stageMemArenaState;
        m_invI = invI;
        m_jointinfos = jointinfos;
    }

    const dxStepperProcessingCallContext *m_stepperCallContext;
    void                            *m_stageMemArenaState;
    dReal                           *m_invI;
    dJointWithInfo1                 *m_jointinfos;
    dxStepperStage0Outputs          m_stage0Outputs;
};

struct dxStepperStage0BodiesCallContext
{
    void Initialize(const dxStepperProcessingCallContext *stepperCallContext, dReal *invI)
    {
        m_stepperCallContext = stepperCallContext;
        m_invI = invI;
        m_tagsTaken = 0;
        m_gravityTaken = 0;
        m_inertiaBodyIndex = 0;
    }

    const dxStepperProcessingCallContext *m_stepperCallContext;
    dReal                           *m_invI;
    atomicord32                     m_tagsTaken;
    atomicord32                     m_gravityTaken;
    volatile atomicord32            m_inertiaBodyIndex;
};

struct dxStepperStage0JointsCallContext
{
    void Initialize(const dxStepperProcessingCallContext *stepperCallContext, dJointWithInfo1 *jointinfos, dxStepperStage0Outputs *stage0Outputs)
    {
        m_stepperCallContext = stepperCallContext;
        m_jointinfos = jointinfos;
        m_stage0Outputs = stage0Outputs;
    }

    const dxStepperProcessingCallContext *m_stepperCallContext;
    dJointWithInfo1                 *m_jointinfos;
    dxStepperStage0Outputs          *m_stage0Outputs;
};

static int dxStepIsland_Stage0_Bodies_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
// static int dxStepIsland_Stage0_Joints_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static int dxStepIsland_Stage1_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);

static void dxStepIsland_Stage0_Bodies(dxStepperStage0BodiesCallContext *callContext);
static void dxStepIsland_Stage0_Joints(dxStepperStage0JointsCallContext *callContext);
static void dxStepIsland_Stage1(dxStepperStage1CallContext *callContext);


struct dxStepperLocalContext
{
    void Initialize(dReal *invI, dJointWithInfo1 *jointinfos, unsigned int nj, 
        unsigned int m, unsigned int nub, const unsigned int *mindex, int *findex, 
        dReal *J, dReal *A, dReal *pairsRhsCfm, dReal *pairsLoHi, 
        atomicord32 *bodyStartJoints, atomicord32 *bodyJointLinks)
    {
        m_invI = invI;
        m_jointinfos = jointinfos;
        m_nj = nj;
        m_m = m;
        m_nub = nub;
        m_mindex = mindex;
        m_findex = findex; 
        m_J = J;
        m_A = A;
        m_pairsRhsCfm = pairsRhsCfm;
        m_pairsLoHi = pairsLoHi;
        m_bodyStartJoints = bodyStartJoints;
        m_bodyJointLinks = bodyJointLinks;
    }

    dReal                           *m_invI;
    dJointWithInfo1                 *m_jointinfos;
    unsigned int                    m_nj;
    unsigned int                    m_m;
    unsigned int                    m_nub;
    const unsigned int              *m_mindex;
    int                             *m_findex;
    dReal                           *m_J;
    dReal                           *m_A;
    dReal                           *m_pairsRhsCfm;
    dReal                           *m_pairsLoHi;
    atomicord32                     *m_bodyStartJoints;
    atomicord32                     *m_bodyJointLinks;
};

struct dxStepperStage2CallContext
{
    void Initialize(const dxStepperProcessingCallContext *callContext, const dxStepperLocalContext *localContext, 
        dReal *JinvM, dReal *rhs_tmp)
    {
        m_stepperCallContext = callContext;
        m_localContext = localContext;
        m_JinvM = JinvM;
        m_rhs_tmp = rhs_tmp;
        m_ji_J = 0;
        m_ji_Ainit = 0;
        m_ji_JinvM = 0;
        m_ji_Aaddjb = 0;
        m_bi_rhs_tmp = 0;
        m_ji_rhs = 0;
    }

    const dxStepperProcessingCallContext *m_stepperCallContext;
    const dxStepperLocalContext     *m_localContext;
    dReal                           *m_JinvM;
    dReal                           *m_rhs_tmp;
    volatile atomicord32            m_ji_J;
    volatile atomicord32            m_ji_Ainit;
    volatile atomicord32            m_ji_JinvM;
    volatile atomicord32            m_ji_Aaddjb;
    volatile atomicord32            m_bi_rhs_tmp;
    volatile atomicord32            m_ji_rhs;
};

struct dxStepperStage3CallContext
{
    void Initialize(const dxStepperProcessingCallContext *callContext, const dxStepperLocalContext *localContext, 
        void *stage1MemArenaState)
    {
        m_stepperCallContext = callContext;
        m_localContext = localContext;
        m_stage1MemArenaState = stage1MemArenaState;
    }

    const dxStepperProcessingCallContext *m_stepperCallContext;
    const dxStepperLocalContext     *m_localContext;
    void                            *m_stage1MemArenaState;
};

struct dxStepperStage4CallContext
{
    void Initialize(const dxStepperProcessingCallContext *callContext, const dxStepperLocalContext *localContext/*, 
        void *stage3MemarenaState*/)
    {
        m_stepperCallContext = callContext;
        m_localContext = localContext;
        // m_stage3MemarenaState = stage3MemarenaState;
        m_bi_constrForce = 0;
    }

    const dxStepperProcessingCallContext *m_stepperCallContext;
    const dxStepperLocalContext     *m_localContext;
    // void                            *m_stage3MemarenaState;
    volatile atomicord32            m_bi_constrForce;
};

static int dxStepIsland_Stage2a_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static int dxStepIsland_Stage2aSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static int dxStepIsland_Stage2b_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static int dxStepIsland_Stage2bSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static int dxStepIsland_Stage2c_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static int dxStepIsland_Stage3_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);

static void dxStepIsland_Stage2a(dxStepperStage2CallContext *callContext);
static void dxStepIsland_Stage2b(dxStepperStage2CallContext *callContext);
static void dxStepIsland_Stage2c(dxStepperStage2CallContext *callContext);
static void dxStepIsland_Stage3(dxStepperStage3CallContext *callContext);

static int dxStepIsland_Stage4_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
static void dxStepIsland_Stage4(dxStepperStage4CallContext *stage4CallContext);


//****************************************************************************
// special matrix multipliers


// this assumes the 4th and 8th rows of B and C are zero.

static inline 
void MultiplyAddJinvMxJToA (dReal *Arow, const dReal *JinvMRow, const dReal *JRow,
    unsigned int infomJinvM, unsigned int infomJ, unsigned int mskip)
{
    dIASSERT (infomJinvM > 0 && infomJ > 0 && Arow && JinvMRow && JRow);
    const unsigned int mskip_munus_infomJ_plus_1 = mskip - infomJ + 1;
    dIASSERT(mskip >= infomJ);
    dReal *currA = Arow;
    const dReal *currJinvM = JinvMRow;
    for (unsigned int i = infomJinvM; ; ) {
        dReal JiM0 = currJinvM[JIM_LX];
        dReal JiM1 = currJinvM[JIM_LY];
        dReal JiM2 = currJinvM[JIM_LZ];
        dReal JiM4 = currJinvM[JIM_AX];
        dReal JiM5 = currJinvM[JIM_AY];
        dReal JiM6 = currJinvM[JIM_AZ];
        const dReal *currJ = JRow;
        for (unsigned int j = infomJ; ; ) {
            dReal sum;
            sum  = JiM0 * currJ[JME_JLX];
            sum += JiM1 * currJ[JME_JLY];
            sum += JiM2 * currJ[JME_JLZ];
            sum += JiM4 * currJ[JME_JAX];
            sum += JiM5 * currJ[JME_JAY];
            sum += JiM6 * currJ[JME_JAZ];
            *currA += sum; 
            if (--j == 0) {
                break;
            }
            ++currA;
            currJ += JME__MAX;
        }
        if (--i == 0) {
            break;
        }
        currJinvM += JIM__MAX;
        currA += mskip_munus_infomJ_plus_1;
    }
}


// this assumes the 4th and 8th rows of B are zero.

static inline 
void MultiplySubJxRhsTmpFromRHS (dReal *rowRhsCfm, const dReal *JRow, const dReal *rowRhsTmp, unsigned int infom)
{
    dIASSERT (infom > 0 && rowRhsCfm && JRow && rowRhsTmp);
    dReal *currRhs = rowRhsCfm + RCE_RHS;
    const dReal *currJ = JRow;
    const dReal RT_LX = rowRhsTmp[dDA_LX], RT_LY = rowRhsTmp[dDA_LY], RT_LZ = rowRhsTmp[dDA_LZ];
    const dReal RT_AX = rowRhsTmp[dDA_AX], RT_AY = rowRhsTmp[dDA_AY], RT_AZ = rowRhsTmp[dDA_AZ];
    for (unsigned int i = infom; ; ) {
        dReal sum;
        sum  = currJ[JME_JLX] * RT_LX;
        sum += currJ[JME_JLY] * RT_LY;
        sum += currJ[JME_JLZ] * RT_LZ;
        sum += currJ[JME_JAX] * RT_AX;
        sum += currJ[JME_JAY] * RT_AY;
        sum += currJ[JME_JAZ] * RT_AZ;
        *currRhs -= sum;
        if (--i == 0) {
            break;
        }
        currRhs += RCE__RHS_CFM_MAX;
        currJ += JME__MAX;
    }
}


static inline 
void MultiplyAddJxLambdaToCForce(dReal cforce[CFE__MAX], 
    const dReal *JRow, const dReal *rowRhsLambda, unsigned int infom, 
    dJointFeedback *fb/*=NULL*/, unsigned jointBodyIndex)
{
    dIASSERT (infom > 0 && cforce && JRow && rowRhsLambda);
    dReal sumLX = 0, sumLY = 0, sumLZ = 0, sumAX=0, sumAY = 0, sumAZ = 0;
    const dReal *currJ = JRow, *currLambda = rowRhsLambda + RLE_LAMBDA;
    for (unsigned int k = infom; ; ) {
        const dReal lambda = *currLambda;
        sumLX += currJ[JME_JLX] * lambda;
        sumLY += currJ[JME_JLY] * lambda;
        sumLZ += currJ[JME_JLZ] * lambda;
        sumAX += currJ[JME_JAX] * lambda;
        sumAY += currJ[JME_JAY] * lambda;
        sumAZ += currJ[JME_JAZ] * lambda;
        if (--k == 0) {
            break;
        }
        currJ += JME__MAX;
        currLambda += RLE__RHS_LAMBDA_MAX;
    }
    if (fb != NULL) {
        if (jointBodyIndex == dJCB__MIN) {
            fb->f1[dV3E_X] = sumLX;
            fb->f1[dV3E_Y] = sumLY;
            fb->f1[dV3E_Z] = sumLZ;
            fb->t1[dV3E_X] = sumAX;
            fb->t1[dV3E_Y] = sumAY;
            fb->t1[dV3E_Z] = sumAZ;
        }
        else {
            dIASSERT(jointBodyIndex == dJCB__MIN + 1);
            dSASSERT(dJCB__MAX == 2);

            fb->f2[dV3E_X] = sumLX;
            fb->f2[dV3E_Y] = sumLY;
            fb->f2[dV3E_Z] = sumLZ;
            fb->t2[dV3E_X] = sumAX;
            fb->t2[dV3E_Y] = sumAY;
            fb->t2[dV3E_Z] = sumAZ;
        }
    }
    cforce[CFE_LX] += sumLX;
    cforce[CFE_LY] += sumLY;
    cforce[CFE_LZ] += sumLZ;
    cforce[CFE_AX] += sumAX;
    cforce[CFE_AY] += sumAY;
    cforce[CFE_AZ] += sumAZ;
}


//****************************************************************************

/*extern */
void dxStepIsland(const dxStepperProcessingCallContext *callContext)
{
    IFTIMING(dTimerStart("preprocessing"));

    dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
    dxWorld *world = callContext->m_world;
    unsigned int nb = callContext->m_islandBodiesCount;
    unsigned int _nj = callContext->m_islandJointsCount;

    dReal *invI = memarena->AllocateOveralignedArray<dReal>(dM3E__MAX * (sizeint)nb, INVI_ALIGNMENT);
    // Reserve twice as much memory and start from the middle so that regardless of 
    // what direction the array grows to there would be sufficient room available.
    const sizeint ji_reserve_count = 2 * (sizeint)_nj;
    dJointWithInfo1 *const jointinfos = memarena->AllocateArray<dJointWithInfo1>(ji_reserve_count);

    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
    dIASSERT(allowedThreads != 0);

    void *stagesMemArenaState = memarena->SaveState();

    dxStepperStage1CallContext *stage1CallContext = (dxStepperStage1CallContext *)memarena->AllocateBlock(sizeof(dxStepperStage1CallContext));
    stage1CallContext->Initialize(callContext, stagesMemArenaState, invI, jointinfos);

    dxStepperStage0BodiesCallContext *stage0BodiesCallContext = (dxStepperStage0BodiesCallContext *)memarena->AllocateBlock(sizeof(dxStepperStage0BodiesCallContext));
    stage0BodiesCallContext->Initialize(callContext, invI);
    
    dxStepperStage0JointsCallContext *stage0JointsCallContext = (dxStepperStage0JointsCallContext *)memarena->AllocateBlock(sizeof(dxStepperStage0JointsCallContext));
    stage0JointsCallContext->Initialize(callContext, jointinfos, &stage1CallContext->m_stage0Outputs);

    if (allowedThreads == 1)
    {
        dxStepIsland_Stage0_Bodies(stage0BodiesCallContext);
        dxStepIsland_Stage0_Joints(stage0JointsCallContext);
        dxStepIsland_Stage1(stage1CallContext);
    }
    else
    {
        unsigned bodyThreads = allowedThreads;
        unsigned jointThreads = 1;

        dCallReleaseeID stage1CallReleasee;
        world->PostThreadedCallForUnawareReleasee(NULL, &stage1CallReleasee, bodyThreads + jointThreads, callContext->m_finalReleasee, 
            NULL, &dxStepIsland_Stage1_Callback, stage1CallContext, 0, "StepIsland Stage1");

        world->PostThreadedCallsGroup(NULL, bodyThreads, stage1CallReleasee, &dxStepIsland_Stage0_Bodies_Callback, stage0BodiesCallContext, "StepIsland Stage0-Bodies");

        dxStepIsland_Stage0_Joints(stage0JointsCallContext);
        world->AlterThreadedCallDependenciesCount(stage1CallReleasee, -1);
        dIASSERT(jointThreads == 1);
    }
}    

static 
int dxStepIsland_Stage0_Bodies_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage0BodiesCallContext *callContext = (dxStepperStage0BodiesCallContext *)_callContext;
    dxStepIsland_Stage0_Bodies(callContext);
    return 1;
}

static 
void dxStepIsland_Stage0_Bodies(dxStepperStage0BodiesCallContext *callContext)
{
    dxBody * const *body = callContext->m_stepperCallContext->m_islandBodiesStart;
    unsigned int nb = callContext->m_stepperCallContext->m_islandBodiesCount;

    if (ThrsafeExchange(&callContext->m_tagsTaken, 1) == 0)
    {
        // number all bodies in the body list - set their tag values
        for (unsigned int i=0; i<nb; i++) body[i]->tag = i;
    }

    if (ThrsafeExchange(&callContext->m_gravityTaken, 1) == 0)
    {
        dxWorld *world = callContext->m_stepperCallContext->m_world;

        // add the gravity force to all bodies
        // since gravity does normally have only one component it's more efficient
        // to run three loops for each individual component
        dxBody *const *const bodyend = body + nb;
        dReal gravity_x = world->gravity[0];
        if (gravity_x) {
            for (dxBody *const *bodycurr = body; bodycurr != bodyend; ++bodycurr) {
                dxBody *b = *bodycurr;
                if ((b->flags & dxBodyNoGravity) == 0) {
                    b->facc[dV3E_X] += b->mass.mass * gravity_x;
                }
            }
        }
        dReal gravity_y = world->gravity[1];
        if (gravity_y) {
            for (dxBody *const *bodycurr = body; bodycurr != bodyend; ++bodycurr) {
                dxBody *b = *bodycurr;
                if ((b->flags & dxBodyNoGravity) == 0) {
                    b->facc[dV3E_Y] += b->mass.mass * gravity_y;
                }
            }
        }
        dReal gravity_z = world->gravity[2];
        if (gravity_z) {
            for (dxBody *const *bodycurr = body; bodycurr != bodyend; ++bodycurr) {
                dxBody *b = *bodycurr;
                if ((b->flags & dxBodyNoGravity) == 0) {
                    b->facc[dV3E_Z] += b->mass.mass * gravity_z;
                }
            }
        }
    }

    // for all bodies, compute the inertia tensor and its inverse in the global
    // frame, and compute the rotational force and add it to the torque
    // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body.
    {
        dReal *invIrow = callContext->m_invI;
        unsigned int bodyIndex = ThrsafeIncrementIntUpToLimit(&callContext->m_inertiaBodyIndex, nb);

        for (unsigned int i = 0; i != nb; invIrow += dM3E__MAX, ++i) {
            if (i == bodyIndex) {
                dMatrix3 tmp;
                dxBody *b = body[i];

                // compute inverse inertia tensor in global frame
                dMultiply2_333 (tmp, b->invI, b->posr.R);
                dMultiply0_333 (invIrow, b->posr.R, tmp);

                // Don't apply gyroscopic torques to bodies
                // if not flagged or the body is kinematic
                if ((b->flags & dxBodyGyroscopic) && (b->invMass > 0)) {
                    dMatrix3 I;
                    // compute inertia tensor in global frame
                    dMultiply2_333 (tmp,b->mass.I,b->posr.R);
                    dMultiply0_333 (I,b->posr.R,tmp);
                    // compute rotational force
#if 0
                    // Explicit computation
                    dMultiply0_331 (tmp,I,b->avel);
                    dSubtractVectorCross3(b->tacc,b->avel,tmp);
#else
                    // Do the implicit computation based on 
                    //"Stabilizing Gyroscopic Forces in Rigid Multibody Simulations"
                    // (Lacoursière 2006)
                    dReal h = callContext->m_stepperCallContext->m_stepSize; // Step size
                    dVector3 L; // Compute angular momentum
                    dMultiply0_331(L, I, b->avel);
                    
                    // Compute a new effective 'inertia tensor'
                    // for the implicit step: the cross-product 
                    // matrix of the angular momentum plus the
                    // old tensor scaled by the timestep.  
                    // Itild may not be symmetric pos-definite, 
                    // but we can still use it to compute implicit
                    // gyroscopic torques.
                    dMatrix3 Itild = { 0 };  
                    dSetCrossMatrixMinus(Itild, L, dV3E__MAX);
                    for (int ii = dM3E__MIN; ii != dM3E__MAX; ++ii) {
                      Itild[ii] = Itild[ii] * h + I[ii];
                    }

                    // Scale momentum by inverse time to get 
                    // a sort of "torque"
                    dScaleVector3(L, dRecip(h)); 
                    // Invert the pseudo-tensor
                    dMatrix3 itInv;
                    // This is a closed-form inversion.
                    // It's probably not numerically stable
                    // when dealing with small masses with
                    // a large asymmetry.
                    // An LU decomposition might be better.
                    if (dInvertMatrix3(itInv, Itild) != 0) {
                        // "Divide" the original tensor
                        // by the pseudo-tensor (on the right)
                        dMultiply0_333(Itild, I, itInv);
                        // Subtract an identity matrix
                        Itild[dM3E_XX] -= 1; Itild[dM3E_YY] -= 1; Itild[dM3E_ZZ] -= 1;

                        // This new inertia matrix rotates the 
                        // momentum to get a new set of torques
                        // that will work correctly when applied
                        // to the old inertia matrix as explicit
                        // torques with a semi-implicit update
                        // step.
                        dVector3 tau0;
                        dMultiply0_331(tau0,Itild,L);
                        
                        // Add the gyro torques to the torque 
                        // accumulator
                        for (int ii = dSA__MIN; ii != dSA__MAX; ++ii) {
                          b->tacc[dV3E__AXES_MIN + ii] += tau0[dV3E__AXES_MIN + ii];
                        }
                    }
#endif
                }

                bodyIndex = ThrsafeIncrementIntUpToLimit(&callContext->m_inertiaBodyIndex, nb);
            }
        }
    }
}

// static 
// int dxStepIsland_Stage0_Joints_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
// {
//     (void)callInstanceIndex; // unused
//     (void)callThisReleasee; // unused
//     dxStepperStage0JointsCallContext *callContext = (dxStepperStage0JointsCallContext *)_callContext;
//     dxStepIsland_Stage0_Joints(callContext);
//     return 1;
// }

static 
void dxStepIsland_Stage0_Joints(dxStepperStage0JointsCallContext *callContext)
{
    dxJoint * const *_joint = callContext->m_stepperCallContext->m_islandJointsStart;
    dJointWithInfo1 *jointinfos = callContext->m_jointinfos;
    unsigned int _nj = callContext->m_stepperCallContext->m_islandJointsCount;

    // get m = total constraint dimension, nub = number of unbounded variables.
    // create constraint offset array and number-of-rows array for all joints.
    // the constraints are re-ordered as follows: the purely unbounded
    // constraints, the mixed unbounded + LCP constraints, and last the purely
    // LCP constraints. this assists the LCP solver to put all unbounded
    // variables at the start for a quick factorization.
    //
    // joints with m=0 are inactive and are removed from the joints array
    // entirely, so that the code that follows does not consider them.
    // also number all active joints in the joint list (set their tag values).
    // inactive joints receive a tag value of -1.

    sizeint ji_start, ji_end;
    {
        unsigned int mcurr = 0;
        sizeint unb_start, mix_start, mix_end, lcp_end;
        unb_start = mix_start = mix_end = lcp_end = _nj;

        dJointWithInfo1 *jicurr = jointinfos + lcp_end;
        dxJoint *const *const _jend = _joint + _nj;
        dxJoint *const *_jcurr = _joint;
        while (true) {
            // -------------------------------------------------------------------------
            // Switch to growing array forward
            {
                bool fwd_end_reached = false;
                dJointWithInfo1 *jimixend = jointinfos + mix_end;
                while (true) {	// jicurr=dest, _jcurr=src
                    if (_jcurr == _jend) {
                        lcp_end = jicurr - jointinfos;
                        fwd_end_reached = true;
                        break;
                    }
                    dxJoint *j = *_jcurr++;
                    j->getInfo1 (&jicurr->info);
                    dIASSERT (/*jicurr->info.m >= 0 && */jicurr->info.m <= 6 && /*jicurr->info.nub >= 0 && */jicurr->info.nub <= jicurr->info.m);
                    if (jicurr->info.m != 0) {
                        mcurr += jicurr->info.m;
                        if (jicurr->info.nub == 0) { // A lcp info - a correct guess!!!
                            jicurr->joint = j;
                            ++jicurr;
                        } else if (jicurr->info.nub < jicurr->info.m) { // A mixed case
                            if (unb_start == mix_start) { // no unbounded infos yet - just move to opposite side of mixed-s
                                unb_start = mix_start = mix_start - 1;
                                dJointWithInfo1 *jimixstart = jointinfos + mix_start;
                                jimixstart->info = jicurr->info;
                                jimixstart->joint = j;
                            } else if (jimixend != jicurr) { // have to swap to the tail of mixed-s
                                dxJoint::Info1 tmp_info = jicurr->info;
                                *jicurr = *jimixend;
                                jimixend->info = tmp_info;
                                jimixend->joint = j;
                                ++jimixend; ++jicurr;
                            } else { // no need to swap as there are no LCP info-s yet
                                jicurr->joint = j;
                                jimixend = jicurr = jicurr + 1;
                            }
                        } else { // A purely unbounded case -- break out and proceed growing in opposite direction
                            unb_start = unb_start - 1;
                            dJointWithInfo1 *jiunbstart = jointinfos + unb_start;
                            jiunbstart->info = jicurr->info;
                            jiunbstart->joint = j;
                            lcp_end = jicurr - jointinfos;
                            mix_end = jimixend - jointinfos;
                            jicurr = jiunbstart - 1;
                            break;
                        }
                    } else {
                        j->tag = -1;
                    }
                }
                if (fwd_end_reached) {
                    break;
                }
            }
            // -------------------------------------------------------------------------
            // Switch to growing array backward
            {
                bool bkw_end_reached = false;
                dJointWithInfo1 *jimixstart = jointinfos + mix_start - 1;
                while (true) {	// jicurr=dest, _jcurr=src
                    if (_jcurr == _jend) {
                        unb_start = (jicurr + 1) - jointinfos;
                        mix_start = (jimixstart + 1) - jointinfos;
                        bkw_end_reached = true;
                        break;
                    }
                    dxJoint *j = *_jcurr++;
                    j->getInfo1 (&jicurr->info);
                    dIASSERT (/*jicurr->info.m >= 0 && */jicurr->info.m <= 6 && /*jicurr->info.nub >= 0 && */jicurr->info.nub <= jicurr->info.m);
                    if (jicurr->info.m != 0) {
                        mcurr += jicurr->info.m;
                        if (jicurr->info.nub == jicurr->info.m) { // An unbounded info - a correct guess!!!
                            jicurr->joint = j;
                            --jicurr;
                        } else if (jicurr->info.nub != 0) { // A mixed case
                            if (mix_end == lcp_end) { // no lcp infos yet - just move to opposite side of mixed-s
                                dJointWithInfo1 *jimixend = jointinfos + mix_end;
                                lcp_end = mix_end = mix_end + 1;
                                jimixend->info = jicurr->info;
                                jimixend->joint = j;
                            } else if (jimixstart != jicurr) { // have to swap to the head of mixed-s
                                dxJoint::Info1 tmp_info = jicurr->info;
                                *jicurr = *jimixstart;
                                jimixstart->info = tmp_info;
                                jimixstart->joint = j;
                                --jimixstart; --jicurr;
                            } else { // no need to swap as there are no unbounded info-s yet
                                jicurr->joint = j;
                                jimixstart = jicurr = jicurr - 1;
                            }
                        } else { // A purely lcp case -- break out and proceed growing in opposite direction
                            dJointWithInfo1 *jilcpend = jointinfos + lcp_end;
                            lcp_end = lcp_end + 1;
                            jilcpend->info = jicurr->info;
                            jilcpend->joint = j;
                            unb_start = (jicurr + 1) - jointinfos;
                            mix_start = (jimixstart + 1) - jointinfos;
                            jicurr = jilcpend + 1;
                            break;
                        }
                    } else {
                        j->tag = -1;
                    }
                }
                if (bkw_end_reached) {
                    break;
                }
            }
        }

        callContext->m_stage0Outputs->m = mcurr;
        callContext->m_stage0Outputs->nub = (unsigned)(mix_start - unb_start);
        dIASSERT((sizeint)(mix_start - unb_start) <= (sizeint)UINT_MAX);
        ji_start = unb_start;
        ji_end = lcp_end;
    }

    {
        const dJointWithInfo1 *jicurr = jointinfos + ji_start;
        const dJointWithInfo1 *const jiend = jointinfos + ji_end;
        for (unsigned int i = 0; jicurr != jiend; i++, ++jicurr) {
            jicurr->joint->tag = i;
        }
    }

    callContext->m_stage0Outputs->ji_start = ji_start;
    callContext->m_stage0Outputs->ji_end = ji_end;
}

static 
int dxStepIsland_Stage1_Callback(void *_stage1CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage1CallContext *stage1CallContext = (dxStepperStage1CallContext *)_stage1CallContext;
    dxStepIsland_Stage1(stage1CallContext);
    return 1;
}

static 
void dxStepIsland_Stage1(dxStepperStage1CallContext *stage1CallContext)
{
    const dxStepperProcessingCallContext *callContext = stage1CallContext->m_stepperCallContext;
    dJointWithInfo1 *_jointinfos = stage1CallContext->m_jointinfos;
    dReal *invI = stage1CallContext->m_invI;
    sizeint ji_start = stage1CallContext->m_stage0Outputs.ji_start;
    sizeint ji_end = stage1CallContext->m_stage0Outputs.ji_end;
    unsigned int m = stage1CallContext->m_stage0Outputs.m;
    unsigned int nub = stage1CallContext->m_stage0Outputs.nub;

    dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
    {
        memarena->RestoreState(stage1CallContext->m_stageMemArenaState);
        stage1CallContext = NULL; // WARNING! _stage1CallContext is not valid after this point!
        dIVERIFY(stage1CallContext == NULL); // To suppress compiler warnings about unused variable assignment

        unsigned int _nj = callContext->m_islandJointsCount;
        const sizeint ji_reserve_count = 2 * (sizeint)_nj;
        memarena->ShrinkArray<dJointWithInfo1>(_jointinfos, ji_reserve_count, ji_end);
    }

    dJointWithInfo1 *jointinfos = _jointinfos + ji_start;
    unsigned int nj = (unsigned int)(ji_end - ji_start);
    dIASSERT((sizeint)(ji_end - ji_start) <= (sizeint)UINT_MAX);

    unsigned int *mindex = NULL;
    dReal *J = NULL, *A = NULL, *pairsRhsCfm = NULL, *pairsLoHi = NULL;
    int *findex = NULL;
    atomicord32 *bodyStartJoints = NULL, *bodyJointLinks = NULL;

    // if there are constraints, compute constrForce
    if (m > 0) {
        mindex = memarena->AllocateArray<unsigned int>((sizeint)(nj + 1));
        {
            unsigned int *mcurr = mindex;
            unsigned int moffs = 0;
            mcurr[0] = moffs;
            mcurr += 1;

            const dJointWithInfo1 *const jiend = jointinfos + nj;
            for (const dJointWithInfo1 *jicurr = jointinfos; jicurr != jiend; ++jicurr) {
                //dxJoint *joint = jicurr->joint;
                moffs += jicurr->info.m;
                mcurr[0] = moffs;
                mcurr += 1;
            }
        }

        // create a constraint equation right hand side vector `c', a constraint
        // force mixing vector `cfm', and LCP low and high bound vectors, and an
        // 'findex' vector.
        findex = memarena->AllocateArray<int>(m);
        J = memarena->AllocateArray<dReal>((sizeint)m * (2 * JME__MAX));
        A = memarena->AllocateOveralignedArray<dReal>((sizeint)m * dPAD(m), AMATRIX_ALIGNMENT);
        pairsRhsCfm = memarena->AllocateArray<dReal>((sizeint)m * RCE__RHS_CFM_MAX);
        pairsLoHi = memarena->AllocateArray<dReal>((sizeint)m * LHE__LO_HI_MAX);
        const unsigned int nb = callContext->m_islandBodiesCount;
        bodyStartJoints = memarena->AllocateArray<atomicord32>(nb);
        bodyJointLinks = memarena->AllocateArray<atomicord32>((sizeint)nj * dJCB__MAX);
        dICHECK(nj < ~((atomicord32)0) / dJCB__MAX); // If larger joint counts are to be used, pointers (or sizeint) need to be stored rather than atomicord32 indices
    }

    dxStepperLocalContext *localContext = (dxStepperLocalContext *)memarena->AllocateBlock(sizeof(dxStepperLocalContext));
    localContext->Initialize(invI, jointinfos, nj, m, nub, mindex, findex, J, A, pairsRhsCfm, pairsLoHi, bodyStartJoints, bodyJointLinks);

    void *stage1MemarenaState = memarena->SaveState();
    dxStepperStage3CallContext *stage3CallContext = (dxStepperStage3CallContext*)memarena->AllocateBlock(sizeof(dxStepperStage3CallContext));
    stage3CallContext->Initialize(callContext, localContext, stage1MemarenaState);

    if (m > 0) {
        dReal *JinvM = memarena->AllocateOveralignedArray<dReal>((sizeint)m * (2 * JIM__MAX), JINVM_ALIGNMENT);
        const unsigned int nb = callContext->m_islandBodiesCount;
        dReal *rhs_tmp = memarena->AllocateArray<dReal>((sizeint)nb * dDA__MAX);

        dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)memarena->AllocateBlock(sizeof(dxStepperStage2CallContext));
        stage2CallContext->Initialize(callContext, localContext, JinvM, rhs_tmp);

        const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
        dIASSERT(allowedThreads != 0);

        if (allowedThreads == 1) {
            IFTIMING(dTimerNow("create J"));
            dxStepIsland_Stage2a(stage2CallContext);
            IFTIMING(dTimerNow("compute Adiag, JinvM and rhs_tmp"));
            dxStepIsland_Stage2b(stage2CallContext);
            IFTIMING(dTimerNow("compute A and rhs"));
            dxStepIsland_Stage2c(stage2CallContext);
            dxStepIsland_Stage3(stage3CallContext);
        }
        else {
            dxWorld *world = callContext->m_world;
            dCallReleaseeID stage3CallReleasee;
            world->PostThreadedCallForUnawareReleasee(NULL, &stage3CallReleasee, 1, callContext->m_finalReleasee, 
                NULL, &dxStepIsland_Stage3_Callback, stage3CallContext, 0, "StepIsland Stage3");

            dCallReleaseeID stage2bSyncReleasee;
            world->PostThreadedCall(NULL, &stage2bSyncReleasee, 1, stage3CallReleasee, 
                NULL, &dxStepIsland_Stage2bSync_Callback, stage2CallContext, 0, "StepIsland Stage2b Sync");

            dCallReleaseeID stage2aSyncReleasee;
            world->PostThreadedCall(NULL, &stage2aSyncReleasee, allowedThreads, stage2bSyncReleasee, 
                NULL, &dxStepIsland_Stage2aSync_Callback, stage2CallContext, 0, "StepIsland Stage2a Sync");

            dIASSERT(allowedThreads > 1); /*if (allowedThreads > 1) */{
                world->PostThreadedCallsGroup(NULL, allowedThreads - 1, stage2aSyncReleasee, &dxStepIsland_Stage2a_Callback, stage2CallContext, "StepIsland Stage2a");
            }
            dxStepIsland_Stage2a(stage2CallContext);
            world->AlterThreadedCallDependenciesCount(stage2aSyncReleasee, -1);
        }
    }
    else {
        dxStepIsland_Stage3(stage3CallContext);
    }
}


static 
int dxStepIsland_Stage2a_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
    dxStepIsland_Stage2a(stage2CallContext);
    return 1;
}

static 
void dxStepIsland_Stage2a(dxStepperStage2CallContext *stage2CallContext)
{
    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
    const dxStepperLocalContext *localContext = stage2CallContext->m_localContext;
    dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
    unsigned int nj = localContext->m_nj;
    const unsigned int *mindex = localContext->m_mindex;

    const dReal stepsizeRecip = dRecip(callContext->m_stepSize);
    dxWorld *world = callContext->m_world;

    {
        int *findex = localContext->m_findex;
        dReal *J = localContext->m_J;
        dReal *pairsRhsCfm = localContext->m_pairsRhsCfm;
        dReal *pairsLoHi = localContext->m_pairsLoHi;

        // get jacobian data from constraints. a (2*m)x8 matrix will be created
        // to store the two jacobian blocks from each constraint. it has this
        // format:
        //
        //   l l l 0 a a a 0  \    .
        //   l l l 0 a a a 0   }-- jacobian body 1 block for joint 0 (3 rows)
        //   l l l 0 a a a 0  /
        //   l l l 0 a a a 0  \    .
        //   l l l 0 a a a 0   }-- jacobian body 2 block for joint 0 (3 rows)
        //   l l l 0 a a a 0  /
        //   l l l 0 a a a 0  }--- jacobian body 1 block for joint 1 (1 row)
        //   l l l 0 a a a 0  }--- jacobian body 2 block for joint 1 (1 row)
        //   etc...
        //
        //   (lll) = linear jacobian data
        //   (aaa) = angular jacobian data
        //

        const dReal worldERP = world->global_erp;
        const dReal worldCFM = world->global_cfm;

        unsigned ji;
        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_J, nj)) != nj) {
            const unsigned ofsi = mindex[ji];
            const unsigned int infom = mindex[ji + 1] - ofsi;

            dReal *const JRow = J + (sizeint)ofsi * (2 * JME__MAX);
            dReal *rowRhsCfm = pairsRhsCfm + (sizeint)ofsi * RCE__RHS_CFM_MAX;
            dReal *rowLoHi = pairsLoHi + (sizeint)ofsi * LHE__LO_HI_MAX;
            {
                dSetZero (JRow, infom * (2 * JME__MAX));

                dReal *const endRhsCfm = rowRhsCfm + infom * RCE__RHS_CFM_MAX;
                for (dReal *currRhsCfm = rowRhsCfm; currRhsCfm != endRhsCfm; currRhsCfm += RCE__RHS_CFM_MAX) {
                    currRhsCfm[RCE_RHS] = REAL(0.0);
                    currRhsCfm[RCE_CFM] = worldCFM;
                }

                dReal *const endLoHi = rowLoHi + infom * LHE__LO_HI_MAX;
                for (dReal *currLoHi = rowLoHi; currLoHi != endLoHi; currLoHi += LHE__LO_HI_MAX) {
                    currLoHi[LHE_LO] = -dInfinity;
                    currLoHi[LHE_HI] = dInfinity;
                }
            }
            int *findexRow = findex + ofsi;
            dSetValue(findexRow, infom, -1);

            dxJoint *joint = jointinfos[ji].joint;
            joint->getInfo2(stepsizeRecip, worldERP, JME__MAX, JRow + JME__J_MIN, JRow + infom * JME__MAX + JME__J_MIN, RCE__RHS_CFM_MAX, rowRhsCfm, rowLoHi, findexRow);
            dSASSERT((int)LHE__LO_HI_MAX == RCE__RHS_CFM_MAX); // To make sure same step fits for both pairs in the call to dxJoint::getInfo2() above

            // findex iteration is compact and is not going to pollute caches - do it first
            {
                // adjust returned findex values for global index numbering
                int *const findicesEnd = findexRow + infom;
                for (int *findexCurr = findexRow; findexCurr != findicesEnd; ++findexCurr) {
                    int fival = *findexCurr;
                    if (fival != -1) {
                        *findexCurr = fival + ofsi;
                    }
                }
            }
            {
                dReal *const endRhsCfm = rowRhsCfm + infom * RCE__RHS_CFM_MAX;
                for (dReal *currRhsCfm = rowRhsCfm; currRhsCfm != endRhsCfm; currRhsCfm += RCE__RHS_CFM_MAX) {
                    currRhsCfm[RCE_RHS] *= stepsizeRecip;
                    currRhsCfm[RCE_CFM] *= stepsizeRecip;
                }
            }
        }
    }
}

static 
int dxStepIsland_Stage2aSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;

    dIASSERT(allowedThreads > 1); /*if (allowedThreads > 1) */{ // The allowed thread count is greater than one as otherwise current function would not be scheduled for execution from the previous stage
        dxWorld *world = callContext->m_world;
        world->AlterThreadedCallDependenciesCount(callThisReleasee, allowedThreads - 1);
        world->PostThreadedCallsGroup(NULL, allowedThreads - 1, callThisReleasee, &dxStepIsland_Stage2b_Callback, stage2CallContext, "StepIsland Stage2b");
    }
    dxStepIsland_Stage2b(stage2CallContext);

    return 1;
}

static 
int dxStepIsland_Stage2b_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
    dxStepIsland_Stage2b(stage2CallContext);
    return 1;
}

static 
void dxStepIsland_Stage2b(dxStepperStage2CallContext *stage2CallContext)
{
    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
    const dxStepperLocalContext *localContext = stage2CallContext->m_localContext;
    dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
    unsigned int nj = localContext->m_nj;
    const unsigned int *mindex = localContext->m_mindex;

    {
        // Warning!!!
        // This code depends on cfm elements and therefore must be in different sub-stage 
        // from Jacobian construction in Stage2a to ensure proper synchronization 
        // and avoid accessing numbers being modified.
        // Warning!!!
        dReal *A = localContext->m_A;
        const dReal *pairsRhsCfm = localContext->m_pairsRhsCfm;
        const unsigned m = localContext->m_m;

        const unsigned int mskip = dPAD(m);

        unsigned ji;
        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_Ainit, nj)) != nj) {
            const unsigned ofsi = mindex[ji];
            const unsigned int infom = mindex[ji + 1] - ofsi;

            dReal *Arow = A + (sizeint)mskip * ofsi;
            dSetZero(Arow, (sizeint)mskip * infom);
            dReal *Adiag = Arow + ofsi;
            const dReal *rowRfsCrm = pairsRhsCfm + (sizeint)ofsi * RCE__RHS_CFM_MAX;
            for (unsigned int i = 0; i != infom; Adiag += mskip, ++i) {
                Adiag[i] = (rowRfsCrm + i * RCE__RHS_CFM_MAX)[RCE_CFM];
            }
        }
    }

    {
        // Warning!!!
        // This code depends on J elements and therefore must be in different sub-stage 
        // from Jacobian construction in Stage2a to ensure proper synchronization 
        // and avoid accessing numbers being modified.
        // Warning!!!
        const dReal *invI = localContext->m_invI;
        const dReal *J = localContext->m_J;
        dReal *JinvM = stage2CallContext->m_JinvM;

        // compute A = J*invM*J'. first compute JinvM = J*invM. this has the same
        // format as J so we just go through the constraints in J multiplying by
        // the appropriate scalars and matrices.
        unsigned ji;
        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_JinvM, nj)) != nj) {
            const unsigned ofsi = mindex[ji];
            const unsigned int infom = mindex[ji + 1] - ofsi;

            dReal *Jdst = JinvM + (sizeint)ofsi * (2 * JIM__MAX);
            dSetZero(Jdst, infom * (2 * JIM__MAX));

            const dReal *Jsrc = J + (sizeint)ofsi * (2 * JME__MAX);
            dxJoint *joint = jointinfos[ji].joint;

            dxBody *jb0 = joint->node[0].body;
            if (true || jb0 != NULL) { // -- always true
                dReal body_invMass0 = jb0->invMass;
                const dReal *body_invI0 = invI + (sizeint)(unsigned int)jb0->tag * dM3E__MAX;
                for (unsigned int j = infom; j != 0; --j) {
                    for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) Jdst[JIM__L_AXES_MIN + k] = Jsrc[JME__JL_MIN + k] * body_invMass0;
                    dMultiply0_133(Jdst + JIM__A_AXES_MIN, Jsrc + JME__JA_MIN, body_invI0);
                    Jsrc += JME__MAX;
                    Jdst += JIM__MAX;
                }
            }

            dxBody *jb1 = joint->node[1].body;
            if (jb1 != NULL) {
                dReal body_invMass1 = jb1->invMass;
                const dReal *body_invI1 = invI + (sizeint)(unsigned int)jb1->tag * dM3E__MAX;
                for (unsigned int j = infom; j != 0; --j) {
                    for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) Jdst[JIM__L_AXES_MIN + k] = Jsrc[JME__JL_MIN + k] * body_invMass1;
                    dMultiply0_133 (Jdst + JIM__A_AXES_MIN, Jsrc + JME__JA_MIN, body_invI1);
                    Jsrc += JME__MAX;
                    Jdst += JIM__MAX;
                }
            }
        }
    }

    {
        // Warning!!!
        // This code reads facc/tacc fields of body objects which (the fields)
        // may be modified by dxJoint::getInfo2(). Therefore the code must be
        // in different sub-stage from Jacobian construction in Stage2a 
        // to ensure proper synchronization and avoid accessing numbers being modified.
        // Warning!!!
        dxBody * const *const body = callContext->m_islandBodiesStart;
        const unsigned int nb = callContext->m_islandBodiesCount;
        const dReal *invI = localContext->m_invI;
        atomicord32 *bodyStartJoints = localContext->m_bodyStartJoints;
        dReal *rhs_tmp = stage2CallContext->m_rhs_tmp;

        // compute the right hand side `rhs'
        const dReal stepsizeRecip = dRecip(callContext->m_stepSize);

        // put v/h + invM*fe into rhs_tmp
        unsigned bi;
        while ((bi = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_bi_rhs_tmp, nb)) != nb) {
            dReal *tmp1curr = rhs_tmp + (sizeint)bi * dDA__MAX;
            const dReal *invIrow = invI + (sizeint)bi * dM3E__MAX;
            dxBody *b = body[bi];
            // dSetZero(tmp1curr, 8); -- not needed
            for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) tmp1curr[dDA__L_MIN + j] = b->facc[dV3E__AXES_MIN + j] * b->invMass + b->lvel[dV3E__AXES_MIN + j] * stepsizeRecip;
            dMultiply0_331 (tmp1curr + dDA__A_MIN, invIrow, b->tacc);
            for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) tmp1curr[dDA__A_MIN + k] += b->avel[dV3E__AXES_MIN + k] * stepsizeRecip;
            // Initialize body start joint indices -- this will be needed later for building body related joint list in dxStepIsland_Stage2c
            bodyStartJoints[bi] = 0;
        }
    }
}

static 
int dxStepIsland_Stage2bSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
    const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;

    dIASSERT(allowedThreads > 1); /*if (allowedThreads > 1) */{ // The allowed thread count is greater than one as otherwise current function would not be scheduled for execution from the previous stage
        dxWorld *world = callContext->m_world;
        world->AlterThreadedCallDependenciesCount(callThisReleasee, allowedThreads - 1);
        world->PostThreadedCallsGroup(NULL, allowedThreads - 1, callThisReleasee, &dxStepIsland_Stage2c_Callback, stage2CallContext, "StepIsland Stage2c");
    }
    dxStepIsland_Stage2c(stage2CallContext);

    return 1;
}


static 
int dxStepIsland_Stage2c_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage2CallContext *stage2CallContext = (dxStepperStage2CallContext *)_stage2CallContext;
    dxStepIsland_Stage2c(stage2CallContext);
    return 1;
}

static 
void dxStepIsland_Stage2c(dxStepperStage2CallContext *stage2CallContext)
{
    //const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
    const dxStepperLocalContext *localContext = stage2CallContext->m_localContext;
    dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
    unsigned int nj = localContext->m_nj;
    const unsigned int *mindex = localContext->m_mindex;

    {
        // Warning!!!
        // This code depends on A elements and JinvM elements and therefore 
        // must be in different sub-stage from A initialization and JinvM calculation in Stage2b 
        // to ensure proper synchronization and avoid accessing numbers being modified.
        // Warning!!!
        dReal *A = localContext->m_A;
        const dReal *JinvM = stage2CallContext->m_JinvM;
        const dReal *J = localContext->m_J;
        const unsigned m = localContext->m_m;

        // now compute A = JinvM * J'. A's rows and columns are grouped by joint,
        // i.e. in the same way as the rows of J. block (i,j) of A is only nonzero
        // if joints i and j have at least one body in common. 
        const unsigned int mskip = dPAD(m);

        unsigned ji;
        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_Aaddjb, nj)) != nj) {
            const unsigned ofsi = mindex[ji];
            const unsigned int infom = mindex[ji + 1] - ofsi;

            dReal *Arow = A + (sizeint)mskip * ofsi;
            const dReal *JinvMRow = JinvM + (sizeint)ofsi * (2 * JIM__MAX);
            dxJoint *joint = jointinfos[ji].joint;

            dxBody *jb0 = joint->node[0].body;
            if (true || jb0 != NULL) { // -- always true
                // compute diagonal block of A
                const dReal *JRow = J + (sizeint)ofsi * (2 * JME__MAX);
                MultiplyAddJinvMxJToA (Arow + ofsi, JinvMRow, JRow, infom, infom, mskip);

                for (dxJointNode *n0 = (ji != 0 ? jb0->firstjoint : NULL); n0; n0 = n0->next) {
                    // if joint was tagged as -1 then it is an inactive (m=0 or disabled)
                    // joint that should not be considered
                    int j0 = n0->joint->tag;
                    if (j0 != -1 && (unsigned)j0 < ji) {
                        const unsigned int jiother_ofsi = mindex[j0];
                        const unsigned int jiother_infom = mindex[j0 + 1] - jiother_ofsi;
                        const dJointWithInfo1 *jiother = jointinfos + j0;
                        unsigned int smart_infom = (jiother->joint->node[1].body == jb0) ? jiother_infom : 0;
                        // set block of A
                        const dReal *JOther = J + ((sizeint)jiother_ofsi * 2 + smart_infom) * JME__MAX;
                        MultiplyAddJinvMxJToA (Arow + jiother_ofsi, JinvMRow, JOther, infom, jiother_infom, mskip);
                    }
                }
            }

            dxBody *jb1 = joint->node[1].body;
            dIASSERT(jb1 != jb0);
            if (jb1 != NULL) {
                const dReal *JinvMOther = JinvMRow + infom * JIM__MAX;
                // compute diagonal block of A
                const dReal *JRow = J + ((sizeint)ofsi * 2 + infom) * JME__MAX;
                MultiplyAddJinvMxJToA (Arow + ofsi, JinvMOther, JRow, infom, infom, mskip);

                for (dxJointNode *n1 = (ji != 0 ? jb1->firstjoint : NULL); n1; n1 = n1->next) {
                    // if joint was tagged as -1 then it is an inactive (m=0 or disabled)
                    // joint that should not be considered
                    int j1 = n1->joint->tag;
                    if (j1 != -1 && (unsigned)j1 < ji) {
                        const unsigned int jiother_ofsi = mindex[j1];
                        const unsigned int jiother_infom = mindex[j1 + 1] - jiother_ofsi;
                        const dJointWithInfo1 *jiother = jointinfos + j1;
                        unsigned int smart_infom = (jiother->joint->node[1].body == jb1) ? jiother_infom : 0;
                        // set block of A
                        const dReal *JOther = J + ((sizeint)jiother_ofsi * 2 + smart_infom) * JME__MAX;
                        MultiplyAddJinvMxJToA (Arow + jiother_ofsi, JinvMOther, JOther, infom, jiother_infom, mskip);
                    }
                }
            }
        }
    }

    {
        // Warning!!!
        // This code depends on rhs_tmp elements and therefore must be in 
        // different sub-stage from rhs_tmp calculation in Stage2b to ensure 
        // proper synchronization and avoid accessing numbers being modified.
        // Warning!!!
        const dReal *J = localContext->m_J;
        const dReal *rhs_tmp = stage2CallContext->m_rhs_tmp;
        dReal *pairsRhsCfm = localContext->m_pairsRhsCfm;
        atomicord32 *bodyStartJoints = localContext->m_bodyStartJoints;
        atomicord32 *bodyJointLinks = localContext->m_bodyJointLinks;

        // compute the right hand side `rhs'
        // put J*rhs_tmp into rhs
        unsigned ji;
        while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_rhs, nj)) != nj) {
            const unsigned ofsi = mindex[ji];
            const unsigned int infom = mindex[ji + 1] - ofsi;

            dReal *currRhsCfm = pairsRhsCfm + (sizeint)ofsi * RCE__RHS_CFM_MAX;
            const dReal *JRow = J + (sizeint)ofsi * (2 * JME__MAX);
            
            dxJoint *joint = jointinfos[ji].joint;

            dxBody *jb0 = joint->node[0].body;
            if (true || jb0 != NULL) { // -- always true
                unsigned bodyIndex = (unsigned)jb0->tag;
                MultiplySubJxRhsTmpFromRHS (currRhsCfm, JRow, rhs_tmp + (sizeint)bodyIndex * dDA__MAX, infom);

                // Link joints connected to each body into a list to be used on results incorporation. The bodyStartJoints have been initialized in dxStepIsland_Stage2b.
                const atomicord32 linkIndex = (atomicord32)((sizeint)ji * dJCB__MAX + dJCB_FIRST_BODY); // It is asserted at links buffer allocation that the indices can't overflow atomicord32
                for (atomicord32 oldStartIndex = bodyStartJoints[bodyIndex]; ; oldStartIndex = bodyStartJoints[bodyIndex]) {
                    bodyJointLinks[linkIndex] = oldStartIndex;
                    if (ThrsafeCompareExchange(&bodyStartJoints[bodyIndex], oldStartIndex, linkIndex + 1)) { // The link index is stored incremented to allow 0 as end indicator
                        break;
                    }
                }
            }

            dxBody *jb1 = joint->node[1].body;
            if (jb1 != NULL) {
                unsigned bodyIndex = (unsigned)jb1->tag;
                MultiplySubJxRhsTmpFromRHS (currRhsCfm, JRow + infom * JME__MAX, rhs_tmp + (sizeint)bodyIndex * dDA__MAX, infom);

                // Link joints connected to each body into a list to be used on results incorporation. The bodyStartJoints have been initialized in dxStepIsland_Stage2b
                const atomicord32 linkIndex = (atomicord32)((sizeint)ji * dJCB__MAX + dJCB_SECOND_BODY); // It is asserted at links buffer allocation that the indices can't overflow atomicord32
                for (atomicord32 oldStartIndex = bodyStartJoints[bodyIndex]; ; oldStartIndex = bodyStartJoints[bodyIndex]) {
                    bodyJointLinks[linkIndex] = oldStartIndex;
                    if (ThrsafeCompareExchange(&bodyStartJoints[bodyIndex], oldStartIndex, linkIndex + 1)) { // The link index is stored incremented to allow 0 as end indicator
                        break;
                    }
                }
            }
        }
    }
}


static 
int dxStepIsland_Stage3_Callback(void *_stage3CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage3CallContext *stage3CallContext = (dxStepperStage3CallContext *)_stage3CallContext;
    dxStepIsland_Stage3(stage3CallContext);
    return 1;
}

static 
void dxStepIsland_Stage3(dxStepperStage3CallContext *stage3CallContext)
{
    const dxStepperProcessingCallContext *callContext = stage3CallContext->m_stepperCallContext;
    const dxStepperLocalContext *localContext = stage3CallContext->m_localContext;

    dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
    memarena->RestoreState(stage3CallContext->m_stage1MemArenaState);
    stage3CallContext = NULL; // WARNING! stage3CallContext is not valid after this point!
    dIVERIFY(stage3CallContext == NULL); // To suppress unused variable assignment warnings

    unsigned int m = localContext->m_m;
    unsigned int nub = localContext->m_nub;
    //const unsigned int *mindex = localContext->m_mindex;
    int *findex = localContext->m_findex;
    dReal *A = localContext->m_A;
    dReal *pairsRhsLambda = localContext->m_pairsRhsCfm; // Reuse cfm buffer for lambdas as the former values are not needed any more
    dReal *pairsLoHi = localContext->m_pairsLoHi;

    if (m > 0) {
        BEGIN_STATE_SAVE(memarena, lcpstate) {
            IFTIMING(dTimerNow ("solve LCP problem"));

            // solve the LCP problem and get lambda.
            // this will destroy A but that's OK
            dxSolveLCP (memarena, m, A, pairsRhsLambda, NULL, nub, pairsLoHi, findex);
            dSASSERT((int)RLE__RHS_LAMBDA_MAX == PBX__MAX && (int)RLE_RHS == PBX_B && (int)RLE_LAMBDA == PBX_X);
            dSASSERT((int)LHE__LO_HI_MAX == PLH__MAX && (int)LHE_LO == PLH_LO && (int)LHE_HI == PLH_HI);

        } END_STATE_SAVE(memarena, lcpstate);
    }

    // void *stage3MemarenaState = memarena->SaveState();

    dxStepperStage4CallContext *stage4CallContext = (dxStepperStage4CallContext *)memarena->AllocateBlock(sizeof(dxStepperStage4CallContext));
    stage4CallContext->Initialize(callContext, localContext/*, stage3MemarenaState*/);

    const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
    dIASSERT(allowedThreads != 0);

    if (allowedThreads == 1) {
        IFTIMING(dTimerNow ("compute and apply constraint force"));
        dxStepIsland_Stage4(stage4CallContext);
        IFTIMING(dTimerEnd());

        if (m > 0) {
            IFTIMING(dTimerReport(stdout,1));
        }
    }
    else {
        dCallReleaseeID finalReleasee = callContext->m_finalReleasee;
        dxWorld *world = callContext->m_world;
        world->AlterThreadedCallDependenciesCount(finalReleasee, allowedThreads - 1);
        world->PostThreadedCallsGroup(NULL, allowedThreads - 1, finalReleasee, &dxStepIsland_Stage4_Callback, stage4CallContext, "StepIsland Stage4");
        // Note: Adding another dependency for the finalReleasee is not necessary as it already depends on the current call
        dxStepIsland_Stage4(stage4CallContext);
    }
}

static 
int dxStepIsland_Stage4_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
{
    (void)callInstanceIndex; // unused
    (void)callThisReleasee; // unused
    dxStepperStage4CallContext *stage4CallContext = (dxStepperStage4CallContext *)_stage4CallContext;
    dxStepIsland_Stage4(stage4CallContext);
    return 1;
}

static 
void dxStepIsland_Stage4(dxStepperStage4CallContext *stage4CallContext)
{
    const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
    const dxStepperLocalContext *localContext = stage4CallContext->m_localContext;

    const dReal stepSize = callContext->m_stepSize;
    dxBody *const *bodies = callContext->m_islandBodiesStart;
    dReal *invI = localContext->m_invI;
    dJointWithInfo1 *jointInfos = localContext->m_jointinfos;
    dReal *J = localContext->m_J;
    dReal *pairsRhsLambda = localContext->m_pairsRhsCfm;
    const unsigned int *mIndex = localContext->m_mindex;
    atomicord32 *bodyStartJoints = localContext->m_bodyStartJoints;
    atomicord32 *bodyJointLinks = localContext->m_bodyJointLinks;
    const unsigned int nb = callContext->m_islandBodiesCount;

    unsigned bi;
    while ((bi = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_bi_constrForce, nb)) != nb) {
        dVector3 angularForceAccumulator;
        dxBody *b = bodies[bi];
        const dReal *invIrow = invI + (sizeint)bi * dM3E__MAX;
        dReal body_invMass_mul_stepSize = stepSize * b->invMass;

        dReal bodyConstrForce[CFE__MAX];
        bool constrForceAvailable = false;
        
        unsigned linkIndex = bodyStartJoints != NULL ? bodyStartJoints[bi] : 0;
        if (linkIndex != 0) {
            dSetZero(bodyConstrForce, dARRAY_SIZE(bodyConstrForce));
        }

        // compute the constraint force as constrForce = J'*lambda
        for (; linkIndex != 0; constrForceAvailable = true, linkIndex = bodyJointLinks[linkIndex - 1]) {
            unsigned jointIndex = (linkIndex - 1) / dJCB__MAX;
            unsigned jointBodyIndex = (linkIndex - 1) % dJCB__MAX;

            const dJointWithInfo1 *currJointInfo = jointInfos + jointIndex;
            unsigned ofsi = mIndex[jointIndex];
            dIASSERT(dIN_RANGE(jointIndex, 0, localContext->m_nj));

            const dReal *JRow = J + (sizeint)ofsi * (2 * JME__MAX);
            const dReal *rowRhsLambda = pairsRhsLambda + (sizeint)ofsi * RLE__RHS_LAMBDA_MAX;

            dxJoint *joint = currJointInfo->joint;
            const unsigned int infom = currJointInfo->info.m;

            // unsigned jRowExtraOffset = jointBodyIndex * infom * JME__MAX;
            unsigned jRowExtraOffset = jointBodyIndex != dJCB__MIN ? infom * JME__MAX : 0;
            dSASSERT(dJCB__MAX == 2);

            dJointFeedback *fb = joint->feedback;
            MultiplyAddJxLambdaToCForce(bodyConstrForce, JRow + jRowExtraOffset, rowRhsLambda, infom, fb, jointBodyIndex);
        }

        // compute the velocity update
        if (constrForceAvailable) {
            // add fe to cforce and multiply cforce by stepSize
            for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) {
                b->lvel[dV3E__AXES_MIN + j] += (bodyConstrForce[CFE__L_MIN + j] + b->facc[dV3E__AXES_MIN + j]) * body_invMass_mul_stepSize;
            }
            for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) {
                angularForceAccumulator[dV3E__AXES_MIN + k] = (bodyConstrForce[CFE__A_MIN + k] + b->tacc[dV3E__AXES_MIN + k]) * stepSize;
            }
        }
        else {
            // add fe to cforce and multiply cforce by stepSize
            dAddVectorScaledVector3(b->lvel, b->lvel, b->facc, body_invMass_mul_stepSize);
            dCopyScaledVector3(angularForceAccumulator, b->tacc, stepSize);
        }

        dMultiplyAdd0_331 (b->avel, invIrow, angularForceAccumulator + dV3E__AXES_MIN);

        // update the position and orientation from the new linear/angular velocity
        // (over the given time step)
        dxStepBody (b, stepSize);

        // zero all force accumulators
        dZeroVector3(b->facc);
        dZeroVector3(b->tacc);
    }
}


//****************************************************************************

/*extern */
sizeint dxEstimateStepMemoryRequirements (dxBody * const *body, unsigned int nb, dxJoint * const *_joint, unsigned int _nj)
{
    (void)body; // unused
    unsigned int nj, m;

    {
        unsigned int njcurr = 0, mcurr = 0;
        dxJoint::SureMaxInfo info;
        dxJoint *const *const _jend = _joint + _nj;
        for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; ++_jcurr) {	
            dxJoint *j = *_jcurr;
            j->getSureMaxInfo (&info);

            unsigned int jm = info.max_m;
            if (jm > 0) {
                njcurr++;

                mcurr += jm;
            }
        }
        nj = njcurr; m = mcurr;
    }

    sizeint res = 0;

    res += dOVERALIGNED_SIZE(sizeof(dReal) * dM3E__MAX * nb, INVI_ALIGNMENT); // for invI

    {
        sizeint sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * 2 * _nj); // for initial jointinfos

        // The array can't grow right more than by nj
        sizeint sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * ((sizeint)_nj + (sizeint)nj)); // for shrunk jointinfos
        sub1_res2 += dEFFICIENT_SIZE(sizeof(dxStepperLocalContext)); //for dxStepperLocalContext
        if (m > 0) {
            sub1_res2 += dEFFICIENT_SIZE(sizeof(unsigned int) * (nj + 1)); // for mindex
            sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m); // for findex
            sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 2 * JME__MAX * m); // for J
            unsigned int mskip = dPAD(m);
            sub1_res2 += dOVERALIGNED_SIZE(sizeof(dReal) * mskip * m, AMATRIX_ALIGNMENT); // for A
            sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * RCE__RHS_CFM_MAX * m); // for pairsRhsCfm
            sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * LHE__LO_HI_MAX * m); // for pairsLoHi
            sub1_res2 += dEFFICIENT_SIZE(sizeof(atomicord32) * nb); // for bodyStartJoints
            sub1_res2 += dEFFICIENT_SIZE(sizeof(atomicord32)* dJCB__MAX * nj); // for bodyJointLinks
        }

        {
            sizeint sub2_res1 = dEFFICIENT_SIZE(sizeof(dxStepperStage3CallContext)); // for dxStepperStage3CallContext

            sizeint sub2_res2 = 0;

            sizeint sub2_res3 = dEFFICIENT_SIZE(sizeof(dxStepperStage4CallContext)); // for dxStepperStage4CallContext

            if (m > 0) {
                sub2_res1 += dOVERALIGNED_SIZE(sizeof(dReal) * 2 * JIM__MAX * m, JINVM_ALIGNMENT); // for JinvM
                sub2_res1 += dEFFICIENT_SIZE(sizeof(dReal) * dDA__MAX * nb); // for rhs_tmp
                sub2_res1 += dEFFICIENT_SIZE(sizeof(dxStepperStage2CallContext)); // for dxStepperStage2CallContext

                sub2_res2 += dxEstimateSolveLCPMemoryReq(m, false);
            }

            sub1_res2 += dMAX(sub2_res1, dMAX(sub2_res2, sub2_res3));
        }

        sizeint sub1_res12_max = dMAX(sub1_res1, sub1_res2);
        sizeint stage01_contexts = dEFFICIENT_SIZE(sizeof(dxStepperStage0BodiesCallContext))
            + dEFFICIENT_SIZE(sizeof(dxStepperStage0JointsCallContext))
            + dEFFICIENT_SIZE(sizeof(dxStepperStage1CallContext));
        res += dMAX(sub1_res12_max, stage01_contexts);
    }

    return res;
}


/*extern */
unsigned dxEstimateStepMaxCallCount(
    unsigned /*activeThreadCount*/, unsigned allowedThreadCount)
{
    unsigned result = 1 // dxStepIsland itself
        + (2 * allowedThreadCount + 2) // (dxStepIsland_Stage2a + dxStepIsland_Stage2b) * allowedThreadCount + 2 * dxStepIsland_Stage2?_Sync
        + 1; // dxStepIsland_Stage3
    return result;
}