diff options
Diffstat (limited to 'libs/ode-0.16.1/ode/src/quickstep.cpp')
-rw-r--r-- | libs/ode-0.16.1/ode/src/quickstep.cpp | 3344 |
1 files changed, 3344 insertions, 0 deletions
diff --git a/libs/ode-0.16.1/ode/src/quickstep.cpp b/libs/ode-0.16.1/ode/src/quickstep.cpp new file mode 100644 index 0000000..046bc33 --- /dev/null +++ b/libs/ode-0.16.1/ode/src/quickstep.cpp @@ -0,0 +1,3344 @@ +/*************************************************************************
+ * *
+ * Open Dynamics Engine, Copyright (C) 2001-2003 Russell L. Smith. *
+ * All rights reserved. Email: russ@q12.org Web: www.q12.org *
+ * *
+ * This library is free software; you can redistribute it and/or *
+ * modify it under the terms of EITHER: *
+ * (1) The GNU Lesser General Public License as published by the Free *
+ * Software Foundation; either version 2.1 of the License, or (at *
+ * your option) any later version. The text of the GNU Lesser *
+ * General Public License is included with this library in the *
+ * file LICENSE.TXT. *
+ * (2) The BSD-style license that is included with this library in *
+ * the file LICENSE-BSD.TXT. *
+ * *
+ * This library is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
+ * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
+ * *
+ *************************************************************************/
+
+#include <ode/common.h>
+#include <ode/rotation.h>
+#include <ode/timer.h>
+#include <ode/error.h>
+#include <ode/misc.h>
+#include "config.h"
+#include "matrix.h"
+#include "odemath.h"
+#include "objects.h"
+#include "joints/joint.h"
+#include "lcp.h"
+#include "util.h"
+#include "threadingutils.h"
+
+#include <new>
+
+
+//***************************************************************************
+// configuration
+
+// for the SOR and CG methods:
+// uncomment the following line to use warm starting. this definitely
+// help for motor-driven joints. unfortunately it appears to hurt
+// with high-friction contacts using the SOR method. use with care
+
+// #define WARM_STARTING 1
+
+
+#define REORDERING_METHOD__DONT_REORDER 0
+#define REORDERING_METHOD__BY_ERROR 1
+#define REORDERING_METHOD__RANDOMLY 2
+
+// for the SOR method:
+// uncomment the following line to determine a new constraint-solving
+// order for each iteration. however, the qsort per iteration is expensive,
+// and the optimal order is somewhat problem dependent.
+// @@@ try the leaf->root ordering.
+
+// #define CONSTRAINTS_REORDERING_METHOD REORDERING_METHOD__BY_ERROR
+
+
+// for the SOR method:
+// uncomment the following line to randomly reorder constraint rows
+// during the solution. depending on the situation, this can help a lot
+// or hardly at all, but it doesn't seem to hurt.
+
+#define CONSTRAINTS_REORDERING_METHOD REORDERING_METHOD__RANDOMLY
+
+
+#if !defined(CONSTRAINTS_REORDERING_METHOD)
+#define CONSTRAINTS_REORDERING_METHOD REORDERING_METHOD__DONT_REORDER
+#endif
+
+
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__RANDOMLY
+#if !defined(RANDOM_CONSTRAINTS_REORDERING_FREQUENCY)
+#define RANDOM_CONSTRAINTS_REORDERING_FREQUENCY 8U
+#endif
+dSASSERT(RANDOM_CONSTRAINTS_REORDERING_FREQUENCY != 0);
+#endif
+
+enum dxRandomReorderStage
+{
+ RRS__MIN,
+
+ RRS_REORDERING = RRS__MIN,
+
+ RRS__MAX,
+};
+
+
+//***************************************************************************
+// macros, typedefs, forwards and inlines
+
+struct IndexError;
+
+
+#define dMIN(A,B) ((A)>(B) ? (B) : (A))
+#define dMAX(A,B) ((B)>(A) ? (B) : (A))
+
+
+#define dxQUICKSTEPISLAND_STAGE2B_STEP 16U
+#define dxQUICKSTEPISLAND_STAGE2C_STEP 32U
+
+#ifdef WARM_STARTING
+#define dxQUICKSTEPISLAND_STAGE4A_STEP 256U
+#else
+#define dxQUICKSTEPISLAND_STAGE4A_STEP 512U
+#endif
+
+#define dxQUICKSTEPISLAND_STAGE4LCP_IMJ_STEP 8U
+#define dxQUICKSTEPISLAND_STAGE4LCP_AD_STEP 8U
+
+#ifdef WARM_STARTING
+#define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP 128U
+#define dxQUICKSTEPISLAND_STAGE4LCP_FC_COMPLETE_TO_PREPARE_COMPLEXITY_DIVISOR 4
+#define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP_PREPARE (dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP * dxQUICKSTEPISLAND_STAGE4LCP_FC_COMPLETE_TO_PREPARE_COMPLEXITY_DIVISOR)
+#define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP_COMPLETE (dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP)
+#else
+#define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP (dxQUICKSTEPISLAND_STAGE4A_STEP / 2) // Average info.m is 3 for stage4a, while there are 6 reals per index in fc
+#endif
+
+#define dxQUICKSTEPISLAND_STAGE4B_STEP 256U
+
+#define dxQUICKSTEPISLAND_STAGE6A_STEP 16U
+#define dxQUICKSTEPISLAND_STAGE6B_STEP 1U
+
+template<unsigned int step_size>
+inline unsigned int CalculateOptimalThreadsCount(unsigned int complexity, unsigned int max_threads)
+{
+ unsigned int raw_threads = dMAX(complexity, step_size) / step_size; // Round down on division
+ unsigned int optimum = dMIN(raw_threads, max_threads);
+ return optimum;
+}
+
+#define dxENCODE_INDEX(index) ((unsigned int)((index) + 1))
+#define dxDECODE_INDEX(code) ((unsigned int)((code) - 1))
+#define dxHEAD_INDEX 0
+
+//****************************************************************************
+// special matrix multipliers
+
+// multiply block of B matrix (q x 6) with 12 dReal per row with C vector (q)
+static inline void Multiply1_12q1 (dReal *A, const dReal *B, const dReal *C, unsigned int q)
+{
+ dIASSERT (q>0 && A && B && C);
+
+ dReal a = 0;
+ dReal b = 0;
+ dReal c = 0;
+ dReal d = 0;
+ dReal e = 0;
+ dReal f = 0;
+ dReal s;
+
+ for(unsigned int i=0, k = 0; i<q; k += 12, i++)
+ {
+ s = C[i]; //C[i] and B[n+k] cannot overlap because its value has been read into a temporary.
+
+ //For the rest of the loop, the only memory dependency (array) is from B[]
+ a += B[ k] * s;
+ b += B[1+k] * s;
+ c += B[2+k] * s;
+ d += B[3+k] * s;
+ e += B[4+k] * s;
+ f += B[5+k] * s;
+ }
+
+ A[0] = a;
+ A[1] = b;
+ A[2] = c;
+ A[3] = d;
+ A[4] = e;
+ A[5] = f;
+}
+
+//***************************************************************************
+// testing stuff
+
+#ifdef TIMING
+#define IFTIMING(x) x
+#else
+#define IFTIMING(x) ((void)0)
+#endif
+
+
+struct dJointWithInfo1
+{
+ dxJoint *joint;
+ dxJoint::Info1 info;
+};
+
+
+struct dxMIndexItem
+{
+ unsigned mIndex;
+ unsigned fbIndex;
+};
+
+struct dxJBodiesItem
+{
+ unsigned first;
+ int second; // The index is optional and can equal to -1
+};
+
+enum dxInvIRowElement
+{
+ IIE__MIN,
+
+ IIE__MATRIX_MIN = IIE__MIN,
+ IIE__MATRIX_MAX = IIE__MATRIX_MIN + dM3E__MAX,
+
+ IIE__MAX = IIE__MATRIX_MAX,
+};
+
+enum dxRHSCFMElement
+{
+ RCE_RHS = dxJoint::GI2_RHS,
+ RCE_CFM = dxJoint::GI2_CFM,
+
+ RCE__RHS_CFM_MAX = dxJoint::GI2__RHS_CFM_MAX,
+};
+
+enum dxLoHiElement
+{
+ LHE_LO = dxJoint::GI2_LO,
+ LHE_HI = dxJoint::GI2_HI,
+
+ LHE__LO_HI_MAX = dxJoint::GI2__LO_HI_MAX,
+};
+
+enum dxJacobiVectorElement
+{
+ JVE__MIN,
+
+ JVE__L_MIN = JVE__MIN + dDA__L_MIN,
+
+ JVE_LX = JVE__MIN + dDA_LX,
+ JVE_LY = JVE__MIN + dDA_LY,
+ JVE_LZ = JVE__MIN + dDA_LZ,
+
+ JVE__L_MAX = JVE__MIN + dDA__L_MAX,
+
+ JVE__A_MIN = JVE__MIN + dDA__A_MIN,
+
+ JVE_AX = JVE__MIN + dDA_AX,
+ JVE_AY = JVE__MIN + dDA_AY,
+ JVE_AZ = JVE__MIN + dDA_AZ,
+
+ JVE__A_MAX = JVE__MIN + dDA__A_MAX,
+
+ JVE__MAX = JVE__MIN + dDA__MAX,
+
+ JVE__L_COUNT = JVE__L_MAX - JVE__L_MIN,
+ JVE__A_COUNT = JVE__A_MAX - JVE__A_MIN,
+};
+
+enum dxJacobiMatrixElement
+{
+ JME__MIN,
+
+ JME__J1_MIN = JME__MIN,
+ JME__J1L_MIN = JME__J1_MIN + JVE__L_MIN,
+
+ JME_J1LX = JME__J1_MIN + JVE_LX,
+ JME_J1LY = JME__J1_MIN + JVE_LY,
+ JME_J1LZ = JME__J1_MIN + JVE_LZ,
+
+ JME__J1L_MAX = JME__J1_MIN + JVE__L_MAX,
+
+ JME__J1A_MIN = JME__J1_MIN + JVE__A_MIN,
+
+ JME_J1AX = JME__J1_MIN + JVE_AX,
+ JME_J1AY = JME__J1_MIN + JVE_AY,
+ JME_J1AZ = JME__J1_MIN + JVE_AZ,
+
+ JME__J1A_MAX = JME__J1_MIN + JVE__A_MAX,
+ JME__J1_MAX = JME__J1_MIN + JVE__MAX,
+
+ JME__RHS_CFM_MIN = JME__J1_MAX,
+ JME_RHS = JME__RHS_CFM_MIN + RCE_RHS,
+ JME_CFM = JME__RHS_CFM_MIN + RCE_CFM,
+ JME__RHS_CFM_MAX = JME__RHS_CFM_MIN + RCE__RHS_CFM_MAX,
+
+ JME__J2_MIN = JME__RHS_CFM_MAX,
+ JME__J2L_MIN = JME__J2_MIN + JVE__L_MIN,
+
+ JME_J2LX = JME__J2_MIN + JVE_LX,
+ JME_J2LY = JME__J2_MIN + JVE_LY,
+ JME_J2LZ = JME__J2_MIN + JVE_LZ,
+
+ JME__J2L_MAX = JME__J2_MIN + JVE__L_MAX,
+
+ JME__J2A_MIN = JME__J2_MIN + JVE__A_MIN,
+
+ JME_J2AX = JME__J2_MIN + JVE_AX,
+ JME_J2AY = JME__J2_MIN + JVE_AY,
+ JME_J2AZ = JME__J2_MIN + JVE_AZ,
+
+ JME__J2A_MAX = JME__J2_MIN + JVE__A_MAX,
+ JME__J2_MAX = JME__J2_MIN + JVE__MAX,
+
+ JME__LO_HI_MIN = JME__J2_MAX,
+ JME_LO = JME__LO_HI_MIN + LHE_LO,
+ JME_HI = JME__LO_HI_MIN + LHE_HI,
+ JME__LO_HI_MAX = JME__LO_HI_MIN + LHE__LO_HI_MAX,
+
+ JME__MAX = JME__LO_HI_MAX, // Is not that a luck to have 16 elements here? ;-)
+
+ JME__J1_COUNT = JME__J1_MAX - JME__J1_MIN,
+ JME__J2_COUNT = JME__J2_MAX - JME__J2_MIN,
+ JME__J_COUNT = JVE__MAX,
+};
+
+dSASSERT(JME__J_COUNT == JME__J1_COUNT);
+dSASSERT(JME__J_COUNT == JME__J2_COUNT);
+
+enum dxJacobiCopyElement
+{
+ JCE__MIN,
+
+ JCE__J1_MIN = JCE__MIN,
+ JCE__J1L_MIN = JCE__J1_MIN,
+
+ JCE_J1LX = JCE__J1L_MIN,
+ JCE_J1LY,
+ JCE_J1LZ,
+
+ JCE__J1L_MAX,
+
+ JCE__J1A_MIN = JCE__J1L_MAX,
+
+ JCE_J1AX = JCE__J1A_MIN,
+ JCE_J1AY,
+ JCE_J1AZ,
+
+ JCE__J1A_MAX,
+ JCE__J1_MAX = JCE__J1A_MAX,
+
+ JCE__J2_MIN = JCE__J1_MAX,
+ JCE__J2L_MIN = JCE__J2_MIN,
+
+ JCE_J2LX = JCE__J2L_MIN,
+ JCE_J2LY,
+ JCE_J2LZ,
+
+ JCE__J2L_MAX,
+
+ JCE__J2A_MIN = JCE__J2L_MAX,
+
+ JCE_J2AX = JCE__J2A_MIN,
+ JCE_J2AY,
+ JCE_J2AZ,
+
+ JCE__J2A_MAX,
+ JCE__J2_MAX = JCE__J2A_MAX,
+
+ JCE__MAX = JCE__J2_MAX,
+
+ JCE__J1_COUNT = JCE__J1_MAX - JCE__J1_MIN,
+ JCE__J2_COUNT = JCE__J2_MAX - JCE__J2_MIN,
+ JCE__JMAX_COUNT = dMAX(JCE__J1_COUNT, JCE__J2_COUNT),
+};
+
+enum dxInvMJTElement
+{
+ IMJ__MIN,
+
+ IMJ__1_MIN = IMJ__MIN,
+
+ IMJ__1L_MIN = IMJ__1_MIN + JVE__L_MIN,
+
+ IMJ_1LX = IMJ__1_MIN + JVE_LX,
+ IMJ_1LY = IMJ__1_MIN + JVE_LY,
+ IMJ_1LZ = IMJ__1_MIN + JVE_LZ,
+
+ IMJ__1L_MAX = IMJ__1_MIN + JVE__L_MAX,
+
+ IMJ__1A_MIN = IMJ__1_MIN + JVE__A_MIN,
+
+ IMJ_1AX = IMJ__1_MIN + JVE_AX,
+ IMJ_1AY = IMJ__1_MIN + JVE_AY,
+ IMJ_1AZ = IMJ__1_MIN + JVE_AZ,
+
+ IMJ__1A_MAX = IMJ__1_MIN + JVE__A_MAX,
+
+ IMJ__1_MAX = IMJ__1_MIN + JVE__MAX,
+
+ IMJ__2_MIN = IMJ__1_MAX,
+
+ IMJ__2L_MIN = IMJ__2_MIN + JVE__L_MIN,
+
+ IMJ_2LX = IMJ__2_MIN + JVE_LX,
+ IMJ_2LY = IMJ__2_MIN + JVE_LY,
+ IMJ_2LZ = IMJ__2_MIN + JVE_LZ,
+
+ IMJ__2L_MAX = IMJ__2_MIN + JVE__L_MAX,
+
+ IMJ__2A_MIN = IMJ__2_MIN + JVE__A_MIN,
+
+ IMJ_2AX = IMJ__2_MIN + JVE_AX,
+ IMJ_2AY = IMJ__2_MIN + JVE_AY,
+ IMJ_2AZ = IMJ__2_MIN + JVE_AZ,
+
+ IMJ__2A_MAX = IMJ__2_MIN + JVE__A_MAX,
+
+ IMJ__2_MAX = IMJ__2_MIN + JVE__MAX,
+
+ IMJ__MAX = IMJ__2_MAX,
+};
+
+enum dxContactForceElement
+{
+ CFE__MIN,
+
+ CFE__DYNAMICS_MIN = CFE__MIN,
+
+ CFE__L_MIN = CFE__DYNAMICS_MIN + dDA__L_MIN,
+
+ CFE_LX = CFE__DYNAMICS_MIN + dDA_LX,
+ CFE_LY = CFE__DYNAMICS_MIN + dDA_LY,
+ CFE_LZ = CFE__DYNAMICS_MIN + dDA_LZ,
+
+ CFE__L_MAX = CFE__DYNAMICS_MIN + dDA__L_MAX,
+
+ CFE__A_MIN = CFE__DYNAMICS_MIN + dDA__A_MIN,
+
+ CFE_AX = CFE__DYNAMICS_MIN + dDA_AX,
+ CFE_AY = CFE__DYNAMICS_MIN + dDA_AY,
+ CFE_AZ = CFE__DYNAMICS_MIN + dDA_AZ,
+
+ CFE__A_MAX = CFE__DYNAMICS_MIN + dDA__A_MAX,
+
+ CFE__DYNAMICS_MAX = CFE__DYNAMICS_MIN + dDA__MAX,
+
+ CFE__MAX = CFE__DYNAMICS_MAX,
+};
+
+enum dxRHSElement
+{
+ RHS__MIN,
+
+ RHS__DYNAMICS_MIN = RHS__MIN,
+
+ RHS__L_MIN = RHS__DYNAMICS_MIN + dDA__L_MIN,
+
+ RHS_LX = RHS__DYNAMICS_MIN + dDA_LX,
+ RHS_LY = RHS__DYNAMICS_MIN + dDA_LY,
+ RHS_LZ = RHS__DYNAMICS_MIN + dDA_LZ,
+
+ RHS__L_MAX = RHS__DYNAMICS_MIN + dDA__L_MAX,
+
+ RHS__A_MIN = RHS__DYNAMICS_MIN + dDA__A_MIN,
+
+ RHS_AX = RHS__DYNAMICS_MIN + dDA_AX,
+ RHS_AY = RHS__DYNAMICS_MIN + dDA_AY,
+ RHS_AZ = RHS__DYNAMICS_MIN + dDA_AZ,
+
+ RHS__A_MAX = RHS__DYNAMICS_MIN + dDA__A_MAX,
+
+ RHS__DYNAMICS_MAX = RHS__DYNAMICS_MIN + dDA__MAX,
+
+ RHS__MAX = RHS__DYNAMICS_MAX,
+};
+
+
+#define JACOBIAN_ALIGNMENT dMAX(JME__MAX * sizeof(dReal), EFFICIENT_ALIGNMENT)
+dSASSERT(((JME__MAX - 1) & JME__MAX) == 0); // Otherwise there is no reason to over-align the Jacobian
+
+#define JCOPY_ALIGNMENT dMAX(32, EFFICIENT_ALIGNMENT)
+#define INVI_ALIGNMENT dMAX(32, EFFICIENT_ALIGNMENT)
+#define INVMJ_ALIGNMENT dMAX(32, EFFICIENT_ALIGNMENT)
+
+
+struct dxQuickStepperStage0Outputs
+{
+ unsigned int nj;
+ unsigned int m;
+ unsigned int mfb;
+};
+
+struct dxQuickStepperStage1CallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *stepperCallContext, void *stageMemArenaState, dReal *invI, dJointWithInfo1 *jointinfos)
+ {
+ m_stepperCallContext = stepperCallContext;
+ m_stageMemArenaState = stageMemArenaState;
+ m_invI = invI;
+ m_jointinfos = jointinfos;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ void *m_stageMemArenaState;
+ dReal *m_invI;
+ dJointWithInfo1 *m_jointinfos;
+ dxQuickStepperStage0Outputs m_stage0Outputs;
+};
+
+struct dxQuickStepperStage0BodiesCallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *stepperCallContext, dReal *invI)
+ {
+ m_stepperCallContext = stepperCallContext;
+ m_invI = invI;
+ m_tagsTaken = 0;
+ m_gravityTaken = 0;
+ m_inertiaBodyIndex = 0;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ dReal *m_invI;
+ atomicord32 m_tagsTaken;
+ atomicord32 m_gravityTaken;
+ volatile atomicord32 m_inertiaBodyIndex;
+};
+
+struct dxQuickStepperStage0JointsCallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *stepperCallContext, dJointWithInfo1 *jointinfos, dxQuickStepperStage0Outputs *stage0Outputs)
+ {
+ m_stepperCallContext = stepperCallContext;
+ m_jointinfos = jointinfos;
+ m_stage0Outputs = stage0Outputs;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ dJointWithInfo1 *m_jointinfos;
+ dxQuickStepperStage0Outputs *m_stage0Outputs;
+};
+
+static int dxQuickStepIsland_Stage0_Bodies_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage0_Joints_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage1_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+static void dxQuickStepIsland_Stage0_Bodies(dxQuickStepperStage0BodiesCallContext *callContext);
+static void dxQuickStepIsland_Stage0_Joints(dxQuickStepperStage0JointsCallContext *callContext);
+static void dxQuickStepIsland_Stage1(dxQuickStepperStage1CallContext *callContext);
+
+
+struct dxQuickStepperLocalContext
+{
+ void Initialize(dReal *invI, dJointWithInfo1 *jointinfos, unsigned int nj,
+ unsigned int m, unsigned int mfb, const dxMIndexItem *mindex, dxJBodiesItem *jb, int *findex,
+ dReal *J, dReal *Jcopy)
+ {
+ m_invI = invI;
+ m_jointinfos = jointinfos;
+ m_nj = nj;
+ m_m = m;
+ m_mfb = mfb;
+ m_valid_findices = 0;
+ m_mindex = mindex;
+ m_jb = jb;
+ m_findex = findex;
+ m_J = J;
+ m_Jcopy = Jcopy;
+ }
+
+ dReal *m_invI;
+ dJointWithInfo1 *m_jointinfos;
+ unsigned int m_nj;
+ unsigned int m_m;
+ unsigned int m_mfb;
+ volatile atomicord32 m_valid_findices;
+ const dxMIndexItem *m_mindex;
+ dxJBodiesItem *m_jb;
+ int *m_findex;
+ dReal *m_J;
+ dReal *m_Jcopy;
+};
+
+struct dxQuickStepperStage3CallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *callContext, const dxQuickStepperLocalContext *localContext,
+ void *stage1MemArenaState)
+ {
+ m_stepperCallContext = callContext;
+ m_localContext = localContext;
+ m_stage1MemArenaState = stage1MemArenaState;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ const dxQuickStepperLocalContext *m_localContext;
+ void *m_stage1MemArenaState;
+};
+
+struct dxQuickStepperStage2CallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *callContext, dxQuickStepperLocalContext *localContext,
+ dReal *rhs_tmp)
+ {
+ m_stepperCallContext = callContext;
+ m_localContext = localContext;
+ m_rhs_tmp = rhs_tmp;
+ m_ji_J = 0;
+ m_ji_jb = 0;
+ m_bi = 0;
+ m_Jrhsi = 0;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ dxQuickStepperLocalContext *m_localContext;
+ dReal *m_rhs_tmp;
+ volatile atomicord32 m_ji_J;
+ volatile atomicord32 m_ji_jb;
+ volatile atomicord32 m_bi;
+ volatile atomicord32 m_Jrhsi;
+};
+
+static int dxQuickStepIsland_Stage2a_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage2aSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage2b_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage2bSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage2c_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage3_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+static void dxQuickStepIsland_Stage2a(dxQuickStepperStage2CallContext *stage2CallContext);
+static void dxQuickStepIsland_Stage2b(dxQuickStepperStage2CallContext *stage2CallContext);
+static void dxQuickStepIsland_Stage2c(dxQuickStepperStage2CallContext *stage2CallContext);
+static void dxQuickStepIsland_Stage3(dxQuickStepperStage3CallContext *stage3CallContext);
+
+
+struct dxQuickStepperStage5CallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *callContext, const dxQuickStepperLocalContext *localContext,
+ void *stage3MemArenaState)
+ {
+ m_stepperCallContext = callContext;
+ m_localContext = localContext;
+ m_stage3MemArenaState = stage3MemArenaState;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ const dxQuickStepperLocalContext *m_localContext;
+ void *m_stage3MemArenaState;
+};
+
+struct dxQuickStepperStage4CallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *callContext, const dxQuickStepperLocalContext *localContext,
+ dReal *lambda, dReal *cforce, dReal *iMJ, IndexError *order, dReal *last_lambda, atomicord32 *bi_links_or_mi_levels, atomicord32 *mi_links)
+ {
+ m_stepperCallContext = callContext;
+ m_localContext = localContext;
+ m_lambda = lambda;
+ m_cforce = cforce;
+ m_iMJ = iMJ;
+ m_order = order;
+ m_last_lambda = last_lambda;
+ m_bi_links_or_mi_levels = bi_links_or_mi_levels;
+ m_mi_links = mi_links;
+ m_LCP_IterationSyncReleasee = NULL;
+ m_LCP_IterationAllowedThreads = 0;
+ m_LCP_fcStartReleasee = NULL;
+ m_ji_4a = 0;
+ m_mi_iMJ = 0;
+ m_mi_fc = 0;
+ m_mi_Ad = 0;
+ m_LCP_iteration = 0;
+ m_cf_4b = 0;
+ m_ji_4b = 0;
+ }
+
+ void AssignLCP_IterationData(dCallReleaseeID releaseeInstance, unsigned int iterationAllowedThreads)
+ {
+ m_LCP_IterationSyncReleasee = releaseeInstance;
+ m_LCP_IterationAllowedThreads = iterationAllowedThreads;
+ }
+
+ void AssignLCP_fcStartReleasee(dCallReleaseeID releaseeInstance)
+ {
+ m_LCP_fcStartReleasee = releaseeInstance;
+ }
+
+ void AssignLCP_fcAllowedThreads(unsigned int prepareThreads, unsigned int completeThreads)
+ {
+ m_LCP_fcPrepareThreadsRemaining = prepareThreads;
+ m_LCP_fcCompleteThreadsTotal = completeThreads;
+ }
+
+ void ResetLCP_fcComputationIndex()
+ {
+ m_mi_fc = 0;
+ }
+
+ void ResetSOR_ConstraintsReorderVariables(unsigned reorderThreads)
+ {
+ m_SOR_reorderHeadTaken = 0;
+ m_SOR_reorderTailTaken = 0;
+ m_SOR_bi_zeroHeadTaken = 0;
+ m_SOR_bi_zeroTailTaken = 0;
+ m_SOR_mi_zeroHeadTaken = 0;
+ m_SOR_mi_zeroTailTaken = 0;
+ m_SOR_reorderThreadsRemaining = reorderThreads;
+ }
+
+ void RecordLCP_IterationStart(unsigned int totalThreads, dCallReleaseeID nextReleasee)
+ {
+ m_LCP_iterationThreadsTotal = totalThreads;
+ m_LCP_iterationThreadsRemaining = totalThreads;
+ m_LCP_iterationNextReleasee = nextReleasee;
+ }
+
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ const dxQuickStepperLocalContext *m_localContext;
+ dReal *m_lambda;
+ dReal *m_cforce;
+ dReal *m_iMJ;
+ IndexError *m_order;
+ dReal *m_last_lambda;
+ atomicord32 *m_bi_links_or_mi_levels;
+ atomicord32 *m_mi_links;
+ dCallReleaseeID m_LCP_IterationSyncReleasee;
+ unsigned int m_LCP_IterationAllowedThreads;
+ dCallReleaseeID m_LCP_fcStartReleasee;
+ volatile atomicord32 m_ji_4a;
+ volatile atomicord32 m_mi_iMJ;
+ volatile atomicord32 m_mi_fc;
+ volatile atomicord32 m_LCP_fcPrepareThreadsRemaining;
+ unsigned int m_LCP_fcCompleteThreadsTotal;
+ volatile atomicord32 m_mi_Ad;
+ unsigned int m_LCP_iteration;
+ unsigned int m_LCP_iterationThreadsTotal;
+ volatile atomicord32 m_LCP_iterationThreadsRemaining;
+ dCallReleaseeID m_LCP_iterationNextReleasee;
+ volatile atomicord32 m_SOR_reorderHeadTaken;
+ volatile atomicord32 m_SOR_reorderTailTaken;
+ volatile atomicord32 m_SOR_bi_zeroHeadTaken;
+ volatile atomicord32 m_SOR_bi_zeroTailTaken;
+ volatile atomicord32 m_SOR_mi_zeroHeadTaken;
+ volatile atomicord32 m_SOR_mi_zeroTailTaken;
+ volatile atomicord32 m_SOR_reorderThreadsRemaining;
+ volatile atomicord32 m_cf_4b;
+ volatile atomicord32 m_ji_4b;
+};
+
+
+static int dxQuickStepIsland_Stage4a_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_iMJ_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_iMJSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_fcStart_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_fc_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+#ifdef WARM_STARTING
+static int dxQuickStepIsland_Stage4LCP_fcWarmComplete_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+#endif
+static int dxQuickStepIsland_Stage4LCP_Ad_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_ReorderPrep_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_IterationStart_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_ConstraintsReordering_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_ConstraintsReorderingSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_Iteration_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4LCP_IterationSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage4b_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage5_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+static void dxQuickStepIsland_Stage4a(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_iMJComputation(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_MTfcComputation(dxQuickStepperStage4CallContext *stage4CallContext, dCallReleaseeID callThisReleasee);
+#ifdef WARM_STARTING
+static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warm(dxQuickStepperStage4CallContext *stage4CallContext, dCallReleaseeID callThisReleasee);
+static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmZeroArrays(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmPrepare(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(dxQuickStepperStage4CallContext *stage4CallContext);
+#endif
+static void dxQuickStepIsland_Stage4LCP_MTfcComputation_cold(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_STfcComputation(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_AdComputation(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_ReorderPrep(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_ConstraintsReordering(dxQuickStepperStage4CallContext *stage4CallContext);
+static bool dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int iteration);
+static void dxQuickStepIsland_Stage4LCP_LinksArraysZeroing(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_DependencyMapForNewOrderRebuilding(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_MTIteration(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int initiallyKnownToBeCompletedLevel);
+static void dxQuickStepIsland_Stage4LCP_STIteration(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage4LCP_IterationStep(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int i);
+static void dxQuickStepIsland_Stage4b(dxQuickStepperStage4CallContext *stage4CallContext);
+static void dxQuickStepIsland_Stage5(dxQuickStepperStage5CallContext *stage5CallContext);
+
+
+struct dxQuickStepperStage6CallContext
+{
+ void Initialize(const dxStepperProcessingCallContext *callContext, const dxQuickStepperLocalContext *localContext)
+ {
+ m_stepperCallContext = callContext;
+ m_localContext = localContext;
+ m_bi_6a = 0;
+ m_bi_6b = 0;
+ }
+
+ const dxStepperProcessingCallContext *m_stepperCallContext;
+ const dxQuickStepperLocalContext *m_localContext;
+ volatile atomicord32 m_bi_6a;
+ volatile atomicord32 m_bi_6b;
+};
+
+static int dxQuickStepIsland_Stage6a_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage6aSync_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+static int dxQuickStepIsland_Stage6b_Callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
+
+static void dxQuickStepIsland_Stage6a(dxQuickStepperStage6CallContext *stage6CallContext);
+static void dxQuickStepIsland_Stage6_VelocityCheck(dxQuickStepperStage6CallContext *stage6CallContext);
+static void dxQuickStepIsland_Stage6b(dxQuickStepperStage6CallContext *stage6CallContext);
+
+//***************************************************************************
+// various common computations involving the matrix J
+
+// compute iMJ = inv(M)*J'
+
+template<unsigned int step_size>
+void compute_invM_JT (volatile atomicord32 *mi_storage, dReal *iMJ,
+ unsigned int m, const dReal *J, const dxJBodiesItem *jb,
+ dxBody * const *body, const dReal *invI)
+{
+ unsigned int m_steps = (m + (step_size - 1)) / step_size;
+
+ unsigned mi_step;
+ while ((mi_step = ThrsafeIncrementIntUpToLimit(mi_storage, m_steps)) != m_steps) {
+ unsigned int mi = mi_step * step_size;
+ const unsigned int miend = mi + dMIN(step_size, m - mi);
+
+ dReal *iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX;
+ const dReal *J_ptr = J + (sizeint)mi * JME__MAX;
+ while (true) {
+ int b1 = jb[mi].first;
+ int b2 = jb[mi].second;
+
+ dReal k1 = body[(unsigned)b1]->invMass;
+ for (unsigned int j = 0; j != JVE__L_COUNT; j++) iMJ_ptr[IMJ__1L_MIN + j] = k1 * J_ptr[JME__J1L_MIN + j];
+ const dReal *invIrow1 = invI + (sizeint)(unsigned)b1 * IIE__MAX + IIE__MATRIX_MIN;
+ dMultiply0_331 (iMJ_ptr + IMJ__1A_MIN, invIrow1, J_ptr + JME__J1A_MIN);
+
+ if (b2 != -1) {
+ dReal k2 = body[(unsigned)b2]->invMass;
+ for (unsigned int j = 0; j != JVE__L_COUNT; ++j) iMJ_ptr[IMJ__2L_MIN + j] = k2 * J_ptr[JME__J2L_MIN + j];
+ const dReal *invIrow2 = invI + (sizeint)(unsigned)b2 * IIE__MAX + IIE__MATRIX_MIN;
+ dMultiply0_331 (iMJ_ptr + IMJ__2A_MIN, invIrow2, J_ptr + JME__J2A_MIN);
+ }
+
+ if (++mi == miend) {
+ break;
+ }
+ iMJ_ptr += IMJ__MAX;
+ J_ptr += JME__MAX;
+ }
+ }
+}
+
+#ifdef WARM_STARTING
+
+static
+void multiply_invM_JT_init_array(unsigned int nb, atomicord32 *bi_links/*=[nb]*/)
+{
+ // const unsigned businessIndex_none = dxENCODE_INDEX(-1);
+ // for (unsigned int bi = 0; bi != nb; ++bi) {
+ // bi_links[bi] = businessIndex_none;
+ // }
+ memset(bi_links, 0, nb * sizeof(bi_links[0]));
+}
+
+// compute out = inv(M)*J'*in.
+template<unsigned int step_size>
+void multiply_invM_JT_prepare(volatile atomicord32 *mi_storage,
+ unsigned int m, const dxJBodiesItem *jb, atomicord32 *bi_links/*=[nb]*/, atomicord32 *mi_links/*=[2*m]*/)
+{
+ unsigned int m_steps = (m + (step_size - 1)) / step_size;
+
+ unsigned mi_step;
+ while ((mi_step = ThrsafeIncrementIntUpToLimit(mi_storage, m_steps)) != m_steps) {
+ unsigned int mi = mi_step * step_size;
+ const unsigned int miend = mi + dMIN(step_size, m - mi);
+
+ while (true) {
+ int b1 = jb[mi].first;
+ int b2 = jb[mi].second;
+
+ const unsigned encoded_mi = dxENCODE_INDEX(mi);
+ unsigned oldIndex_b1 = ThrsafeExchange(&bi_links[b1], encoded_mi);
+ mi_links[(sizeint)mi * 2] = oldIndex_b1;
+
+ if (b2 != -1) {
+ unsigned oldIndex_b2 = ThrsafeExchange(&bi_links[b2], encoded_mi);
+ mi_links[(sizeint)mi * 2 + 1] = oldIndex_b2;
+ }
+
+ if (++mi == miend) {
+ break;
+ }
+ }
+ }
+}
+
+template<unsigned int step_size, unsigned int out_offset, unsigned int out_stride>
+void multiply_invM_JT_complete(volatile atomicord32 *bi_storage, dReal *out,
+ unsigned int nb, const dReal *iMJ, const dxJBodiesItem *jb, const dReal *in,
+ atomicord32 *bi_links/*=[nb]*/, atomicord32 *mi_links/*=[2*m]*/)
+{
+ const unsigned businessIndex_none = dxENCODE_INDEX(-1);
+
+ unsigned int nb_steps = (nb + (step_size - 1)) / step_size;
+
+ unsigned bi_step;
+ while ((bi_step = ThrsafeIncrementIntUpToLimit(bi_storage, nb_steps)) != nb_steps) {
+ unsigned int bi = bi_step * step_size;
+ const unsigned int biend = bi + dMIN(step_size, nb - bi);
+
+ dReal *out_ptr = out + (sizeint)bi * out_stride + out_offset;
+ while (true) {
+ dReal psum0 = REAL(0.0), psum1 = REAL(0.0), psum2 = REAL(0.0), psum3 = REAL(0.0), psum4 = REAL(0.0), psum5 = REAL(0.0);
+
+ unsigned businessIndex = bi_links[bi];
+ while (businessIndex != businessIndex_none) {
+ unsigned int mi = dxDECODE_INDEX(businessIndex);
+ const dReal *iMJ_ptr;
+
+ if (bi == jb[mi].first) {
+ iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX + IMJ__1_MIN;
+ businessIndex = mi_links[(sizeint)mi * 2];
+ }
+ else {
+ dIASSERT(bi == jb[mi].second);
+
+ iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX + IMJ__2_MIN;
+ businessIndex = mi_links[(sizeint)mi * 2 + 1];
+ }
+
+ const dReal in_i = in[mi];
+ psum0 += in_i * iMJ_ptr[JVE_LX]; psum1 += in_i * iMJ_ptr[JVE_LY]; psum2 += in_i * iMJ_ptr[JVE_LZ];
+ psum3 += in_i * iMJ_ptr[JVE_AX]; psum4 += in_i * iMJ_ptr[JVE_AY]; psum5 += in_i * iMJ_ptr[JVE_AZ];
+ }
+
+ out_ptr[dDA_LX] = psum0; out_ptr[dDA_LY] = psum1; out_ptr[dDA_LZ] = psum2;
+ out_ptr[dDA_AX] = psum3; out_ptr[dDA_AY] = psum4; out_ptr[dDA_AZ] = psum5;
+
+ if (++bi == biend) {
+ break;
+ }
+ out_ptr += out_stride;
+ }
+ }
+}
+
+template<unsigned int out_offset, unsigned int out_stride>
+void _multiply_invM_JT (dReal *out,
+ unsigned int m, unsigned int nb, dReal *iMJ, const dxJBodiesItem *jb, const dReal *in)
+{
+ dSetZero (out, (sizeint)nb * out_stride);
+ const dReal *iMJ_ptr = iMJ;
+ for (unsigned int i=0; i<m; i++) {
+ int b1 = jb[i].first;
+ int b2 = jb[i].second;
+ const dReal in_i = in[i];
+
+ dReal *out_ptr = out + (sizeint)(unsigned)b1 * out_stride + out_offset;
+ for (unsigned int j = JVE__MIN; j != JVE__MAX; j++) out_ptr[j - JVE__MIN] += iMJ_ptr[IMJ__1_MIN + j] * in_i;
+ dSASSERT(out_stride - out_offset >= JVE__MAX);
+ dSASSERT(JVE__MAX == (int)dDA__MAX);
+
+ if (b2 != -1) {
+ out_ptr = out + (sizeint)(unsigned)b2 * out_stride + out_offset;
+ for (unsigned int j = JVE__MIN; j != JVE__MAX; j++) out_ptr[j - JVE__MIN] += iMJ_ptr[IMJ__2_MIN + j] * in_i;
+ dSASSERT(out_stride - out_offset >= JVE__MAX);
+ dSASSERT(JVE__MAX == (int)dDA__MAX);
+ }
+
+ iMJ_ptr += IMJ__MAX;
+ }
+}
+#endif
+
+// compute out = J*in.
+template<unsigned int step_size, unsigned int in_offset, unsigned int in_stride>
+void multiplyAdd_J (volatile atomicord32 *mi_storage,
+ unsigned int m, dReal *J, const dxJBodiesItem *jb, const dReal *in)
+{
+ unsigned int m_steps = (m + (step_size - 1)) / step_size;
+
+ unsigned mi_step;
+ while ((mi_step = ThrsafeIncrementIntUpToLimit(mi_storage, m_steps)) != m_steps) {
+ unsigned int mi = mi_step * step_size;
+ const unsigned int miend = mi + dMIN(step_size, m - mi);
+
+ dReal *J_ptr = J + (sizeint)mi * JME__MAX;
+ while (true) {
+ int b1 = jb[mi].first;
+ int b2 = jb[mi].second;
+ dReal sum = REAL(0.0);
+ const dReal *in_ptr = in + (sizeint)(unsigned)b1 * in_stride + in_offset;
+ for (unsigned int j = 0; j != JME__J1_COUNT; ++j) sum += J_ptr[j + JME__J1_MIN] * in_ptr[j];
+ dSASSERT(in_offset + JME__J1_COUNT <= in_stride);
+
+ if (b2 != -1) {
+ in_ptr = in + (sizeint)(unsigned)b2 * in_stride + in_offset;
+ for (unsigned int j = 0; j != JME__J2_COUNT; ++j) sum += J_ptr[j + JME__J2_MIN] * in_ptr[j];
+ dSASSERT(in_offset + JME__J2_COUNT <= in_stride);
+ }
+ J_ptr[JME_RHS] += sum;
+
+ if (++mi == miend) {
+ break;
+ }
+ J_ptr += JME__MAX;
+ }
+ }
+}
+
+
+struct IndexError {
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+ dReal error; // error to sort on
+#endif
+ int index; // row index
+};
+
+
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+
+static int compare_index_error (const void *a, const void *b)
+{
+ const IndexError *i1 = (IndexError*) a;
+ const IndexError *i2 = (IndexError*) b;
+ if (i1->error < i2->error) return -1;
+ if (i1->error > i2->error) return 1;
+ return 0;
+}
+
+#endif // #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+
+static inline
+bool IsSORConstraintsReorderRequiredForIteration(unsigned iteration)
+{
+ bool result = false;
+
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+
+ result = true;
+
+
+#elif CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__RANDOMLY
+
+ // This logic is intended to skip randomization on the very first iteration
+ if (!dIN_RANGE(iteration, 0, RANDOM_CONSTRAINTS_REORDERING_FREQUENCY)
+ ? dIN_RANGE(iteration % RANDOM_CONSTRAINTS_REORDERING_FREQUENCY, RRS__MIN, RRS__MAX)
+ : iteration == 0) {
+ result = true;
+ }
+
+
+#else // #if CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__BY_ERROR && CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__RANDOMLY
+
+ if (iteration == 0) {
+ result = true;
+ }
+
+
+#endif
+
+ return result;
+}
+
+/*extern */
+void dxQuickStepIsland(const dxStepperProcessingCallContext *callContext)
+{
+ dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+ unsigned int nb = callContext->m_islandBodiesCount;
+ unsigned int _nj = callContext->m_islandJointsCount;
+
+ dReal *invI = memarena->AllocateOveralignedArray<dReal>((sizeint)nb * IIE__MAX, INVI_ALIGNMENT);
+ dJointWithInfo1 *const jointinfos = memarena->AllocateArray<dJointWithInfo1>(_nj);
+
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ dIASSERT(allowedThreads != 0);
+
+ void *stagesMemArenaState = memarena->SaveState();
+
+ dxQuickStepperStage1CallContext *stage1CallContext = (dxQuickStepperStage1CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage1CallContext));
+ stage1CallContext->Initialize(callContext, stagesMemArenaState, invI, jointinfos);
+
+ dxQuickStepperStage0BodiesCallContext *stage0BodiesCallContext = (dxQuickStepperStage0BodiesCallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage0BodiesCallContext));
+ stage0BodiesCallContext->Initialize(callContext, invI);
+
+ dxQuickStepperStage0JointsCallContext *stage0JointsCallContext = (dxQuickStepperStage0JointsCallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage0JointsCallContext));
+ stage0JointsCallContext->Initialize(callContext, jointinfos, &stage1CallContext->m_stage0Outputs);
+
+ if (allowedThreads == 1)
+ {
+ IFTIMING(dTimerStart("preprocessing"));
+ dxQuickStepIsland_Stage0_Bodies(stage0BodiesCallContext);
+ dxQuickStepIsland_Stage0_Joints(stage0JointsCallContext);
+ dxQuickStepIsland_Stage1(stage1CallContext);
+ }
+ else
+ {
+ unsigned bodyThreads = CalculateOptimalThreadsCount<1U>(nb, allowedThreads);
+ unsigned jointThreads = 1;
+
+ dxWorld *world = callContext->m_world;
+
+ dCallReleaseeID stage1CallReleasee;
+ world->PostThreadedCallForUnawareReleasee(NULL, &stage1CallReleasee, bodyThreads + jointThreads, callContext->m_finalReleasee,
+ NULL, &dxQuickStepIsland_Stage1_Callback, stage1CallContext, 0, "QuickStepIsland Stage1");
+
+ // It is preferable to post single threaded task first to be started sooner
+ world->PostThreadedCall(NULL, NULL, 0, stage1CallReleasee, NULL, &dxQuickStepIsland_Stage0_Joints_Callback, stage0JointsCallContext, 0, "QuickStepIsland Stage0-Joints");
+ dIASSERT(jointThreads == 1);
+
+ if (bodyThreads > 1) {
+ world->PostThreadedCallsGroup(NULL, bodyThreads - 1, stage1CallReleasee, &dxQuickStepIsland_Stage0_Bodies_Callback, stage0BodiesCallContext, "QuickStepIsland Stage0-Bodies");
+ }
+ dxQuickStepIsland_Stage0_Bodies(stage0BodiesCallContext);
+ world->AlterThreadedCallDependenciesCount(stage1CallReleasee, -1);
+ }
+}
+
+static
+int dxQuickStepIsland_Stage0_Bodies_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage0BodiesCallContext *callContext = (dxQuickStepperStage0BodiesCallContext *)_callContext;
+ dxQuickStepIsland_Stage0_Bodies(callContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage0_Bodies(dxQuickStepperStage0BodiesCallContext *callContext)
+{
+ dxBody * const *body = callContext->m_stepperCallContext->m_islandBodiesStart;
+ unsigned int nb = callContext->m_stepperCallContext->m_islandBodiesCount;
+
+ if (ThrsafeExchange(&callContext->m_tagsTaken, 1) == 0)
+ {
+ // number all bodies in the body list - set their tag values
+ for (unsigned int i=0; i<nb; i++) body[i]->tag = i;
+ }
+
+ if (ThrsafeExchange(&callContext->m_gravityTaken, 1) == 0)
+ {
+ dxWorld *world = callContext->m_stepperCallContext->m_world;
+
+ // add the gravity force to all bodies
+ // since gravity does normally have only one component it's more efficient
+ // to run three loops for each individual component
+ dxBody *const *const bodyend = body + nb;
+ dReal gravity_x = world->gravity[0];
+ if (gravity_x) {
+ for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
+ dxBody *b = *bodycurr;
+ if ((b->flags & dxBodyNoGravity) == 0) {
+ b->facc[0] += b->mass.mass * gravity_x;
+ }
+ }
+ }
+ dReal gravity_y = world->gravity[1];
+ if (gravity_y) {
+ for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
+ dxBody *b = *bodycurr;
+ if ((b->flags & dxBodyNoGravity) == 0) {
+ b->facc[1] += b->mass.mass * gravity_y;
+ }
+ }
+ }
+ dReal gravity_z = world->gravity[2];
+ if (gravity_z) {
+ for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
+ dxBody *b = *bodycurr;
+ if ((b->flags & dxBodyNoGravity) == 0) {
+ b->facc[2] += b->mass.mass * gravity_z;
+ }
+ }
+ }
+ }
+
+ // for all bodies, compute the inertia tensor and its inverse in the global
+ // frame, and compute the rotational force and add it to the torque
+ // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body.
+ {
+ dReal *invI = callContext->m_invI;
+ unsigned int bodyIndex;
+ while ((bodyIndex = ThrsafeIncrementIntUpToLimit(&callContext->m_inertiaBodyIndex, nb)) != nb) {
+ dReal *invIrow = invI + (sizeint)bodyIndex * IIE__MAX;
+ dxBody *b = body[bodyIndex];
+
+ dMatrix3 tmp;
+ // compute inverse inertia tensor in global frame
+ dMultiply2_333 (tmp, b->invI, b->posr.R);
+ dMultiply0_333 (invIrow + IIE__MATRIX_MIN, b->posr.R, tmp);
+
+ // Don't apply gyroscopic torques to bodies
+ // if not flagged or the body is kinematic
+ if ((b->flags & dxBodyGyroscopic) && (b->invMass > 0)) {
+ dMatrix3 I;
+ // compute inertia tensor in global frame
+ dMultiply2_333 (tmp, b->mass.I, b->posr.R);
+ dMultiply0_333 (I, b->posr.R, tmp);
+ // compute rotational force
+#if 0
+ // Explicit computation
+ dMultiply0_331 (tmp, I, b->avel);
+ dSubtractVectorCross3(b->tacc, b->avel, tmp);
+#else
+ // Do the implicit computation based on
+ //"Stabilizing Gyroscopic Forces in Rigid Multibody Simulations"
+ // (Lacoursière 2006)
+ dReal h = callContext->m_stepperCallContext->m_stepSize; // Step size
+ dVector3 L; // Compute angular momentum
+ dMultiply0_331(L, I, b->avel);
+
+ // Compute a new effective 'inertia tensor'
+ // for the implicit step: the cross-product
+ // matrix of the angular momentum plus the
+ // old tensor scaled by the timestep.
+ // Itild may not be symmetric pos-definite,
+ // but we can still use it to compute implicit
+ // gyroscopic torques.
+ dMatrix3 Itild = { 0 };
+ dSetCrossMatrixMinus(Itild, L, 4);
+ for (int ii = dM3E__MIN; ii < dM3E__MAX; ++ii) {
+ Itild[ii] = Itild[ii] * h + I[ii];
+ }
+
+ // Scale momentum by inverse time to get
+ // a sort of "torque"
+ dScaleVector3(L, dRecip(h));
+ // Invert the pseudo-tensor
+ dMatrix3 itInv;
+ // This is a closed-form inversion.
+ // It's probably not numerically stable
+ // when dealing with small masses with
+ // a large asymmetry.
+ // An LU decomposition might be better.
+ if (dInvertMatrix3(itInv, Itild) != 0) {
+ // "Divide" the original tensor
+ // by the pseudo-tensor (on the right)
+ dMultiply0_333(Itild, I, itInv);
+ // Subtract an identity matrix
+ Itild[dM3E_XX] -= 1; Itild[dM3E_YY] -= 1; Itild[dM3E_ZZ] -= 1;
+
+ // This new inertia matrix rotates the
+ // momentum to get a new set of torques
+ // that will work correctly when applied
+ // to the old inertia matrix as explicit
+ // torques with a semi-implicit update
+ // step.
+ dVector3 tau0;
+ dMultiply0_331(tau0, Itild, L);
+
+ // Add the gyro torques to the torque
+ // accumulator
+ dAddVectors3(b->tacc, b->tacc, tau0);
+ }
+#endif
+ }
+ }
+ }
+}
+
+static
+int dxQuickStepIsland_Stage0_Joints_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage0JointsCallContext *callContext = (dxQuickStepperStage0JointsCallContext *)_callContext;
+ dxQuickStepIsland_Stage0_Joints(callContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage0_Joints(dxQuickStepperStage0JointsCallContext *callContext)
+{
+ dxJoint * const *_joint = callContext->m_stepperCallContext->m_islandJointsStart;
+ unsigned int _nj = callContext->m_stepperCallContext->m_islandJointsCount;
+
+ // get joint information (m = total constraint dimension, nub = number of unbounded variables).
+ // joints with m=0 are inactive and are removed from the joints array
+ // entirely, so that the code that follows does not consider them.
+ {
+ unsigned int mcurr = 0, mfbcurr = 0;
+ dJointWithInfo1 *jicurr = callContext->m_jointinfos;
+ dxJoint *const *const _jend = _joint + _nj;
+ for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) { // jicurr=dest, _jcurr=src
+ dxJoint *j = *_jcurr;
+ j->getInfo1 (&jicurr->info);
+ dIASSERT (/*jicurr->info.m >= 0 && */jicurr->info.m <= 6 && /*jicurr->info.nub >= 0 && */jicurr->info.nub <= jicurr->info.m);
+
+ unsigned int jm = jicurr->info.m;
+ if (jm != 0) {
+ mcurr += jm;
+ if (j->feedback != NULL) {
+ mfbcurr += jm;
+ }
+ jicurr->joint = j;
+ jicurr++;
+ }
+ }
+ callContext->m_stage0Outputs->m = mcurr;
+ callContext->m_stage0Outputs->mfb = mfbcurr;
+ callContext->m_stage0Outputs->nj = (unsigned int)(jicurr - callContext->m_jointinfos);
+ dIASSERT((sizeint)(jicurr - callContext->m_jointinfos) < UINT_MAX || (sizeint)(jicurr - callContext->m_jointinfos) == UINT_MAX); // to avoid "...always evaluates to true" warnings
+ }
+}
+
+static
+int dxQuickStepIsland_Stage1_Callback(void *_stage1CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage1CallContext *stage1CallContext = (dxQuickStepperStage1CallContext *)_stage1CallContext;
+ dxQuickStepIsland_Stage1(stage1CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage1(dxQuickStepperStage1CallContext *stage1CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage1CallContext->m_stepperCallContext;
+ dReal *invI = stage1CallContext->m_invI;
+ dJointWithInfo1 *jointinfos = stage1CallContext->m_jointinfos;
+ unsigned int nj = stage1CallContext->m_stage0Outputs.nj;
+ unsigned int m = stage1CallContext->m_stage0Outputs.m;
+ unsigned int mfb = stage1CallContext->m_stage0Outputs.mfb;
+
+ dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+ memarena->RestoreState(stage1CallContext->m_stageMemArenaState);
+ stage1CallContext = NULL; // WARNING! _stage1CallContext is not valid after this point!
+ dIVERIFY(stage1CallContext == NULL); // To suppress unused variable assignment warnings
+
+ {
+ unsigned int _nj = callContext->m_islandJointsCount;
+ memarena->ShrinkArray<dJointWithInfo1>(jointinfos, _nj, nj);
+ }
+
+ dxMIndexItem *mindex = NULL;
+ dxJBodiesItem *jb = NULL;
+ int *findex = NULL;
+ dReal *J = NULL, *Jcopy = NULL;
+
+ // if there are constraints, compute the constraint force
+ if (m > 0) {
+ mindex = memarena->AllocateArray<dxMIndexItem>(nj + 1);
+ {
+ dxMIndexItem *mcurr = mindex;
+ unsigned int moffs = 0, mfboffs = 0;
+ mcurr->mIndex = moffs;
+ mcurr->fbIndex = mfboffs;
+ ++mcurr;
+
+ const dJointWithInfo1 *const jiend = jointinfos + nj;
+ for (const dJointWithInfo1 *jicurr = jointinfos; jicurr != jiend; ++jicurr) {
+ dxJoint *joint = jicurr->joint;
+ moffs += jicurr->info.m;
+ if (joint->feedback) { mfboffs += jicurr->info.m; }
+ mcurr->mIndex = moffs;
+ mcurr->fbIndex = mfboffs;
+ ++mcurr;
+ }
+ }
+
+ jb = memarena->AllocateArray<dxJBodiesItem>(m);
+ findex = memarena->AllocateArray<int>(m);
+ J = memarena->AllocateOveralignedArray<dReal>((sizeint)m * JME__MAX, JACOBIAN_ALIGNMENT);
+ Jcopy = memarena->AllocateOveralignedArray<dReal>((sizeint)mfb * JCE__MAX, JCOPY_ALIGNMENT);
+ }
+
+ dxQuickStepperLocalContext *localContext = (dxQuickStepperLocalContext *)memarena->AllocateBlock(sizeof(dxQuickStepperLocalContext));
+ localContext->Initialize(invI, jointinfos, nj, m, mfb, mindex, jb, findex, J, Jcopy);
+
+ void *stage1MemarenaState = memarena->SaveState();
+ dxQuickStepperStage3CallContext *stage3CallContext = (dxQuickStepperStage3CallContext*)memarena->AllocateBlock(sizeof(dxQuickStepperStage3CallContext));
+ stage3CallContext->Initialize(callContext, localContext, stage1MemarenaState);
+
+ if (m > 0) {
+ unsigned int nb = callContext->m_islandBodiesCount;
+ // create a constraint equation right hand side vector `rhs', a constraint
+ // force mixing vector `cfm', and LCP low and high bound vectors, and an
+ // 'findex' vector.
+ dReal *rhs_tmp = memarena->AllocateArray<dReal>((sizeint)nb * RHS__MAX);
+
+ dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext*)memarena->AllocateBlock(sizeof(dxQuickStepperStage2CallContext));
+ stage2CallContext->Initialize(callContext, localContext, rhs_tmp);
+
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ dIASSERT(allowedThreads != 0);
+
+ if (allowedThreads == 1)
+ {
+ IFTIMING (dTimerNow ("create J"));
+ dxQuickStepIsland_Stage2a(stage2CallContext);
+ IFTIMING (dTimerNow ("compute rhs_tmp"));
+ dxQuickStepIsland_Stage2b(stage2CallContext);
+ dxQuickStepIsland_Stage2c(stage2CallContext);
+ dxQuickStepIsland_Stage3(stage3CallContext);
+ }
+ else
+ {
+ dxWorld *world = callContext->m_world;
+
+ dCallReleaseeID stage3CallReleasee;
+ world->PostThreadedCallForUnawareReleasee(NULL, &stage3CallReleasee, 1, callContext->m_finalReleasee,
+ NULL, &dxQuickStepIsland_Stage3_Callback, stage3CallContext, 0, "QuickStepIsland Stage3");
+
+ dCallReleaseeID stage2bSyncReleasee;
+ world->PostThreadedCall(NULL, &stage2bSyncReleasee, 1, stage3CallReleasee,
+ NULL, &dxQuickStepIsland_Stage2bSync_Callback, stage2CallContext, 0, "QuickStepIsland Stage2b Sync");
+
+ unsigned stage2a_allowedThreads = CalculateOptimalThreadsCount<1U>(nj, allowedThreads);
+
+ dCallReleaseeID stage2aSyncReleasee;
+ world->PostThreadedCall(NULL, &stage2aSyncReleasee, stage2a_allowedThreads, stage2bSyncReleasee,
+ NULL, &dxQuickStepIsland_Stage2aSync_Callback, stage2CallContext, 0, "QuickStepIsland Stage2a Sync");
+
+ if (stage2a_allowedThreads > 1) {
+ world->PostThreadedCallsGroup(NULL, stage2a_allowedThreads - 1, stage2aSyncReleasee, &dxQuickStepIsland_Stage2a_Callback, stage2CallContext, "QuickStepIsland Stage2a");
+ }
+ dxQuickStepIsland_Stage2a(stage2CallContext);
+ world->AlterThreadedCallDependenciesCount(stage2aSyncReleasee, -1);
+ }
+ }
+ else {
+ dxQuickStepIsland_Stage3(stage3CallContext);
+ }
+}
+
+
+static
+int dxQuickStepIsland_Stage2a_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext;
+ dxQuickStepIsland_Stage2a(stage2CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage2a(dxQuickStepperStage2CallContext *stage2CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+ dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext;
+ dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
+ unsigned int nj = localContext->m_nj;
+ const dxMIndexItem *mindex = localContext->m_mindex;
+
+ const dReal stepsizeRecip = dRecip(callContext->m_stepSize);
+ {
+ int *findex = localContext->m_findex;
+ dReal *J = localContext->m_J;
+ dReal *JCopy = localContext->m_Jcopy;
+
+ // get jacobian data from constraints. an m*16 matrix will be created
+ // to store the two jacobian blocks from each constraint. it has this
+ // format:
+ //
+ // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi \ .
+ // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi }-- jacobian for joint 0, body 1 and body 2 (3 rows)
+ // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi /
+ // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi }--- jacobian for joint 1, body 1 and body 2 (3 rows)
+ // etc...
+ //
+ // (lll) = linear jacobian data
+ // (aaa) = angular jacobian data
+ //
+ dxWorld *world = callContext->m_world;
+ const dReal worldERP = world->global_erp;
+ const dReal worldCFM = world->global_cfm;
+
+ unsigned validFIndices = 0;
+
+ unsigned ji;
+ while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_J, nj)) != nj) {
+ const unsigned ofsi = mindex[ji].mIndex;
+ const unsigned int infom = mindex[ji + 1].mIndex - ofsi;
+
+ dReal *const JRow = J + (sizeint)ofsi * JME__MAX;
+ {
+ dReal *const JEnd = JRow + infom * JME__MAX;
+ for (dReal *JCurr = JRow; JCurr != JEnd; JCurr += JME__MAX) {
+ dSetZero(JCurr + JME__J1_MIN, JME__J1_COUNT);
+ JCurr[JME_RHS] = REAL(0.0);
+ JCurr[JME_CFM] = worldCFM;
+ dSetZero(JCurr + JME__J2_MIN, JME__J2_COUNT);
+ JCurr[JME_LO] = -dInfinity;
+ JCurr[JME_HI] = dInfinity;
+ dSASSERT(JME__J1_COUNT + 2 + JME__J2_COUNT + 2 == JME__MAX);
+ }
+ }
+ int *findexRow = findex + ofsi;
+ dSetValue(findexRow, infom, -1);
+
+ dxJoint *joint = jointinfos[ji].joint;
+ joint->getInfo2(stepsizeRecip, worldERP, JME__MAX, JRow + JME__J1_MIN, JRow + JME__J2_MIN, JME__MAX, JRow + JME__RHS_CFM_MIN, JRow + JME__LO_HI_MIN, findexRow);
+
+ // findex iteration is compact and is not going to pollute caches - do it first
+ {
+ // adjust returned findex values for global index numbering
+ int *const findicesEnd = findexRow + infom;
+ for (int *findexCurr = findexRow; findexCurr != findicesEnd; ++findexCurr) {
+ int fival = *findexCurr;
+ if (fival != -1) {
+ *findexCurr = fival + ofsi;
+ ++validFIndices;
+ }
+ }
+ }
+ {
+ dReal *const JEnd = JRow + infom * JME__MAX;
+ for (dReal *JCurr = JRow; JCurr != JEnd; JCurr += JME__MAX) {
+ JCurr[JME_RHS] *= stepsizeRecip;
+ JCurr[JME_CFM] *= stepsizeRecip;
+ }
+ }
+ {
+ // we need a copy of Jacobian for joint feedbacks
+ // because it gets destroyed by SOR solver
+ // instead of saving all Jacobian, we can save just rows
+ // for joints, that requested feedback (which is normally much less)
+ unsigned mfbIndex = mindex[ji].fbIndex;
+ if (mfbIndex != mindex[ji + 1].fbIndex) {
+ dReal *const JEnd = JRow + infom * JME__MAX;
+ dReal *JCopyRow = JCopy + mfbIndex * JCE__MAX; // Random access by mfbIndex here! Do not optimize!
+ for (const dReal *JCurr = JRow; ; ) {
+ for (unsigned i = 0; i != JME__J1_COUNT; ++i) { JCopyRow[i + JCE__J1_MIN] = JCurr[i + JME__J1_MIN]; }
+ for (unsigned j = 0; j != JME__J2_COUNT; ++j) { JCopyRow[j + JCE__J2_MIN] = JCurr[j + JME__J2_MIN]; }
+ JCopyRow += JCE__MAX;
+ dSASSERT((unsigned)JCE__J1_COUNT == JME__J1_COUNT);
+ dSASSERT((unsigned)JCE__J2_COUNT == JME__J2_COUNT);
+ dSASSERT(JCE__J1_COUNT + JCE__J2_COUNT == JCE__MAX);
+
+ if ((JCurr += JME__MAX) == JEnd) {
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (validFIndices != 0) {
+ ThrsafeAdd(&localContext->m_valid_findices, validFIndices);
+ }
+ }
+
+ {
+ dxJBodiesItem *jb = localContext->m_jb;
+
+ // create an array of body numbers for each joint row
+ unsigned ji;
+ while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_jb, nj)) != nj) {
+ dxJoint *joint = jointinfos[ji].joint;
+ int b1 = (joint->node[0].body) ? (joint->node[0].body->tag) : -1;
+ int b2 = (joint->node[1].body) ? (joint->node[1].body->tag) : -1;
+
+ dxJBodiesItem *const jb_end = jb + mindex[ji + 1].mIndex;
+ dxJBodiesItem *jb_ptr = jb + mindex[ji].mIndex;
+ for (; jb_ptr != jb_end; ++jb_ptr) {
+ jb_ptr->first = b1;
+ jb_ptr->second = b2;
+ }
+ }
+ }
+}
+
+static
+int dxQuickStepIsland_Stage2aSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext;
+ const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+ const unsigned int nb = callContext->m_islandBodiesCount;
+
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ unsigned int stage2b_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE2B_STEP>(nb, allowedThreads);
+
+ if (stage2b_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage2b_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage2b_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage2b_Callback, stage2CallContext, "QuickStepIsland Stage2b");
+ }
+ dxQuickStepIsland_Stage2b(stage2CallContext);
+
+ return 1;
+}
+
+static
+int dxQuickStepIsland_Stage2b_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext;
+ dxQuickStepIsland_Stage2b(stage2CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage2b(dxQuickStepperStage2CallContext *stage2CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext;
+
+ const dReal stepsizeRecip = dRecip(callContext->m_stepSize);
+ {
+ // Warning!!!
+ // This code reads facc/tacc fields of body objects which (the fields)
+ // may be modified by dxJoint::getInfo2(). Therefore the code must be
+ // in different sub-stage from Jacobian construction in Stage2a
+ // to ensure proper synchronization and avoid accessing numbers being modified.
+ // Warning!!!
+ dxBody * const *const body = callContext->m_islandBodiesStart;
+ const unsigned int nb = callContext->m_islandBodiesCount;
+ const dReal *invI = localContext->m_invI;
+ dReal *rhs_tmp = stage2CallContext->m_rhs_tmp;
+
+ // compute the right hand side `rhs'
+
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE2B_STEP;
+ unsigned int nb_steps = (nb + (step_size - 1)) / step_size;
+
+ // put -(v/h + invM*fe) into rhs_tmp
+ unsigned bi_step;
+ while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_bi, nb_steps)) != nb_steps) {
+ unsigned int bi = bi_step * step_size;
+ const unsigned int biend = bi + dMIN(step_size, nb - bi);
+
+ dReal *rhscurr = rhs_tmp + (sizeint)bi * RHS__MAX;
+ const dReal *invIrow = invI + (sizeint)bi * IIE__MAX;
+ while (true) {
+ dxBody *b = body[bi];
+ dReal body_invMass = b->invMass;
+ for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) rhscurr[RHS__L_MIN + j] = -(b->facc[dV3E__AXES_MIN + j] * body_invMass + b->lvel[dV3E__AXES_MIN + j] * stepsizeRecip);
+ dMultiply0_331 (rhscurr + RHS__A_MIN, invIrow + IIE__MATRIX_MIN, b->tacc);
+ for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) rhscurr[RHS__A_MIN + k] = -(b->avel[dV3E__AXES_MIN + k] * stepsizeRecip) - rhscurr[RHS__A_MIN + k];
+
+ if (++bi == biend) {
+ break;
+ }
+ rhscurr += RHS__MAX;
+ invIrow += IIE__MAX;
+ }
+ }
+ }
+}
+
+static
+int dxQuickStepIsland_Stage2bSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext;
+ const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+
+ const dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext;
+ unsigned int m = localContext->m_m;
+
+ unsigned int stage2c_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE2C_STEP>(m, allowedThreads);
+
+ if (stage2c_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage2c_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage2c_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage2c_Callback, stage2CallContext, "QuickStepIsland Stage2c");
+ }
+ dxQuickStepIsland_Stage2c(stage2CallContext);
+
+ return 1;
+}
+
+
+static
+int dxQuickStepIsland_Stage2c_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext;
+ dxQuickStepIsland_Stage2c(stage2CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage2c(dxQuickStepperStage2CallContext *stage2CallContext)
+{
+ //const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext;
+
+ //const dReal stepsizeRecip = dRecip(callContext->m_stepSize);
+ {
+ // Warning!!!
+ // This code depends on rhs_tmp and therefore must be in different sub-stage
+ // from rhs_tmp calculation in Stage2b to ensure proper synchronization
+ // and avoid accessing numbers being modified.
+ // Warning!!!
+ dReal *J = localContext->m_J;
+ const dxJBodiesItem *jb = localContext->m_jb;
+ const dReal *rhs_tmp = stage2CallContext->m_rhs_tmp;
+ const unsigned int m = localContext->m_m;
+
+ // add J*rhs_tmp to rhs
+ multiplyAdd_J<dxQUICKSTEPISLAND_STAGE2C_STEP, RHS__DYNAMICS_MIN, RHS__MAX>(&stage2CallContext->m_Jrhsi, m, J, jb, rhs_tmp);
+ }
+}
+
+
+static
+int dxQuickStepIsland_Stage3_Callback(void *_stage3CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage3CallContext *stage3CallContext = (dxQuickStepperStage3CallContext *)_stage3CallContext;
+ dxQuickStepIsland_Stage3(stage3CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage3(dxQuickStepperStage3CallContext *stage3CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage3CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage3CallContext->m_localContext;
+
+ dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+ memarena->RestoreState(stage3CallContext->m_stage1MemArenaState);
+ stage3CallContext = NULL; // WARNING! stage3CallContext is not valid after this point!
+ dIVERIFY(stage3CallContext == NULL); // To suppress unused variable assignment warnings
+
+ void *stage3MemarenaState = memarena->SaveState();
+ dxQuickStepperStage5CallContext *stage5CallContext = (dxQuickStepperStage5CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage5CallContext));
+ stage5CallContext->Initialize(callContext, localContext, stage3MemarenaState);
+
+ unsigned int m = localContext->m_m;
+
+ if (m > 0) {
+ // load lambda from the value saved on the previous iteration
+ dReal *lambda = memarena->AllocateArray<dReal>(m);
+
+ unsigned int nb = callContext->m_islandBodiesCount;
+ dReal *cforce = memarena->AllocateArray<dReal>((sizeint)nb * CFE__MAX);
+ dReal *iMJ = memarena->AllocateOveralignedArray<dReal>((sizeint)m * IMJ__MAX, INVMJ_ALIGNMENT);
+ // order to solve constraint rows in
+ IndexError *order = memarena->AllocateArray<IndexError>(m);
+ dReal *last_lambda = NULL;
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+ // the lambda computed at the previous iteration.
+ // this is used to measure error for when we are reordering the indexes.
+ last_lambda = memarena->AllocateArray<dReal>(m);
+#endif
+
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ bool singleThreadedExecution = allowedThreads == 1;
+ dIASSERT(allowedThreads >= 1);
+
+ atomicord32 *bi_links_or_mi_levels = NULL;
+ atomicord32 *mi_links = NULL;
+#if !dTHREADING_INTF_DISABLED
+ bi_links_or_mi_levels = memarena->AllocateArray<atomicord32>(dMAX(nb, m));
+ mi_links = memarena->AllocateArray<atomicord32>(2 * ((sizeint)m + 1));
+#else
+ dIASSERT(singleThreadedExecution);
+#endif
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage4CallContext));
+ stage4CallContext->Initialize(callContext, localContext, lambda, cforce, iMJ, order, last_lambda, bi_links_or_mi_levels, mi_links);
+
+ if (singleThreadedExecution)
+ {
+ dxQuickStepIsland_Stage4a(stage4CallContext);
+
+ IFTIMING (dTimerNow ("solving LCP problem"));
+ dxQuickStepIsland_Stage4LCP_iMJComputation(stage4CallContext);
+ dxQuickStepIsland_Stage4LCP_STfcComputation(stage4CallContext);
+ dxQuickStepIsland_Stage4LCP_AdComputation(stage4CallContext);
+ dxQuickStepIsland_Stage4LCP_ReorderPrep(stage4CallContext);
+
+ dxWorld *world = callContext->m_world;
+ const unsigned int num_iterations = world->qs.num_iterations;
+ for (unsigned int iteration=0; iteration < num_iterations; iteration++) {
+ if (IsSORConstraintsReorderRequiredForIteration(iteration)) {
+ stage4CallContext->ResetSOR_ConstraintsReorderVariables(0);
+ dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(stage4CallContext, iteration);
+ }
+ dxQuickStepIsland_Stage4LCP_STIteration(stage4CallContext);
+ }
+
+ dxQuickStepIsland_Stage4b(stage4CallContext);
+ dxQuickStepIsland_Stage5(stage5CallContext);
+ }
+ else
+ {
+ dxWorld *world = callContext->m_world;
+
+ dCallReleaseeID stage5CallReleasee;
+ world->PostThreadedCallForUnawareReleasee(NULL, &stage5CallReleasee, 1, callContext->m_finalReleasee,
+ NULL, &dxQuickStepIsland_Stage5_Callback, stage5CallContext, 0, "QuickStepIsland Stage5");
+
+ dCallReleaseeID stage4LCP_IterationSyncReleasee;
+ world->PostThreadedCall(NULL, &stage4LCP_IterationSyncReleasee, 1, stage5CallReleasee,
+ NULL, &dxQuickStepIsland_Stage4LCP_IterationSync_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_Iteration Sync");
+
+ unsigned int stage4LCP_Iteration_allowedThreads = CalculateOptimalThreadsCount<1U>(m, allowedThreads);
+ stage4CallContext->AssignLCP_IterationData(stage4LCP_IterationSyncReleasee, stage4LCP_Iteration_allowedThreads);
+
+ dCallReleaseeID stage4LCP_IterationStartReleasee;
+ world->PostThreadedCall(NULL, &stage4LCP_IterationStartReleasee, 3, stage4LCP_IterationSyncReleasee,
+ NULL, &dxQuickStepIsland_Stage4LCP_IterationStart_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_Iteration Start");
+
+ unsigned int nj = localContext->m_nj;
+ unsigned int stage4a_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE4A_STEP>(nj, allowedThreads);
+
+ dCallReleaseeID stage4LCP_fcStartReleasee;
+ // Note: It is unnecessary to make fc dependent on 4a if there is no WARM_STARTING
+ // However I'm doing so to minimize use of preprocessor conditions in sources
+ unsigned stage4LCP_fcDependenciesCountToUse = stage4a_allowedThreads;
+#ifdef WARM_STARTING
+ // Posted with extra dependency to be removed from dxQuickStepIsland_Stage4LCP_iMJSync_Callback
+ stage4LCP_fcDependenciesCountToUse += 1;
+#endif
+ world->PostThreadedCall(NULL, &stage4LCP_fcStartReleasee, stage4LCP_fcDependenciesCountToUse, stage4LCP_IterationStartReleasee,
+ NULL, &dxQuickStepIsland_Stage4LCP_fcStart_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_fc Start");
+#ifdef WARM_STARTING
+ stage4CallContext->AssignLCP_fcStartReleasee(stage4LCP_fcStartReleasee);
+#endif
+
+ unsigned stage4LCP_iMJ_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE4LCP_IMJ_STEP>(m, allowedThreads);
+
+ dCallReleaseeID stage4LCP_iMJSyncReleasee;
+ world->PostThreadedCall(NULL, &stage4LCP_iMJSyncReleasee, stage4LCP_iMJ_allowedThreads, stage4LCP_IterationStartReleasee,
+ NULL, &dxQuickStepIsland_Stage4LCP_iMJSync_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_iMJ Sync");
+
+ world->PostThreadedCall(NULL, NULL, 0, stage4LCP_IterationStartReleasee, NULL, &dxQuickStepIsland_Stage4LCP_ReorderPrep_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_ReorderPrep");
+ world->PostThreadedCallsGroup(NULL, stage4a_allowedThreads, stage4LCP_fcStartReleasee, &dxQuickStepIsland_Stage4a_Callback, stage4CallContext, "QuickStepIsland Stage4a");
+
+ if (stage4LCP_iMJ_allowedThreads > 1) {
+ world->PostThreadedCallsGroup(NULL, stage4LCP_iMJ_allowedThreads - 1, stage4LCP_iMJSyncReleasee, &dxQuickStepIsland_Stage4LCP_iMJ_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_iMJ");
+ }
+ dxQuickStepIsland_Stage4LCP_iMJComputation(stage4CallContext);
+ world->AlterThreadedCallDependenciesCount(stage4LCP_iMJSyncReleasee, -1);
+ }
+ }
+ else {
+ dxQuickStepIsland_Stage5(stage5CallContext);
+ }
+}
+
+static
+int dxQuickStepIsland_Stage4a_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4a(stage4CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4a(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ dReal *lambda = stage4CallContext->m_lambda;
+ const dxMIndexItem *mindex = localContext->m_mindex;
+#ifdef WARM_STARTING
+ dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
+#endif
+ unsigned int nj = localContext->m_nj;
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4A_STEP;
+ unsigned int nj_steps = (nj + (step_size - 1)) / step_size;
+
+ unsigned ji_step;
+ while ((ji_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_ji_4a, nj_steps)) != nj_steps) {
+ unsigned int ji = ji_step * step_size;
+ dReal *lambdacurr = lambda + mindex[ji].mIndex;
+#ifdef WARM_STARTING
+ const dJointWithInfo1 *jicurr = jointinfos + ji;
+ const dJointWithInfo1 *const jiend = jicurr + dMIN(step_size, nj - ji);
+
+ do {
+ const dReal *joint_lambdas = jicurr->joint->lambda;
+ dReal *const lambdsnext = lambdacurr + jicurr->info.m;
+
+ while (true) {
+ // for warm starting, multiplication by 0.9 seems to be necessary to prevent
+ // jerkiness in motor-driven joints. I have no idea why this works.
+ *lambdacurr = *joint_lambdas * 0.9;
+
+ if (++lambdacurr == lambdsnext) {
+ break;
+ }
+
+ ++joint_lambdas;
+ }
+ }
+ while (++jicurr != jiend);
+#else
+ dReal *lambdsnext = lambda + mindex[ji + dMIN(step_size, nj - ji)].mIndex;
+ dSetZero(lambdacurr, lambdsnext - lambdacurr);
+#endif
+ }
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_iMJ_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4LCP_iMJComputation(stage4CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_iMJComputation(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ dReal *iMJ = stage4CallContext->m_iMJ;
+ unsigned int m = localContext->m_m;
+ dReal *J = localContext->m_J;
+ const dxJBodiesItem *jb = localContext->m_jb;
+ dxBody * const *body = callContext->m_islandBodiesStart;
+ dReal *invI = localContext->m_invI;
+
+ // precompute iMJ = inv(M)*J'
+ compute_invM_JT<dxQUICKSTEPISLAND_STAGE4LCP_IMJ_STEP>(&stage4CallContext->m_mi_iMJ, iMJ, m, J, jb, body, invI);
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_iMJSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ unsigned int m = localContext->m_m;
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+
+ unsigned int stage4LCP_Ad_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE4LCP_AD_STEP>(m, allowedThreads);
+
+#ifdef WARM_STARTING
+ {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(stage4CallContext->m_LCP_fcStartReleasee, -1);
+ }
+#endif
+
+ if (stage4LCP_Ad_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_Ad_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage4LCP_Ad_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_Ad_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_Ad");
+ }
+ dxQuickStepIsland_Stage4LCP_AdComputation(stage4CallContext);
+
+ return 1;
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_fcStart_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ unsigned int fcPrepareComplexity, fcCompleteComplexity;
+#ifdef WARM_STARTING
+ fcPrepareComplexity = localContext->m_m / dxQUICKSTEPISLAND_STAGE4LCP_FC_COMPLETE_TO_PREPARE_COMPLEXITY_DIVISOR;
+ fcCompleteComplexity = callContext->m_islandBodiesCount;
+#else
+ fcPrepareComplexity = localContext->m_m;
+ fcCompleteComplexity = 0;
+#endif
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ unsigned int stage4LCP_fcPrepare_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP>(fcPrepareComplexity, allowedThreads);
+ unsigned int stage4LCP_fcComplete_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP>(fcCompleteComplexity, allowedThreads);
+ stage4CallContext->AssignLCP_fcAllowedThreads(stage4LCP_fcPrepare_allowedThreads, stage4LCP_fcComplete_allowedThreads);
+
+#ifdef WARM_STARTING
+ dxQuickStepIsland_Stage4LCP_MTfcComputation_warmZeroArrays(stage4CallContext);
+#endif
+
+ if (stage4LCP_fcPrepare_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_fcPrepare_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage4LCP_fcPrepare_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_fc_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_fc");
+ }
+ dxQuickStepIsland_Stage4LCP_MTfcComputation(stage4CallContext, callThisReleasee);
+
+ return 1;
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_fc_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4LCP_MTfcComputation(stage4CallContext, callThisReleasee);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_MTfcComputation(dxQuickStepperStage4CallContext *stage4CallContext, dCallReleaseeID callThisReleasee)
+{
+#ifdef WARM_STARTING
+ dxQuickStepIsland_Stage4LCP_MTfcComputation_warm(stage4CallContext, callThisReleasee);
+#else
+ (void)callThisReleasee; // unused
+ dxQuickStepIsland_Stage4LCP_MTfcComputation_cold(stage4CallContext);
+#endif
+}
+
+#ifdef WARM_STARTING
+
+static
+void dxQuickStepIsland_Stage4LCP_MTfcComputation_warm(dxQuickStepperStage4CallContext *stage4CallContext, dCallReleaseeID callThisReleasee)
+{
+ dxQuickStepIsland_Stage4LCP_MTfcComputation_warmPrepare(stage4CallContext);
+
+ if (ThrsafeExchangeAdd(&stage4CallContext->m_LCP_fcPrepareThreadsRemaining, (atomicord32)(-1)) == 1) {
+ stage4CallContext->ResetLCP_fcComputationIndex();
+
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ unsigned int stage4LCP_fcComplete_allowedThreads = stage4CallContext->m_LCP_fcCompleteThreadsTotal;
+
+ if (stage4LCP_fcComplete_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_fcComplete_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage4LCP_fcComplete_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_fcWarmComplete_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_fcWarmComplete");
+ }
+ dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(stage4CallContext);
+ }
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmZeroArrays(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+
+ unsigned int nb = callContext->m_islandBodiesCount;
+ atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;
+
+ multiply_invM_JT_init_array(nb, bi_links);
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmPrepare(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ unsigned int m = localContext->m_m;
+ const dxJBodiesItem *jb = localContext->m_jb;
+
+ // Prepare to compute fc=(inv(M)*J')*lambda. we will incrementally maintain fc
+ // as we change lambda.
+ multiply_invM_JT_prepare<dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP_PREPARE>(&stage4CallContext->m_mi_fc, m, jb, stage4CallContext->m_bi_links_or_mi_levels, stage4CallContext->m_mi_links);
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_fcWarmComplete_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+
+ dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(stage4CallContext);
+
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ dReal *fc = stage4CallContext->m_cforce;
+ unsigned int nb = callContext->m_islandBodiesCount;
+ dReal *iMJ = stage4CallContext->m_iMJ;
+ const dxJBodiesItem *jb = localContext->m_jb;
+ dReal *lambda = stage4CallContext->m_lambda;
+
+ // Complete computation of fc=(inv(M)*J')*lambda. we will incrementally maintain fc
+ // as we change lambda.
+ multiply_invM_JT_complete<dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP_COMPLETE, CFE__DYNAMICS_MIN, CFE__MAX>(&stage4CallContext->m_mi_fc, fc, nb, iMJ, jb, lambda, stage4CallContext->m_bi_links_or_mi_levels, stage4CallContext->m_mi_links);
+}
+
+#else // #ifndef WARM_STARTING
+
+static
+void dxQuickStepIsland_Stage4LCP_MTfcComputation_cold(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+
+ dReal *fc = stage4CallContext->m_cforce;
+ unsigned int nb = callContext->m_islandBodiesCount;
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP;
+ unsigned int nb_steps = (nb + (step_size - 1)) / step_size;
+
+ unsigned bi_step;
+ while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_mi_fc, nb_steps)) != nb_steps) {
+ unsigned int bi = bi_step * step_size;
+ unsigned int bicnt = dMIN(step_size, nb - bi);
+ dSetZero(fc + (sizeint)bi * CFE__MAX, (sizeint)bicnt * CFE__MAX);
+ }
+}
+
+#endif // #ifndef WARM_STARTING
+
+
+static
+void dxQuickStepIsland_Stage4LCP_STfcComputation(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+#ifdef WARM_STARTING
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ dReal *fc = stage4CallContext->m_cforce;
+ unsigned int m = localContext->m_m;
+ unsigned int nb = callContext->m_islandBodiesCount;
+ dReal *iMJ = stage4CallContext->m_iMJ;
+ const dxJBodiesItem *jb = localContext->m_jb;
+ dReal *lambda = stage4CallContext->m_lambda;
+
+ // compute fc=(inv(M)*J')*lambda. we will incrementally maintain fc
+ // as we change lambda.
+ _multiply_invM_JT<CFE__DYNAMICS_MIN, CFE__MAX>(fc, m, nb, iMJ, jb, lambda);
+#else
+ dReal *fc = stage4CallContext->m_cforce;
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ unsigned int nb = callContext->m_islandBodiesCount;
+
+ dSetZero(fc, (sizeint)nb * CFE__MAX);
+#endif
+
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_Ad_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4LCP_AdComputation(stage4CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_AdComputation(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ const dxJBodiesItem *jb = localContext->m_jb;
+ dReal *J = localContext->m_J;
+ unsigned int m = localContext->m_m;
+
+ dxWorld *world = callContext->m_world;
+ dxQuickStepParameters *qs = &world->qs;
+ const dReal sor_w = qs->w; // SOR over-relaxation parameter
+
+ dReal *iMJ = stage4CallContext->m_iMJ;
+
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4LCP_AD_STEP;
+ unsigned int m_steps = (m + (step_size - 1)) / step_size;
+
+ unsigned mi_step;
+ while ((mi_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_mi_Ad, m_steps)) != m_steps) {
+ unsigned int mi = mi_step * step_size;
+ const unsigned int miend = mi + dMIN(step_size, m - mi);
+
+ const dReal *iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX;
+ dReal *J_ptr = J + (sizeint)mi * JME__MAX;
+ while (true) {
+ dReal sum = REAL(0.0);
+ {
+ for (unsigned int j = JVE__MIN; j != JVE__MAX; ++j) sum += iMJ_ptr[IMJ__1_MIN + j] * J_ptr[JME__J1_MIN + j];
+ dSASSERT(JME__J1_COUNT == (int)JVE__MAX);
+ }
+
+ int b2 = jb[mi].second;
+ if (b2 != -1) {
+ for (unsigned int k = JVE__MIN; k != JVE__MAX; ++k) sum += iMJ_ptr[IMJ__2_MIN + k] * J_ptr[JME__J2_MIN + k];
+ dSASSERT(JME__J2_COUNT == (int)JVE__MAX);
+ }
+
+ dReal cfm_i = J_ptr[JME_CFM];
+ dReal Ad_i = sor_w / (sum + cfm_i);
+
+ // NOTE: This may seem unnecessary but it's indeed an optimization
+ // to move multiplication by Ad[i] and cfm[i] out of iteration loop.
+
+ // scale cfm, J and b by Ad
+ J_ptr[JME_CFM] = cfm_i * Ad_i;
+ J_ptr[JME_RHS] *= Ad_i;
+
+ {
+ for (unsigned int j = JVE__MIN; j != JVE__MAX; ++j) J_ptr[JME__J1_MIN + j] *= Ad_i;
+ dSASSERT(JME__J1_COUNT == (int)JVE__MAX);
+ }
+
+ if (b2 != -1) {
+ for (unsigned int k = JVE__MIN; k != JVE__MAX; ++k) J_ptr[JME__J2_MIN + k] *= Ad_i;
+ dSASSERT(JME__J2_COUNT == (int)JVE__MAX);
+ }
+
+ if (++mi == miend) {
+ break;
+ }
+ iMJ_ptr += IMJ__MAX;
+ J_ptr += JME__MAX;
+ }
+ }
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_ReorderPrep_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4LCP_ReorderPrep(stage4CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_ReorderPrep(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+ unsigned int m = localContext->m_m;
+ unsigned int valid_findices = localContext->m_valid_findices;
+
+ IndexError *order = stage4CallContext->m_order;
+
+ {
+ // make sure constraints with findex < 0 come first.
+ IndexError *orderhead = order, *ordertail = order + (m - valid_findices);
+ const int *findex = localContext->m_findex;
+
+ // Fill the array from both ends
+ for (unsigned int i = 0; i != m; ++i) {
+ if (findex[i] == -1) {
+ orderhead->index = i; // Place them at the front
+ ++orderhead;
+ } else {
+ ordertail->index = i; // Place them at the end
+ ++ordertail;
+ }
+ }
+ dIASSERT(orderhead == order + (m - valid_findices));
+ dIASSERT(ordertail == order + m);
+ }
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_IterationStart_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+
+ dxWorld *world = callContext->m_world;
+ dxQuickStepParameters *qs = &world->qs;
+
+ const unsigned int num_iterations = qs->num_iterations;
+ unsigned iteration = stage4CallContext->m_LCP_iteration;
+
+ if (iteration < num_iterations)
+ {
+ dCallReleaseeID nextReleasee;
+ dCallReleaseeID stage4LCP_IterationSyncReleasee = stage4CallContext->m_LCP_IterationSyncReleasee;
+ unsigned int stage4LCP_Iteration_allowedThreads = stage4CallContext->m_LCP_IterationAllowedThreads;
+
+ bool reorderRequired = false;
+
+ if (IsSORConstraintsReorderRequiredForIteration(iteration))
+ {
+ reorderRequired = true;
+ }
+
+ unsigned syncCallDependencies = reorderRequired ? 1 : stage4LCP_Iteration_allowedThreads;
+
+ // Increment iterations counter in advance as anyway it needs to be incremented
+ // before independent tasks (the reordering or the iteration) are posted
+ // (otherwise next iteration may complete before the increment
+ // and the same iteration index may be used again).
+ stage4CallContext->m_LCP_iteration = iteration + 1;
+
+ if (iteration + 1 != num_iterations) {
+ dCallReleaseeID stage4LCP_IterationStartReleasee;
+ world->PostThreadedCallForUnawareReleasee(NULL, &stage4LCP_IterationStartReleasee, syncCallDependencies, stage4LCP_IterationSyncReleasee,
+ NULL, &dxQuickStepIsland_Stage4LCP_IterationStart_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_Iteration Start");
+ nextReleasee = stage4LCP_IterationStartReleasee;
+ }
+ else {
+ world->AlterThreadedCallDependenciesCount(stage4LCP_IterationSyncReleasee, syncCallDependencies);
+ nextReleasee = stage4LCP_IterationSyncReleasee;
+ }
+
+ if (reorderRequired) {
+ const unsigned int reorderThreads = 2;
+ dIASSERT(callContext->m_stepperAllowedThreads >= 2); // Otherwise the single-threaded execution path would be taken
+
+ stage4CallContext->ResetSOR_ConstraintsReorderVariables(reorderThreads);
+
+ dCallReleaseeID stage4LCP_ConstraintsReorderingSyncReleasee;
+ world->PostThreadedCall(NULL, &stage4LCP_ConstraintsReorderingSyncReleasee, reorderThreads, nextReleasee,
+ NULL, &dxQuickStepIsland_Stage4LCP_ConstraintsReorderingSync_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_ConstraintsReordering Sync");
+
+ if (reorderThreads > 1) {
+ world->PostThreadedCallsGroup(NULL, reorderThreads - 1, stage4LCP_ConstraintsReorderingSyncReleasee, &dxQuickStepIsland_Stage4LCP_ConstraintsReordering_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_ConstraintsReordering");
+ }
+ dxQuickStepIsland_Stage4LCP_ConstraintsReordering(stage4CallContext);
+ world->AlterThreadedCallDependenciesCount(stage4LCP_ConstraintsReorderingSyncReleasee, -1);
+ }
+ else {
+ dIASSERT(iteration != 0); {
+ dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(stage4CallContext);
+ }
+
+ stage4CallContext->RecordLCP_IterationStart(stage4LCP_Iteration_allowedThreads, nextReleasee);
+
+ unsigned knownToBeCompletedLevel = dxHEAD_INDEX;
+ if (stage4LCP_Iteration_allowedThreads > 1) {
+ world->PostThreadedCallsIndexOverridenGroup(NULL, stage4LCP_Iteration_allowedThreads - 1, nextReleasee, &dxQuickStepIsland_Stage4LCP_Iteration_Callback, stage4CallContext, knownToBeCompletedLevel, "QuickStepIsland Stage4LCP_Iteration");
+ }
+ dxQuickStepIsland_Stage4LCP_MTIteration(stage4CallContext, knownToBeCompletedLevel);
+ world->AlterThreadedCallDependenciesCount(nextReleasee, -1);
+ }
+ }
+
+ return 1;
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_ConstraintsReordering_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4LCP_ConstraintsReordering(stage4CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_ConstraintsReordering(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ unsigned int iteration = stage4CallContext->m_LCP_iteration - 1; // Iteration is pre-incremented before scheduled tasks are released for execution
+ if (dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(stage4CallContext, iteration)) {
+
+ dxQuickStepIsland_Stage4LCP_LinksArraysZeroing(stage4CallContext);
+ if (ThrsafeExchangeAdd(&stage4CallContext->m_SOR_reorderThreadsRemaining, (atomicord32)(-1)) == 1) { // If last thread has exited the reordering routine...
+ // Rebuild the object dependency map
+ dxQuickStepIsland_Stage4LCP_DependencyMapForNewOrderRebuilding(stage4CallContext);
+ }
+ }
+ else {
+ // NOTE: So far, this branch is only called in CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR case
+ if (ThrsafeExchangeAdd(&stage4CallContext->m_SOR_reorderThreadsRemaining, (atomicord32)(-1)) == 1) { // If last thread has exited the reordering routine...
+ dIASSERT(iteration != 0);
+ dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(stage4CallContext);
+ }
+ }
+}
+
+static
+bool dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int iteration)
+{
+ bool result = false;
+
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+
+ struct ConstraintsReorderingHelper
+ {
+ void operator ()(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int startIndex, unsigned int endIndex)
+ {
+ const dReal *lambda = stage4CallContext->m_lambda;
+ dReal *last_lambda = stage4CallContext->m_last_lambda;
+ IndexError *order = stage4CallContext->m_order;
+
+ for (unsigned int index = startIndex; index != endIndex; ++index) {
+ unsigned int i = order[index].index;
+ dReal lambda_i = lambda[i];
+ if (lambda_i != REAL(0.0)) {
+ //@@@ relative error: order[i].error = dFabs(lambda[i]-last_lambda[i])/max;
+ order[index].error = dFabs(lambda_i - last_lambda[i]);
+ }
+ else if (last_lambda[i] != REAL(0.0)) {
+ //@@@ relative error: order[i].error = dFabs(lambda[i]-last_lambda[i])/max;
+ order[index].error = dFabs(/*lambda_i - */last_lambda[i]); // lambda_i == 0
+ }
+ else {
+ order[index].error = dInfinity;
+ }
+ // Finally copy the lambda for the next iteration
+ last_lambda[i] = lambda_i;
+ }
+ qsort (order + startIndex, endIndex - startIndex, sizeof(IndexError), &compare_index_error);
+ }
+ };
+
+ if (iteration > 1) { // Only reorder starting from iteration #2
+ // sort the constraints so that the ones converging slowest
+ // get solved last. use the absolute (not relative) error.
+ /*
+ * Full reorder needs to be done.
+ * Even though this contradicts the initial idea of moving dependent constraints
+ * to the order end the algorithm does not work the other way well.
+ * It looks like the iterative method needs a shake after it already found
+ * some initial approximations and those incurred errors help it to converge even better.
+ */
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderHeadTaken, 1) == 0) {
+ // Process the head
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+ ConstraintsReorderingHelper()(stage4CallContext, 0, localContext->m_m);
+ }
+
+ result = true;
+ }
+ else if (iteration == 1) {
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderHeadTaken, 1) == 0) {
+ // Process the first half
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+ unsigned int startIndex = 0;
+ unsigned int indicesCount = localContext->m_m / 2;
+ // Just copy the lambdas for the next iteration
+ memcpy(stage4CallContext->m_last_lambda + startIndex, stage4CallContext->m_lambda + startIndex, indicesCount * sizeof(dReal));
+ }
+
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderTailTaken, 1) == 0) {
+ // Process the second half
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+ unsigned int startIndex = localContext->m_m / 2;
+ unsigned int indicesCount = localContext->m_m - startIndex;
+ // Just copy the lambdas for the next iteration
+ memcpy(stage4CallContext->m_last_lambda + startIndex, stage4CallContext->m_lambda + startIndex, indicesCount * sizeof(dReal));
+ }
+
+ // result = false; -- already 'false'
+ }
+ else /*if (iteration < 1) */{
+ result = true; // return true on 0th iteration to build dependency map for the initial order
+ }
+
+
+#elif CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__RANDOMLY
+
+ if (iteration != 0) {
+ dIASSERT(!dIN_RANGE(iteration, 0, RANDOM_CONSTRAINTS_REORDERING_FREQUENCY));
+
+ dIASSERT(iteration % RANDOM_CONSTRAINTS_REORDERING_FREQUENCY == RRS_REORDERING); {
+ struct ConstraintsReorderingHelper
+ {
+ void operator ()(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int startIndex, unsigned int indicesCount)
+ {
+ IndexError *order = stage4CallContext->m_order + startIndex;
+
+ for (unsigned int index = 1; index < indicesCount; ++index) {
+ int swapIndex = dRandInt(index + 1);
+ IndexError tmp = order[index];
+ order[index] = order[swapIndex];
+ order[swapIndex] = tmp;
+ }
+ }
+ };
+
+ /*
+ * Full reorder needs to be done.
+ * Even though this contradicts the initial idea of moving dependent constraints
+ * to the order end the algorithm does not work the other way well.
+ * It looks like the iterative method needs a shake after it already found
+ * some initial approximations and those incurred errors help it to converge even better.
+ */
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderHeadTaken, 1) == 0) {
+ // Process the head
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+ ConstraintsReorderingHelper()(stage4CallContext, 0, localContext->m_m);
+ }
+ }
+ dIASSERT((RRS__MAX, true)); // A reference to RRS__MAX to be located by Find in Files
+ }
+ else {
+ // Just return true and skip the randomization for the very first iteration
+ }
+
+ result = true;
+
+#else // #if CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__BY_ERROR && CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__RANDOMLY
+
+ dIASSERT(iteration == 0); // The reordering request is only returned for the first iteration
+ result = true;
+
+
+#endif
+
+ return result;
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_LinksArraysZeroing(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_bi_zeroHeadTaken, 1) == 0) {
+ atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;/*=[nb]*/
+ unsigned int nb = callContext->m_islandBodiesCount;
+ memset(bi_links, 0, sizeof(bi_links[0]) * (nb / 2));
+ }
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_bi_zeroTailTaken, 1) == 0) {
+ atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;/*=[nb]*/
+ unsigned int nb = callContext->m_islandBodiesCount;
+ memset(bi_links + nb / 2, 0, sizeof(bi_links[0]) * (nb - nb / 2));
+ }
+
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_mi_zeroHeadTaken, 1) == 0) {
+ atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/
+ unsigned int m = localContext->m_m;
+ memset(mi_links, 0, sizeof(mi_links[0]) * (m + 1));
+ }
+ if (ThrsafeExchange(&stage4CallContext->m_SOR_mi_zeroTailTaken, 1) == 0) {
+ atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/
+ unsigned int m = localContext->m_m;
+ memset(mi_links + (m + 1), 0, sizeof(mi_links[0]) * (m + 1));
+ }
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_DependencyMapForNewOrderRebuilding(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;/*=[nb]*/
+ atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/
+
+ IndexError *order = stage4CallContext->m_order;
+ const dxJBodiesItem *jb = localContext->m_jb;
+
+ unsigned int m = localContext->m_m;
+ for (unsigned int i = 0; i != m; ++i) {
+ unsigned int index = order[i].index;
+
+ int b1 = jb[index].first;
+ int b2 = jb[index].second;
+
+ unsigned int encioded_i = dxENCODE_INDEX(i);
+
+ unsigned int encoded_depi = bi_links[(unsigned int)b1];
+ bi_links[(unsigned int)b1] = encioded_i;
+
+ if (b2 != -1 && b2 != b1) {
+ if (encoded_depi < (unsigned int)bi_links[(unsigned int)b2]) {
+ encoded_depi = bi_links[(unsigned int)b2];
+ }
+ bi_links[(unsigned int)b2] = encioded_i;
+ }
+
+ // OD: There is also a dependency on findex[index],
+ // however the findex can only refer to the rows of the same joint
+ // and hence that index is going to have the same bodies. Since the
+ // indices are sorted in a way that the meaningful findex values
+ // always come last, the dependency of findex[index] is going to
+ // be implicitly satisfied via matching bodies at smaller "i"s.
+
+ // Check that the dependency targets an earlier "i"
+ dIASSERT(encoded_depi < encioded_i);
+
+ unsigned encoded_downi = mi_links[(sizeint)encoded_depi * 2 + 1];
+ mi_links[(sizeint)encoded_depi * 2 + 1] = encioded_i; // Link i as down-dependency for depi
+ mi_links[(sizeint)encioded_i * 2 + 0] = encoded_downi; // Link previous down-chain as the level-dependency with i
+ }
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ atomicord32 *mi_levels = stage4CallContext->m_bi_links_or_mi_levels;/*=[m]*/
+ atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/
+
+ // NOTE!
+ // OD: The mi_links array is not zero-filled before the reconstruction.
+ // Iteration ends with all the down links zeroed. And since down links
+ // are moved to the next level links when parent-child relations are established,
+ // the horizontal levels are properly terminated.
+ // The leaf nodes had their links zero-initialized initially
+ // and those zeros remain intact during the solving. This way the down links
+ // are properly terminated as well.
+ // This is very obscure and error prone and would need an assertion check at least
+ // but the simplest assertion approach I can imagine would be
+ // zero filling and building another tree with the memory buffer comparison afterwards.
+ // That would be stupid, obviously.
+ //
+ // NOTE!
+ // OD: This routine can be threaded. However having two threads messing
+ // in one integer array with random access and kicking each other memory lines
+ // out of cache would probably work worse than letting a single thread do the whole job.
+ unsigned int m = localContext->m_m;
+ for (unsigned int i = 0; i != m; ++i) {
+ unsigned int currentLevelRoot = mi_levels[i];
+ unsigned int currentLevelFirstLink = mi_links[2 * (sizeint)currentLevelRoot + 1];
+ unsigned int encoded_i = dxENCODE_INDEX(i);
+ mi_links[2 * (sizeint)currentLevelRoot + 1] = encoded_i;
+ mi_links[2 * (sizeint)encoded_i + 0] = currentLevelFirstLink;
+ }
+
+ // Additionally reset available level root's list head
+ mi_links[2 * dxHEAD_INDEX + 0] = dxHEAD_INDEX;
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_ConstraintsReorderingSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ unsigned int stage4LCP_Iteration_allowedThreads = stage4CallContext->m_LCP_IterationAllowedThreads;
+
+ stage4CallContext->RecordLCP_IterationStart(stage4LCP_Iteration_allowedThreads, callThisReleasee);
+
+ unsigned knownToBeCompletedLevel = dxHEAD_INDEX;
+ if (stage4LCP_Iteration_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_Iteration_allowedThreads - 1);
+ world->PostThreadedCallsIndexOverridenGroup(NULL, stage4LCP_Iteration_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_Iteration_Callback, stage4CallContext, knownToBeCompletedLevel, "QuickStepIsland Stage4LCP_Iteration");
+ }
+ dxQuickStepIsland_Stage4LCP_MTIteration(stage4CallContext, knownToBeCompletedLevel);
+
+ return 1;
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_Iteration_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ unsigned int initiallyKnownToBeCompletedLevel = (unsigned int)callInstanceIndex;
+ dIASSERT(initiallyKnownToBeCompletedLevel == callInstanceIndex); // A truncation check...
+
+ dxQuickStepIsland_Stage4LCP_MTIteration(stage4CallContext, initiallyKnownToBeCompletedLevel);
+ return 1;
+}
+
+/*
+ * +0 +0
+ * Root───┬─────────────────┬──...
+ * +1│ +1│
+ * ┌┴┐+0 ┌─┐+0 .
+ * │A├─────┤B├─...
+ * └┬┘ └┬┘
+ * +1│ +1│
+ * ┌┴┐+0 .
+ * │C├─...
+ * └┬┘
+ * +1│
+ * .
+ *
+ * Lower tree levels depend on their parents. Same level nodes are independent with respect to each other.
+ *
+ * 1. B is linked in place of A
+ * 2. A is processed
+ * 3. C is inserted at the Root level
+ *
+ * The tree starts with a single child subtree at the root level ("down" link of slot #0 is used for that).
+ * Then, additional "C" nodes are added to the root level by building horizontal link via slots of
+ * their former parent "A"s that had become free.
+ * The "level" link of slot #0 is used to find the root level head.
+ *
+ * Since the tree is altered during iteration, mi_levels record each node parents so that the tree could be reconstructed.
+ */
+static
+void dxQuickStepIsland_Stage4LCP_MTIteration(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int initiallyKnownToBeCompletedLevel)
+{
+ atomicord32 *mi_levels = stage4CallContext->m_bi_links_or_mi_levels;
+ atomicord32 *mi_links = stage4CallContext->m_mi_links;
+
+ unsigned int knownToBeCompletedLevel = initiallyKnownToBeCompletedLevel;
+
+ while (true) {
+ unsigned int initialLevelRoot = mi_links[2 * dxHEAD_INDEX + 0];
+ if (initialLevelRoot != dxHEAD_INDEX && initialLevelRoot == knownToBeCompletedLevel) {
+ // No work is (currently) available
+ break;
+ }
+
+ for (unsigned int currentLevelRoot = initialLevelRoot; ; currentLevelRoot = mi_links[2 * (sizeint)currentLevelRoot + 0]) {
+ while (true) {
+ const unsigned invalid_link = dxENCODE_INDEX(-1);
+
+ unsigned currentLevelFirstLink = mi_links[2 * (sizeint)currentLevelRoot + 1];
+ if (currentLevelFirstLink == invalid_link) {
+ break;
+ }
+
+ // Try to extract first record from linked list
+ unsigned currentLevelNextLink = mi_links[2 * (sizeint)currentLevelFirstLink + 0];
+ if (ThrsafeCompareExchange(&mi_links[2 * (sizeint)currentLevelRoot + 1], currentLevelFirstLink, currentLevelNextLink)) {
+ // if succeeded, execute selected iteration step...
+ dxQuickStepIsland_Stage4LCP_IterationStep(stage4CallContext, dxDECODE_INDEX(currentLevelFirstLink));
+
+ // Check if there are any dependencies
+ unsigned level0DownLink = mi_links[2 * (sizeint)currentLevelFirstLink + 1];
+ if (level0DownLink != invalid_link) {
+ // ...and if yes, insert the record into the list of available level roots
+ unsigned int levelRootsFirst;
+ do {
+ levelRootsFirst = mi_links[2 * dxHEAD_INDEX + 0];
+ mi_links[2 * (sizeint)currentLevelFirstLink + 0] = levelRootsFirst;
+ }
+ while (!ThrsafeCompareExchange(&mi_links[2 * dxHEAD_INDEX + 0], levelRootsFirst, currentLevelFirstLink));
+
+ // If another level was added and some threads have already exited...
+ unsigned int threadsTotal = stage4CallContext->m_LCP_iterationThreadsTotal;
+ unsigned int threadsRemaining = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_LCP_iterationThreadsRemaining, threadsTotal);
+ if (threadsRemaining != threadsTotal) {
+ // ...go on an schedule one more...
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ dxWorld *world = callContext->m_world;
+ // ...passing knownToBeCompletedLevel as the initial one for the spawned call
+ world->PostThreadedCallForUnawareReleasee(NULL, NULL, 0, stage4CallContext->m_LCP_iterationNextReleasee, NULL, &dxQuickStepIsland_Stage4LCP_Iteration_Callback, stage4CallContext, knownToBeCompletedLevel, "QuickStepIsland Stage4LCP_Iteration");
+ // NOTE: it's hard to predict whether it is reasonable to re-post a call
+ // each time a new level is added (provided some calls have already exited, of course).
+ // The efficiency very much depends on dependencies patterns between levels
+ // (i.e. it depends on the amount of available work added with each level).
+ // The strategy of re-posting exited calls as frequently as possible
+ // leads to potential wasting execution cycles in some cores for the aid
+ // of keeping other cores busy as much as possible and not letting all the
+ // work be executed by just a partial cores subset. With emergency of large
+ // available work amounts (the work that is not dependent on anything and
+ // ready to be executed immediately) this strategy is going to transit into
+ // full cores set being busy executing useful work. If amounts of work
+ // emerging from added levels are small, the strategy should lead to
+ // approximately the same efficiency as if the work was done by only a cores subset
+ // with the remaining cores wasting (some) cycles for re-scheduling calls
+ // to those busy cores rather than being idle or handling other islands.
+ }
+ }
+
+ // Finally record the root index of current record's level
+ mi_levels[dxDECODE_INDEX(currentLevelFirstLink)] = currentLevelRoot;
+ }
+ }
+
+ if (currentLevelRoot == knownToBeCompletedLevel) {
+ break;
+ }
+ dIASSERT(currentLevelRoot != dxHEAD_INDEX); // Zero level is expected to be the deepest one in the list and execution must not loop past it.
+ }
+ // Save the level root we started from as known to be completed
+ knownToBeCompletedLevel = initialLevelRoot;
+ }
+
+ // Decrement running threads count on exit
+ ThrsafeAdd(&stage4CallContext->m_LCP_iterationThreadsRemaining, (atomicord32)(-1));
+}
+
+static
+void dxQuickStepIsland_Stage4LCP_STIteration(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ unsigned int m = localContext->m_m;
+ for (unsigned int i = 0; i != m; ++i) {
+ dxQuickStepIsland_Stage4LCP_IterationStep(stage4CallContext, i);
+ }
+}
+
+//***************************************************************************
+// SOR-LCP method
+
+// nb is the number of bodies in the body array.
+// J is an m*16 matrix of constraint rows with rhs, cfm, lo and hi in padding
+// jb is an array of first and second body numbers for each constraint row
+// invI is the global frame inverse inertia for each body (stacked 3x3 matrices)
+//
+// this returns lambda and fc (the constraint force).
+// note: fc is returned as inv(M)*J'*lambda, the constraint force is actually J'*lambda
+//
+// b, lo and hi are modified on exit
+
+static
+void dxQuickStepIsland_Stage4LCP_IterationStep(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int i)
+{
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ IndexError *order = stage4CallContext->m_order;
+ unsigned int index = order[i].index;
+
+ dReal *fc_ptr1;
+ dReal *fc_ptr2 = NULL;
+ dReal delta;
+
+ dReal *lambda = stage4CallContext->m_lambda;
+ dReal old_lambda = lambda[index];
+
+ dReal *J = localContext->m_J;
+ const dReal *J_ptr = J + (sizeint)index * JME__MAX;
+
+ {
+ delta = J_ptr[JME_RHS] - old_lambda * J_ptr[JME_CFM];
+
+ dReal *fc = stage4CallContext->m_cforce;
+
+ const dxJBodiesItem *jb = localContext->m_jb;
+ int b2 = jb[index].second;
+ int b1 = jb[index].first;
+
+ // @@@ potential optimization: SIMD-ize this and the b2 >= 0 case
+ fc_ptr1 = fc + (sizeint)(unsigned)b1 * CFE__MAX;
+ delta -= fc_ptr1[CFE_LX] * J_ptr[JME_J1LX] + fc_ptr1[CFE_LY] * J_ptr[JME_J1LY] +
+ fc_ptr1[CFE_LZ] * J_ptr[JME_J1LZ] + fc_ptr1[CFE_AX] * J_ptr[JME_J1AX] +
+ fc_ptr1[CFE_AY] * J_ptr[JME_J1AY] + fc_ptr1[CFE_AZ] * J_ptr[JME_J1AZ];
+ // @@@ potential optimization: handle 1-body constraints in a separate
+ // loop to avoid the cost of test & jump?
+ if (b2 != -1) {
+ fc_ptr2 = fc + (sizeint)(unsigned)b2 * CFE__MAX;
+ delta -= fc_ptr2[CFE_LX] * J_ptr[JME_J2LX] + fc_ptr2[CFE_LY] * J_ptr[JME_J2LY] +
+ fc_ptr2[CFE_LZ] * J_ptr[JME_J2LZ] + fc_ptr2[CFE_AX] * J_ptr[JME_J2AX] +
+ fc_ptr2[CFE_AY] * J_ptr[JME_J2AY] + fc_ptr2[CFE_AZ] * J_ptr[JME_J2AZ];
+ }
+ }
+
+ {
+ dReal hi_act, lo_act;
+
+ // set the limits for this constraint.
+ // this is the place where the QuickStep method differs from the
+ // direct LCP solving method, since that method only performs this
+ // limit adjustment once per time step, whereas this method performs
+ // once per iteration per constraint row.
+ // the constraints are ordered so that all lambda[] values needed have
+ // already been computed.
+ const int *findex = localContext->m_findex;
+ if (findex[index] != -1) {
+ hi_act = dFabs (J_ptr[JME_HI] * lambda[(unsigned)findex[index]]);
+ lo_act = -hi_act;
+ } else {
+ hi_act = J_ptr[JME_HI];
+ lo_act = J_ptr[JME_LO];
+ }
+
+ // compute lambda and clamp it to [lo,hi].
+ // @@@ potential optimization: does SSE have clamping instructions
+ // to save test+jump penalties here?
+ dReal new_lambda = old_lambda + delta;
+ if (new_lambda < lo_act) {
+ delta = lo_act - old_lambda;
+ lambda[index] = lo_act;
+ }
+ else if (new_lambda > hi_act) {
+ delta = hi_act - old_lambda;
+ lambda[index] = hi_act;
+ }
+ else {
+ lambda[index] = new_lambda;
+ }
+ }
+
+ //@@@ a trick that may or may not help
+ //dReal ramp = (1-((dReal)(iteration+1)/(dReal)num_iterations));
+ //delta *= ramp;
+
+ {
+ dReal *iMJ = stage4CallContext->m_iMJ;
+ const dReal *iMJ_ptr = iMJ + (sizeint)index * IMJ__MAX;
+ // update fc.
+ // @@@ potential optimization: SIMD for this and the b2 >= 0 case
+ fc_ptr1[CFE_LX] += delta * iMJ_ptr[IMJ_1LX];
+ fc_ptr1[CFE_LY] += delta * iMJ_ptr[IMJ_1LY];
+ fc_ptr1[CFE_LZ] += delta * iMJ_ptr[IMJ_1LZ];
+ fc_ptr1[CFE_AX] += delta * iMJ_ptr[IMJ_1AX];
+ fc_ptr1[CFE_AY] += delta * iMJ_ptr[IMJ_1AY];
+ fc_ptr1[CFE_AZ] += delta * iMJ_ptr[IMJ_1AZ];
+ // @@@ potential optimization: handle 1-body constraints in a separate
+ // loop to avoid the cost of test & jump?
+ if (fc_ptr2) {
+ fc_ptr2[CFE_LX] += delta * iMJ_ptr[IMJ_2LX];
+ fc_ptr2[CFE_LY] += delta * iMJ_ptr[IMJ_2LY];
+ fc_ptr2[CFE_LZ] += delta * iMJ_ptr[IMJ_2LZ];
+ fc_ptr2[CFE_AX] += delta * iMJ_ptr[IMJ_2AX];
+ fc_ptr2[CFE_AY] += delta * iMJ_ptr[IMJ_2AY];
+ fc_ptr2[CFE_AZ] += delta * iMJ_ptr[IMJ_2AZ];
+ }
+ }
+}
+
+static inline
+bool IsStage4bJointInfosIterationRequired(const dxQuickStepperLocalContext *localContext)
+{
+ return
+#ifdef WARM_STARTING
+ true ||
+#endif
+ localContext->m_mfb > 0;
+}
+
+static
+int dxQuickStepIsland_Stage4LCP_IterationSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ unsigned int stage4b_allowedThreads = 1;
+ if (IsStage4bJointInfosIterationRequired(localContext)) {
+ unsigned int allowedThreads = callContext->m_stepperAllowedThreads;
+ dIASSERT(allowedThreads >= stage4b_allowedThreads);
+ stage4b_allowedThreads += CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE4B_STEP>(localContext->m_nj, allowedThreads - stage4b_allowedThreads);
+ }
+
+ if (stage4b_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4b_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage4b_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4b_Callback, stage4CallContext, "QuickStepIsland Stage4b");
+ }
+ dxQuickStepIsland_Stage4b(stage4CallContext);
+
+ return 1;
+}
+
+static
+int dxQuickStepIsland_Stage4b_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext;
+ dxQuickStepIsland_Stage4b(stage4CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage4b(dxQuickStepperStage4CallContext *stage4CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext;
+
+ if (ThrsafeExchange(&stage4CallContext->m_cf_4b, 1) == 0) {
+ dxBody * const *body = callContext->m_islandBodiesStart;
+ unsigned int nb = callContext->m_islandBodiesCount;
+ const dReal *cforce = stage4CallContext->m_cforce;
+ dReal stepsize = callContext->m_stepSize;
+ // add stepsize * cforce to the body velocity
+ const dReal *cforcecurr = cforce;
+ dxBody *const *const bodyend = body + nb;
+ for (dxBody *const *bodycurr = body; bodycurr != bodyend; cforcecurr += CFE__MAX, bodycurr++) {
+ dxBody *b = *bodycurr;
+ for (unsigned int j = dSA__MIN; j != dSA__MAX; j++) {
+ b->lvel[dV3E__AXES_MIN + j] += stepsize * cforcecurr[CFE__L_MIN + j];
+ b->avel[dV3E__AXES_MIN + j] += stepsize * cforcecurr[CFE__A_MIN + j];
+ }
+ }
+ }
+
+
+ // note that the SOR method overwrites rhs and J at this point, so
+ // they should not be used again.
+
+ if (IsStage4bJointInfosIterationRequired(localContext)) {
+ dReal data[JVE__MAX];
+ const dReal *Jcopy = localContext->m_Jcopy;
+ const dReal *lambda = stage4CallContext->m_lambda;
+ const dxMIndexItem *mindex = localContext->m_mindex;
+ dJointWithInfo1 *jointinfos = localContext->m_jointinfos;
+
+ unsigned int nj = localContext->m_nj;
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4B_STEP;
+ unsigned int nj_steps = (nj + (step_size - 1)) / step_size;
+
+ unsigned ji_step;
+ while ((ji_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_ji_4b, nj_steps)) != nj_steps) {
+ unsigned int ji = ji_step * step_size;
+ const unsigned int jiend = ji + dMIN(step_size, nj - ji);
+
+ const dReal *Jcopycurr = Jcopy + (sizeint)mindex[ji].fbIndex * JCE__MAX;
+
+ while (true) {
+ // straightforward computation of joint constraint forces:
+ // multiply related lambdas with respective J' block for joints
+ // where feedback was requested
+ const unsigned int fb_infom = mindex[ji + 1].fbIndex - mindex[ji].fbIndex;
+ if (fb_infom != 0) {
+ dIASSERT(fb_infom == mindex[ji + 1].mIndex - mindex[ji].mIndex);
+
+ const dReal *lambdacurr = lambda + mindex[ji].mIndex;
+ dxJoint *joint = jointinfos[ji].joint;
+
+#ifdef WARM_STARTING
+ memcpy(joint->lambda, lambdacurr, fb_infom * sizeof(dReal));
+#endif
+
+ dJointFeedback *fb = joint->feedback;
+
+ if (joint->node[1].body) {
+ Multiply1_12q1 (data, Jcopycurr + JCE__J2_MIN, lambdacurr, fb_infom);
+ dSASSERT(JCE__MAX == 12);
+
+ fb->f2[dSA_X] = data[JVE_LX];
+ fb->f2[dSA_Y] = data[JVE_LY];
+ fb->f2[dSA_Z] = data[JVE_LZ];
+ fb->t2[dSA_X] = data[JVE_AX];
+ fb->t2[dSA_Y] = data[JVE_AY];
+ fb->t2[dSA_Z] = data[JVE_AZ];
+ }
+
+ Multiply1_12q1 (data, Jcopycurr + JCE__J1_MIN, lambdacurr, fb_infom);
+ dSASSERT(JCE__MAX == 12);
+
+ fb->f1[dSA_X] = data[JVE_LX];
+ fb->f1[dSA_Y] = data[JVE_LY];
+ fb->f1[dSA_Z] = data[JVE_LZ];
+ fb->t1[dSA_X] = data[JVE_AX];
+ fb->t1[dSA_Y] = data[JVE_AY];
+ fb->t1[dSA_Z] = data[JVE_AZ];
+
+ Jcopycurr += fb_infom * JCE__MAX;
+ }
+ else {
+#ifdef WARM_STARTING
+ const dReal *lambdacurr = lambda + mindex[ji].mIndex;
+ const unsigned int infom = mindex[ji + 1].mIndex - mindex[ji].mIndex;
+ dxJoint *joint = jointinfos[ji].joint;
+ memcpy(joint->lambda, lambdacurr, infom * sizeof(dReal));
+#endif
+ }
+
+ if (++ji == jiend) {
+ break;
+ }
+ }
+ }
+ }
+}
+
+static
+int dxQuickStepIsland_Stage5_Callback(void *_stage5CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage5CallContext *stage5CallContext = (dxQuickStepperStage5CallContext *)_stage5CallContext;
+ dxQuickStepIsland_Stage5(stage5CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage5(dxQuickStepperStage5CallContext *stage5CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage5CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage5CallContext->m_localContext;
+
+ dxWorldProcessMemArena *memarena = callContext->m_stepperArena;
+ memarena->RestoreState(stage5CallContext->m_stage3MemArenaState);
+ stage5CallContext = NULL; // WARNING! stage3CallContext is not valid after this point!
+ dIVERIFY(stage5CallContext == NULL); // To suppress unused variable assignment warnings
+
+ dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage6CallContext));
+ stage6CallContext->Initialize(callContext, localContext);
+
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ dIASSERT(allowedThreads >= 1);
+
+ if (allowedThreads == 1) {
+ IFTIMING (dTimerNow ("compute velocity update"));
+ dxQuickStepIsland_Stage6a(stage6CallContext);
+ dxQuickStepIsland_Stage6_VelocityCheck(stage6CallContext);
+ IFTIMING (dTimerNow ("update position and tidy up"));
+ dxQuickStepIsland_Stage6b(stage6CallContext);
+ IFTIMING (dTimerEnd());
+ IFTIMING (if (m > 0) dTimerReport (stdout,1));
+ }
+ else {
+ unsigned int nb = callContext->m_islandBodiesCount;
+ unsigned int stage6a_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE6A_STEP>(nb, allowedThreads);
+
+ dxWorld *world = callContext->m_world;
+
+ dCallReleaseeID stage6aSyncReleasee;
+ world->PostThreadedCallForUnawareReleasee(NULL, &stage6aSyncReleasee, stage6a_allowedThreads, callContext->m_finalReleasee,
+ NULL, &dxQuickStepIsland_Stage6aSync_Callback, stage6CallContext, 0, "QuickStepIsland Stage6a Sync");
+
+ if (stage6a_allowedThreads > 1) {
+ world->PostThreadedCallsGroup(NULL, stage6a_allowedThreads - 1, stage6aSyncReleasee, &dxQuickStepIsland_Stage6a_Callback, stage6CallContext, "QuickStepIsland Stage6a");
+ }
+ dxQuickStepIsland_Stage6a(stage6CallContext);
+ world->AlterThreadedCallDependenciesCount(stage6aSyncReleasee, -1);
+ }
+}
+
+
+static
+int dxQuickStepIsland_Stage6a_Callback(void *_stage6CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)_stage6CallContext;
+ dxQuickStepIsland_Stage6a(stage6CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage6a(dxQuickStepperStage6CallContext *stage6CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext;
+ const dxQuickStepperLocalContext *localContext = stage6CallContext->m_localContext;
+
+ dReal stepsize = callContext->m_stepSize;
+ dReal *invI = localContext->m_invI;
+ dxBody * const *body = callContext->m_islandBodiesStart;
+
+ unsigned int nb = callContext->m_islandBodiesCount;
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE6A_STEP;
+ unsigned int nb_steps = (nb + (step_size - 1)) / step_size;
+
+ unsigned bi_step;
+ while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage6CallContext->m_bi_6a, nb_steps)) != nb_steps) {
+ unsigned int bi = bi_step * step_size;
+ unsigned int bicnt = dMIN(step_size, nb - bi);
+
+ const dReal *invIrow = invI + (sizeint)bi * IIE__MAX;
+ dxBody *const *bodycurr = body + bi;
+ dxBody *const *bodyend = bodycurr + bicnt;
+ while (true) {
+ // compute the velocity update:
+ // add stepsize * invM * fe to the body velocity
+ dxBody *b = *bodycurr;
+ dReal body_invMass_mul_stepsize = stepsize * b->invMass;
+ for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) {
+ b->lvel[dV3E__AXES_MIN + j] += body_invMass_mul_stepsize * b->facc[dV3E__AXES_MIN + j];
+ b->tacc[dV3E__AXES_MIN + j] *= stepsize;
+ }
+ dMultiplyAdd0_331 (b->avel, invIrow + IIE__MATRIX_MIN, b->tacc);
+
+ if (++bodycurr == bodyend) {
+ break;
+ }
+ invIrow += IIE__MAX;
+ }
+ }
+}
+
+
+static
+int dxQuickStepIsland_Stage6aSync_Callback(void *_stage6CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)_stage6CallContext;
+ dxQuickStepIsland_Stage6_VelocityCheck(stage6CallContext);
+
+ const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext;
+
+ const unsigned allowedThreads = callContext->m_stepperAllowedThreads;
+ unsigned int nb = callContext->m_islandBodiesCount;
+ unsigned int stage6b_allowedThreads = CalculateOptimalThreadsCount<dxQUICKSTEPISLAND_STAGE6B_STEP>(nb, allowedThreads);
+
+ if (stage6b_allowedThreads > 1) {
+ dxWorld *world = callContext->m_world;
+ world->AlterThreadedCallDependenciesCount(callThisReleasee, stage6b_allowedThreads - 1);
+ world->PostThreadedCallsGroup(NULL, stage6b_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage6b_Callback, stage6CallContext, "QuickStepIsland Stage6b");
+ }
+ dxQuickStepIsland_Stage6b(stage6CallContext);
+
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage6_VelocityCheck(dxQuickStepperStage6CallContext *stage6CallContext)
+{
+ (void)stage6CallContext; // can be unused
+#ifdef CHECK_VELOCITY_OBEYS_CONSTRAINT
+ const dxQuickStepperLocalContext *localContext = stage6CallContext->m_localContext;
+
+ unsigned int m = localContext->m_m;
+ if (m > 0) {
+ const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext;
+ dxBody * const *body = callContext->m_islandBodiesStart;
+ dReal *J = localContext->m_J;
+ const dxJBodiesItem *jb = localContext->m_jb;
+
+ dReal error = 0;
+ const dReal* J_ptr = J;
+ for (unsigned int i = 0; i < m; ++i) {
+ int b1 = jb[i].first;
+ int b2 = jb[i].second;
+ dReal sum = 0;
+ dxBody *bodycurr = body[(unsigned)b1];
+ for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) sum += J_ptr[JME__J1L_MIN + j] * bodycurr->lvel[dV3E__AXES_MIN + j] + J_ptr[JME__J1A_MIN + j] * bodycurr->avel[dV3E__AXES_MIN + j];
+ if (b2 != -1) {
+ dxBody *bodycurr = body[(unsigned)b2];
+ for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) sum += J_ptr[JME__J2L_MIN + k] * bodycurr->lvel[dV3E__AXES_MIN + k] + J_ptr[JME__J2A_MIN + k] * bodycurr->avel[dV3E__AXES_MIN + k];
+ }
+ J_ptr += JME__MAX;
+ error += dFabs(sum);
+ }
+ printf ("velocity error = %10.6e\n", error);
+ }
+#endif
+}
+
+static
+int dxQuickStepIsland_Stage6b_Callback(void *_stage6CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee)
+{
+ (void)callInstanceIndex; // unused
+ (void)callThisReleasee; // unused
+ dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)_stage6CallContext;
+ dxQuickStepIsland_Stage6b(stage6CallContext);
+ return 1;
+}
+
+static
+void dxQuickStepIsland_Stage6b(dxQuickStepperStage6CallContext *stage6CallContext)
+{
+ const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext;
+
+ dReal stepsize = callContext->m_stepSize;
+ dxBody * const *body = callContext->m_islandBodiesStart;
+
+ // update the position and orientation from the new linear/angular velocity
+ // (over the given timestep)
+ unsigned int nb = callContext->m_islandBodiesCount;
+ const unsigned int step_size = dxQUICKSTEPISLAND_STAGE6B_STEP;
+ unsigned int nb_steps = (nb + (step_size - 1)) / step_size;
+
+ unsigned bi_step;
+ while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage6CallContext->m_bi_6b, nb_steps)) != nb_steps) {
+ unsigned int bi = bi_step * step_size;
+ unsigned int bicnt = dMIN(step_size, nb - bi);
+
+ dxBody *const *bodycurr = body + bi;
+ dxBody *const *bodyend = bodycurr + bicnt;
+ while (true) {
+ dxBody *b = *bodycurr;
+ dxStepBody (b, stepsize);
+ dZeroVector3 (b->facc);
+ dZeroVector3 (b->tacc);
+ if (++bodycurr == bodyend) {
+ break;
+ }
+ }
+ }
+}
+
+
+
+/*extern */
+sizeint dxEstimateQuickStepMemoryRequirements (dxBody * const *body,
+ unsigned int nb,
+ dxJoint * const *_joint,
+ unsigned int _nj)
+{
+ (void)body; // unused
+ unsigned int nj, m, mfb;
+
+ {
+ unsigned int njcurr = 0, mcurr = 0, mfbcurr = 0;
+ dxJoint::SureMaxInfo info;
+ dxJoint *const *const _jend = _joint + _nj;
+ for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) {
+ dxJoint *j = *_jcurr;
+ j->getSureMaxInfo (&info);
+
+ unsigned int jm = info.max_m;
+ if (jm > 0) {
+ njcurr++;
+
+ mcurr += jm;
+ if (j->feedback)
+ mfbcurr += jm;
+ }
+ }
+ nj = njcurr; m = mcurr; mfb = mfbcurr;
+ }
+
+ sizeint res = 0;
+
+ res += dOVERALIGNED_SIZE(sizeof(dReal) * IIE__MAX * nb, INVI_ALIGNMENT); // for invI
+
+ {
+ sizeint sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * _nj); // for initial jointinfos
+
+ sizeint sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * nj); // for shrunk jointinfos
+ sub1_res2 += dEFFICIENT_SIZE(sizeof(dxQuickStepperLocalContext)); // for dxQuickStepLocalContext
+ if (m > 0) {
+ sub1_res2 += dEFFICIENT_SIZE(sizeof(dxMIndexItem) * (nj + 1)); // for mindex
+ sub1_res2 += dEFFICIENT_SIZE(sizeof(dxJBodiesItem) * m); // for jb
+ sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m); // for findex
+ sub1_res2 += dOVERALIGNED_SIZE(sizeof(dReal) * JME__MAX * m, JACOBIAN_ALIGNMENT); // for J
+ sub1_res2 += dOVERALIGNED_SIZE(sizeof(dReal) * JCE__MAX * mfb, JCOPY_ALIGNMENT); // for Jcopy
+ {
+ sizeint sub2_res1 = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage3CallContext)); // for dxQuickStepperStage3CallContext
+ sub2_res1 += dEFFICIENT_SIZE(sizeof(dReal) * RHS__MAX * nb); // for rhs_tmp
+ sub2_res1 += dEFFICIENT_SIZE(sizeof(dxQuickStepperStage2CallContext)); // for dxQuickStepperStage2CallContext
+
+ sizeint sub2_res2 = 0;
+ {
+ sizeint sub3_res1 = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage5CallContext)); // for dxQuickStepperStage5CallContext;
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(dReal) * m); // for lambda
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(dReal) * CFE__MAX * nb); // for cforce
+ sub3_res1 += dOVERALIGNED_SIZE(sizeof(dReal) * IMJ__MAX * m, INVMJ_ALIGNMENT); // for iMJ
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(IndexError) * m); // for order
+#if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(dReal) * m); // for last_lambda
+#endif
+#if !dTHREADING_INTF_DISABLED
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(atomicord32) * dMAX(nb, m)); // for bi_links_or_mi_levels
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(atomicord32) * 2 * ((sizeint)m + 1)); // for mi_links
+#endif
+ sub3_res1 += dEFFICIENT_SIZE(sizeof(dxQuickStepperStage4CallContext)); // for dxQuickStepperStage4CallContext;
+
+ sizeint sub3_res2 = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage6CallContext)); // for dxQuickStepperStage6CallContext;
+
+ sub2_res2 += dMAX(sub3_res1, sub3_res2);
+ }
+
+ sub1_res2 += dMAX(sub2_res1, sub2_res2);
+ }
+ }
+ else {
+ sub1_res2 += dEFFICIENT_SIZE(sizeof(dxQuickStepperStage3CallContext)); // for dxQuickStepperStage3CallContext
+ }
+
+ sizeint sub1_res12_max = dMAX(sub1_res1, sub1_res2);
+ sizeint stage01_contexts = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage0BodiesCallContext))
+ + dEFFICIENT_SIZE(sizeof(dxQuickStepperStage0JointsCallContext))
+ + dEFFICIENT_SIZE(sizeof(dxQuickStepperStage1CallContext));
+ res += dMAX(sub1_res12_max, stage01_contexts);
+ }
+
+ return res;
+}
+
+/*extern */
+unsigned dxEstimateQuickStepMaxCallCount(unsigned activeThreadCount, unsigned allowedThreadCount)
+{
+ (void)activeThreadCount; // unused
+ unsigned result = 1 // dxQuickStepIsland itself
+ + 5 + (2 * allowedThreadCount + 1) // for Stage4 related schedules
+ + 1 // dxStepIsland_Stage5
+ + allowedThreadCount; // Reserve
+ return result;
+}
+
|