/************************************************************************* * * * Open Dynamics Engine, Copyright (C) 2001-2003 Russell L. Smith. * * All rights reserved. Email: russ@q12.org Web: www.q12.org * * * * This library is free software; you can redistribute it and/or * * modify it under the terms of EITHER: * * (1) The GNU Lesser General Public License as published by the Free * * Software Foundation; either version 2.1 of the License, or (at * * your option) any later version. The text of the GNU Lesser * * General Public License is included with this library in the * * file LICENSE.TXT. * * (2) The BSD-style license that is included with this library in * * the file LICENSE-BSD.TXT. * * * * This library is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files * * LICENSE.TXT and LICENSE-BSD.TXT for more details. * * * *************************************************************************/ #include #include #include #include #include #include "config.h" #include "matrix.h" #include "odemath.h" #include "objects.h" #include "joints/joint.h" #include "lcp.h" #include "util.h" #include "threadingutils.h" #include //*************************************************************************** // configuration // for the SOR and CG methods: // uncomment the following line to use warm starting. this definitely // help for motor-driven joints. unfortunately it appears to hurt // with high-friction contacts using the SOR method. use with care // #define WARM_STARTING 1 #define REORDERING_METHOD__DONT_REORDER 0 #define REORDERING_METHOD__BY_ERROR 1 #define REORDERING_METHOD__RANDOMLY 2 // for the SOR method: // uncomment the following line to determine a new constraint-solving // order for each iteration. however, the qsort per iteration is expensive, // and the optimal order is somewhat problem dependent. // @@@ try the leaf->root ordering. // #define CONSTRAINTS_REORDERING_METHOD REORDERING_METHOD__BY_ERROR // for the SOR method: // uncomment the following line to randomly reorder constraint rows // during the solution. depending on the situation, this can help a lot // or hardly at all, but it doesn't seem to hurt. #define CONSTRAINTS_REORDERING_METHOD REORDERING_METHOD__RANDOMLY #if !defined(CONSTRAINTS_REORDERING_METHOD) #define CONSTRAINTS_REORDERING_METHOD REORDERING_METHOD__DONT_REORDER #endif #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__RANDOMLY #if !defined(RANDOM_CONSTRAINTS_REORDERING_FREQUENCY) #define RANDOM_CONSTRAINTS_REORDERING_FREQUENCY 8U #endif dSASSERT(RANDOM_CONSTRAINTS_REORDERING_FREQUENCY != 0); #endif enum dxRandomReorderStage { RRS__MIN, RRS_REORDERING = RRS__MIN, RRS__MAX, }; //*************************************************************************** // macros, typedefs, forwards and inlines struct IndexError; #define dMIN(A,B) ((A)>(B) ? (B) : (A)) #define dMAX(A,B) ((B)>(A) ? (B) : (A)) #define dxQUICKSTEPISLAND_STAGE2B_STEP 16U #define dxQUICKSTEPISLAND_STAGE2C_STEP 32U #ifdef WARM_STARTING #define dxQUICKSTEPISLAND_STAGE4A_STEP 256U #else #define dxQUICKSTEPISLAND_STAGE4A_STEP 512U #endif #define dxQUICKSTEPISLAND_STAGE4LCP_IMJ_STEP 8U #define dxQUICKSTEPISLAND_STAGE4LCP_AD_STEP 8U #ifdef WARM_STARTING #define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP 128U #define dxQUICKSTEPISLAND_STAGE4LCP_FC_COMPLETE_TO_PREPARE_COMPLEXITY_DIVISOR 4 #define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP_PREPARE (dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP * dxQUICKSTEPISLAND_STAGE4LCP_FC_COMPLETE_TO_PREPARE_COMPLEXITY_DIVISOR) #define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP_COMPLETE (dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP) #else #define dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP (dxQUICKSTEPISLAND_STAGE4A_STEP / 2) // Average info.m is 3 for stage4a, while there are 6 reals per index in fc #endif #define dxQUICKSTEPISLAND_STAGE4B_STEP 256U #define dxQUICKSTEPISLAND_STAGE6A_STEP 16U #define dxQUICKSTEPISLAND_STAGE6B_STEP 1U template inline unsigned int CalculateOptimalThreadsCount(unsigned int complexity, unsigned int max_threads) { unsigned int raw_threads = dMAX(complexity, step_size) / step_size; // Round down on division unsigned int optimum = dMIN(raw_threads, max_threads); return optimum; } #define dxENCODE_INDEX(index) ((unsigned int)((index) + 1)) #define dxDECODE_INDEX(code) ((unsigned int)((code) - 1)) #define dxHEAD_INDEX 0 //**************************************************************************** // special matrix multipliers // multiply block of B matrix (q x 6) with 12 dReal per row with C vector (q) static inline void Multiply1_12q1 (dReal *A, const dReal *B, const dReal *C, unsigned int q) { dIASSERT (q>0 && A && B && C); dReal a = 0; dReal b = 0; dReal c = 0; dReal d = 0; dReal e = 0; dReal f = 0; dReal s; for(unsigned int i=0, k = 0; i void compute_invM_JT (volatile atomicord32 *mi_storage, dReal *iMJ, unsigned int m, const dReal *J, const dxJBodiesItem *jb, dxBody * const *body, const dReal *invI) { unsigned int m_steps = (m + (step_size - 1)) / step_size; unsigned mi_step; while ((mi_step = ThrsafeIncrementIntUpToLimit(mi_storage, m_steps)) != m_steps) { unsigned int mi = mi_step * step_size; const unsigned int miend = mi + dMIN(step_size, m - mi); dReal *iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX; const dReal *J_ptr = J + (sizeint)mi * JME__MAX; while (true) { int b1 = jb[mi].first; int b2 = jb[mi].second; dReal k1 = body[(unsigned)b1]->invMass; for (unsigned int j = 0; j != JVE__L_COUNT; j++) iMJ_ptr[IMJ__1L_MIN + j] = k1 * J_ptr[JME__J1L_MIN + j]; const dReal *invIrow1 = invI + (sizeint)(unsigned)b1 * IIE__MAX + IIE__MATRIX_MIN; dMultiply0_331 (iMJ_ptr + IMJ__1A_MIN, invIrow1, J_ptr + JME__J1A_MIN); if (b2 != -1) { dReal k2 = body[(unsigned)b2]->invMass; for (unsigned int j = 0; j != JVE__L_COUNT; ++j) iMJ_ptr[IMJ__2L_MIN + j] = k2 * J_ptr[JME__J2L_MIN + j]; const dReal *invIrow2 = invI + (sizeint)(unsigned)b2 * IIE__MAX + IIE__MATRIX_MIN; dMultiply0_331 (iMJ_ptr + IMJ__2A_MIN, invIrow2, J_ptr + JME__J2A_MIN); } if (++mi == miend) { break; } iMJ_ptr += IMJ__MAX; J_ptr += JME__MAX; } } } #ifdef WARM_STARTING static void multiply_invM_JT_init_array(unsigned int nb, atomicord32 *bi_links/*=[nb]*/) { // const unsigned businessIndex_none = dxENCODE_INDEX(-1); // for (unsigned int bi = 0; bi != nb; ++bi) { // bi_links[bi] = businessIndex_none; // } memset(bi_links, 0, nb * sizeof(bi_links[0])); } // compute out = inv(M)*J'*in. template void multiply_invM_JT_prepare(volatile atomicord32 *mi_storage, unsigned int m, const dxJBodiesItem *jb, atomicord32 *bi_links/*=[nb]*/, atomicord32 *mi_links/*=[2*m]*/) { unsigned int m_steps = (m + (step_size - 1)) / step_size; unsigned mi_step; while ((mi_step = ThrsafeIncrementIntUpToLimit(mi_storage, m_steps)) != m_steps) { unsigned int mi = mi_step * step_size; const unsigned int miend = mi + dMIN(step_size, m - mi); while (true) { int b1 = jb[mi].first; int b2 = jb[mi].second; const unsigned encoded_mi = dxENCODE_INDEX(mi); unsigned oldIndex_b1 = ThrsafeExchange(&bi_links[b1], encoded_mi); mi_links[(sizeint)mi * 2] = oldIndex_b1; if (b2 != -1) { unsigned oldIndex_b2 = ThrsafeExchange(&bi_links[b2], encoded_mi); mi_links[(sizeint)mi * 2 + 1] = oldIndex_b2; } if (++mi == miend) { break; } } } } template void multiply_invM_JT_complete(volatile atomicord32 *bi_storage, dReal *out, unsigned int nb, const dReal *iMJ, const dxJBodiesItem *jb, const dReal *in, atomicord32 *bi_links/*=[nb]*/, atomicord32 *mi_links/*=[2*m]*/) { const unsigned businessIndex_none = dxENCODE_INDEX(-1); unsigned int nb_steps = (nb + (step_size - 1)) / step_size; unsigned bi_step; while ((bi_step = ThrsafeIncrementIntUpToLimit(bi_storage, nb_steps)) != nb_steps) { unsigned int bi = bi_step * step_size; const unsigned int biend = bi + dMIN(step_size, nb - bi); dReal *out_ptr = out + (sizeint)bi * out_stride + out_offset; while (true) { dReal psum0 = REAL(0.0), psum1 = REAL(0.0), psum2 = REAL(0.0), psum3 = REAL(0.0), psum4 = REAL(0.0), psum5 = REAL(0.0); unsigned businessIndex = bi_links[bi]; while (businessIndex != businessIndex_none) { unsigned int mi = dxDECODE_INDEX(businessIndex); const dReal *iMJ_ptr; if (bi == jb[mi].first) { iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX + IMJ__1_MIN; businessIndex = mi_links[(sizeint)mi * 2]; } else { dIASSERT(bi == jb[mi].second); iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX + IMJ__2_MIN; businessIndex = mi_links[(sizeint)mi * 2 + 1]; } const dReal in_i = in[mi]; psum0 += in_i * iMJ_ptr[JVE_LX]; psum1 += in_i * iMJ_ptr[JVE_LY]; psum2 += in_i * iMJ_ptr[JVE_LZ]; psum3 += in_i * iMJ_ptr[JVE_AX]; psum4 += in_i * iMJ_ptr[JVE_AY]; psum5 += in_i * iMJ_ptr[JVE_AZ]; } out_ptr[dDA_LX] = psum0; out_ptr[dDA_LY] = psum1; out_ptr[dDA_LZ] = psum2; out_ptr[dDA_AX] = psum3; out_ptr[dDA_AY] = psum4; out_ptr[dDA_AZ] = psum5; if (++bi == biend) { break; } out_ptr += out_stride; } } } template void _multiply_invM_JT (dReal *out, unsigned int m, unsigned int nb, dReal *iMJ, const dxJBodiesItem *jb, const dReal *in) { dSetZero (out, (sizeint)nb * out_stride); const dReal *iMJ_ptr = iMJ; for (unsigned int i=0; i= JVE__MAX); dSASSERT(JVE__MAX == (int)dDA__MAX); if (b2 != -1) { out_ptr = out + (sizeint)(unsigned)b2 * out_stride + out_offset; for (unsigned int j = JVE__MIN; j != JVE__MAX; j++) out_ptr[j - JVE__MIN] += iMJ_ptr[IMJ__2_MIN + j] * in_i; dSASSERT(out_stride - out_offset >= JVE__MAX); dSASSERT(JVE__MAX == (int)dDA__MAX); } iMJ_ptr += IMJ__MAX; } } #endif // compute out = J*in. template void multiplyAdd_J (volatile atomicord32 *mi_storage, unsigned int m, dReal *J, const dxJBodiesItem *jb, const dReal *in) { unsigned int m_steps = (m + (step_size - 1)) / step_size; unsigned mi_step; while ((mi_step = ThrsafeIncrementIntUpToLimit(mi_storage, m_steps)) != m_steps) { unsigned int mi = mi_step * step_size; const unsigned int miend = mi + dMIN(step_size, m - mi); dReal *J_ptr = J + (sizeint)mi * JME__MAX; while (true) { int b1 = jb[mi].first; int b2 = jb[mi].second; dReal sum = REAL(0.0); const dReal *in_ptr = in + (sizeint)(unsigned)b1 * in_stride + in_offset; for (unsigned int j = 0; j != JME__J1_COUNT; ++j) sum += J_ptr[j + JME__J1_MIN] * in_ptr[j]; dSASSERT(in_offset + JME__J1_COUNT <= in_stride); if (b2 != -1) { in_ptr = in + (sizeint)(unsigned)b2 * in_stride + in_offset; for (unsigned int j = 0; j != JME__J2_COUNT; ++j) sum += J_ptr[j + JME__J2_MIN] * in_ptr[j]; dSASSERT(in_offset + JME__J2_COUNT <= in_stride); } J_ptr[JME_RHS] += sum; if (++mi == miend) { break; } J_ptr += JME__MAX; } } } struct IndexError { #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR dReal error; // error to sort on #endif int index; // row index }; #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR static int compare_index_error (const void *a, const void *b) { const IndexError *i1 = (IndexError*) a; const IndexError *i2 = (IndexError*) b; if (i1->error < i2->error) return -1; if (i1->error > i2->error) return 1; return 0; } #endif // #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR static inline bool IsSORConstraintsReorderRequiredForIteration(unsigned iteration) { bool result = false; #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR result = true; #elif CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__RANDOMLY // This logic is intended to skip randomization on the very first iteration if (!dIN_RANGE(iteration, 0, RANDOM_CONSTRAINTS_REORDERING_FREQUENCY) ? dIN_RANGE(iteration % RANDOM_CONSTRAINTS_REORDERING_FREQUENCY, RRS__MIN, RRS__MAX) : iteration == 0) { result = true; } #else // #if CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__BY_ERROR && CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__RANDOMLY if (iteration == 0) { result = true; } #endif return result; } /*extern */ void dxQuickStepIsland(const dxStepperProcessingCallContext *callContext) { dxWorldProcessMemArena *memarena = callContext->m_stepperArena; unsigned int nb = callContext->m_islandBodiesCount; unsigned int _nj = callContext->m_islandJointsCount; dReal *invI = memarena->AllocateOveralignedArray((sizeint)nb * IIE__MAX, INVI_ALIGNMENT); dJointWithInfo1 *const jointinfos = memarena->AllocateArray(_nj); const unsigned allowedThreads = callContext->m_stepperAllowedThreads; dIASSERT(allowedThreads != 0); void *stagesMemArenaState = memarena->SaveState(); dxQuickStepperStage1CallContext *stage1CallContext = (dxQuickStepperStage1CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage1CallContext)); stage1CallContext->Initialize(callContext, stagesMemArenaState, invI, jointinfos); dxQuickStepperStage0BodiesCallContext *stage0BodiesCallContext = (dxQuickStepperStage0BodiesCallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage0BodiesCallContext)); stage0BodiesCallContext->Initialize(callContext, invI); dxQuickStepperStage0JointsCallContext *stage0JointsCallContext = (dxQuickStepperStage0JointsCallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage0JointsCallContext)); stage0JointsCallContext->Initialize(callContext, jointinfos, &stage1CallContext->m_stage0Outputs); if (allowedThreads == 1) { IFTIMING(dTimerStart("preprocessing")); dxQuickStepIsland_Stage0_Bodies(stage0BodiesCallContext); dxQuickStepIsland_Stage0_Joints(stage0JointsCallContext); dxQuickStepIsland_Stage1(stage1CallContext); } else { unsigned bodyThreads = CalculateOptimalThreadsCount<1U>(nb, allowedThreads); unsigned jointThreads = 1; dxWorld *world = callContext->m_world; dCallReleaseeID stage1CallReleasee; world->PostThreadedCallForUnawareReleasee(NULL, &stage1CallReleasee, bodyThreads + jointThreads, callContext->m_finalReleasee, NULL, &dxQuickStepIsland_Stage1_Callback, stage1CallContext, 0, "QuickStepIsland Stage1"); // It is preferable to post single threaded task first to be started sooner world->PostThreadedCall(NULL, NULL, 0, stage1CallReleasee, NULL, &dxQuickStepIsland_Stage0_Joints_Callback, stage0JointsCallContext, 0, "QuickStepIsland Stage0-Joints"); dIASSERT(jointThreads == 1); if (bodyThreads > 1) { world->PostThreadedCallsGroup(NULL, bodyThreads - 1, stage1CallReleasee, &dxQuickStepIsland_Stage0_Bodies_Callback, stage0BodiesCallContext, "QuickStepIsland Stage0-Bodies"); } dxQuickStepIsland_Stage0_Bodies(stage0BodiesCallContext); world->AlterThreadedCallDependenciesCount(stage1CallReleasee, -1); } } static int dxQuickStepIsland_Stage0_Bodies_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage0BodiesCallContext *callContext = (dxQuickStepperStage0BodiesCallContext *)_callContext; dxQuickStepIsland_Stage0_Bodies(callContext); return 1; } static void dxQuickStepIsland_Stage0_Bodies(dxQuickStepperStage0BodiesCallContext *callContext) { dxBody * const *body = callContext->m_stepperCallContext->m_islandBodiesStart; unsigned int nb = callContext->m_stepperCallContext->m_islandBodiesCount; if (ThrsafeExchange(&callContext->m_tagsTaken, 1) == 0) { // number all bodies in the body list - set their tag values for (unsigned int i=0; itag = i; } if (ThrsafeExchange(&callContext->m_gravityTaken, 1) == 0) { dxWorld *world = callContext->m_stepperCallContext->m_world; // add the gravity force to all bodies // since gravity does normally have only one component it's more efficient // to run three loops for each individual component dxBody *const *const bodyend = body + nb; dReal gravity_x = world->gravity[0]; if (gravity_x) { for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { dxBody *b = *bodycurr; if ((b->flags & dxBodyNoGravity) == 0) { b->facc[0] += b->mass.mass * gravity_x; } } } dReal gravity_y = world->gravity[1]; if (gravity_y) { for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { dxBody *b = *bodycurr; if ((b->flags & dxBodyNoGravity) == 0) { b->facc[1] += b->mass.mass * gravity_y; } } } dReal gravity_z = world->gravity[2]; if (gravity_z) { for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { dxBody *b = *bodycurr; if ((b->flags & dxBodyNoGravity) == 0) { b->facc[2] += b->mass.mass * gravity_z; } } } } // for all bodies, compute the inertia tensor and its inverse in the global // frame, and compute the rotational force and add it to the torque // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body. { dReal *invI = callContext->m_invI; unsigned int bodyIndex; while ((bodyIndex = ThrsafeIncrementIntUpToLimit(&callContext->m_inertiaBodyIndex, nb)) != nb) { dReal *invIrow = invI + (sizeint)bodyIndex * IIE__MAX; dxBody *b = body[bodyIndex]; dMatrix3 tmp; // compute inverse inertia tensor in global frame dMultiply2_333 (tmp, b->invI, b->posr.R); dMultiply0_333 (invIrow + IIE__MATRIX_MIN, b->posr.R, tmp); // Don't apply gyroscopic torques to bodies // if not flagged or the body is kinematic if ((b->flags & dxBodyGyroscopic) && (b->invMass > 0)) { dMatrix3 I; // compute inertia tensor in global frame dMultiply2_333 (tmp, b->mass.I, b->posr.R); dMultiply0_333 (I, b->posr.R, tmp); // compute rotational force #if 0 // Explicit computation dMultiply0_331 (tmp, I, b->avel); dSubtractVectorCross3(b->tacc, b->avel, tmp); #else // Do the implicit computation based on //"Stabilizing Gyroscopic Forces in Rigid Multibody Simulations" // (Lacoursière 2006) dReal h = callContext->m_stepperCallContext->m_stepSize; // Step size dVector3 L; // Compute angular momentum dMultiply0_331(L, I, b->avel); // Compute a new effective 'inertia tensor' // for the implicit step: the cross-product // matrix of the angular momentum plus the // old tensor scaled by the timestep. // Itild may not be symmetric pos-definite, // but we can still use it to compute implicit // gyroscopic torques. dMatrix3 Itild = { 0 }; dSetCrossMatrixMinus(Itild, L, 4); for (int ii = dM3E__MIN; ii < dM3E__MAX; ++ii) { Itild[ii] = Itild[ii] * h + I[ii]; } // Scale momentum by inverse time to get // a sort of "torque" dScaleVector3(L, dRecip(h)); // Invert the pseudo-tensor dMatrix3 itInv; // This is a closed-form inversion. // It's probably not numerically stable // when dealing with small masses with // a large asymmetry. // An LU decomposition might be better. if (dInvertMatrix3(itInv, Itild) != 0) { // "Divide" the original tensor // by the pseudo-tensor (on the right) dMultiply0_333(Itild, I, itInv); // Subtract an identity matrix Itild[dM3E_XX] -= 1; Itild[dM3E_YY] -= 1; Itild[dM3E_ZZ] -= 1; // This new inertia matrix rotates the // momentum to get a new set of torques // that will work correctly when applied // to the old inertia matrix as explicit // torques with a semi-implicit update // step. dVector3 tau0; dMultiply0_331(tau0, Itild, L); // Add the gyro torques to the torque // accumulator dAddVectors3(b->tacc, b->tacc, tau0); } #endif } } } } static int dxQuickStepIsland_Stage0_Joints_Callback(void *_callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage0JointsCallContext *callContext = (dxQuickStepperStage0JointsCallContext *)_callContext; dxQuickStepIsland_Stage0_Joints(callContext); return 1; } static void dxQuickStepIsland_Stage0_Joints(dxQuickStepperStage0JointsCallContext *callContext) { dxJoint * const *_joint = callContext->m_stepperCallContext->m_islandJointsStart; unsigned int _nj = callContext->m_stepperCallContext->m_islandJointsCount; // get joint information (m = total constraint dimension, nub = number of unbounded variables). // joints with m=0 are inactive and are removed from the joints array // entirely, so that the code that follows does not consider them. { unsigned int mcurr = 0, mfbcurr = 0; dJointWithInfo1 *jicurr = callContext->m_jointinfos; dxJoint *const *const _jend = _joint + _nj; for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) { // jicurr=dest, _jcurr=src dxJoint *j = *_jcurr; j->getInfo1 (&jicurr->info); dIASSERT (/*jicurr->info.m >= 0 && */jicurr->info.m <= 6 && /*jicurr->info.nub >= 0 && */jicurr->info.nub <= jicurr->info.m); unsigned int jm = jicurr->info.m; if (jm != 0) { mcurr += jm; if (j->feedback != NULL) { mfbcurr += jm; } jicurr->joint = j; jicurr++; } } callContext->m_stage0Outputs->m = mcurr; callContext->m_stage0Outputs->mfb = mfbcurr; callContext->m_stage0Outputs->nj = (unsigned int)(jicurr - callContext->m_jointinfos); dIASSERT((sizeint)(jicurr - callContext->m_jointinfos) < UINT_MAX || (sizeint)(jicurr - callContext->m_jointinfos) == UINT_MAX); // to avoid "...always evaluates to true" warnings } } static int dxQuickStepIsland_Stage1_Callback(void *_stage1CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage1CallContext *stage1CallContext = (dxQuickStepperStage1CallContext *)_stage1CallContext; dxQuickStepIsland_Stage1(stage1CallContext); return 1; } static void dxQuickStepIsland_Stage1(dxQuickStepperStage1CallContext *stage1CallContext) { const dxStepperProcessingCallContext *callContext = stage1CallContext->m_stepperCallContext; dReal *invI = stage1CallContext->m_invI; dJointWithInfo1 *jointinfos = stage1CallContext->m_jointinfos; unsigned int nj = stage1CallContext->m_stage0Outputs.nj; unsigned int m = stage1CallContext->m_stage0Outputs.m; unsigned int mfb = stage1CallContext->m_stage0Outputs.mfb; dxWorldProcessMemArena *memarena = callContext->m_stepperArena; memarena->RestoreState(stage1CallContext->m_stageMemArenaState); stage1CallContext = NULL; // WARNING! _stage1CallContext is not valid after this point! dIVERIFY(stage1CallContext == NULL); // To suppress unused variable assignment warnings { unsigned int _nj = callContext->m_islandJointsCount; memarena->ShrinkArray(jointinfos, _nj, nj); } dxMIndexItem *mindex = NULL; dxJBodiesItem *jb = NULL; int *findex = NULL; dReal *J = NULL, *Jcopy = NULL; // if there are constraints, compute the constraint force if (m > 0) { mindex = memarena->AllocateArray(nj + 1); { dxMIndexItem *mcurr = mindex; unsigned int moffs = 0, mfboffs = 0; mcurr->mIndex = moffs; mcurr->fbIndex = mfboffs; ++mcurr; const dJointWithInfo1 *const jiend = jointinfos + nj; for (const dJointWithInfo1 *jicurr = jointinfos; jicurr != jiend; ++jicurr) { dxJoint *joint = jicurr->joint; moffs += jicurr->info.m; if (joint->feedback) { mfboffs += jicurr->info.m; } mcurr->mIndex = moffs; mcurr->fbIndex = mfboffs; ++mcurr; } } jb = memarena->AllocateArray(m); findex = memarena->AllocateArray(m); J = memarena->AllocateOveralignedArray((sizeint)m * JME__MAX, JACOBIAN_ALIGNMENT); Jcopy = memarena->AllocateOveralignedArray((sizeint)mfb * JCE__MAX, JCOPY_ALIGNMENT); } dxQuickStepperLocalContext *localContext = (dxQuickStepperLocalContext *)memarena->AllocateBlock(sizeof(dxQuickStepperLocalContext)); localContext->Initialize(invI, jointinfos, nj, m, mfb, mindex, jb, findex, J, Jcopy); void *stage1MemarenaState = memarena->SaveState(); dxQuickStepperStage3CallContext *stage3CallContext = (dxQuickStepperStage3CallContext*)memarena->AllocateBlock(sizeof(dxQuickStepperStage3CallContext)); stage3CallContext->Initialize(callContext, localContext, stage1MemarenaState); if (m > 0) { unsigned int nb = callContext->m_islandBodiesCount; // create a constraint equation right hand side vector `rhs', a constraint // force mixing vector `cfm', and LCP low and high bound vectors, and an // 'findex' vector. dReal *rhs_tmp = memarena->AllocateArray((sizeint)nb * RHS__MAX); dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext*)memarena->AllocateBlock(sizeof(dxQuickStepperStage2CallContext)); stage2CallContext->Initialize(callContext, localContext, rhs_tmp); const unsigned allowedThreads = callContext->m_stepperAllowedThreads; dIASSERT(allowedThreads != 0); if (allowedThreads == 1) { IFTIMING (dTimerNow ("create J")); dxQuickStepIsland_Stage2a(stage2CallContext); IFTIMING (dTimerNow ("compute rhs_tmp")); dxQuickStepIsland_Stage2b(stage2CallContext); dxQuickStepIsland_Stage2c(stage2CallContext); dxQuickStepIsland_Stage3(stage3CallContext); } else { dxWorld *world = callContext->m_world; dCallReleaseeID stage3CallReleasee; world->PostThreadedCallForUnawareReleasee(NULL, &stage3CallReleasee, 1, callContext->m_finalReleasee, NULL, &dxQuickStepIsland_Stage3_Callback, stage3CallContext, 0, "QuickStepIsland Stage3"); dCallReleaseeID stage2bSyncReleasee; world->PostThreadedCall(NULL, &stage2bSyncReleasee, 1, stage3CallReleasee, NULL, &dxQuickStepIsland_Stage2bSync_Callback, stage2CallContext, 0, "QuickStepIsland Stage2b Sync"); unsigned stage2a_allowedThreads = CalculateOptimalThreadsCount<1U>(nj, allowedThreads); dCallReleaseeID stage2aSyncReleasee; world->PostThreadedCall(NULL, &stage2aSyncReleasee, stage2a_allowedThreads, stage2bSyncReleasee, NULL, &dxQuickStepIsland_Stage2aSync_Callback, stage2CallContext, 0, "QuickStepIsland Stage2a Sync"); if (stage2a_allowedThreads > 1) { world->PostThreadedCallsGroup(NULL, stage2a_allowedThreads - 1, stage2aSyncReleasee, &dxQuickStepIsland_Stage2a_Callback, stage2CallContext, "QuickStepIsland Stage2a"); } dxQuickStepIsland_Stage2a(stage2CallContext); world->AlterThreadedCallDependenciesCount(stage2aSyncReleasee, -1); } } else { dxQuickStepIsland_Stage3(stage3CallContext); } } static int dxQuickStepIsland_Stage2a_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext; dxQuickStepIsland_Stage2a(stage2CallContext); return 1; } static void dxQuickStepIsland_Stage2a(dxQuickStepperStage2CallContext *stage2CallContext) { const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext; dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext; dJointWithInfo1 *jointinfos = localContext->m_jointinfos; unsigned int nj = localContext->m_nj; const dxMIndexItem *mindex = localContext->m_mindex; const dReal stepsizeRecip = dRecip(callContext->m_stepSize); { int *findex = localContext->m_findex; dReal *J = localContext->m_J; dReal *JCopy = localContext->m_Jcopy; // get jacobian data from constraints. an m*16 matrix will be created // to store the two jacobian blocks from each constraint. it has this // format: // // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi \ . // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi }-- jacobian for joint 0, body 1 and body 2 (3 rows) // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi / // l1 l1 l1 a1 a1 a1 rhs cfm l2 l2 l2 a2 a2 a2 lo hi }--- jacobian for joint 1, body 1 and body 2 (3 rows) // etc... // // (lll) = linear jacobian data // (aaa) = angular jacobian data // dxWorld *world = callContext->m_world; const dReal worldERP = world->global_erp; const dReal worldCFM = world->global_cfm; unsigned validFIndices = 0; unsigned ji; while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_J, nj)) != nj) { const unsigned ofsi = mindex[ji].mIndex; const unsigned int infom = mindex[ji + 1].mIndex - ofsi; dReal *const JRow = J + (sizeint)ofsi * JME__MAX; { dReal *const JEnd = JRow + infom * JME__MAX; for (dReal *JCurr = JRow; JCurr != JEnd; JCurr += JME__MAX) { dSetZero(JCurr + JME__J1_MIN, JME__J1_COUNT); JCurr[JME_RHS] = REAL(0.0); JCurr[JME_CFM] = worldCFM; dSetZero(JCurr + JME__J2_MIN, JME__J2_COUNT); JCurr[JME_LO] = -dInfinity; JCurr[JME_HI] = dInfinity; dSASSERT(JME__J1_COUNT + 2 + JME__J2_COUNT + 2 == JME__MAX); } } int *findexRow = findex + ofsi; dSetValue(findexRow, infom, -1); dxJoint *joint = jointinfos[ji].joint; joint->getInfo2(stepsizeRecip, worldERP, JME__MAX, JRow + JME__J1_MIN, JRow + JME__J2_MIN, JME__MAX, JRow + JME__RHS_CFM_MIN, JRow + JME__LO_HI_MIN, findexRow); // findex iteration is compact and is not going to pollute caches - do it first { // adjust returned findex values for global index numbering int *const findicesEnd = findexRow + infom; for (int *findexCurr = findexRow; findexCurr != findicesEnd; ++findexCurr) { int fival = *findexCurr; if (fival != -1) { *findexCurr = fival + ofsi; ++validFIndices; } } } { dReal *const JEnd = JRow + infom * JME__MAX; for (dReal *JCurr = JRow; JCurr != JEnd; JCurr += JME__MAX) { JCurr[JME_RHS] *= stepsizeRecip; JCurr[JME_CFM] *= stepsizeRecip; } } { // we need a copy of Jacobian for joint feedbacks // because it gets destroyed by SOR solver // instead of saving all Jacobian, we can save just rows // for joints, that requested feedback (which is normally much less) unsigned mfbIndex = mindex[ji].fbIndex; if (mfbIndex != mindex[ji + 1].fbIndex) { dReal *const JEnd = JRow + infom * JME__MAX; dReal *JCopyRow = JCopy + mfbIndex * JCE__MAX; // Random access by mfbIndex here! Do not optimize! for (const dReal *JCurr = JRow; ; ) { for (unsigned i = 0; i != JME__J1_COUNT; ++i) { JCopyRow[i + JCE__J1_MIN] = JCurr[i + JME__J1_MIN]; } for (unsigned j = 0; j != JME__J2_COUNT; ++j) { JCopyRow[j + JCE__J2_MIN] = JCurr[j + JME__J2_MIN]; } JCopyRow += JCE__MAX; dSASSERT((unsigned)JCE__J1_COUNT == JME__J1_COUNT); dSASSERT((unsigned)JCE__J2_COUNT == JME__J2_COUNT); dSASSERT(JCE__J1_COUNT + JCE__J2_COUNT == JCE__MAX); if ((JCurr += JME__MAX) == JEnd) { break; } } } } } if (validFIndices != 0) { ThrsafeAdd(&localContext->m_valid_findices, validFIndices); } } { dxJBodiesItem *jb = localContext->m_jb; // create an array of body numbers for each joint row unsigned ji; while ((ji = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_ji_jb, nj)) != nj) { dxJoint *joint = jointinfos[ji].joint; int b1 = (joint->node[0].body) ? (joint->node[0].body->tag) : -1; int b2 = (joint->node[1].body) ? (joint->node[1].body->tag) : -1; dxJBodiesItem *const jb_end = jb + mindex[ji + 1].mIndex; dxJBodiesItem *jb_ptr = jb + mindex[ji].mIndex; for (; jb_ptr != jb_end; ++jb_ptr) { jb_ptr->first = b1; jb_ptr->second = b2; } } } } static int dxQuickStepIsland_Stage2aSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext; const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext; const unsigned int nb = callContext->m_islandBodiesCount; const unsigned allowedThreads = callContext->m_stepperAllowedThreads; unsigned int stage2b_allowedThreads = CalculateOptimalThreadsCount(nb, allowedThreads); if (stage2b_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage2b_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage2b_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage2b_Callback, stage2CallContext, "QuickStepIsland Stage2b"); } dxQuickStepIsland_Stage2b(stage2CallContext); return 1; } static int dxQuickStepIsland_Stage2b_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext; dxQuickStepIsland_Stage2b(stage2CallContext); return 1; } static void dxQuickStepIsland_Stage2b(dxQuickStepperStage2CallContext *stage2CallContext) { const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext; const dReal stepsizeRecip = dRecip(callContext->m_stepSize); { // Warning!!! // This code reads facc/tacc fields of body objects which (the fields) // may be modified by dxJoint::getInfo2(). Therefore the code must be // in different sub-stage from Jacobian construction in Stage2a // to ensure proper synchronization and avoid accessing numbers being modified. // Warning!!! dxBody * const *const body = callContext->m_islandBodiesStart; const unsigned int nb = callContext->m_islandBodiesCount; const dReal *invI = localContext->m_invI; dReal *rhs_tmp = stage2CallContext->m_rhs_tmp; // compute the right hand side `rhs' const unsigned int step_size = dxQUICKSTEPISLAND_STAGE2B_STEP; unsigned int nb_steps = (nb + (step_size - 1)) / step_size; // put -(v/h + invM*fe) into rhs_tmp unsigned bi_step; while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage2CallContext->m_bi, nb_steps)) != nb_steps) { unsigned int bi = bi_step * step_size; const unsigned int biend = bi + dMIN(step_size, nb - bi); dReal *rhscurr = rhs_tmp + (sizeint)bi * RHS__MAX; const dReal *invIrow = invI + (sizeint)bi * IIE__MAX; while (true) { dxBody *b = body[bi]; dReal body_invMass = b->invMass; for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) rhscurr[RHS__L_MIN + j] = -(b->facc[dV3E__AXES_MIN + j] * body_invMass + b->lvel[dV3E__AXES_MIN + j] * stepsizeRecip); dMultiply0_331 (rhscurr + RHS__A_MIN, invIrow + IIE__MATRIX_MIN, b->tacc); for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) rhscurr[RHS__A_MIN + k] = -(b->avel[dV3E__AXES_MIN + k] * stepsizeRecip) - rhscurr[RHS__A_MIN + k]; if (++bi == biend) { break; } rhscurr += RHS__MAX; invIrow += IIE__MAX; } } } } static int dxQuickStepIsland_Stage2bSync_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext; const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext; const unsigned allowedThreads = callContext->m_stepperAllowedThreads; const dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext; unsigned int m = localContext->m_m; unsigned int stage2c_allowedThreads = CalculateOptimalThreadsCount(m, allowedThreads); if (stage2c_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage2c_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage2c_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage2c_Callback, stage2CallContext, "QuickStepIsland Stage2c"); } dxQuickStepIsland_Stage2c(stage2CallContext); return 1; } static int dxQuickStepIsland_Stage2c_Callback(void *_stage2CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage2CallContext *stage2CallContext = (dxQuickStepperStage2CallContext *)_stage2CallContext; dxQuickStepIsland_Stage2c(stage2CallContext); return 1; } static void dxQuickStepIsland_Stage2c(dxQuickStepperStage2CallContext *stage2CallContext) { //const dxStepperProcessingCallContext *callContext = stage2CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage2CallContext->m_localContext; //const dReal stepsizeRecip = dRecip(callContext->m_stepSize); { // Warning!!! // This code depends on rhs_tmp and therefore must be in different sub-stage // from rhs_tmp calculation in Stage2b to ensure proper synchronization // and avoid accessing numbers being modified. // Warning!!! dReal *J = localContext->m_J; const dxJBodiesItem *jb = localContext->m_jb; const dReal *rhs_tmp = stage2CallContext->m_rhs_tmp; const unsigned int m = localContext->m_m; // add J*rhs_tmp to rhs multiplyAdd_J(&stage2CallContext->m_Jrhsi, m, J, jb, rhs_tmp); } } static int dxQuickStepIsland_Stage3_Callback(void *_stage3CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage3CallContext *stage3CallContext = (dxQuickStepperStage3CallContext *)_stage3CallContext; dxQuickStepIsland_Stage3(stage3CallContext); return 1; } static void dxQuickStepIsland_Stage3(dxQuickStepperStage3CallContext *stage3CallContext) { const dxStepperProcessingCallContext *callContext = stage3CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage3CallContext->m_localContext; dxWorldProcessMemArena *memarena = callContext->m_stepperArena; memarena->RestoreState(stage3CallContext->m_stage1MemArenaState); stage3CallContext = NULL; // WARNING! stage3CallContext is not valid after this point! dIVERIFY(stage3CallContext == NULL); // To suppress unused variable assignment warnings void *stage3MemarenaState = memarena->SaveState(); dxQuickStepperStage5CallContext *stage5CallContext = (dxQuickStepperStage5CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage5CallContext)); stage5CallContext->Initialize(callContext, localContext, stage3MemarenaState); unsigned int m = localContext->m_m; if (m > 0) { // load lambda from the value saved on the previous iteration dReal *lambda = memarena->AllocateArray(m); unsigned int nb = callContext->m_islandBodiesCount; dReal *cforce = memarena->AllocateArray((sizeint)nb * CFE__MAX); dReal *iMJ = memarena->AllocateOveralignedArray((sizeint)m * IMJ__MAX, INVMJ_ALIGNMENT); // order to solve constraint rows in IndexError *order = memarena->AllocateArray(m); dReal *last_lambda = NULL; #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR // the lambda computed at the previous iteration. // this is used to measure error for when we are reordering the indexes. last_lambda = memarena->AllocateArray(m); #endif const unsigned allowedThreads = callContext->m_stepperAllowedThreads; bool singleThreadedExecution = allowedThreads == 1; dIASSERT(allowedThreads >= 1); atomicord32 *bi_links_or_mi_levels = NULL; atomicord32 *mi_links = NULL; #if !dTHREADING_INTF_DISABLED bi_links_or_mi_levels = memarena->AllocateArray(dMAX(nb, m)); mi_links = memarena->AllocateArray(2 * ((sizeint)m + 1)); #else dIASSERT(singleThreadedExecution); #endif dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage4CallContext)); stage4CallContext->Initialize(callContext, localContext, lambda, cforce, iMJ, order, last_lambda, bi_links_or_mi_levels, mi_links); if (singleThreadedExecution) { dxQuickStepIsland_Stage4a(stage4CallContext); IFTIMING (dTimerNow ("solving LCP problem")); dxQuickStepIsland_Stage4LCP_iMJComputation(stage4CallContext); dxQuickStepIsland_Stage4LCP_STfcComputation(stage4CallContext); dxQuickStepIsland_Stage4LCP_AdComputation(stage4CallContext); dxQuickStepIsland_Stage4LCP_ReorderPrep(stage4CallContext); dxWorld *world = callContext->m_world; const unsigned int num_iterations = world->qs.num_iterations; for (unsigned int iteration=0; iteration < num_iterations; iteration++) { if (IsSORConstraintsReorderRequiredForIteration(iteration)) { stage4CallContext->ResetSOR_ConstraintsReorderVariables(0); dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(stage4CallContext, iteration); } dxQuickStepIsland_Stage4LCP_STIteration(stage4CallContext); } dxQuickStepIsland_Stage4b(stage4CallContext); dxQuickStepIsland_Stage5(stage5CallContext); } else { dxWorld *world = callContext->m_world; dCallReleaseeID stage5CallReleasee; world->PostThreadedCallForUnawareReleasee(NULL, &stage5CallReleasee, 1, callContext->m_finalReleasee, NULL, &dxQuickStepIsland_Stage5_Callback, stage5CallContext, 0, "QuickStepIsland Stage5"); dCallReleaseeID stage4LCP_IterationSyncReleasee; world->PostThreadedCall(NULL, &stage4LCP_IterationSyncReleasee, 1, stage5CallReleasee, NULL, &dxQuickStepIsland_Stage4LCP_IterationSync_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_Iteration Sync"); unsigned int stage4LCP_Iteration_allowedThreads = CalculateOptimalThreadsCount<1U>(m, allowedThreads); stage4CallContext->AssignLCP_IterationData(stage4LCP_IterationSyncReleasee, stage4LCP_Iteration_allowedThreads); dCallReleaseeID stage4LCP_IterationStartReleasee; world->PostThreadedCall(NULL, &stage4LCP_IterationStartReleasee, 3, stage4LCP_IterationSyncReleasee, NULL, &dxQuickStepIsland_Stage4LCP_IterationStart_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_Iteration Start"); unsigned int nj = localContext->m_nj; unsigned int stage4a_allowedThreads = CalculateOptimalThreadsCount(nj, allowedThreads); dCallReleaseeID stage4LCP_fcStartReleasee; // Note: It is unnecessary to make fc dependent on 4a if there is no WARM_STARTING // However I'm doing so to minimize use of preprocessor conditions in sources unsigned stage4LCP_fcDependenciesCountToUse = stage4a_allowedThreads; #ifdef WARM_STARTING // Posted with extra dependency to be removed from dxQuickStepIsland_Stage4LCP_iMJSync_Callback stage4LCP_fcDependenciesCountToUse += 1; #endif world->PostThreadedCall(NULL, &stage4LCP_fcStartReleasee, stage4LCP_fcDependenciesCountToUse, stage4LCP_IterationStartReleasee, NULL, &dxQuickStepIsland_Stage4LCP_fcStart_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_fc Start"); #ifdef WARM_STARTING stage4CallContext->AssignLCP_fcStartReleasee(stage4LCP_fcStartReleasee); #endif unsigned stage4LCP_iMJ_allowedThreads = CalculateOptimalThreadsCount(m, allowedThreads); dCallReleaseeID stage4LCP_iMJSyncReleasee; world->PostThreadedCall(NULL, &stage4LCP_iMJSyncReleasee, stage4LCP_iMJ_allowedThreads, stage4LCP_IterationStartReleasee, NULL, &dxQuickStepIsland_Stage4LCP_iMJSync_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_iMJ Sync"); world->PostThreadedCall(NULL, NULL, 0, stage4LCP_IterationStartReleasee, NULL, &dxQuickStepIsland_Stage4LCP_ReorderPrep_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_ReorderPrep"); world->PostThreadedCallsGroup(NULL, stage4a_allowedThreads, stage4LCP_fcStartReleasee, &dxQuickStepIsland_Stage4a_Callback, stage4CallContext, "QuickStepIsland Stage4a"); if (stage4LCP_iMJ_allowedThreads > 1) { world->PostThreadedCallsGroup(NULL, stage4LCP_iMJ_allowedThreads - 1, stage4LCP_iMJSyncReleasee, &dxQuickStepIsland_Stage4LCP_iMJ_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_iMJ"); } dxQuickStepIsland_Stage4LCP_iMJComputation(stage4CallContext); world->AlterThreadedCallDependenciesCount(stage4LCP_iMJSyncReleasee, -1); } } else { dxQuickStepIsland_Stage5(stage5CallContext); } } static int dxQuickStepIsland_Stage4a_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4a(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4a(dxQuickStepperStage4CallContext *stage4CallContext) { const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; dReal *lambda = stage4CallContext->m_lambda; const dxMIndexItem *mindex = localContext->m_mindex; #ifdef WARM_STARTING dJointWithInfo1 *jointinfos = localContext->m_jointinfos; #endif unsigned int nj = localContext->m_nj; const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4A_STEP; unsigned int nj_steps = (nj + (step_size - 1)) / step_size; unsigned ji_step; while ((ji_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_ji_4a, nj_steps)) != nj_steps) { unsigned int ji = ji_step * step_size; dReal *lambdacurr = lambda + mindex[ji].mIndex; #ifdef WARM_STARTING const dJointWithInfo1 *jicurr = jointinfos + ji; const dJointWithInfo1 *const jiend = jicurr + dMIN(step_size, nj - ji); do { const dReal *joint_lambdas = jicurr->joint->lambda; dReal *const lambdsnext = lambdacurr + jicurr->info.m; while (true) { // for warm starting, multiplication by 0.9 seems to be necessary to prevent // jerkiness in motor-driven joints. I have no idea why this works. *lambdacurr = *joint_lambdas * 0.9; if (++lambdacurr == lambdsnext) { break; } ++joint_lambdas; } } while (++jicurr != jiend); #else dReal *lambdsnext = lambda + mindex[ji + dMIN(step_size, nj - ji)].mIndex; dSetZero(lambdacurr, lambdsnext - lambdacurr); #endif } } static int dxQuickStepIsland_Stage4LCP_iMJ_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4LCP_iMJComputation(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4LCP_iMJComputation(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; dReal *iMJ = stage4CallContext->m_iMJ; unsigned int m = localContext->m_m; dReal *J = localContext->m_J; const dxJBodiesItem *jb = localContext->m_jb; dxBody * const *body = callContext->m_islandBodiesStart; dReal *invI = localContext->m_invI; // precompute iMJ = inv(M)*J' compute_invM_JT(&stage4CallContext->m_mi_iMJ, iMJ, m, J, jb, body, invI); } static int dxQuickStepIsland_Stage4LCP_iMJSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int m = localContext->m_m; const unsigned allowedThreads = callContext->m_stepperAllowedThreads; unsigned int stage4LCP_Ad_allowedThreads = CalculateOptimalThreadsCount(m, allowedThreads); #ifdef WARM_STARTING { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(stage4CallContext->m_LCP_fcStartReleasee, -1); } #endif if (stage4LCP_Ad_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_Ad_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage4LCP_Ad_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_Ad_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_Ad"); } dxQuickStepIsland_Stage4LCP_AdComputation(stage4CallContext); return 1; } static int dxQuickStepIsland_Stage4LCP_fcStart_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int fcPrepareComplexity, fcCompleteComplexity; #ifdef WARM_STARTING fcPrepareComplexity = localContext->m_m / dxQUICKSTEPISLAND_STAGE4LCP_FC_COMPLETE_TO_PREPARE_COMPLEXITY_DIVISOR; fcCompleteComplexity = callContext->m_islandBodiesCount; #else fcPrepareComplexity = localContext->m_m; fcCompleteComplexity = 0; #endif const unsigned allowedThreads = callContext->m_stepperAllowedThreads; unsigned int stage4LCP_fcPrepare_allowedThreads = CalculateOptimalThreadsCount(fcPrepareComplexity, allowedThreads); unsigned int stage4LCP_fcComplete_allowedThreads = CalculateOptimalThreadsCount(fcCompleteComplexity, allowedThreads); stage4CallContext->AssignLCP_fcAllowedThreads(stage4LCP_fcPrepare_allowedThreads, stage4LCP_fcComplete_allowedThreads); #ifdef WARM_STARTING dxQuickStepIsland_Stage4LCP_MTfcComputation_warmZeroArrays(stage4CallContext); #endif if (stage4LCP_fcPrepare_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_fcPrepare_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage4LCP_fcPrepare_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_fc_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_fc"); } dxQuickStepIsland_Stage4LCP_MTfcComputation(stage4CallContext, callThisReleasee); return 1; } static int dxQuickStepIsland_Stage4LCP_fc_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4LCP_MTfcComputation(stage4CallContext, callThisReleasee); return 1; } static void dxQuickStepIsland_Stage4LCP_MTfcComputation(dxQuickStepperStage4CallContext *stage4CallContext, dCallReleaseeID callThisReleasee) { #ifdef WARM_STARTING dxQuickStepIsland_Stage4LCP_MTfcComputation_warm(stage4CallContext, callThisReleasee); #else (void)callThisReleasee; // unused dxQuickStepIsland_Stage4LCP_MTfcComputation_cold(stage4CallContext); #endif } #ifdef WARM_STARTING static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warm(dxQuickStepperStage4CallContext *stage4CallContext, dCallReleaseeID callThisReleasee) { dxQuickStepIsland_Stage4LCP_MTfcComputation_warmPrepare(stage4CallContext); if (ThrsafeExchangeAdd(&stage4CallContext->m_LCP_fcPrepareThreadsRemaining, (atomicord32)(-1)) == 1) { stage4CallContext->ResetLCP_fcComputationIndex(); const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; unsigned int stage4LCP_fcComplete_allowedThreads = stage4CallContext->m_LCP_fcCompleteThreadsTotal; if (stage4LCP_fcComplete_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_fcComplete_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage4LCP_fcComplete_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_fcWarmComplete_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_fcWarmComplete"); } dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(stage4CallContext); } } static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmZeroArrays(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; unsigned int nb = callContext->m_islandBodiesCount; atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels; multiply_invM_JT_init_array(nb, bi_links); } static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmPrepare(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int m = localContext->m_m; const dxJBodiesItem *jb = localContext->m_jb; // Prepare to compute fc=(inv(M)*J')*lambda. we will incrementally maintain fc // as we change lambda. multiply_invM_JT_prepare(&stage4CallContext->m_mi_fc, m, jb, stage4CallContext->m_bi_links_or_mi_levels, stage4CallContext->m_mi_links); } static int dxQuickStepIsland_Stage4LCP_fcWarmComplete_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4LCP_MTfcComputation_warmComplete(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; dReal *fc = stage4CallContext->m_cforce; unsigned int nb = callContext->m_islandBodiesCount; dReal *iMJ = stage4CallContext->m_iMJ; const dxJBodiesItem *jb = localContext->m_jb; dReal *lambda = stage4CallContext->m_lambda; // Complete computation of fc=(inv(M)*J')*lambda. we will incrementally maintain fc // as we change lambda. multiply_invM_JT_complete(&stage4CallContext->m_mi_fc, fc, nb, iMJ, jb, lambda, stage4CallContext->m_bi_links_or_mi_levels, stage4CallContext->m_mi_links); } #else // #ifndef WARM_STARTING static void dxQuickStepIsland_Stage4LCP_MTfcComputation_cold(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; dReal *fc = stage4CallContext->m_cforce; unsigned int nb = callContext->m_islandBodiesCount; const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4LCP_FC_STEP; unsigned int nb_steps = (nb + (step_size - 1)) / step_size; unsigned bi_step; while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_mi_fc, nb_steps)) != nb_steps) { unsigned int bi = bi_step * step_size; unsigned int bicnt = dMIN(step_size, nb - bi); dSetZero(fc + (sizeint)bi * CFE__MAX, (sizeint)bicnt * CFE__MAX); } } #endif // #ifndef WARM_STARTING static void dxQuickStepIsland_Stage4LCP_STfcComputation(dxQuickStepperStage4CallContext *stage4CallContext) { #ifdef WARM_STARTING const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; dReal *fc = stage4CallContext->m_cforce; unsigned int m = localContext->m_m; unsigned int nb = callContext->m_islandBodiesCount; dReal *iMJ = stage4CallContext->m_iMJ; const dxJBodiesItem *jb = localContext->m_jb; dReal *lambda = stage4CallContext->m_lambda; // compute fc=(inv(M)*J')*lambda. we will incrementally maintain fc // as we change lambda. _multiply_invM_JT(fc, m, nb, iMJ, jb, lambda); #else dReal *fc = stage4CallContext->m_cforce; const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; unsigned int nb = callContext->m_islandBodiesCount; dSetZero(fc, (sizeint)nb * CFE__MAX); #endif } static int dxQuickStepIsland_Stage4LCP_Ad_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4LCP_AdComputation(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4LCP_AdComputation(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; const dxJBodiesItem *jb = localContext->m_jb; dReal *J = localContext->m_J; unsigned int m = localContext->m_m; dxWorld *world = callContext->m_world; dxQuickStepParameters *qs = &world->qs; const dReal sor_w = qs->w; // SOR over-relaxation parameter dReal *iMJ = stage4CallContext->m_iMJ; const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4LCP_AD_STEP; unsigned int m_steps = (m + (step_size - 1)) / step_size; unsigned mi_step; while ((mi_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_mi_Ad, m_steps)) != m_steps) { unsigned int mi = mi_step * step_size; const unsigned int miend = mi + dMIN(step_size, m - mi); const dReal *iMJ_ptr = iMJ + (sizeint)mi * IMJ__MAX; dReal *J_ptr = J + (sizeint)mi * JME__MAX; while (true) { dReal sum = REAL(0.0); { for (unsigned int j = JVE__MIN; j != JVE__MAX; ++j) sum += iMJ_ptr[IMJ__1_MIN + j] * J_ptr[JME__J1_MIN + j]; dSASSERT(JME__J1_COUNT == (int)JVE__MAX); } int b2 = jb[mi].second; if (b2 != -1) { for (unsigned int k = JVE__MIN; k != JVE__MAX; ++k) sum += iMJ_ptr[IMJ__2_MIN + k] * J_ptr[JME__J2_MIN + k]; dSASSERT(JME__J2_COUNT == (int)JVE__MAX); } dReal cfm_i = J_ptr[JME_CFM]; dReal Ad_i = sor_w / (sum + cfm_i); // NOTE: This may seem unnecessary but it's indeed an optimization // to move multiplication by Ad[i] and cfm[i] out of iteration loop. // scale cfm, J and b by Ad J_ptr[JME_CFM] = cfm_i * Ad_i; J_ptr[JME_RHS] *= Ad_i; { for (unsigned int j = JVE__MIN; j != JVE__MAX; ++j) J_ptr[JME__J1_MIN + j] *= Ad_i; dSASSERT(JME__J1_COUNT == (int)JVE__MAX); } if (b2 != -1) { for (unsigned int k = JVE__MIN; k != JVE__MAX; ++k) J_ptr[JME__J2_MIN + k] *= Ad_i; dSASSERT(JME__J2_COUNT == (int)JVE__MAX); } if (++mi == miend) { break; } iMJ_ptr += IMJ__MAX; J_ptr += JME__MAX; } } } static int dxQuickStepIsland_Stage4LCP_ReorderPrep_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4LCP_ReorderPrep(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4LCP_ReorderPrep(dxQuickStepperStage4CallContext *stage4CallContext) { const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int m = localContext->m_m; unsigned int valid_findices = localContext->m_valid_findices; IndexError *order = stage4CallContext->m_order; { // make sure constraints with findex < 0 come first. IndexError *orderhead = order, *ordertail = order + (m - valid_findices); const int *findex = localContext->m_findex; // Fill the array from both ends for (unsigned int i = 0; i != m; ++i) { if (findex[i] == -1) { orderhead->index = i; // Place them at the front ++orderhead; } else { ordertail->index = i; // Place them at the end ++ordertail; } } dIASSERT(orderhead == order + (m - valid_findices)); dIASSERT(ordertail == order + m); } } static int dxQuickStepIsland_Stage4LCP_IterationStart_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; dxWorld *world = callContext->m_world; dxQuickStepParameters *qs = &world->qs; const unsigned int num_iterations = qs->num_iterations; unsigned iteration = stage4CallContext->m_LCP_iteration; if (iteration < num_iterations) { dCallReleaseeID nextReleasee; dCallReleaseeID stage4LCP_IterationSyncReleasee = stage4CallContext->m_LCP_IterationSyncReleasee; unsigned int stage4LCP_Iteration_allowedThreads = stage4CallContext->m_LCP_IterationAllowedThreads; bool reorderRequired = false; if (IsSORConstraintsReorderRequiredForIteration(iteration)) { reorderRequired = true; } unsigned syncCallDependencies = reorderRequired ? 1 : stage4LCP_Iteration_allowedThreads; // Increment iterations counter in advance as anyway it needs to be incremented // before independent tasks (the reordering or the iteration) are posted // (otherwise next iteration may complete before the increment // and the same iteration index may be used again). stage4CallContext->m_LCP_iteration = iteration + 1; if (iteration + 1 != num_iterations) { dCallReleaseeID stage4LCP_IterationStartReleasee; world->PostThreadedCallForUnawareReleasee(NULL, &stage4LCP_IterationStartReleasee, syncCallDependencies, stage4LCP_IterationSyncReleasee, NULL, &dxQuickStepIsland_Stage4LCP_IterationStart_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_Iteration Start"); nextReleasee = stage4LCP_IterationStartReleasee; } else { world->AlterThreadedCallDependenciesCount(stage4LCP_IterationSyncReleasee, syncCallDependencies); nextReleasee = stage4LCP_IterationSyncReleasee; } if (reorderRequired) { const unsigned int reorderThreads = 2; dIASSERT(callContext->m_stepperAllowedThreads >= 2); // Otherwise the single-threaded execution path would be taken stage4CallContext->ResetSOR_ConstraintsReorderVariables(reorderThreads); dCallReleaseeID stage4LCP_ConstraintsReorderingSyncReleasee; world->PostThreadedCall(NULL, &stage4LCP_ConstraintsReorderingSyncReleasee, reorderThreads, nextReleasee, NULL, &dxQuickStepIsland_Stage4LCP_ConstraintsReorderingSync_Callback, stage4CallContext, 0, "QuickStepIsland Stage4LCP_ConstraintsReordering Sync"); if (reorderThreads > 1) { world->PostThreadedCallsGroup(NULL, reorderThreads - 1, stage4LCP_ConstraintsReorderingSyncReleasee, &dxQuickStepIsland_Stage4LCP_ConstraintsReordering_Callback, stage4CallContext, "QuickStepIsland Stage4LCP_ConstraintsReordering"); } dxQuickStepIsland_Stage4LCP_ConstraintsReordering(stage4CallContext); world->AlterThreadedCallDependenciesCount(stage4LCP_ConstraintsReorderingSyncReleasee, -1); } else { dIASSERT(iteration != 0); { dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(stage4CallContext); } stage4CallContext->RecordLCP_IterationStart(stage4LCP_Iteration_allowedThreads, nextReleasee); unsigned knownToBeCompletedLevel = dxHEAD_INDEX; if (stage4LCP_Iteration_allowedThreads > 1) { world->PostThreadedCallsIndexOverridenGroup(NULL, stage4LCP_Iteration_allowedThreads - 1, nextReleasee, &dxQuickStepIsland_Stage4LCP_Iteration_Callback, stage4CallContext, knownToBeCompletedLevel, "QuickStepIsland Stage4LCP_Iteration"); } dxQuickStepIsland_Stage4LCP_MTIteration(stage4CallContext, knownToBeCompletedLevel); world->AlterThreadedCallDependenciesCount(nextReleasee, -1); } } return 1; } static int dxQuickStepIsland_Stage4LCP_ConstraintsReordering_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4LCP_ConstraintsReordering(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4LCP_ConstraintsReordering(dxQuickStepperStage4CallContext *stage4CallContext) { unsigned int iteration = stage4CallContext->m_LCP_iteration - 1; // Iteration is pre-incremented before scheduled tasks are released for execution if (dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(stage4CallContext, iteration)) { dxQuickStepIsland_Stage4LCP_LinksArraysZeroing(stage4CallContext); if (ThrsafeExchangeAdd(&stage4CallContext->m_SOR_reorderThreadsRemaining, (atomicord32)(-1)) == 1) { // If last thread has exited the reordering routine... // Rebuild the object dependency map dxQuickStepIsland_Stage4LCP_DependencyMapForNewOrderRebuilding(stage4CallContext); } } else { // NOTE: So far, this branch is only called in CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR case if (ThrsafeExchangeAdd(&stage4CallContext->m_SOR_reorderThreadsRemaining, (atomicord32)(-1)) == 1) { // If last thread has exited the reordering routine... dIASSERT(iteration != 0); dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(stage4CallContext); } } } static bool dxQuickStepIsland_Stage4LCP_ConstraintsShuffling(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int iteration) { bool result = false; #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR struct ConstraintsReorderingHelper { void operator ()(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int startIndex, unsigned int endIndex) { const dReal *lambda = stage4CallContext->m_lambda; dReal *last_lambda = stage4CallContext->m_last_lambda; IndexError *order = stage4CallContext->m_order; for (unsigned int index = startIndex; index != endIndex; ++index) { unsigned int i = order[index].index; dReal lambda_i = lambda[i]; if (lambda_i != REAL(0.0)) { //@@@ relative error: order[i].error = dFabs(lambda[i]-last_lambda[i])/max; order[index].error = dFabs(lambda_i - last_lambda[i]); } else if (last_lambda[i] != REAL(0.0)) { //@@@ relative error: order[i].error = dFabs(lambda[i]-last_lambda[i])/max; order[index].error = dFabs(/*lambda_i - */last_lambda[i]); // lambda_i == 0 } else { order[index].error = dInfinity; } // Finally copy the lambda for the next iteration last_lambda[i] = lambda_i; } qsort (order + startIndex, endIndex - startIndex, sizeof(IndexError), &compare_index_error); } }; if (iteration > 1) { // Only reorder starting from iteration #2 // sort the constraints so that the ones converging slowest // get solved last. use the absolute (not relative) error. /* * Full reorder needs to be done. * Even though this contradicts the initial idea of moving dependent constraints * to the order end the algorithm does not work the other way well. * It looks like the iterative method needs a shake after it already found * some initial approximations and those incurred errors help it to converge even better. */ if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderHeadTaken, 1) == 0) { // Process the head const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; ConstraintsReorderingHelper()(stage4CallContext, 0, localContext->m_m); } result = true; } else if (iteration == 1) { if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderHeadTaken, 1) == 0) { // Process the first half const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int startIndex = 0; unsigned int indicesCount = localContext->m_m / 2; // Just copy the lambdas for the next iteration memcpy(stage4CallContext->m_last_lambda + startIndex, stage4CallContext->m_lambda + startIndex, indicesCount * sizeof(dReal)); } if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderTailTaken, 1) == 0) { // Process the second half const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int startIndex = localContext->m_m / 2; unsigned int indicesCount = localContext->m_m - startIndex; // Just copy the lambdas for the next iteration memcpy(stage4CallContext->m_last_lambda + startIndex, stage4CallContext->m_lambda + startIndex, indicesCount * sizeof(dReal)); } // result = false; -- already 'false' } else /*if (iteration < 1) */{ result = true; // return true on 0th iteration to build dependency map for the initial order } #elif CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__RANDOMLY if (iteration != 0) { dIASSERT(!dIN_RANGE(iteration, 0, RANDOM_CONSTRAINTS_REORDERING_FREQUENCY)); dIASSERT(iteration % RANDOM_CONSTRAINTS_REORDERING_FREQUENCY == RRS_REORDERING); { struct ConstraintsReorderingHelper { void operator ()(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int startIndex, unsigned int indicesCount) { IndexError *order = stage4CallContext->m_order + startIndex; for (unsigned int index = 1; index < indicesCount; ++index) { int swapIndex = dRandInt(index + 1); IndexError tmp = order[index]; order[index] = order[swapIndex]; order[swapIndex] = tmp; } } }; /* * Full reorder needs to be done. * Even though this contradicts the initial idea of moving dependent constraints * to the order end the algorithm does not work the other way well. * It looks like the iterative method needs a shake after it already found * some initial approximations and those incurred errors help it to converge even better. */ if (ThrsafeExchange(&stage4CallContext->m_SOR_reorderHeadTaken, 1) == 0) { // Process the head const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; ConstraintsReorderingHelper()(stage4CallContext, 0, localContext->m_m); } } dIASSERT((RRS__MAX, true)); // A reference to RRS__MAX to be located by Find in Files } else { // Just return true and skip the randomization for the very first iteration } result = true; #else // #if CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__BY_ERROR && CONSTRAINTS_REORDERING_METHOD != REORDERING_METHOD__RANDOMLY dIASSERT(iteration == 0); // The reordering request is only returned for the first iteration result = true; #endif return result; } static void dxQuickStepIsland_Stage4LCP_LinksArraysZeroing(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; if (ThrsafeExchange(&stage4CallContext->m_SOR_bi_zeroHeadTaken, 1) == 0) { atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;/*=[nb]*/ unsigned int nb = callContext->m_islandBodiesCount; memset(bi_links, 0, sizeof(bi_links[0]) * (nb / 2)); } if (ThrsafeExchange(&stage4CallContext->m_SOR_bi_zeroTailTaken, 1) == 0) { atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;/*=[nb]*/ unsigned int nb = callContext->m_islandBodiesCount; memset(bi_links + nb / 2, 0, sizeof(bi_links[0]) * (nb - nb / 2)); } if (ThrsafeExchange(&stage4CallContext->m_SOR_mi_zeroHeadTaken, 1) == 0) { atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/ unsigned int m = localContext->m_m; memset(mi_links, 0, sizeof(mi_links[0]) * (m + 1)); } if (ThrsafeExchange(&stage4CallContext->m_SOR_mi_zeroTailTaken, 1) == 0) { atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/ unsigned int m = localContext->m_m; memset(mi_links + (m + 1), 0, sizeof(mi_links[0]) * (m + 1)); } } static void dxQuickStepIsland_Stage4LCP_DependencyMapForNewOrderRebuilding(dxQuickStepperStage4CallContext *stage4CallContext) { const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; atomicord32 *bi_links = stage4CallContext->m_bi_links_or_mi_levels;/*=[nb]*/ atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/ IndexError *order = stage4CallContext->m_order; const dxJBodiesItem *jb = localContext->m_jb; unsigned int m = localContext->m_m; for (unsigned int i = 0; i != m; ++i) { unsigned int index = order[i].index; int b1 = jb[index].first; int b2 = jb[index].second; unsigned int encioded_i = dxENCODE_INDEX(i); unsigned int encoded_depi = bi_links[(unsigned int)b1]; bi_links[(unsigned int)b1] = encioded_i; if (b2 != -1 && b2 != b1) { if (encoded_depi < (unsigned int)bi_links[(unsigned int)b2]) { encoded_depi = bi_links[(unsigned int)b2]; } bi_links[(unsigned int)b2] = encioded_i; } // OD: There is also a dependency on findex[index], // however the findex can only refer to the rows of the same joint // and hence that index is going to have the same bodies. Since the // indices are sorted in a way that the meaningful findex values // always come last, the dependency of findex[index] is going to // be implicitly satisfied via matching bodies at smaller "i"s. // Check that the dependency targets an earlier "i" dIASSERT(encoded_depi < encioded_i); unsigned encoded_downi = mi_links[(sizeint)encoded_depi * 2 + 1]; mi_links[(sizeint)encoded_depi * 2 + 1] = encioded_i; // Link i as down-dependency for depi mi_links[(sizeint)encioded_i * 2 + 0] = encoded_downi; // Link previous down-chain as the level-dependency with i } } static void dxQuickStepIsland_Stage4LCP_DependencyMapFromSavedLevelsReconstruction(dxQuickStepperStage4CallContext *stage4CallContext) { const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; atomicord32 *mi_levels = stage4CallContext->m_bi_links_or_mi_levels;/*=[m]*/ atomicord32 *mi_links = stage4CallContext->m_mi_links;/*=[2*(m + 1)]*/ // NOTE! // OD: The mi_links array is not zero-filled before the reconstruction. // Iteration ends with all the down links zeroed. And since down links // are moved to the next level links when parent-child relations are established, // the horizontal levels are properly terminated. // The leaf nodes had their links zero-initialized initially // and those zeros remain intact during the solving. This way the down links // are properly terminated as well. // This is very obscure and error prone and would need an assertion check at least // but the simplest assertion approach I can imagine would be // zero filling and building another tree with the memory buffer comparison afterwards. // That would be stupid, obviously. // // NOTE! // OD: This routine can be threaded. However having two threads messing // in one integer array with random access and kicking each other memory lines // out of cache would probably work worse than letting a single thread do the whole job. unsigned int m = localContext->m_m; for (unsigned int i = 0; i != m; ++i) { unsigned int currentLevelRoot = mi_levels[i]; unsigned int currentLevelFirstLink = mi_links[2 * (sizeint)currentLevelRoot + 1]; unsigned int encoded_i = dxENCODE_INDEX(i); mi_links[2 * (sizeint)currentLevelRoot + 1] = encoded_i; mi_links[2 * (sizeint)encoded_i + 0] = currentLevelFirstLink; } // Additionally reset available level root's list head mi_links[2 * dxHEAD_INDEX + 0] = dxHEAD_INDEX; } static int dxQuickStepIsland_Stage4LCP_ConstraintsReorderingSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; unsigned int stage4LCP_Iteration_allowedThreads = stage4CallContext->m_LCP_IterationAllowedThreads; stage4CallContext->RecordLCP_IterationStart(stage4LCP_Iteration_allowedThreads, callThisReleasee); unsigned knownToBeCompletedLevel = dxHEAD_INDEX; if (stage4LCP_Iteration_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4LCP_Iteration_allowedThreads - 1); world->PostThreadedCallsIndexOverridenGroup(NULL, stage4LCP_Iteration_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4LCP_Iteration_Callback, stage4CallContext, knownToBeCompletedLevel, "QuickStepIsland Stage4LCP_Iteration"); } dxQuickStepIsland_Stage4LCP_MTIteration(stage4CallContext, knownToBeCompletedLevel); return 1; } static int dxQuickStepIsland_Stage4LCP_Iteration_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; unsigned int initiallyKnownToBeCompletedLevel = (unsigned int)callInstanceIndex; dIASSERT(initiallyKnownToBeCompletedLevel == callInstanceIndex); // A truncation check... dxQuickStepIsland_Stage4LCP_MTIteration(stage4CallContext, initiallyKnownToBeCompletedLevel); return 1; } /* * +0 +0 * Root───┬─────────────────┬──... * +1│ +1│ * ┌┴┐+0 ┌─┐+0 . * │A├─────┤B├─... * └┬┘ └┬┘ * +1│ +1│ * ┌┴┐+0 . * │C├─... * └┬┘ * +1│ * . * * Lower tree levels depend on their parents. Same level nodes are independent with respect to each other. * * 1. B is linked in place of A * 2. A is processed * 3. C is inserted at the Root level * * The tree starts with a single child subtree at the root level ("down" link of slot #0 is used for that). * Then, additional "C" nodes are added to the root level by building horizontal link via slots of * their former parent "A"s that had become free. * The "level" link of slot #0 is used to find the root level head. * * Since the tree is altered during iteration, mi_levels record each node parents so that the tree could be reconstructed. */ static void dxQuickStepIsland_Stage4LCP_MTIteration(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int initiallyKnownToBeCompletedLevel) { atomicord32 *mi_levels = stage4CallContext->m_bi_links_or_mi_levels; atomicord32 *mi_links = stage4CallContext->m_mi_links; unsigned int knownToBeCompletedLevel = initiallyKnownToBeCompletedLevel; while (true) { unsigned int initialLevelRoot = mi_links[2 * dxHEAD_INDEX + 0]; if (initialLevelRoot != dxHEAD_INDEX && initialLevelRoot == knownToBeCompletedLevel) { // No work is (currently) available break; } for (unsigned int currentLevelRoot = initialLevelRoot; ; currentLevelRoot = mi_links[2 * (sizeint)currentLevelRoot + 0]) { while (true) { const unsigned invalid_link = dxENCODE_INDEX(-1); unsigned currentLevelFirstLink = mi_links[2 * (sizeint)currentLevelRoot + 1]; if (currentLevelFirstLink == invalid_link) { break; } // Try to extract first record from linked list unsigned currentLevelNextLink = mi_links[2 * (sizeint)currentLevelFirstLink + 0]; if (ThrsafeCompareExchange(&mi_links[2 * (sizeint)currentLevelRoot + 1], currentLevelFirstLink, currentLevelNextLink)) { // if succeeded, execute selected iteration step... dxQuickStepIsland_Stage4LCP_IterationStep(stage4CallContext, dxDECODE_INDEX(currentLevelFirstLink)); // Check if there are any dependencies unsigned level0DownLink = mi_links[2 * (sizeint)currentLevelFirstLink + 1]; if (level0DownLink != invalid_link) { // ...and if yes, insert the record into the list of available level roots unsigned int levelRootsFirst; do { levelRootsFirst = mi_links[2 * dxHEAD_INDEX + 0]; mi_links[2 * (sizeint)currentLevelFirstLink + 0] = levelRootsFirst; } while (!ThrsafeCompareExchange(&mi_links[2 * dxHEAD_INDEX + 0], levelRootsFirst, currentLevelFirstLink)); // If another level was added and some threads have already exited... unsigned int threadsTotal = stage4CallContext->m_LCP_iterationThreadsTotal; unsigned int threadsRemaining = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_LCP_iterationThreadsRemaining, threadsTotal); if (threadsRemaining != threadsTotal) { // ...go on an schedule one more... const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; dxWorld *world = callContext->m_world; // ...passing knownToBeCompletedLevel as the initial one for the spawned call world->PostThreadedCallForUnawareReleasee(NULL, NULL, 0, stage4CallContext->m_LCP_iterationNextReleasee, NULL, &dxQuickStepIsland_Stage4LCP_Iteration_Callback, stage4CallContext, knownToBeCompletedLevel, "QuickStepIsland Stage4LCP_Iteration"); // NOTE: it's hard to predict whether it is reasonable to re-post a call // each time a new level is added (provided some calls have already exited, of course). // The efficiency very much depends on dependencies patterns between levels // (i.e. it depends on the amount of available work added with each level). // The strategy of re-posting exited calls as frequently as possible // leads to potential wasting execution cycles in some cores for the aid // of keeping other cores busy as much as possible and not letting all the // work be executed by just a partial cores subset. With emergency of large // available work amounts (the work that is not dependent on anything and // ready to be executed immediately) this strategy is going to transit into // full cores set being busy executing useful work. If amounts of work // emerging from added levels are small, the strategy should lead to // approximately the same efficiency as if the work was done by only a cores subset // with the remaining cores wasting (some) cycles for re-scheduling calls // to those busy cores rather than being idle or handling other islands. } } // Finally record the root index of current record's level mi_levels[dxDECODE_INDEX(currentLevelFirstLink)] = currentLevelRoot; } } if (currentLevelRoot == knownToBeCompletedLevel) { break; } dIASSERT(currentLevelRoot != dxHEAD_INDEX); // Zero level is expected to be the deepest one in the list and execution must not loop past it. } // Save the level root we started from as known to be completed knownToBeCompletedLevel = initialLevelRoot; } // Decrement running threads count on exit ThrsafeAdd(&stage4CallContext->m_LCP_iterationThreadsRemaining, (atomicord32)(-1)); } static void dxQuickStepIsland_Stage4LCP_STIteration(dxQuickStepperStage4CallContext *stage4CallContext) { const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int m = localContext->m_m; for (unsigned int i = 0; i != m; ++i) { dxQuickStepIsland_Stage4LCP_IterationStep(stage4CallContext, i); } } //*************************************************************************** // SOR-LCP method // nb is the number of bodies in the body array. // J is an m*16 matrix of constraint rows with rhs, cfm, lo and hi in padding // jb is an array of first and second body numbers for each constraint row // invI is the global frame inverse inertia for each body (stacked 3x3 matrices) // // this returns lambda and fc (the constraint force). // note: fc is returned as inv(M)*J'*lambda, the constraint force is actually J'*lambda // // b, lo and hi are modified on exit static void dxQuickStepIsland_Stage4LCP_IterationStep(dxQuickStepperStage4CallContext *stage4CallContext, unsigned int i) { const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; IndexError *order = stage4CallContext->m_order; unsigned int index = order[i].index; dReal *fc_ptr1; dReal *fc_ptr2 = NULL; dReal delta; dReal *lambda = stage4CallContext->m_lambda; dReal old_lambda = lambda[index]; dReal *J = localContext->m_J; const dReal *J_ptr = J + (sizeint)index * JME__MAX; { delta = J_ptr[JME_RHS] - old_lambda * J_ptr[JME_CFM]; dReal *fc = stage4CallContext->m_cforce; const dxJBodiesItem *jb = localContext->m_jb; int b2 = jb[index].second; int b1 = jb[index].first; // @@@ potential optimization: SIMD-ize this and the b2 >= 0 case fc_ptr1 = fc + (sizeint)(unsigned)b1 * CFE__MAX; delta -= fc_ptr1[CFE_LX] * J_ptr[JME_J1LX] + fc_ptr1[CFE_LY] * J_ptr[JME_J1LY] + fc_ptr1[CFE_LZ] * J_ptr[JME_J1LZ] + fc_ptr1[CFE_AX] * J_ptr[JME_J1AX] + fc_ptr1[CFE_AY] * J_ptr[JME_J1AY] + fc_ptr1[CFE_AZ] * J_ptr[JME_J1AZ]; // @@@ potential optimization: handle 1-body constraints in a separate // loop to avoid the cost of test & jump? if (b2 != -1) { fc_ptr2 = fc + (sizeint)(unsigned)b2 * CFE__MAX; delta -= fc_ptr2[CFE_LX] * J_ptr[JME_J2LX] + fc_ptr2[CFE_LY] * J_ptr[JME_J2LY] + fc_ptr2[CFE_LZ] * J_ptr[JME_J2LZ] + fc_ptr2[CFE_AX] * J_ptr[JME_J2AX] + fc_ptr2[CFE_AY] * J_ptr[JME_J2AY] + fc_ptr2[CFE_AZ] * J_ptr[JME_J2AZ]; } } { dReal hi_act, lo_act; // set the limits for this constraint. // this is the place where the QuickStep method differs from the // direct LCP solving method, since that method only performs this // limit adjustment once per time step, whereas this method performs // once per iteration per constraint row. // the constraints are ordered so that all lambda[] values needed have // already been computed. const int *findex = localContext->m_findex; if (findex[index] != -1) { hi_act = dFabs (J_ptr[JME_HI] * lambda[(unsigned)findex[index]]); lo_act = -hi_act; } else { hi_act = J_ptr[JME_HI]; lo_act = J_ptr[JME_LO]; } // compute lambda and clamp it to [lo,hi]. // @@@ potential optimization: does SSE have clamping instructions // to save test+jump penalties here? dReal new_lambda = old_lambda + delta; if (new_lambda < lo_act) { delta = lo_act - old_lambda; lambda[index] = lo_act; } else if (new_lambda > hi_act) { delta = hi_act - old_lambda; lambda[index] = hi_act; } else { lambda[index] = new_lambda; } } //@@@ a trick that may or may not help //dReal ramp = (1-((dReal)(iteration+1)/(dReal)num_iterations)); //delta *= ramp; { dReal *iMJ = stage4CallContext->m_iMJ; const dReal *iMJ_ptr = iMJ + (sizeint)index * IMJ__MAX; // update fc. // @@@ potential optimization: SIMD for this and the b2 >= 0 case fc_ptr1[CFE_LX] += delta * iMJ_ptr[IMJ_1LX]; fc_ptr1[CFE_LY] += delta * iMJ_ptr[IMJ_1LY]; fc_ptr1[CFE_LZ] += delta * iMJ_ptr[IMJ_1LZ]; fc_ptr1[CFE_AX] += delta * iMJ_ptr[IMJ_1AX]; fc_ptr1[CFE_AY] += delta * iMJ_ptr[IMJ_1AY]; fc_ptr1[CFE_AZ] += delta * iMJ_ptr[IMJ_1AZ]; // @@@ potential optimization: handle 1-body constraints in a separate // loop to avoid the cost of test & jump? if (fc_ptr2) { fc_ptr2[CFE_LX] += delta * iMJ_ptr[IMJ_2LX]; fc_ptr2[CFE_LY] += delta * iMJ_ptr[IMJ_2LY]; fc_ptr2[CFE_LZ] += delta * iMJ_ptr[IMJ_2LZ]; fc_ptr2[CFE_AX] += delta * iMJ_ptr[IMJ_2AX]; fc_ptr2[CFE_AY] += delta * iMJ_ptr[IMJ_2AY]; fc_ptr2[CFE_AZ] += delta * iMJ_ptr[IMJ_2AZ]; } } } static inline bool IsStage4bJointInfosIterationRequired(const dxQuickStepperLocalContext *localContext) { return #ifdef WARM_STARTING true || #endif localContext->m_mfb > 0; } static int dxQuickStepIsland_Stage4LCP_IterationSync_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; unsigned int stage4b_allowedThreads = 1; if (IsStage4bJointInfosIterationRequired(localContext)) { unsigned int allowedThreads = callContext->m_stepperAllowedThreads; dIASSERT(allowedThreads >= stage4b_allowedThreads); stage4b_allowedThreads += CalculateOptimalThreadsCount(localContext->m_nj, allowedThreads - stage4b_allowedThreads); } if (stage4b_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage4b_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage4b_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage4b_Callback, stage4CallContext, "QuickStepIsland Stage4b"); } dxQuickStepIsland_Stage4b(stage4CallContext); return 1; } static int dxQuickStepIsland_Stage4b_Callback(void *_stage4CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage4CallContext *stage4CallContext = (dxQuickStepperStage4CallContext *)_stage4CallContext; dxQuickStepIsland_Stage4b(stage4CallContext); return 1; } static void dxQuickStepIsland_Stage4b(dxQuickStepperStage4CallContext *stage4CallContext) { const dxStepperProcessingCallContext *callContext = stage4CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage4CallContext->m_localContext; if (ThrsafeExchange(&stage4CallContext->m_cf_4b, 1) == 0) { dxBody * const *body = callContext->m_islandBodiesStart; unsigned int nb = callContext->m_islandBodiesCount; const dReal *cforce = stage4CallContext->m_cforce; dReal stepsize = callContext->m_stepSize; // add stepsize * cforce to the body velocity const dReal *cforcecurr = cforce; dxBody *const *const bodyend = body + nb; for (dxBody *const *bodycurr = body; bodycurr != bodyend; cforcecurr += CFE__MAX, bodycurr++) { dxBody *b = *bodycurr; for (unsigned int j = dSA__MIN; j != dSA__MAX; j++) { b->lvel[dV3E__AXES_MIN + j] += stepsize * cforcecurr[CFE__L_MIN + j]; b->avel[dV3E__AXES_MIN + j] += stepsize * cforcecurr[CFE__A_MIN + j]; } } } // note that the SOR method overwrites rhs and J at this point, so // they should not be used again. if (IsStage4bJointInfosIterationRequired(localContext)) { dReal data[JVE__MAX]; const dReal *Jcopy = localContext->m_Jcopy; const dReal *lambda = stage4CallContext->m_lambda; const dxMIndexItem *mindex = localContext->m_mindex; dJointWithInfo1 *jointinfos = localContext->m_jointinfos; unsigned int nj = localContext->m_nj; const unsigned int step_size = dxQUICKSTEPISLAND_STAGE4B_STEP; unsigned int nj_steps = (nj + (step_size - 1)) / step_size; unsigned ji_step; while ((ji_step = ThrsafeIncrementIntUpToLimit(&stage4CallContext->m_ji_4b, nj_steps)) != nj_steps) { unsigned int ji = ji_step * step_size; const unsigned int jiend = ji + dMIN(step_size, nj - ji); const dReal *Jcopycurr = Jcopy + (sizeint)mindex[ji].fbIndex * JCE__MAX; while (true) { // straightforward computation of joint constraint forces: // multiply related lambdas with respective J' block for joints // where feedback was requested const unsigned int fb_infom = mindex[ji + 1].fbIndex - mindex[ji].fbIndex; if (fb_infom != 0) { dIASSERT(fb_infom == mindex[ji + 1].mIndex - mindex[ji].mIndex); const dReal *lambdacurr = lambda + mindex[ji].mIndex; dxJoint *joint = jointinfos[ji].joint; #ifdef WARM_STARTING memcpy(joint->lambda, lambdacurr, fb_infom * sizeof(dReal)); #endif dJointFeedback *fb = joint->feedback; if (joint->node[1].body) { Multiply1_12q1 (data, Jcopycurr + JCE__J2_MIN, lambdacurr, fb_infom); dSASSERT(JCE__MAX == 12); fb->f2[dSA_X] = data[JVE_LX]; fb->f2[dSA_Y] = data[JVE_LY]; fb->f2[dSA_Z] = data[JVE_LZ]; fb->t2[dSA_X] = data[JVE_AX]; fb->t2[dSA_Y] = data[JVE_AY]; fb->t2[dSA_Z] = data[JVE_AZ]; } Multiply1_12q1 (data, Jcopycurr + JCE__J1_MIN, lambdacurr, fb_infom); dSASSERT(JCE__MAX == 12); fb->f1[dSA_X] = data[JVE_LX]; fb->f1[dSA_Y] = data[JVE_LY]; fb->f1[dSA_Z] = data[JVE_LZ]; fb->t1[dSA_X] = data[JVE_AX]; fb->t1[dSA_Y] = data[JVE_AY]; fb->t1[dSA_Z] = data[JVE_AZ]; Jcopycurr += fb_infom * JCE__MAX; } else { #ifdef WARM_STARTING const dReal *lambdacurr = lambda + mindex[ji].mIndex; const unsigned int infom = mindex[ji + 1].mIndex - mindex[ji].mIndex; dxJoint *joint = jointinfos[ji].joint; memcpy(joint->lambda, lambdacurr, infom * sizeof(dReal)); #endif } if (++ji == jiend) { break; } } } } } static int dxQuickStepIsland_Stage5_Callback(void *_stage5CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage5CallContext *stage5CallContext = (dxQuickStepperStage5CallContext *)_stage5CallContext; dxQuickStepIsland_Stage5(stage5CallContext); return 1; } static void dxQuickStepIsland_Stage5(dxQuickStepperStage5CallContext *stage5CallContext) { const dxStepperProcessingCallContext *callContext = stage5CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage5CallContext->m_localContext; dxWorldProcessMemArena *memarena = callContext->m_stepperArena; memarena->RestoreState(stage5CallContext->m_stage3MemArenaState); stage5CallContext = NULL; // WARNING! stage3CallContext is not valid after this point! dIVERIFY(stage5CallContext == NULL); // To suppress unused variable assignment warnings dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)memarena->AllocateBlock(sizeof(dxQuickStepperStage6CallContext)); stage6CallContext->Initialize(callContext, localContext); const unsigned allowedThreads = callContext->m_stepperAllowedThreads; dIASSERT(allowedThreads >= 1); if (allowedThreads == 1) { IFTIMING (dTimerNow ("compute velocity update")); dxQuickStepIsland_Stage6a(stage6CallContext); dxQuickStepIsland_Stage6_VelocityCheck(stage6CallContext); IFTIMING (dTimerNow ("update position and tidy up")); dxQuickStepIsland_Stage6b(stage6CallContext); IFTIMING (dTimerEnd()); IFTIMING (if (m > 0) dTimerReport (stdout,1)); } else { unsigned int nb = callContext->m_islandBodiesCount; unsigned int stage6a_allowedThreads = CalculateOptimalThreadsCount(nb, allowedThreads); dxWorld *world = callContext->m_world; dCallReleaseeID stage6aSyncReleasee; world->PostThreadedCallForUnawareReleasee(NULL, &stage6aSyncReleasee, stage6a_allowedThreads, callContext->m_finalReleasee, NULL, &dxQuickStepIsland_Stage6aSync_Callback, stage6CallContext, 0, "QuickStepIsland Stage6a Sync"); if (stage6a_allowedThreads > 1) { world->PostThreadedCallsGroup(NULL, stage6a_allowedThreads - 1, stage6aSyncReleasee, &dxQuickStepIsland_Stage6a_Callback, stage6CallContext, "QuickStepIsland Stage6a"); } dxQuickStepIsland_Stage6a(stage6CallContext); world->AlterThreadedCallDependenciesCount(stage6aSyncReleasee, -1); } } static int dxQuickStepIsland_Stage6a_Callback(void *_stage6CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)_stage6CallContext; dxQuickStepIsland_Stage6a(stage6CallContext); return 1; } static void dxQuickStepIsland_Stage6a(dxQuickStepperStage6CallContext *stage6CallContext) { const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext; const dxQuickStepperLocalContext *localContext = stage6CallContext->m_localContext; dReal stepsize = callContext->m_stepSize; dReal *invI = localContext->m_invI; dxBody * const *body = callContext->m_islandBodiesStart; unsigned int nb = callContext->m_islandBodiesCount; const unsigned int step_size = dxQUICKSTEPISLAND_STAGE6A_STEP; unsigned int nb_steps = (nb + (step_size - 1)) / step_size; unsigned bi_step; while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage6CallContext->m_bi_6a, nb_steps)) != nb_steps) { unsigned int bi = bi_step * step_size; unsigned int bicnt = dMIN(step_size, nb - bi); const dReal *invIrow = invI + (sizeint)bi * IIE__MAX; dxBody *const *bodycurr = body + bi; dxBody *const *bodyend = bodycurr + bicnt; while (true) { // compute the velocity update: // add stepsize * invM * fe to the body velocity dxBody *b = *bodycurr; dReal body_invMass_mul_stepsize = stepsize * b->invMass; for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) { b->lvel[dV3E__AXES_MIN + j] += body_invMass_mul_stepsize * b->facc[dV3E__AXES_MIN + j]; b->tacc[dV3E__AXES_MIN + j] *= stepsize; } dMultiplyAdd0_331 (b->avel, invIrow + IIE__MATRIX_MIN, b->tacc); if (++bodycurr == bodyend) { break; } invIrow += IIE__MAX; } } } static int dxQuickStepIsland_Stage6aSync_Callback(void *_stage6CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)_stage6CallContext; dxQuickStepIsland_Stage6_VelocityCheck(stage6CallContext); const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext; const unsigned allowedThreads = callContext->m_stepperAllowedThreads; unsigned int nb = callContext->m_islandBodiesCount; unsigned int stage6b_allowedThreads = CalculateOptimalThreadsCount(nb, allowedThreads); if (stage6b_allowedThreads > 1) { dxWorld *world = callContext->m_world; world->AlterThreadedCallDependenciesCount(callThisReleasee, stage6b_allowedThreads - 1); world->PostThreadedCallsGroup(NULL, stage6b_allowedThreads - 1, callThisReleasee, &dxQuickStepIsland_Stage6b_Callback, stage6CallContext, "QuickStepIsland Stage6b"); } dxQuickStepIsland_Stage6b(stage6CallContext); return 1; } static void dxQuickStepIsland_Stage6_VelocityCheck(dxQuickStepperStage6CallContext *stage6CallContext) { (void)stage6CallContext; // can be unused #ifdef CHECK_VELOCITY_OBEYS_CONSTRAINT const dxQuickStepperLocalContext *localContext = stage6CallContext->m_localContext; unsigned int m = localContext->m_m; if (m > 0) { const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext; dxBody * const *body = callContext->m_islandBodiesStart; dReal *J = localContext->m_J; const dxJBodiesItem *jb = localContext->m_jb; dReal error = 0; const dReal* J_ptr = J; for (unsigned int i = 0; i < m; ++i) { int b1 = jb[i].first; int b2 = jb[i].second; dReal sum = 0; dxBody *bodycurr = body[(unsigned)b1]; for (unsigned int j = dSA__MIN; j != dSA__MAX; ++j) sum += J_ptr[JME__J1L_MIN + j] * bodycurr->lvel[dV3E__AXES_MIN + j] + J_ptr[JME__J1A_MIN + j] * bodycurr->avel[dV3E__AXES_MIN + j]; if (b2 != -1) { dxBody *bodycurr = body[(unsigned)b2]; for (unsigned int k = dSA__MIN; k != dSA__MAX; ++k) sum += J_ptr[JME__J2L_MIN + k] * bodycurr->lvel[dV3E__AXES_MIN + k] + J_ptr[JME__J2A_MIN + k] * bodycurr->avel[dV3E__AXES_MIN + k]; } J_ptr += JME__MAX; error += dFabs(sum); } printf ("velocity error = %10.6e\n", error); } #endif } static int dxQuickStepIsland_Stage6b_Callback(void *_stage6CallContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee) { (void)callInstanceIndex; // unused (void)callThisReleasee; // unused dxQuickStepperStage6CallContext *stage6CallContext = (dxQuickStepperStage6CallContext *)_stage6CallContext; dxQuickStepIsland_Stage6b(stage6CallContext); return 1; } static void dxQuickStepIsland_Stage6b(dxQuickStepperStage6CallContext *stage6CallContext) { const dxStepperProcessingCallContext *callContext = stage6CallContext->m_stepperCallContext; dReal stepsize = callContext->m_stepSize; dxBody * const *body = callContext->m_islandBodiesStart; // update the position and orientation from the new linear/angular velocity // (over the given timestep) unsigned int nb = callContext->m_islandBodiesCount; const unsigned int step_size = dxQUICKSTEPISLAND_STAGE6B_STEP; unsigned int nb_steps = (nb + (step_size - 1)) / step_size; unsigned bi_step; while ((bi_step = ThrsafeIncrementIntUpToLimit(&stage6CallContext->m_bi_6b, nb_steps)) != nb_steps) { unsigned int bi = bi_step * step_size; unsigned int bicnt = dMIN(step_size, nb - bi); dxBody *const *bodycurr = body + bi; dxBody *const *bodyend = bodycurr + bicnt; while (true) { dxBody *b = *bodycurr; dxStepBody (b, stepsize); dZeroVector3 (b->facc); dZeroVector3 (b->tacc); if (++bodycurr == bodyend) { break; } } } } /*extern */ sizeint dxEstimateQuickStepMemoryRequirements (dxBody * const *body, unsigned int nb, dxJoint * const *_joint, unsigned int _nj) { (void)body; // unused unsigned int nj, m, mfb; { unsigned int njcurr = 0, mcurr = 0, mfbcurr = 0; dxJoint::SureMaxInfo info; dxJoint *const *const _jend = _joint + _nj; for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) { dxJoint *j = *_jcurr; j->getSureMaxInfo (&info); unsigned int jm = info.max_m; if (jm > 0) { njcurr++; mcurr += jm; if (j->feedback) mfbcurr += jm; } } nj = njcurr; m = mcurr; mfb = mfbcurr; } sizeint res = 0; res += dOVERALIGNED_SIZE(sizeof(dReal) * IIE__MAX * nb, INVI_ALIGNMENT); // for invI { sizeint sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * _nj); // for initial jointinfos sizeint sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * nj); // for shrunk jointinfos sub1_res2 += dEFFICIENT_SIZE(sizeof(dxQuickStepperLocalContext)); // for dxQuickStepLocalContext if (m > 0) { sub1_res2 += dEFFICIENT_SIZE(sizeof(dxMIndexItem) * (nj + 1)); // for mindex sub1_res2 += dEFFICIENT_SIZE(sizeof(dxJBodiesItem) * m); // for jb sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m); // for findex sub1_res2 += dOVERALIGNED_SIZE(sizeof(dReal) * JME__MAX * m, JACOBIAN_ALIGNMENT); // for J sub1_res2 += dOVERALIGNED_SIZE(sizeof(dReal) * JCE__MAX * mfb, JCOPY_ALIGNMENT); // for Jcopy { sizeint sub2_res1 = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage3CallContext)); // for dxQuickStepperStage3CallContext sub2_res1 += dEFFICIENT_SIZE(sizeof(dReal) * RHS__MAX * nb); // for rhs_tmp sub2_res1 += dEFFICIENT_SIZE(sizeof(dxQuickStepperStage2CallContext)); // for dxQuickStepperStage2CallContext sizeint sub2_res2 = 0; { sizeint sub3_res1 = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage5CallContext)); // for dxQuickStepperStage5CallContext; sub3_res1 += dEFFICIENT_SIZE(sizeof(dReal) * m); // for lambda sub3_res1 += dEFFICIENT_SIZE(sizeof(dReal) * CFE__MAX * nb); // for cforce sub3_res1 += dOVERALIGNED_SIZE(sizeof(dReal) * IMJ__MAX * m, INVMJ_ALIGNMENT); // for iMJ sub3_res1 += dEFFICIENT_SIZE(sizeof(IndexError) * m); // for order #if CONSTRAINTS_REORDERING_METHOD == REORDERING_METHOD__BY_ERROR sub3_res1 += dEFFICIENT_SIZE(sizeof(dReal) * m); // for last_lambda #endif #if !dTHREADING_INTF_DISABLED sub3_res1 += dEFFICIENT_SIZE(sizeof(atomicord32) * dMAX(nb, m)); // for bi_links_or_mi_levels sub3_res1 += dEFFICIENT_SIZE(sizeof(atomicord32) * 2 * ((sizeint)m + 1)); // for mi_links #endif sub3_res1 += dEFFICIENT_SIZE(sizeof(dxQuickStepperStage4CallContext)); // for dxQuickStepperStage4CallContext; sizeint sub3_res2 = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage6CallContext)); // for dxQuickStepperStage6CallContext; sub2_res2 += dMAX(sub3_res1, sub3_res2); } sub1_res2 += dMAX(sub2_res1, sub2_res2); } } else { sub1_res2 += dEFFICIENT_SIZE(sizeof(dxQuickStepperStage3CallContext)); // for dxQuickStepperStage3CallContext } sizeint sub1_res12_max = dMAX(sub1_res1, sub1_res2); sizeint stage01_contexts = dEFFICIENT_SIZE(sizeof(dxQuickStepperStage0BodiesCallContext)) + dEFFICIENT_SIZE(sizeof(dxQuickStepperStage0JointsCallContext)) + dEFFICIENT_SIZE(sizeof(dxQuickStepperStage1CallContext)); res += dMAX(sub1_res12_max, stage01_contexts); } return res; } /*extern */ unsigned dxEstimateQuickStepMaxCallCount(unsigned activeThreadCount, unsigned allowedThreadCount) { (void)activeThreadCount; // unused unsigned result = 1 // dxQuickStepIsland itself + 5 + (2 * allowedThreadCount + 1) // for Stage4 related schedules + 1 // dxStepIsland_Stage5 + allowedThreadCount; // Reserve return result; }