From c5fc66ee58f2c60f2d226868bb1cf5b91badaf53 Mon Sep 17 00:00:00 2001 From: sanine Date: Sat, 1 Oct 2022 20:59:36 -0500 Subject: add ode --- libs/ode-0.16.1/ode/src/lcp.cpp | 1317 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 1317 insertions(+) create mode 100644 libs/ode-0.16.1/ode/src/lcp.cpp (limited to 'libs/ode-0.16.1/ode/src/lcp.cpp') diff --git a/libs/ode-0.16.1/ode/src/lcp.cpp b/libs/ode-0.16.1/ode/src/lcp.cpp new file mode 100644 index 0000000..58db0bd --- /dev/null +++ b/libs/ode-0.16.1/ode/src/lcp.cpp @@ -0,0 +1,1317 @@ +/************************************************************************* + * * + * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. * + * All rights reserved. Email: russ@q12.org Web: www.q12.org * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of EITHER: * + * (1) The GNU Lesser General Public License as published by the Free * + * Software Foundation; either version 2.1 of the License, or (at * + * your option) any later version. The text of the GNU Lesser * + * General Public License is included with this library in the * + * file LICENSE.TXT. * + * (2) The BSD-style license that is included with this library in * + * the file LICENSE-BSD.TXT. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files * + * LICENSE.TXT and LICENSE-BSD.TXT for more details. * + * * + *************************************************************************/ + +/* + + +THE ALGORITHM +------------- + +solve A*x = b+w, with x and w subject to certain LCP conditions. +each x(i),w(i) must lie on one of the three line segments in the following +diagram. each line segment corresponds to one index set : + + w(i) + /|\ | : + | | : + | |i in N : + w>0 | |state[i]=0 : + | | : + | | : i in C + w=0 + +-----------------------+ + | : | + | : | + w<0 | : |i in N + | : |state[i]=1 + | : | + | : | + +-------|-----------|-----------|----------> x(i) + lo 0 hi + +the Dantzig algorithm proceeds as follows: + for i=1:n + * if (x(i),w(i)) is not on the line, push x(i) and w(i) positive or + negative towards the line. as this is done, the other (x(j),w(j)) + for j= 0. this makes the algorithm a bit +simpler, because the starting point for x(i),w(i) is always on the dotted +line x=0 and x will only ever increase in one direction, so it can only hit +two out of the three line segments. + + +NOTES +----- + +this is an implementation of "lcp_dantzig2_ldlt.m" and "lcp_dantzig_lohi.m". +the implementation is split into an LCP problem object (dLCP) and an LCP +driver function. most optimization occurs in the dLCP object. + +a naive implementation of the algorithm requires either a lot of data motion +or a lot of permutation-array lookup, because we are constantly re-ordering +rows and columns. to avoid this and make a more optimized algorithm, a +non-trivial data structure is used to represent the matrix A (this is +implemented in the fast version of the dLCP object). + +during execution of this algorithm, some indexes in A are clamped (set C), +some are non-clamped (set N), and some are "don't care" (where x=0). +A,x,b,w (and other problem vectors) are permuted such that the clamped +indexes are first, the unclamped indexes are next, and the don't-care +indexes are last. this permutation is recorded in the array `p'. +initially p = 0..n-1, and as the rows and columns of A,x,b,w are swapped, +the corresponding elements of p are swapped. + +because the C and N elements are grouped together in the rows of A, we can do +lots of work with a fast dot product function. if A,x,etc were not permuted +and we only had a permutation array, then those dot products would be much +slower as we would have a permutation array lookup in some inner loops. + +A is accessed through an array of row pointers, so that element (i,j) of the +permuted matrix is A[i][j]. this makes row swapping fast. for column swapping +we still have to actually move the data. + +during execution of this algorithm we maintain an L*D*L' factorization of +the clamped submatrix of A (call it `AC') which is the top left nC*nC +submatrix of A. there are two ways we could arrange the rows/columns in AC. + +(1) AC is always permuted such that L*D*L' = AC. this causes a problem +when a row/column is removed from C, because then all the rows/columns of A +between the deleted index and the end of C need to be rotated downward. +this results in a lot of data motion and slows things down. +(2) L*D*L' is actually a factorization of a *permutation* of AC (which is +itself a permutation of the underlying A). this is what we do - the +permutation is recorded in the vector C. call this permutation A[C,C]. +when a row/column is removed from C, all we have to do is swap two +rows/columns and manipulate C. + +*/ + +#include +#include +#include // for testing +#include "config.h" +#include "lcp.h" +#include "util.h" +#include "matrix.h" +#include "mat.h" // for testing +#include "threaded_solver_ldlt.h" + +#include "fastdot_impl.h" +#include "fastldltfactor_impl.h" +#include "fastldltsolve_impl.h" + + +//*************************************************************************** +// code generation parameters + +// LCP debugging (mostly for fast dLCP) - this slows things down a lot +//#define DEBUG_LCP + +#define dLCP_FAST // use fast dLCP object + +#define NUB_OPTIMIZATIONS // use NUB optimizations + + +// option 1 : matrix row pointers (less data copying) +#define ROWPTRS +#define ATYPE dReal ** +#define AROW(i) (m_A[i]) + +// option 2 : no matrix row pointers (slightly faster inner loops) +//#define NOROWPTRS +//#define ATYPE dReal * +//#define AROW(i) (m_A+(i)*m_nskip) + + +//*************************************************************************** + +#define dMIN(A,B) ((A)>(B) ? (B) : (A)) +#define dMAX(A,B) ((B)>(A) ? (B) : (A)) + + +#define LMATRIX_ALIGNMENT dMAX(64, EFFICIENT_ALIGNMENT) + +//*************************************************************************** + + +// transfer b-values to x-values +template +inline +void transfer_b_to_x(dReal pairsbx[PBX__MAX], unsigned n) +{ + dReal *const endbx = pairsbx + (sizeint)n * PBX__MAX; + for (dReal *currbx = pairsbx; currbx != endbx; currbx += PBX__MAX) { + currbx[PBX_X] = currbx[PBX_B]; + if (zero_b) { + currbx[PBX_B] = REAL(0.0); + } + } +} + +// swap row/column i1 with i2 in the n*n matrix A. the leading dimension of +// A is nskip. this only references and swaps the lower triangle. +// if `do_fast_row_swaps' is nonzero and row pointers are being used, then +// rows will be swapped by exchanging row pointers. otherwise the data will +// be copied. + +static +void swapRowsAndCols (ATYPE A, unsigned n, unsigned i1, unsigned i2, unsigned nskip, + int do_fast_row_swaps) +{ + dAASSERT (A && n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n && + nskip >= n && i1 < i2); + +# ifdef ROWPTRS + dReal *A_i1 = A[i1]; + dReal *A_i2 = A[i2]; + for (unsigned i=i1+1; i0 && i1 < n && i2 < n && nskip >= n && i1 <= i2); + + if (i1 != i2) { + swapRowsAndCols (A, n, i1, i2, nskip, do_fast_row_swaps); + + dxSwap((pairsbx + (sizeint)i1 * PBX__MAX)[PBX_B], (pairsbx + (sizeint)i2 * PBX__MAX)[PBX_B]); + dxSwap((pairsbx + (sizeint)i1 * PBX__MAX)[PBX_X], (pairsbx + (sizeint)i2 * PBX__MAX)[PBX_X]); + dSASSERT(PBX__MAX == 2); + + dxSwap(w[i1], w[i2]); + + dxSwap((pairslh + (sizeint)i1 * PLH__MAX)[PLH_LO], (pairslh + (sizeint)i2 * PLH__MAX)[PLH_LO]); + dxSwap((pairslh + (sizeint)i1 * PLH__MAX)[PLH_HI], (pairslh + (sizeint)i2 * PLH__MAX)[PLH_HI]); + dSASSERT(PLH__MAX == 2); + + dxSwap(p[i1], p[i2]); + dxSwap(state[i1], state[i2]); + + if (findex != NULL) { + dxSwap(findex[i1], findex[i2]); + } + } +} + + +// for debugging - check that L,d is the factorization of A[C,C]. +// A[C,C] has size nC*nC and leading dimension nskip. +// L has size nC*nC and leading dimension nskip. +// d has size nC. + +#ifdef DEBUG_LCP + +static +void checkFactorization (ATYPE A, dReal *_L, dReal *_d, + unsigned nC, unsigned *C, unsigned nskip) +{ + unsigned i, j; + if (nC == 0) return; + + // get A1=A, copy the lower triangle to the upper triangle, get A2=A[C,C] + dMatrix A1 (nC, nC); + for (i=0; i < nC; i++) { + for (j = 0; j <= i; j++) A1(i, j) = A1(j, i) = AROW(i)[j]; + } + dMatrix A2 = A1.select (nC, C, nC, C); + + // printf ("A1=\n"); A1.print(); printf ("\n"); + // printf ("A2=\n"); A2.print(); printf ("\n"); + + // compute A3 = L*D*L' + dMatrix L (nC, nC, _L, nskip, 1); + dMatrix D (nC, nC); + for (i = 0; i < nC; i++) D(i, i) = 1.0 / _d[i]; + L.clearUpperTriangle(); + for (i = 0; i < nC; i++) L(i, i) = 1; + dMatrix A3 = L * D * L.transpose(); + + // printf ("L=\n"); L.print(); printf ("\n"); + // printf ("D=\n"); D.print(); printf ("\n"); + // printf ("A3=\n"); A2.print(); printf ("\n"); + + // compare A2 and A3 + dReal diff = A2.maxDifference (A3); + if (diff > 1e-8) + dDebug (0, "L*D*L' check, maximum difference = %.6e\n", diff); +} + +#endif + + +// for debugging + +#ifdef DEBUG_LCP + +static +void checkPermutations (unsigned i, unsigned n, unsigned nC, unsigned nN, unsigned *p, unsigned *C) +{ + unsigned j,k; + dIASSERT (/*nC >= 0 && nN >= 0 && */(nC + nN) == i && i < n); + for (k=0; k= 0 && p[k] < i); + for (k=i; k + dReal AiC_times_qC (unsigned i, dReal *q) const { return calculateLargeVectorDot (AROW(i), q, m_nC); } + template + dReal AiN_times_qN (unsigned i, dReal *q) const { return calculateLargeVectorDot (AROW(i) + m_nC, q + (sizeint)m_nC * q_stride, m_nN); } + void pN_equals_ANC_times_qC (dReal *p, dReal *q); + void pN_plusequals_ANi (dReal *p, unsigned i, bool dir_positive); + template + void pC_plusequals_s_times_qC (dReal *p, dReal s, dReal *q); + void pN_plusequals_s_times_qN (dReal *p, dReal s, dReal *q); + void solve1 (dReal *a, unsigned i, bool dir_positive, int only_transfer=0); + void unpermute_X(); + void unpermute_W(); +}; + + +dLCP::dLCP (unsigned _n, unsigned _nskip, unsigned _nub, dReal *_Adata, dReal *_pairsbx, dReal *_w, + dReal *_pairslh, dReal *_L, dReal *_d, + dReal *_Dell, dReal *_ell, dReal *_tmp, + bool *_state, int *_findex, unsigned *_p, unsigned *_C, dReal **Arows): + m_n(_n), m_nskip(_nskip), m_nub(_nub), m_nC(0), m_nN(0), +# ifdef ROWPTRS + m_A(Arows), +#else + m_A(_Adata), +#endif + m_pairsbx(_pairsbx), m_w(_w), m_pairslh(_pairslh), + m_L(_L), m_d(_d), m_Dell(_Dell), m_ell(_ell), m_tmp(_tmp), + m_state(_state), m_findex(_findex), m_p(_p), m_C(_C) +{ + dxtSetZero(m_pairsbx + PBX_X, m_n); + + { +# ifdef ROWPTRS + // make matrix row pointers + dReal *aptr = _Adata; + ATYPE A = m_A; + const unsigned n = m_n, nskip = m_nskip; + for (unsigned k=0; k nub + { + const unsigned n = m_n; + const unsigned nub = m_nub; + if (nub < n) { + for (unsigned k=0; k<100; k++) { + unsigned i1,i2; + do { + i1 = dRandInt(n-nub)+nub; + i2 = dRandInt(n-nub)+nub; + } + while (i1 > i2); + //printf ("--> %d %d\n",i1,i2); + swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, n, i1, i2, m_nskip, 0); + } + } + */ + + // permute the problem so that *all* the unbounded variables are at the + // start, i.e. look for unbounded variables not included in `nub'. we can + // potentially push up `nub' this way and get a bigger initial factorization. + // note that when we swap rows/cols here we must not just swap row pointers, + // as the initial factorization relies on the data being all in one chunk. + // variables that have findex >= 0 are *not* considered to be unbounded even + // if lo=-inf and hi=inf - this is because these limits may change during the + // solution process. + + { + int *findex = m_findex; + dReal *pairslh = m_pairslh; + const unsigned n = m_n; + for (unsigned k = m_nub; k < n; ++k) { + if (findex && findex[k] >= 0) continue; + if ((pairslh + (sizeint)k * PLH__MAX)[PLH_LO] == -dInfinity && (pairslh + (sizeint)k * PLH__MAX)[PLH_HI] == dInfinity) { + swapProblem (m_A, m_pairsbx, m_w, pairslh, m_p, m_state, findex, n, m_nub, k, m_nskip, 0); + m_nub++; + } + } + } + + // if there are unbounded variables at the start, factorize A up to that + // point and solve for x. this puts all indexes 0..nub-1 into C. + if (m_nub > 0) { + const unsigned nub = m_nub; + { + dReal *Lrow = m_L; + const unsigned nskip = m_nskip; + for (unsigned j = 0; j < nub; Lrow += nskip, ++j) memcpy(Lrow, AROW(j), (j + 1) * sizeof(dReal)); + } + transfer_b_to_x (m_pairsbx, nub); + factorMatrixAsLDLT<1> (m_L, m_d, nub, m_nskip); + solveEquationSystemWithLDLT<1, PBX__MAX> (m_L, m_d, m_pairsbx + PBX_X, nub, m_nskip); + dSetZero (m_w, nub); + { + unsigned *C = m_C; + for (unsigned k = 0; k < nub; ++k) C[k] = k; + } + m_nC = nub; + } + + // permute the indexes > nub such that all findex variables are at the end + if (m_findex) { + const unsigned nub = m_nub; + int *findex = m_findex; + unsigned num_at_end = 0; + for (unsigned k = m_n; k > nub; ) { + --k; + if (findex[k] >= 0) { + swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, findex, m_n, k, m_n - 1 - num_at_end, m_nskip, 1); + num_at_end++; + } + } + } + + // print info about indexes + /* + { + const unsigned n = m_n; + const unsigned nub = m_nub; + for (unsigned k=0; k 0) { + // ell,Dell were computed by solve1(). note, ell = D \ L1solve (L,A(i,C)) + dReal *const Ltgt = m_L + (sizeint)m_nskip * nC, *ell = m_ell; + memcpy(Ltgt, ell, nC * sizeof(dReal)); + + dReal ell_Dell_dot = dxDot(m_ell, m_Dell, nC); + dReal AROW_i_i = AROW(i)[i] != ell_Dell_dot ? AROW(i)[i] : dNextAfter(AROW(i)[i], dInfinity); // A hack to avoid getting a zero in the denominator + m_d[nC] = dRecip (AROW_i_i - ell_Dell_dot); + } + else { + m_d[0] = dRecip (AROW(i)[i]); + } + + swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, m_n, nC, i, m_nskip, 1); + + m_C[nC] = nC; + m_nC = nC + 1; // nC value is outdated after this line + } + +# ifdef DEBUG_LCP + checkFactorization (m_A, m_L, m_d, m_nC, m_C, m_nskip); + if (i < (m_n-1)) checkPermutations (i+1, m_n, m_nC, m_nN, m_p, m_C); +# endif +} + + +void dLCP::transfer_i_from_N_to_C (unsigned i) +{ + { + const unsigned nC = m_nC; + if (nC > 0) { + { + dReal *const aptr = AROW(i); + dReal *Dell = m_Dell; + const unsigned *C = m_C; +# ifdef NUB_OPTIMIZATIONS + // if nub>0, initial part of aptr unpermuted + const unsigned nub = m_nub; + unsigned j=0; + for ( ; j(m_L, m_Dell, nC, m_nskip); + + dReal ell_Dell_dot = REAL(0.0); + dReal *const Ltgt = m_L + (sizeint)m_nskip * nC; + dReal *ell = m_ell, *Dell = m_Dell, *d = m_d; + for (unsigned j = 0; j < nC; ++j) { + dReal ell_j, Dell_j = Dell[j]; + Ltgt[j] = ell[j] = ell_j = Dell_j * d[j]; + ell_Dell_dot += ell_j * Dell_j; + } + + dReal AROW_i_i = AROW(i)[i] != ell_Dell_dot ? AROW(i)[i] : dNextAfter(AROW(i)[i], dInfinity); // A hack to avoid getting a zero in the denominator + m_d[nC] = dRecip (AROW_i_i - ell_Dell_dot); + } + else { + m_d[0] = dRecip (AROW(i)[i]); + } + + swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, m_n, nC, i, m_nskip, 1); + + m_C[nC] = nC; + m_nN--; + m_nC = nC + 1; // nC value is outdated after this line + } + + // @@@ TO DO LATER + // if we just finish here then we'll go back and re-solve for + // delta_x. but actually we can be more efficient and incrementally + // update delta_x here. but if we do this, we wont have ell and Dell + // to use in updating the factorization later. + +# ifdef DEBUG_LCP + checkFactorization (m_A,m_L,m_d,m_nC,m_C,m_nskip); +# endif +} + + +void dLCP::transfer_i_from_C_to_N (unsigned i, void *tmpbuf) +{ + { + unsigned *C = m_C; + // remove a row/column from the factorization, and adjust the + // indexes (black magic!) + int last_idx = -1; + const unsigned nC = m_nC; + unsigned j = 0; + for ( ; j < nC; ++j) { + if (C[j] == nC - 1) { + last_idx = j; + } + if (C[j] == i) { + dxLDLTRemove (m_A, C, m_L, m_d, m_n, nC, j, m_nskip, tmpbuf); + unsigned k; + if (last_idx == -1) { + for (k = j + 1 ; k < nC; ++k) { + if (C[k] == nC - 1) { + break; + } + } + dIASSERT (k < nC); + } + else { + k = last_idx; + } + C[k] = C[j]; + if (j != (nC - 1)) memmove (C + j, C + j + 1, (nC - j - 1) * sizeof(C[0])); + break; + } + } + dIASSERT (j < nC); + + swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, m_n, i, nC - 1, m_nskip, 1); + + m_nN++; + m_nC = nC - 1; // nC value is outdated after this line + } + +# ifdef DEBUG_LCP + checkFactorization (m_A, m_L, m_d, m_nC, m_C, m_nskip); +# endif +} + + +void dLCP::pN_equals_ANC_times_qC (dReal *p, dReal *q) +{ + // we could try to make this matrix-vector multiplication faster using + // outer product matrix tricks, e.g. with the dMultidotX() functions. + // but i tried it and it actually made things slower on random 100x100 + // problems because of the overhead involved. so we'll stick with the + // simple method for now. + const unsigned nC = m_nC; + dReal *ptgt = p + nC; + const unsigned nN = m_nN; + for (unsigned i = 0; i < nN; ++i) { + ptgt[i] = dxDot (AROW(i + nC), q, nC); + } +} + + +void dLCP::pN_plusequals_ANi (dReal *p, unsigned i, bool dir_positive) +{ + const unsigned nC = m_nC; + dReal *aptr = AROW(i) + nC; + dReal *ptgt = p + nC; + if (dir_positive) { + const unsigned nN = m_nN; + for (unsigned j=0; j < nN; ++j) ptgt[j] += aptr[j]; + } + else { + const unsigned nN = m_nN; + for (unsigned j=0; j < nN; ++j) ptgt[j] -= aptr[j]; + } +} + +template +void dLCP::pC_plusequals_s_times_qC (dReal *p, dReal s, dReal *q) +{ + const unsigned nC = m_nC; + dReal *q_end = q + nC; + for (; q != q_end; p += p_stride, ++q) { + *p += s * (*q); + } +} + +void dLCP::pN_plusequals_s_times_qN (dReal *p, dReal s, dReal *q) +{ + const unsigned nC = m_nC; + dReal *ptgt = p + nC, *qsrc = q + nC; + const unsigned nN = m_nN; + for (unsigned i = 0; i < nN; ++i) { + ptgt[i] += s * qsrc[i]; + } +} + +void dLCP::solve1 (dReal *a, unsigned i, bool dir_positive, int only_transfer) +{ + // the `Dell' and `ell' that are computed here are saved. if index i is + // later added to the factorization then they can be reused. + // + // @@@ question: do we need to solve for entire delta_x??? yes, but + // only if an x goes below 0 during the step. + + const unsigned nC = m_nC; + if (nC > 0) { + { + dReal *Dell = m_Dell; + unsigned *C = m_C; + dReal *aptr = AROW(i); +# ifdef NUB_OPTIMIZATIONS + // if nub>0, initial part of aptr[] is guaranteed unpermuted + const unsigned nub = m_nub; + unsigned j = 0; + for ( ; j < nub; ++j) Dell[j] = aptr[j]; + for ( ; j < nC; ++j) Dell[j] = aptr[C[j]]; +# else + for (unsigned j = 0; j < nC; ++j) Dell[j] = aptr[C[j]]; +# endif + } + solveL1Straight<1>(m_L, m_Dell, nC, m_nskip); + { + dReal *ell = m_ell, *Dell = m_Dell, *d = m_d; + for (unsigned j = 0; j < nC; ++j) ell[j] = Dell[j] * d[j]; + } + + if (!only_transfer) { + dReal *tmp = m_tmp, *ell = m_ell; + { + for (unsigned j = 0; j < nC; ++j) tmp[j] = ell[j]; + } + solveL1Transposed<1>(m_L, tmp, nC, m_nskip); + if (dir_positive) { + unsigned *C = m_C; + dReal *tmp = m_tmp; + for (unsigned j = 0; j < nC; ++j) a[C[j]] = -tmp[j]; + } else { + unsigned *C = m_C; + dReal *tmp = m_tmp; + for (unsigned j = 0; j < nC; ++j) a[C[j]] = tmp[j]; + } + } + } +} + + +void dLCP::unpermute_X() +{ + unsigned *p = m_p; + dReal *pairsbx = m_pairsbx; + const unsigned n = m_n; + for (unsigned j = 0; j < n; ++j) { + unsigned k = p[j]; + if (k != j) { + // p[j] = j; -- not going to be checked anymore anyway + dReal x_j = (pairsbx + (sizeint)j * PBX__MAX)[PBX_X]; + for (;;) { + dxSwap(x_j, (pairsbx + (sizeint)k * PBX__MAX)[PBX_X]); + + unsigned orig_k = p[k]; + p[k] = k; + if (orig_k == j) { + break; + } + k = orig_k; + } + (pairsbx + (sizeint)j * PBX__MAX)[PBX_X] = x_j; + } + } +} + +void dLCP::unpermute_W() +{ + memcpy (m_tmp, m_w, m_n * sizeof(dReal)); + + const unsigned *p = m_p; + dReal *w = m_w, *tmp = m_tmp; + const unsigned n = m_n; + for (unsigned j = 0; j < n; ++j) { + unsigned k = p[j]; + w[k] = tmp[j]; + } +} + +#endif // dLCP_FAST + + +static void dxSolveLCP_AllUnbounded (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX]); +static void dxSolveLCP_Generic (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX], + dReal *outer_w/*=NULL*/, unsigned nub, dReal pairslh[PLH__MAX], int *findex); + +/*extern */ +void dxSolveLCP (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX], + dReal *outer_w/*=NULL*/, unsigned nub, dReal pairslh[PLH__MAX], int *findex) +{ + if (nub >= n) + { + dxSolveLCP_AllUnbounded (memarena, n, A, pairsbx); + } + else + { + dxSolveLCP_Generic (memarena, n, A, pairsbx, outer_w, nub, pairslh, findex); + } +} + +//*************************************************************************** +// if all the variables are unbounded then we can just factor, solve, and return + +static +void dxSolveLCP_AllUnbounded (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX]) +{ + dAASSERT(A != NULL); + dAASSERT(pairsbx != NULL); + dAASSERT(n != 0); + + transfer_b_to_x(pairsbx, n); + + unsigned nskip = dPAD(n); + factorMatrixAsLDLT (A, pairsbx + PBX_B, n, nskip); + solveEquationSystemWithLDLT (A, pairsbx + PBX_B, pairsbx + PBX_X, n, nskip); +} + +//*************************************************************************** +// an optimized Dantzig LCP driver routine for the lo-hi LCP problem. + +static +void dxSolveLCP_Generic (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX], + dReal *outer_w/*=NULL*/, unsigned nub, dReal pairslh[PLH__MAX], int *findex) +{ + dAASSERT (n > 0 && A && pairsbx && pairslh && nub >= 0 && nub < n); +# ifndef dNODEBUG + { + // check restrictions on lo and hi + dReal *endlh = pairslh + (sizeint)n * PLH__MAX; + for (dReal *currlh = pairslh; currlh != endlh; currlh += PLH__MAX) dIASSERT (currlh[PLH_LO] <= 0 && currlh[PLH_HI] >= 0); + } +# endif + + const unsigned nskip = dPAD(n); + dReal *L = memarena->AllocateOveralignedArray ((sizeint)nskip * n, LMATRIX_ALIGNMENT); + dReal *d = memarena->AllocateArray (n); + dReal *w = outer_w != NULL ? outer_w : memarena->AllocateArray (n); + dReal *delta_w = memarena->AllocateArray (n); + dReal *delta_x = memarena->AllocateArray (n); + dReal *Dell = memarena->AllocateArray (n); + dReal *ell = memarena->AllocateArray (n); +#ifdef ROWPTRS + dReal **Arows = memarena->AllocateArray (n); +#else + dReal **Arows = NULL; +#endif + unsigned *p = memarena->AllocateArray (n); + unsigned *C = memarena->AllocateArray (n); + + // for i in N, state[i] is 0 if x(i)==lo(i) or 1 if x(i)==hi(i) + bool *state = memarena->AllocateArray (n); + + // create LCP object. note that tmp is set to delta_w to save space, this + // optimization relies on knowledge of how tmp is used, so be careful! + dLCP lcp(n, nskip, nub, A, pairsbx, w, pairslh, L, d, Dell, ell, delta_w, state, findex, p, C, Arows); + unsigned adj_nub = lcp.getNub(); + + // loop over all indexes adj_nub..n-1. for index i, if x(i),w(i) satisfy the + // LCP conditions then i is added to the appropriate index set. otherwise + // x(i),w(i) is driven either +ve or -ve to force it to the valid region. + // as we drive x(i), x(C) is also adjusted to keep w(C) at zero. + // while driving x(i) we maintain the LCP conditions on the other variables + // 0..i-1. we do this by watching out for other x(i),w(i) values going + // outside the valid region, and then switching them between index sets + // when that happens. + + bool hit_first_friction_index = false; + for (unsigned i = adj_nub; i < n; ++i) { + bool s_error = false; + // the index i is the driving index and indexes i+1..n-1 are "dont care", + // i.e. when we make changes to the system those x's will be zero and we + // don't care what happens to those w's. in other words, we only consider + // an (i+1)*(i+1) sub-problem of A*x=b+w. + + // if we've hit the first friction index, we have to compute the lo and + // hi values based on the values of x already computed. we have been + // permuting the indexes, so the values stored in the findex vector are + // no longer valid. thus we have to temporarily unpermute the x vector. + // for the purposes of this computation, 0*infinity = 0 ... so if the + // contact constraint's normal force is 0, there should be no tangential + // force applied. + + if (!hit_first_friction_index && findex && findex[i] >= 0) { + // un-permute x into delta_w, which is not being used at the moment + for (unsigned j = 0; j < n; ++j) delta_w[p[j]] = (pairsbx + (sizeint)j * PBX__MAX)[PBX_X]; + + // set lo and hi values + for (unsigned k = i; k < n; ++k) { + dReal *currlh = pairslh + (sizeint)k * PLH__MAX; + dReal wfk = delta_w[findex[k]]; + if (wfk == 0) { + currlh[PLH_HI] = 0; + currlh[PLH_LO] = 0; + } + else { + currlh[PLH_HI] = dFabs (currlh[PLH_HI] * wfk); + currlh[PLH_LO] = -currlh[PLH_HI]; + } + } + hit_first_friction_index = true; + } + + // thus far we have not even been computing the w values for indexes + // greater than i, so compute w[i] now. + dReal wPrep = lcp.AiC_times_qC (i, pairsbx + PBX_X) + lcp.AiN_times_qN (i, pairsbx + PBX_X); + + dReal *currbx = pairsbx + (sizeint)i * PBX__MAX; + + w[i] = wPrep - currbx[PBX_B]; + + // if lo=hi=0 (which can happen for tangential friction when normals are + // 0) then the index will be assigned to set N with some state. however, + // set C's line has zero size, so the index will always remain in set N. + // with the "normal" switching logic, if w changed sign then the index + // would have to switch to set C and then back to set N with an inverted + // state. this is pointless, and also computationally expensive. to + // prevent this from happening, we use the rule that indexes with lo=hi=0 + // will never be checked for set changes. this means that the state for + // these indexes may be incorrect, but that doesn't matter. + + dReal *currlh = pairslh + (sizeint)i * PLH__MAX; + + // see if x(i),w(i) is in a valid region + if (currlh[PLH_LO] == 0 && w[i] >= 0) { + lcp.transfer_i_to_N (i); + state[i] = false; + } + else if (currlh[PLH_HI] == 0 && w[i] <= 0) { + lcp.transfer_i_to_N (i); + state[i] = true; + } + else if (w[i] == 0) { + // this is a degenerate case. by the time we get to this test we know + // that lo != 0, which means that lo < 0 as lo is not allowed to be +ve, + // and similarly that hi > 0. this means that the line segment + // corresponding to set C is at least finite in extent, and we are on it. + // NOTE: we must call lcp.solve1() before lcp.transfer_i_to_C() + lcp.solve1 (delta_x, i, false, 1); + + lcp.transfer_i_to_C (i); + } + else { + // we must push x(i) and w(i) + for (;;) { + // find direction to push on x(i) + bool dir_positive = (w[i] <= 0); + + // compute: delta_x(C) = -dir*A(C,C)\A(C,i) + lcp.solve1 (delta_x, i, dir_positive); + + // note that delta_x[i] = (dir_positive ? 1 : -1), but we wont bother to set it + + // compute: delta_w = A*delta_x ... note we only care about + // delta_w(N) and delta_w(i), the rest is ignored + lcp.pN_equals_ANC_times_qC (delta_w, delta_x); + lcp.pN_plusequals_ANi (delta_w, i, dir_positive); + delta_w[i] = dir_positive + ? lcp.AiC_times_qC<1> (i, delta_x) + lcp.Aii(i) + : lcp.AiC_times_qC<1> (i, delta_x) - lcp.Aii(i); + + // find largest step we can take (size=s), either to drive x(i),w(i) + // to the valid LCP region or to drive an already-valid variable + // outside the valid region. + + int cmd = 1; // index switching command + unsigned si = 0; // si = index to switch if cmd>3 + + dReal s = delta_w[i] != REAL(0.0) + ? -w[i] / delta_w[i] + : (w[i] != REAL(0.0) ? dCopySign(dInfinity, -w[i]) : REAL(0.0)); + + if (dir_positive) { + if (currlh[PLH_HI] < dInfinity) { + dReal s2 = (currlh[PLH_HI] - currbx[PBX_X]); // was (hi[i]-x[i])/dirf // step to x(i)=hi(i) + if (s2 < s) { + s = s2; + cmd = 3; + } + } + } + else { + if (currlh[PLH_LO] > -dInfinity) { + dReal s2 = (currbx[PBX_X] - currlh[PLH_LO]); // was (lo[i]-x[i])/dirf // step to x(i)=lo(i) + if (s2 < s) { + s = s2; + cmd = 2; + } + } + } + + { + const unsigned numN = lcp.numN(); + for (unsigned k = 0; k < numN; ++k) { + const unsigned indexN_k = lcp.indexN(k); + if (!state[indexN_k] ? delta_w[indexN_k] < 0 : delta_w[indexN_k] > 0) { + // don't bother checking if lo=hi=0 + dReal *indexlh = pairslh + (sizeint)indexN_k * PLH__MAX; + if (indexlh[PLH_LO] == 0 && indexlh[PLH_HI] == 0) continue; + dReal s2 = -w[indexN_k] / delta_w[indexN_k]; + if (s2 < s) { + s = s2; + cmd = 4; + si = indexN_k; + } + } + } + } + + { + const unsigned numC = lcp.numC(); + for (unsigned k = adj_nub; k < numC; ++k) { + const unsigned indexC_k = lcp.indexC(k); + dReal *indexlh = pairslh + (sizeint)indexC_k * PLH__MAX; + if (delta_x[indexC_k] < 0 && indexlh[PLH_LO] > -dInfinity) { + dReal s2 = (indexlh[PLH_LO] - (pairsbx + (sizeint)indexC_k * PBX__MAX)[PBX_X]) / delta_x[indexC_k]; + if (s2 < s) { + s = s2; + cmd = 5; + si = indexC_k; + } + } + if (delta_x[indexC_k] > 0 && indexlh[PLH_HI] < dInfinity) { + dReal s2 = (indexlh[PLH_HI] - (pairsbx + (sizeint)indexC_k * PBX__MAX)[PBX_X]) / delta_x[indexC_k]; + if (s2 < s) { + s = s2; + cmd = 6; + si = indexC_k; + } + } + } + } + + //static char* cmdstring[8] = {0,"->C","->NL","->NH","N->C", + // "C->NL","C->NH"}; + //printf ("cmd=%d (%s), si=%d\n",cmd,cmdstring[cmd],(cmd>3) ? si : i); + + // if s <= 0 then we've got a problem. if we just keep going then + // we're going to get stuck in an infinite loop. instead, just cross + // our fingers and exit with the current solution. + if (s <= REAL(0.0)) { + dMessage (d_ERR_LCP, "LCP internal error, s <= 0 (s=%.4e)",(double)s); + if (i < n) { + dxtSetZero(currbx + PBX_X, n - i); + dxSetZero (w + i, n - i); + } + s_error = true; + break; + } + + // apply x = x + s * delta_x + lcp.pC_plusequals_s_times_qC (pairsbx + PBX_X, s, delta_x); + currbx[PBX_X] = dir_positive + ? currbx[PBX_X] + s + : currbx[PBX_X] - s; + + // apply w = w + s * delta_w + lcp.pN_plusequals_s_times_qN (w, s, delta_w); + w[i] += s * delta_w[i]; + + void *tmpbuf; + // switch indexes between sets if necessary + switch (cmd) { + case 1: // done + w[i] = 0; + lcp.transfer_i_to_C (i); + break; + case 2: // done + currbx[PBX_X] = currlh[PLH_LO]; + state[i] = false; + lcp.transfer_i_to_N (i); + break; + case 3: // done + currbx[PBX_X] = currlh[PLH_HI]; + state[i] = true; + lcp.transfer_i_to_N (i); + break; + case 4: // keep going + w[si] = 0; + lcp.transfer_i_from_N_to_C (si); + break; + case 5: // keep going + (pairsbx + (sizeint)si * PBX__MAX)[PBX_X] = (pairslh + (sizeint)si * PLH__MAX)[PLH_LO]; + state[si] = false; + tmpbuf = memarena->PeekBufferRemainder(); + lcp.transfer_i_from_C_to_N (si, tmpbuf); + break; + case 6: // keep going + (pairsbx + (sizeint)si * PBX__MAX)[PBX_X] = (pairslh + (sizeint)si * PLH__MAX)[PLH_HI]; + state[si] = true; + tmpbuf = memarena->PeekBufferRemainder(); + lcp.transfer_i_from_C_to_N (si, tmpbuf); + break; + } + + if (cmd <= 3) break; + } // for (;;) + } // else + + if (s_error) { + break; + } + } // for (unsigned i = adj_nub; i < n; ++i) + + // now we have to un-permute x and w + if (outer_w != NULL) { + lcp.unpermute_W(); + } + lcp.unpermute_X(); // This destroys p[] and must be done last +} + +sizeint dxEstimateSolveLCPMemoryReq(unsigned n, bool outer_w_avail) +{ + const unsigned nskip = dPAD(n); + + sizeint res = 0; + + res += dOVERALIGNED_SIZE(sizeof(dReal) * ((sizeint)n * nskip), LMATRIX_ALIGNMENT); // for L + res += 5 * dEFFICIENT_SIZE(sizeof(dReal) * n); // for d, delta_w, delta_x, Dell, ell + if (!outer_w_avail) { + res += dEFFICIENT_SIZE(sizeof(dReal) * n); // for w + } +#ifdef ROWPTRS + res += dEFFICIENT_SIZE(sizeof(dReal *) * n); // for Arows +#endif + res += 2 * dEFFICIENT_SIZE(sizeof(unsigned) * n); // for p, C + res += dEFFICIENT_SIZE(sizeof(bool) * n); // for state + + // Use n instead of nC as nC varies at runtime while n is greater or equal to nC + sizeint lcp_transfer_req = dLCP::estimate_transfer_i_from_C_to_N_mem_req(n, nskip); + res += dEFFICIENT_SIZE(lcp_transfer_req); // for dLCP::transfer_i_from_C_to_N + + return res; +} + + +//*************************************************************************** +// accuracy and timing test + +static sizeint EstimateTestSolveLCPMemoryReq(unsigned n) +{ + const unsigned nskip = dPAD(n); + + sizeint res = 0; + + res += 2 * dEFFICIENT_SIZE(sizeof(dReal) * ((sizeint)n * nskip)); // for A, A2 + res += 7 * dEFFICIENT_SIZE(sizeof(dReal) * n); // for x, b, w, lo, hi, tmp1, tmp2 + res += dEFFICIENT_SIZE(sizeof(dReal) * PBX__MAX * n); // for pairsbx, + res += dEFFICIENT_SIZE(sizeof(dReal) * PLH__MAX * n); // for pairslh + + res += dxEstimateSolveLCPMemoryReq(n, true); + + return res; +} + +extern "C" ODE_API int dTestSolveLCP() +{ + const unsigned n = 100; + + sizeint memreq = EstimateTestSolveLCPMemoryReq(n); + dxWorldProcessMemArena *arena = dxAllocateTemporaryWorldProcessMemArena(memreq, NULL, NULL); + if (arena == NULL) { + return 0; + } + arena->ResetState(); + + unsigned i,nskip = dPAD(n); +#ifdef dDOUBLE + const dReal tol = REAL(1e-9); +#endif +#ifdef dSINGLE + const dReal tol = REAL(1e-4); +#endif + printf ("dTestSolveLCP()\n"); + + dReal *A = arena->AllocateArray (n*nskip); + dReal *x = arena->AllocateArray (n); + dReal *b = arena->AllocateArray (n); + dReal *w = arena->AllocateArray (n); + dReal *lo = arena->AllocateArray (n); + dReal *hi = arena->AllocateArray (n); + + dReal *A2 = arena->AllocateArray (n*nskip); + dReal *pairsbx = arena->AllocateArray (n * PBX__MAX); + dReal *pairslh = arena->AllocateArray (n * PLH__MAX); + + dReal *tmp1 = arena->AllocateArray (n); + dReal *tmp2 = arena->AllocateArray (n); + + double total_time = 0; + for (unsigned count=0; count < 1000; count++) { + BEGIN_STATE_SAVE(arena, saveInner) { + + // form (A,b) = a random positive definite LCP problem + dMakeRandomMatrix (A2,n,n,1.0); + dMultiply2 (A,A2,A2,n,n,n); + dMakeRandomMatrix (x,n,1,1.0); + dMultiply0 (b,A,x,n,n,1); + for (i=0; i tol ? "FAILED" : "passed"); + if (diff > tol) dDebug (0,"A*x = b+w, maximum difference = %.6e",diff); + unsigned n1=0,n2=0,n3=0; + for (i=0; i= 0) { + n1++; // ok + } + else if (x[i]==hi[i] && w[i] <= 0) { + n2++; // ok + } + else if (x[i] >= lo[i] && x[i] <= hi[i] && w[i] == 0) { + n3++; // ok + } + else { + dDebug (0,"FAILED: i=%d x=%.4e w=%.4e lo=%.4e hi=%.4e",i, + x[i],w[i],lo[i],hi[i]); + } + } + + // pacifier + printf ("passed: NL=%3d NH=%3d C=%3d ",n1,n2,n3); + printf ("time=%10.3f ms avg=%10.4f\n",time * 1000.0,average); + + } END_STATE_SAVE(arena, saveInner); + } + + dxFreeTemporaryWorldProcessMemArena(arena); + return 1; +} -- cgit v1.2.1