From d52e57307cd728c59b913fe9b515d7ea83d78d95 Mon Sep 17 00:00:00 2001 From: Stefan Povolny Date: Tue, 24 Jan 2023 10:55:14 -0800 Subject: [PATCH 01/42] initial commit, added an algorithm to fixedSizeSquareMatrixOps that calculates the polar decomposition of a 3x3 square matrix --- src/fixedSizeSquareMatrixOpsImpl.hpp | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index f006d866..06a591d1 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -1253,6 +1253,50 @@ struct SquareMatrixOps< 3 > return row; } + + /** + * @brief Determine the polar decomposition using the Higham algorithm such that @p R . U = V . @p R = @p matrix + * @tparam DST_MATRIX The type of @p R. + * @tparam MATRIX The type of @p matrix. + * @param R The resultant rotation matrix, of size ISIZE x ISIZE. + * @param matrix The matrix to be decomposed, of size ISIZE x ISIZE. + */ + template< typename DST_MATRIX, typename MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R, + MATRIX const & LVARRAY_RESTRICT_REF matrix ) + { + checkSizes< 3, 3 >( R ); + checkSizes< 3, 3 >( matrix ); + + using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >; + + // Initialize + copy< 3, 3>( R, matrix ); + realType RInverse[3][3], RInverseTranspose[3][3], RRTMinusI[3][3]; + + // Higham Algorithm + realType error = 1.0; + int iter = 0; + while( error > 1.0e-16 ) + { + iter++; + invert( RInverse, R ); + transpose< 3, 3 >( RInverseTranspose, RInverse ); + add< 3, 3 >( R, RInverseTranspose ); + scale< 3, 3 >( R, 0.5 ); + Rij_eq_AikBjk< 3, 3 >( RRTMinusI, R, R ); + addIdentity< 3, 3 >( RRTMinusI, -1.0 ); + for( std::ptrdiff_t i = 0 ; i < 3 ; i++ ) + { + for( std::ptrdiff_t j = 0 ; j < 3 ; j++ ) + { + error += RRTMinusI[i][j] * RRTMinusI[i][j]; + } + } + LVARRAY_ERROR_IF( iter >= 100, "Polar decomposition failed to converge!"); + } + } }; } // namespace internal From 6d4875ec34e6c65e162a590c42c7ec9f4d7931d4 Mon Sep 17 00:00:00 2001 From: Stefan Povolny Date: Wed, 25 Jan 2023 08:38:03 -0800 Subject: [PATCH 02/42] implementation of polar decomposition complete, still need to test --- src/fixedSizeSquareMatrixOps.hpp | 18 ++++ src/fixedSizeSquareMatrixOpsImpl.hpp | 144 +++++++++++++++++++-------- 2 files changed, 118 insertions(+), 44 deletions(-) diff --git a/src/fixedSizeSquareMatrixOps.hpp b/src/fixedSizeSquareMatrixOps.hpp index c61e4a9e..9fb82ec3 100644 --- a/src/fixedSizeSquareMatrixOps.hpp +++ b/src/fixedSizeSquareMatrixOps.hpp @@ -354,6 +354,24 @@ void symmetricToDense( DST_MATRIX && dstMatrix, SRC_SYM_MATRIX const & srcSymMat srcSymMatrix ); } +/** + * @brief Determine the polar decomposition of the matrix @p srcMatrix + * @tparam M The size of @p R and @p srcMatrix. + * @tparam DST_MATRIX The type of @p R. + * @tparam MATRIX The type of @p srcMatrix. + * @param R The resultant rotation matrix. + * @param matrix The matrix to be decomposed. + * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p srcMatrix. + * This is done using Higham's iterative algorithm. + */ +template< std::ptrdiff_t M, typename DST_MATRIX, typename MATRIX > +LVARRAY_HOST_DEVICE constexpr inline +void polarDecomposition( DST_MATRIX && R, MATRIX const & srcMatrix ) +{ + return internal::SquareMatrixOps< M >::polarDecomposition( std::forward< DST_MATRIX >( R ), + srcMatrix ); +} + ///@} } // namespace tensorOps diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index 06a591d1..3733fae4 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -529,6 +529,56 @@ struct SquareMatrixOps< 2 > dstMatrix[ 1 ][ 0 ] = srcSymMatrix[ 2 ]; } + /** + * @brief Determine the polar decomposition of the 2x2 matrix @p matrix + * @tparam DST_MATRIX The type of @p R. + * @tparam MATRIX The type of @p matrix. + * @param R The resultant rotation matrix. + * @param matrix The matrix to be decomposed. + * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix. + * This is done using Higham's iterative algorithm. + */ + template< typename DST_MATRIX, typename MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R, + MATRIX const & LVARRAY_RESTRICT_REF matrix ) + { + checkSizes< 2, 2 >( R ); + checkSizes< 2, 2 >( matrix ); + + using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >; + + // Initialize + copy< 2, 2 >( R, matrix ); + FloatingPoint RInverse[2][2] = { {0} }, + RInverseTranspose[2][2] = { {0} }, + RRTMinusI[2][2] = { {0} }; + + // Higham Algorithm + FloatingPoint error = 1.0; + int iter = 0; + while( error > 1.0e-16 && iter < 100 ) + { + iter++; + error = 0.0; + + invert( RInverse, R ); + transpose< 2, 2 >( RInverseTranspose, RInverse ); + add< 2, 2 >( R, RInverseTranspose ); + scale< 2, 2 >( R, 0.5 ); + Rij_eq_AikBjk< 2, 2, 2 >( RRTMinusI, R, R ); + addIdentity< 2 >( RRTMinusI, -1.0 ); + + for( std::ptrdiff_t i = 0 ; i < 2 ; i++ ) + { + for( std::ptrdiff_t j = 0 ; j < 2 ; j++ ) + { + error += RRTMinusI[i][j] * RRTMinusI[i][j]; + } + } + } + } + private: /** @@ -1162,6 +1212,56 @@ struct SquareMatrixOps< 3 > dstMatrix[ 2 ][ 1 ] = srcSymMatrix[ 3 ]; } + /** + * @brief Determine the polar decomposition of the 3x3 matrix @p matrix + * @tparam DST_MATRIX The type of @p R. + * @tparam MATRIX The type of @p matrix. + * @param R The resultant rotation matrix. + * @param matrix The matrix to be decomposed. + * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix. + * This is done using Higham's iterative algorithm. + */ + template< typename DST_MATRIX, typename MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R, + MATRIX const & LVARRAY_RESTRICT_REF matrix ) + { + checkSizes< 3, 3 >( R ); + checkSizes< 3, 3 >( matrix ); + + using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >; + + // Initialize + copy< 3, 3 >( R, matrix ); + FloatingPoint RInverse[3][3] = { {0} }, + RInverseTranspose[3][3] = { {0} }, + RRTMinusI[3][3] = { {0} }; + + // Higham Algorithm + FloatingPoint error = 1.0; + int iter = 0; + while( error > 1.0e-16 && iter < 100 ) + { + iter++; + error = 0.0; + + invert( RInverse, R ); + transpose< 3, 3 >( RInverseTranspose, RInverse ); + add< 3, 3 >( R, RInverseTranspose ); + scale< 3, 3 >( R, 0.5 ); + Rij_eq_AikBjk< 3, 3, 3 >( RRTMinusI, R, R ); + addIdentity< 3 >( RRTMinusI, -1.0 ); + + for( std::ptrdiff_t i = 0 ; i < 3 ; i++ ) + { + for( std::ptrdiff_t j = 0 ; j < 3 ; j++ ) + { + error += RRTMinusI[i][j] * RRTMinusI[i][j]; + } + } + } + } + private: /** * @brief Compute the eigenvalues of the 3x3 symmetric matrix @p matrix. @@ -1253,50 +1353,6 @@ struct SquareMatrixOps< 3 > return row; } - - /** - * @brief Determine the polar decomposition using the Higham algorithm such that @p R . U = V . @p R = @p matrix - * @tparam DST_MATRIX The type of @p R. - * @tparam MATRIX The type of @p matrix. - * @param R The resultant rotation matrix, of size ISIZE x ISIZE. - * @param matrix The matrix to be decomposed, of size ISIZE x ISIZE. - */ - template< typename DST_MATRIX, typename MATRIX > - LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline - static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R, - MATRIX const & LVARRAY_RESTRICT_REF matrix ) - { - checkSizes< 3, 3 >( R ); - checkSizes< 3, 3 >( matrix ); - - using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >; - - // Initialize - copy< 3, 3>( R, matrix ); - realType RInverse[3][3], RInverseTranspose[3][3], RRTMinusI[3][3]; - - // Higham Algorithm - realType error = 1.0; - int iter = 0; - while( error > 1.0e-16 ) - { - iter++; - invert( RInverse, R ); - transpose< 3, 3 >( RInverseTranspose, RInverse ); - add< 3, 3 >( R, RInverseTranspose ); - scale< 3, 3 >( R, 0.5 ); - Rij_eq_AikBjk< 3, 3 >( RRTMinusI, R, R ); - addIdentity< 3, 3 >( RRTMinusI, -1.0 ); - for( std::ptrdiff_t i = 0 ; i < 3 ; i++ ) - { - for( std::ptrdiff_t j = 0 ; j < 3 ; j++ ) - { - error += RRTMinusI[i][j] * RRTMinusI[i][j]; - } - } - LVARRAY_ERROR_IF( iter >= 100, "Polar decomposition failed to converge!"); - } - } }; } // namespace internal From 02e0e1c050e4cf97893e20f86bf6f4cdfa0034dd Mon Sep 17 00:00:00 2001 From: Benjamin Curtice Corbett Date: Thu, 26 Jan 2023 10:58:05 -0800 Subject: [PATCH 03/42] Fixed cmake so that GEOSX TPLs work and also updated spack. --- cmake/SetupTPL.cmake | 51 +++---- host-configs/LLNL/lassen-base.cmake | 3 - host-configs/LLNL/quartz-base.cmake | 3 - scripts/uberenv/packages/lvarray/package.py | 26 ++-- scripts/uberenv/project.json | 4 +- .../blueos_3_ppc64le_ib_p9/compilers.yaml | 132 +++++++++++++++--- .../toss_3_x86_64_ib/compilers.yaml | 83 ++++++++++- .../toss_3_x86_64_ib_python/packages.yaml | 7 + 8 files changed, 240 insertions(+), 69 deletions(-) diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake index bff94834..29a60128 100644 --- a/cmake/SetupTPL.cmake +++ b/cmake/SetupTPL.cmake @@ -1,19 +1,22 @@ set(thirdPartyLibs "") -################################ +############################### # CAMP -################################ -if(NOT EXISTS ${CAMP_DIR}) - message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") -endif() +############################### +if(CAMP_DIR STREQUAL RAJA_DIR) + message(STATUS "LvArray using CAMP from RAJA.") +else() + if(NOT EXISTS ${CAMP_DIR}) + message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") + endif() -message(STATUS "Using CAMP from ${CAMP_DIR}") + message(STATUS "LvArray using CAMP from ${CAMP_DIR}") -find_package(camp REQUIRED PATHS ${CAMP_DIR}) + find_package(camp REQUIRED PATHS ${CAMP_DIR}) -set(ENABLE_CAMP ON CACHE BOOL "") + set(thirdPartyLibs ${thirdPartyLibs} camp) +endif() -set(thirdPartyLibs ${thirdPartyLibs} camp) ################################ # RAJA @@ -22,7 +25,7 @@ if(NOT EXISTS ${RAJA_DIR}) message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.") endif() -message(STATUS "Using RAJA from ${RAJA_DIR}") +message(STATUS "LvArray using RAJA from ${RAJA_DIR}") find_package(RAJA REQUIRED PATHS ${RAJA_DIR}) @@ -39,14 +42,14 @@ if(ENABLE_UMPIRE) message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.") endif() - message(STATUS "Using Umpire from ${UMPIRE_DIR}") + message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}") find_package(umpire REQUIRED PATHS ${UMPIRE_DIR}) set(thirdPartyLibs ${thirdPartyLibs} umpire) else() - message(STATUS "Not using Umpire.") + message(STATUS "LvArray not using Umpire.") endif() ################################ @@ -65,32 +68,32 @@ if(ENABLE_CHAI) message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.") endif() - message(STATUS "Using CHAI from ${CHAI_DIR}") + message(STATUS "LvArray using CHAI from ${CHAI_DIR}") find_package(chai REQUIRED PATHS ${CHAI_DIR}) - - # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. - get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) - list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) - set_target_properties(chai - PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") + + # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. + # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) + # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) + # set_target_properties(chai + # PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") set(thirdPartyLibs ${thirdPartyLibs} chai) else() - message(STATUS "Not using CHAI.") + message(STATUS "LvArray not using CHAI.") endif() -################################ +############################### # CALIPER -################################ +############################### if(ENABLE_CALIPER) if(NOT EXISTS ${CALIPER_DIR}) message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.") endif() - message(STATUS "Using caliper from ${CALIPER_DIR}") + message(STATUS "LvArray using caliper from ${CALIPER_DIR}") find_package(caliper REQUIRED PATHS ${CALIPER_DIR}) @@ -102,7 +105,7 @@ if(ENABLE_CALIPER) set(thirdPartyLibs ${thirdPartyLibs} caliper) else() - message(STATUS "Not using caliper.") + message(STATUS "LvArray not using caliper.") endif() ################################ diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake index d544d936..cb33c4a0 100644 --- a/host-configs/LLNL/lassen-base.cmake +++ b/host-configs/LLNL/lassen-base.cmake @@ -11,9 +11,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") set(ENABLE_CHAI ON CACHE BOOL "") set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") -set(ENABLE_CALIPER ON CACHE BOOL "") -set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "") - set(ENABLE_ADDR2LINE ON CACHE BOOL "") # Cuda options diff --git a/host-configs/LLNL/quartz-base.cmake b/host-configs/LLNL/quartz-base.cmake index 397324e9..e203ffe6 100644 --- a/host-configs/LLNL/quartz-base.cmake +++ b/host-configs/LLNL/quartz-base.cmake @@ -12,9 +12,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") set(ENABLE_CHAI ON CACHE BOOL "") set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "") -set(ENABLE_CALIPER ON CACHE BOOL "") -set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "") - # set(ENABLE_PYLVARRAY ON CACHE BOOL "") # set(PYTHON_DIR /usr/tce/packages/python/python-3.7.2 CACHE PATH "") diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index 9c4b47d9..cf9d5548 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -56,32 +56,36 @@ class Lvarray(CMakePackage, CudaPackage): variant('docs', default=False, description='Build docs') variant('addr2line', default=True, description='Build support for addr2line.') - + depends_on('blt', when='@0.2.0:', type='build') depends_on('camp') - depends_on('camp+cuda', when='+cuda') depends_on('raja') - depends_on('raja+cuda', when='+cuda') - # At the moment Umpire doesn't support shared when building with CUDA. depends_on('umpire', when='+umpire') - depends_on('umpire+cuda~shared', when='+umpire+cuda') depends_on('chai+raja', when='+chai') - depends_on('chai+raja+cuda', when='+chai+cuda') depends_on('caliper', when='+caliper') depends_on('python +shared +pic', when='+pylvarray') - depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray') - depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray') + depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray') + depends_on('py-scipy@1.5.2:', when='+pylvarray') depends_on('py-pip', when='+pylvarray') depends_on('doxygen@1.8.13:', when='+docs', type='build') depends_on('py-sphinx@1.6.3:', when='+docs', type='build') + with when('+cuda'): + for sm_ in CudaPackage.cuda_arch_values: + depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + + phases = ['hostconfig', 'cmake', 'build', 'install'] @run_after('build') @@ -285,10 +289,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#{0}\n\n".format("-" * 80)) if "+caliper" in spec: - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Caliper\n") - cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) else: @@ -297,6 +297,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write('#{0}\n'.format('-' * 80)) cfg.write('# Python\n') cfg.write('#{0}\n\n'.format('-' * 80)) + if '+pylvarray' in spec: cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) @@ -306,6 +307,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Documentation\n") cfg.write("#{0}\n\n".format("-" * 80)) + if "+docs" in spec: cfg.write(cmake_cache_option("ENABLE_DOCS", True)) sphinx_dir = spec['py-sphinx'].prefix diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json index 9822f975..703db7a4 100644 --- a/scripts/uberenv/project.json +++ b/scripts/uberenv/project.json @@ -3,8 +3,8 @@ "package_version" : "develop", "package_final_phase" : "hostconfig", "package_source_dir" : "../..", - "spack_url": "https://github.com/corbett5/spack", - "spack_branch": "package/corbett/lvarray-update", + "spack_url": "https://github.com/spack/spack", + "spack_branch": "v0.19.0", "spack_activate" : {}, "spack_clean_packages": ["lvarray"], "build_jobs": 100 diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml index b1bf26cb..b8353dd0 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml @@ -2,30 +2,90 @@ compilers: - compiler: spec: clang@10.0.1 paths: - cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] - compiler: spec: clang@11.0.1 paths: - cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 - target: ppc64le + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@12.0.1 + paths: + cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@13.0.1 + paths: + cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@14.0.4 + paths: + cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang + cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@7.3.0 + paths: + cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 modules: [] environment: {} extra_rpaths: [] @@ -37,25 +97,55 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@9.3.1 + paths: + cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@10.2.1 + paths: + cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] - compiler: - spec: xl@16.1.1 + spec: intel@19.1.2 paths: - cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc - cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC - f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf - fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf + cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc + cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc + f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort + fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort flags: - cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 - cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native + cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml index 3d9648a7..b8353dd0 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml @@ -7,8 +7,8 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 target: x86_64 modules: [] @@ -22,8 +22,53 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@12.0.1 + paths: + cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@13.0.1 + paths: + cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@14.0.4 + paths: + cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang + cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 target: x86_64 modules: [] @@ -59,6 +104,36 @@ compilers: modules: [] environment: {} extra_rpaths: [] +- compiler: + spec: gcc@9.3.1 + paths: + cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@10.2.1 + paths: + cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] - compiler: spec: intel@19.1.2 paths: diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml index 0c6b833b..a6fbda09 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml @@ -107,3 +107,10 @@ packages: externals: - spec: pkg-config@0.27.1 prefix: /usr/bin/ + + ninja: + buildable: False + externals: + - spec: ninja@kitware + prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ + From 943b300e8b9f8620ccb4ada3a9f914c2dae24897 Mon Sep 17 00:00:00 2001 From: Stefan Povolny Date: Thu, 9 Mar 2023 12:21:35 -0800 Subject: [PATCH 04/42] implemented unit test for polar decomposition --- src/fixedSizeSquareMatrixOpsImpl.hpp | 134 ++++++++---------- unitTests/CMakeLists.txt | 1 + unitTests/testTensorOpsInverse.hpp | 93 ++++++++++++ unitTests/testTensorOpsPolarDecomposition.cpp | 29 ++++ 4 files changed, 184 insertions(+), 73 deletions(-) create mode 100644 unitTests/testTensorOpsPolarDecomposition.cpp diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index 3733fae4..d135cf7a 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -13,6 +13,7 @@ #pragma once #include "genericTensorOps.hpp" +#include "limits.hpp" namespace LvArray { @@ -71,6 +72,60 @@ template< std::ptrdiff_t M > struct SquareMatrixOps {}; +/** + * @brief Determine the polar decomposition of @p matrix + * @tparam DST_MATRIX The type of @p R. + * @tparam MATRIX The type of @p matrix. + * @param R The resultant orthogonal matrix. + * @param matrix The matrix to be decomposed. + * @details The polar decomposition returns an orthogonal matrix such that @p R . U = V . @p R = @p matrix. + * This is done using Higham's iterative algorithm. + */ +template< std::ptrdiff_t M, typename DST_MATRIX, typename MATRIX > +LVARRAY_HOST_DEVICE inline +static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, + MATRIX const & LVARRAY_RESTRICT_REF matrix ) +{ + checkSizes< M, M >( R ); + checkSizes< M, M >( matrix ); + + using FloatingPoint = std::decay_t< decltype( R[0][0] ) >; + + // Initialize + copy< M, M >( R, matrix ); + FloatingPoint RInverse[M][M] = { {0} }, + RInverseTranspose[M][M] = { {0} }, + RRTMinusI[M][M] = { {0} }; + + // Higham Algorithm + FloatingPoint errorSquared = 1.0; + FloatingPoint tolerance = 10 * LvArray::NumericLimits< FloatingPoint >::epsilon; + int iter = 0; + while( errorSquared > tolerance * tolerance && iter < 100 ) + { + iter++; + errorSquared = 0.0; + + // Average the current R with its inverse tranpose + SquareMatrixOps< M >::invert( RInverse, R ); + transpose< M, M >( RInverseTranspose, RInverse ); + add< M, M >( R, RInverseTranspose ); + scale< M, M >( R, 0.5 ); + + // Determine how close R is to being orthogonal using L2Norm(R.R^T-I) + Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, R ); + addIdentity< M >( RRTMinusI, -1.0 ); + for( std::ptrdiff_t i = 0 ; i < M ; i++ ) + { + for( std::ptrdiff_t j = 0 ; j < M ; j++ ) + { + errorSquared += RRTMinusI[i][j] * RRTMinusI[i][j]; + } + } + } + LVARRAY_WARNING_IF( iter == 100, "Polar decomposition did not converge in 100 iterations!"); +} + /** * @struct SquareMatrixOps< 2 > * @brief Performs operations on 2x2 square matrices. @@ -533,9 +588,9 @@ struct SquareMatrixOps< 2 > * @brief Determine the polar decomposition of the 2x2 matrix @p matrix * @tparam DST_MATRIX The type of @p R. * @tparam MATRIX The type of @p matrix. - * @param R The resultant rotation matrix. + * @param R The resultant orthogonal matrix. * @param matrix The matrix to be decomposed. - * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix. + * @details The polar decomposition returns an orthogonal matrix such that @p R . U = V . @p R = @p matrix. * This is done using Higham's iterative algorithm. */ template< typename DST_MATRIX, typename MATRIX > @@ -543,44 +598,10 @@ struct SquareMatrixOps< 2 > static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R, MATRIX const & LVARRAY_RESTRICT_REF matrix ) { - checkSizes< 2, 2 >( R ); - checkSizes< 2, 2 >( matrix ); - - using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >; - - // Initialize - copy< 2, 2 >( R, matrix ); - FloatingPoint RInverse[2][2] = { {0} }, - RInverseTranspose[2][2] = { {0} }, - RRTMinusI[2][2] = { {0} }; - - // Higham Algorithm - FloatingPoint error = 1.0; - int iter = 0; - while( error > 1.0e-16 && iter < 100 ) - { - iter++; - error = 0.0; - - invert( RInverse, R ); - transpose< 2, 2 >( RInverseTranspose, RInverse ); - add< 2, 2 >( R, RInverseTranspose ); - scale< 2, 2 >( R, 0.5 ); - Rij_eq_AikBjk< 2, 2, 2 >( RRTMinusI, R, R ); - addIdentity< 2 >( RRTMinusI, -1.0 ); - - for( std::ptrdiff_t i = 0 ; i < 2 ; i++ ) - { - for( std::ptrdiff_t j = 0 ; j < 2 ; j++ ) - { - error += RRTMinusI[i][j] * RRTMinusI[i][j]; - } - } - } + polarDecompositionBase< 2 >( R, matrix ); } private: - /** * @brief Compute the eigenvalues of the 2x2 symmetric matrix @p matrix. * @tparam FloatingPoint A floating point type. @@ -1216,9 +1237,9 @@ struct SquareMatrixOps< 3 > * @brief Determine the polar decomposition of the 3x3 matrix @p matrix * @tparam DST_MATRIX The type of @p R. * @tparam MATRIX The type of @p matrix. - * @param R The resultant rotation matrix. + * @param R The resultant orthogonal matrix. * @param matrix The matrix to be decomposed. - * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix. + * @details The polar decomposition returns an orthogonal matrix such that @p R . U = V . @p R = @p matrix. * This is done using Higham's iterative algorithm. */ template< typename DST_MATRIX, typename MATRIX > @@ -1226,40 +1247,7 @@ struct SquareMatrixOps< 3 > static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R, MATRIX const & LVARRAY_RESTRICT_REF matrix ) { - checkSizes< 3, 3 >( R ); - checkSizes< 3, 3 >( matrix ); - - using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >; - - // Initialize - copy< 3, 3 >( R, matrix ); - FloatingPoint RInverse[3][3] = { {0} }, - RInverseTranspose[3][3] = { {0} }, - RRTMinusI[3][3] = { {0} }; - - // Higham Algorithm - FloatingPoint error = 1.0; - int iter = 0; - while( error > 1.0e-16 && iter < 100 ) - { - iter++; - error = 0.0; - - invert( RInverse, R ); - transpose< 3, 3 >( RInverseTranspose, RInverse ); - add< 3, 3 >( R, RInverseTranspose ); - scale< 3, 3 >( R, 0.5 ); - Rij_eq_AikBjk< 3, 3, 3 >( RRTMinusI, R, R ); - addIdentity< 3 >( RRTMinusI, -1.0 ); - - for( std::ptrdiff_t i = 0 ; i < 3 ; i++ ) - { - for( std::ptrdiff_t j = 0 ; j < 3 ; j++ ) - { - error += RRTMinusI[i][j] * RRTMinusI[i][j]; - } - } - } + polarDecompositionBase< 3 >( R, matrix ); } private: diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt index 4d91681e..cda1b58b 100644 --- a/unitTests/CMakeLists.txt +++ b/unitTests/CMakeLists.txt @@ -70,6 +70,7 @@ set( testSources testTensorOpsEigen.cpp testTensorOpsInverseOneArg.cpp testTensorOpsInverseTwoArgs.cpp + testTensorOpsPolarDecomposition.cpp testTensorOpsSymDeterminant.cpp testTensorOpsSymInverseOneArg.cpp testTensorOpsSymInverseTwoArgs.cpp diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index 4909a686..c4665a01 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -244,6 +244,52 @@ class InverseTest : public ::testing::Test } ); } + void polarDecomposition() + { + ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView(); + ArrayViewT< T, 3, 1 > const srcMatrix_IKJ = m_srcMatrix_IKJ.toView(); + ArrayViewT< T, 3, 0 > const srcMatrix_KJI = m_srcMatrix_KJI.toView(); + + ArrayViewT< FLOAT, 3, 2 > const dstMatrix_IJK = m_dstMatrix_IJK.toView(); + ArrayViewT< FLOAT, 3, 1 > const dstMatrix_IKJ = m_dstMatrix_IKJ.toView(); + ArrayViewT< FLOAT, 3, 0 > const dstMatrix_KJI = m_dstMatrix_KJI.toView(); + + ArrayT< T, RAJA::PERM_IJK > arrayOfMatrices( numMatrices, M, M ); + + T scale = 100; + for( T & value : arrayOfMatrices ) + { + value = randomValue( scale, m_gen ); + } + + ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst(); + + forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i ) + { + T srcLocal[ M ][ M ]; + FLOAT dstLocal[ M ][ M ]; + + #define _TEST( output, input ) \ + tensorOps::copy< M, M >( input, matrices[ i ] ); \ + tensorOps::polarDecomposition< M >( output, input ); \ + checkPolarDecomposition( output, input ) + + #define _TEST_PERMS( input, output0, output1, output2, output3 ) \ + _TEST( output0, input ); \ + _TEST( output1, input ); \ + _TEST( output2, input ); \ + _TEST( output3, input ) + + _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + private: @@ -347,6 +393,53 @@ class InverseTest : public ::testing::Test } } + template< typename MATRIX_A, typename MATRIX_B > + static void LVARRAY_HOST_DEVICE checkPolarDecomposition( MATRIX_A const & orthogonalMatrix, MATRIX_B const & sourceMatrix ) + { + // Error tolerance (depends on the implementation!) + using FloatingPoint = std::decay_t< decltype( orthogonalMatrix[0][0] ) >; + FloatingPoint tolerance = 10 * LvArray::NumericLimits< FloatingPoint >::epsilon; + FloatingPoint sqrtTolerance = math::sqrt( tolerance ); + + if( M == 2 ) + { + // Check if orthogonalMatrix is actually orthogonal + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[0][0] + orthogonalMatrix[0][1] * orthogonalMatrix[0][1], 1.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[0][1] + orthogonalMatrix[1][0] * orthogonalMatrix[1][1], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[1][0] + orthogonalMatrix[0][1] * orthogonalMatrix[1][1], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[1][0] + orthogonalMatrix[1][1] * orthogonalMatrix[1][1], 1.0, tolerance ); + + // Check if sourceMatrix.(orthogonalMatrix^T) yields a symmetric matrix + PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * sourceMatrix[0][0] + orthogonalMatrix[1][1] * sourceMatrix[0][1], + orthogonalMatrix[0][0] * sourceMatrix[1][0] + orthogonalMatrix[0][1] * sourceMatrix[1][1], + sqrtTolerance ); + } + else if( M == 3 ) + { + // Check if orthogonalMatrix is actually orthogonal + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[0][0] + orthogonalMatrix[0][1] * orthogonalMatrix[0][1] + orthogonalMatrix[0][2] * orthogonalMatrix[0][2], 1.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[1][0] + orthogonalMatrix[0][1] * orthogonalMatrix[1][1] + orthogonalMatrix[0][2] * orthogonalMatrix[1][2], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[2][0] + orthogonalMatrix[0][1] * orthogonalMatrix[2][1] + orthogonalMatrix[0][2] * orthogonalMatrix[2][2], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[1][0] + orthogonalMatrix[0][1] * orthogonalMatrix[1][1] + orthogonalMatrix[0][2] * orthogonalMatrix[1][2], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[1][0] + orthogonalMatrix[1][1] * orthogonalMatrix[1][1] + orthogonalMatrix[1][2] * orthogonalMatrix[1][2], 1.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[2][0] + orthogonalMatrix[1][1] * orthogonalMatrix[2][1] + orthogonalMatrix[1][2] * orthogonalMatrix[2][2], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[2][0] + orthogonalMatrix[0][1] * orthogonalMatrix[2][1] + orthogonalMatrix[0][2] * orthogonalMatrix[2][2], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[2][0] + orthogonalMatrix[1][1] * orthogonalMatrix[2][1] + orthogonalMatrix[1][2] * orthogonalMatrix[2][2], 0.0, tolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[2][0] * orthogonalMatrix[2][0] + orthogonalMatrix[2][1] * orthogonalMatrix[2][1] + orthogonalMatrix[2][2] * orthogonalMatrix[2][2], 1.0, tolerance ); + + // Check if sourceMatrix.(orthogonalMatrix^T) yields a symmetric matrix + PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * sourceMatrix[0][0] + orthogonalMatrix[1][1] * sourceMatrix[0][1] + orthogonalMatrix[1][2] * sourceMatrix[0][2], + orthogonalMatrix[0][0] * sourceMatrix[1][0] + orthogonalMatrix[0][1] * sourceMatrix[1][1] + orthogonalMatrix[0][2] * sourceMatrix[1][2], + sqrtTolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[2][0] * sourceMatrix[0][0] + orthogonalMatrix[2][1] * sourceMatrix[0][1] + orthogonalMatrix[2][2] * sourceMatrix[0][2], + orthogonalMatrix[0][0] * sourceMatrix[2][0] + orthogonalMatrix[0][1] * sourceMatrix[2][1] + orthogonalMatrix[0][2] * sourceMatrix[2][2], + sqrtTolerance ); + PORTABLE_EXPECT_NEAR( orthogonalMatrix[2][0] * sourceMatrix[1][0] + orthogonalMatrix[2][1] * sourceMatrix[1][1] + orthogonalMatrix[2][2] * sourceMatrix[1][2], + orthogonalMatrix[1][0] * sourceMatrix[2][0] + orthogonalMatrix[1][1] * sourceMatrix[2][1] + orthogonalMatrix[1][2] * sourceMatrix[2][2], + sqrtTolerance ); + } + } + std::mt19937_64 m_gen; diff --git a/unitTests/testTensorOpsPolarDecomposition.cpp b/unitTests/testTensorOpsPolarDecomposition.cpp new file mode 100644 index 00000000..29c48dc0 --- /dev/null +++ b/unitTests/testTensorOpsPolarDecomposition.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +#include "testTensorOpsInverse.hpp" + +namespace LvArray +{ +namespace testing +{ + +TYPED_TEST( InverseTest, polarDecomposition ) +{ + this->polarDecomposition(); +} + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + return result; +} From ddcd7b418a58c72307d95fbbb5613b19f42aaedc Mon Sep 17 00:00:00 2001 From: Stefan Povolny Date: Thu, 4 May 2023 14:15:37 -0700 Subject: [PATCH 05/42] made error message in polar decomposition GPU-compatible --- src/fixedSizeSquareMatrixOpsImpl.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index d135cf7a..d4c3f1b7 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -123,7 +123,10 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, } } } - LVARRAY_WARNING_IF( iter == 100, "Polar decomposition did not converge in 100 iterations!"); + if( iter == 100 ) + { + printf("Polar decomposition did not converge in 100 iterations!"); + } } /** From 1339ea72414dbbcc0efdc3e8e5ecc5aabd13206d Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Wed, 30 Oct 2024 11:51:14 -0700 Subject: [PATCH 06/42] Added device math functions for ceil, floor, and power --- src/math.hpp | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/src/math.hpp b/src/math.hpp index d45f68f3..d4cd408d 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -440,6 +440,113 @@ __half2 abs( __half2 const x ) #endif + +/** + * @return The ceiling value of @p x. + * @param x The number to get the ceiling value of. + * @note This set of overloads is valid for any numeric type. + */ +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +float ceil( float const x ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::ceilf( x ); +#else + return std::ceil( x ); +#endif +} + +template< typename T > +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr +double ceil( T const x ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::ceil( double ( x ) ); +#else + return std::ceil( x ); +#endif +} + +#if defined( LVARRAY_USE_DEVICE ) + +/// @copydoc ceil( T ) +LVARRAY_DEVICE LVARRAY_FORCE_INLINE +__half ceil( __half const x ) +{ +#if CUDART_VERSION > 11000 + return hceil( x ); +#else + return x > __half( 0 ) ? x : -x; +#endif +} + +/// @copydoc ceil( T ) +LVARRAY_DEVICE LVARRAY_FORCE_INLINE +__half2 ceil( __half2 const x ) +{ +#if CUDART_VERSION > 11000 + return h2ceil( x ); +#else + return LVARRAY_THROW( "h2ceil is not implemented for host", std::runtime_error ); // This is wrong, copied from other function used to mimic +#endif +} + +#endif + + +/** + * @return The floor value of @p x. + * @param x The number to get the floor value of. + * @note This set of overloads is valid for any numeric type. + */ +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +float floor( float const x ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::floorf( x ); +#else + return std::floor( x ); +#endif +} + +template< typename T > +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr +double floor( T const x ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::floor( double ( x ) ); +#else + return std::floor( x ); +#endif +} + +#if defined( LVARRAY_USE_DEVICE ) + +/// @copydoc floor( T ) +LVARRAY_DEVICE LVARRAY_FORCE_INLINE +__half floor( __half const x ) +{ +#if CUDART_VERSION > 11000 + return hfloor( x ); +#else + return x > __half( 0 ) ? x : -x; +#endif +} + +/// @copydoc floor( T ) +LVARRAY_DEVICE LVARRAY_FORCE_INLINE +__half2 floor( __half2 const x ) +{ +#if CUDART_VERSION > 11000 + return h2floor( x ); +#else + return LVARRAY_THROW( "h2floor is not implemented for host", std::runtime_error ); +#endif +} + +#endif + + /** * @return @code x * x @endcode. * @tparam T The typeof @p x. @@ -452,6 +559,44 @@ T square( T const x ) ///@} + +/** + * @name Power. + */ +///@{ + +/** + * @return The power of @p x. + * @param x The number to get the power of. + * @param n The exponent. + * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double + * and the return type is @c double. + */ +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +float pow( float const x, float const n ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::powf( x, n ); +#else + return std::pow( x, n ); +#endif +} + +/// @copydoc pow( float ) +template< typename T > +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +double pow( T const x, T const n ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::pow( double( x ), double( n ) ); +#else + return std::pow( x, n ); +#endif +} + +///@} + + /** * @name Square root and inverse square root. */ From 1e0127af1d596359878efe3453d5b34e1bf17370 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Thu, 5 Dec 2024 09:07:51 -0800 Subject: [PATCH 07/42] Corrected host only function inside of checkIndices preventing device compilation --- src/indexing.hpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 2dca4597..0665d68e 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -206,7 +206,27 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con template< typename INDEX_TYPE, typename ... INDICES > LVARRAY_HOST_DEVICE inline void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices ) -{ LVARRAY_ERROR_IF( invalidIndices( dims, indices ... ), "Invalid indices. " << printDimsAndIndices( dims, indices ... ) ); } +{ + if( invalidIndices( dims, indices ... ) ) + { + constexpr int NDIM = sizeof ... (INDICES); + printf( "Invalid indices. \n"); + + + printf( " ( indices ) = { "); + (printf( " %d", indices),...); + printf( "\n"); + + printf( " ( dims ) = { "); + for( INDEX_TYPE dim=0; dim Date: Tue, 14 Jan 2025 09:38:08 -0800 Subject: [PATCH 08/42] Added debugging flags to host configs --- host-configs/LLNL/lassen-cuda-11-base.cmake | 2 +- host-configs/LLNL/lassen-cuda-12-base.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/host-configs/LLNL/lassen-cuda-11-base.cmake b/host-configs/LLNL/lassen-cuda-11-base.cmake index 3c1ea972..73b7d51c 100644 --- a/host-configs/LLNL/lassen-cuda-11-base.cmake +++ b/host-configs/LLNL/lassen-cuda-11-base.cmake @@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "") set(CMAKE_CUDA_STANDARD 17 CACHE STRING "") set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") -set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "") +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo --source-in-ptx ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "") set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE) diff --git a/host-configs/LLNL/lassen-cuda-12-base.cmake b/host-configs/LLNL/lassen-cuda-12-base.cmake index 3b8f4125..174ba7fb 100644 --- a/host-configs/LLNL/lassen-cuda-12-base.cmake +++ b/host-configs/LLNL/lassen-cuda-12-base.cmake @@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "") set(CMAKE_CUDA_STANDARD 17 CACHE STRING "") set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") -set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -lineinfo --source-in-ptx -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "") set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE) From 8e8670602c678bf053b28cf752b7a76a2fd32f63 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 16 Jan 2025 18:05:54 -0800 Subject: [PATCH 09/42] wip: trying to fix build error on Lassen. --- src/indexing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 2dca4597..1203aa1d 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -206,7 +206,7 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con template< typename INDEX_TYPE, typename ... INDICES > LVARRAY_HOST_DEVICE inline void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices ) -{ LVARRAY_ERROR_IF( invalidIndices( dims, indices ... ), "Invalid indices. " << printDimsAndIndices( dims, indices ... ) ); } +{ LVARRAY_ERROR_IF( false , "Invalid indices. " ); } /** * @brief Calculate the strides given the dimensions and permutation. From c021cc526ce002be245480db1f79c1549a92d813 Mon Sep 17 00:00:00 2001 From: "Randolph R. Settgast" Date: Wed, 22 Jan 2025 14:42:25 -0800 Subject: [PATCH 10/42] use fold expression to do index checks --- src/indexing.hpp | 50 +++++++++++++++++++++++++++++----------- src/typeManipulation.hpp | 8 ++++--- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 1203aa1d..fc90af57 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -159,21 +159,41 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic * @param indices A variadic pack of indices. */ template< typename INDEX_TYPE, typename ... INDICES > -std::string printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices ) +LVARRAY_HOST_DEVICE +void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices ) { constexpr int NDIM = sizeof ... (INDICES); - std::ostringstream oss; - oss << "dimensions = { " << dims[ 0 ]; + printf( "dimensions = { %d", dims[ 0 ] ); for( int i = 1; i < NDIM; ++i ) { - oss << ", " << dims[ i ]; + printf( ", %d", dims[ i ] ); } + printf( "}\n"); - oss << " } indices = " << getIndexString( indices ... ); + printf( " indices = { " ); + (printf(" %d, ", indices ),...); + printf( "}\n"); +} - return oss.str(); + +/** + * @brief Function to check if an index is invalid + * @tparam DIMS_TYPE The integral type used for the dimensions of the space + * @tparam INDEX_TYPE The Integral types of the index to check against + * @param dims A pointer to the dimensions of the space. + * @param indices the index to check against. + * @return whether the index is invalid + */ +template< typename DIMS_TYPE, typename INDEX_TYPE > +LVARRAY_HOST_DEVICE inline constexpr +bool invalidIndex( DIMS_TYPE const * const LVARRAY_RESTRICT dims, + int const dimsIndex, + INDEX_TYPE const index ) +{ + return (index < 0) || (index >= dims[dimsIndex]); } + /** * @tparam INDEX_TYPE The integral type used for the dimensions of the space. * @tparam INDICES A variadic pack of the integral types of the indices. @@ -185,14 +205,9 @@ template< typename INDEX_TYPE, typename ... INDICES > LVARRAY_HOST_DEVICE inline constexpr bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices ) { - int curDim = 0; bool invalid = false; - typeManipulation::forEachArg( [dims, &curDim, &invalid]( auto const index ) - { - invalid = invalid || ( index < 0 ) || ( index >= dims[ curDim ] ); - ++curDim; - }, indices ... ); - + int curDim = 0; + (invalidIndex(dims, curDim++, indices),...); return invalid; } @@ -206,7 +221,14 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con template< typename INDEX_TYPE, typename ... INDICES > LVARRAY_HOST_DEVICE inline void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices ) -{ LVARRAY_ERROR_IF( false , "Invalid indices. " ); } +{ + bool const invalid = invalidIndices( dims, indices ... ); + if( invalid ) + { + printDimsAndIndices( dims, indices ... ); + LVARRAY_ERROR( "Invalid indices. Info precedes this line." ); + } +} /** * @brief Calculate the strides given the dimensions and permutation. diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp index 33ca95c6..90f3df31 100644 --- a/src/typeManipulation.hpp +++ b/src/typeManipulation.hpp @@ -124,10 +124,12 @@ namespace typeManipulation * @param f The function to not call. */ DISABLE_HD_WARNING -template< typename F > +template< typename F, typename ARG > inline constexpr LVARRAY_HOST_DEVICE -void forEachArg( F && f ) -{ LVARRAY_UNUSED_VARIABLE( f ); } +void forEachArg( F && f, ARG && arg ) +{ + f(arg); +} /** * @tparam F The type of the function to call. From efc221da0d0969fbd5c75393abe8b167e7c2a3a3 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Wed, 22 Jan 2025 14:59:32 -0800 Subject: [PATCH 11/42] revert changes to typeManipulation. --- src/typeManipulation.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp index 90f3df31..33ca95c6 100644 --- a/src/typeManipulation.hpp +++ b/src/typeManipulation.hpp @@ -124,12 +124,10 @@ namespace typeManipulation * @param f The function to not call. */ DISABLE_HD_WARNING -template< typename F, typename ARG > +template< typename F > inline constexpr LVARRAY_HOST_DEVICE -void forEachArg( F && f, ARG && arg ) -{ - f(arg); -} +void forEachArg( F && f ) +{ LVARRAY_UNUSED_VARIABLE( f ); } /** * @tparam F The type of the function to call. From 9275fe76e29991230cf14eea56534ebfec59e4e1 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Wed, 22 Jan 2025 15:10:21 -0800 Subject: [PATCH 12/42] use bool return value. --- src/indexing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index fc90af57..8c07b3e2 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -207,7 +207,7 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con { bool invalid = false; int curDim = 0; - (invalidIndex(dims, curDim++, indices),...); + ( (invalid = invalid || invalidIndex(dims, curDim++, indices)), ...); return invalid; } From 882b587cec1cd2c46c9adb83c3a91a1b98651c45 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Wed, 22 Jan 2025 17:11:09 -0800 Subject: [PATCH 13/42] use fold expression. --- src/indexing.hpp | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 8c07b3e2..6c0348da 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -20,6 +20,8 @@ // TPL includes #include +#include + namespace LvArray { @@ -151,6 +153,29 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic return oss.str(); } + +template< typename INDEX_TYPE > +LVARRAY_HOST_DEVICE +void printIndexValue( INDEX_TYPE const & val ) +{ + if constexpr( std::is_same_v ) + { + printf( "%d", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%ld", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%lld", val ); + } + else + { + static_assert(!sizeof(T*), "Unsupported integral type for print_value"); + } +} + /** * @tparam INDEX_TYPE The integral type of the dimensions. * @tparam INDICES A variadic pack of the integral types of the indices. @@ -163,16 +188,19 @@ LVARRAY_HOST_DEVICE void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices ) { constexpr int NDIM = sizeof ... (INDICES); - printf( "dimensions = { %d", dims[ 0 ] ); + printf( "dimensions = { "); + printIndexValue( dims[0] ); for( int i = 1; i < NDIM; ++i ) { - printf( ", %d", dims[ i ] ); + printf( ", "); + printIndexValue(dims[i]); } printf( "}\n"); - - printf( " indices = { " ); - (printf(" %d, ", indices ),...); - printf( "}\n"); + + printf( "indices = { "); + bool firstIndex = true; + ( (firstIndex ? ( firstIndex = false, printIndexValue(indices) ) : (printf(", "), printIndexValue(indices))), ...); + printf(" }\n"); } From e599472d6255f14110b5ff8bf21f7d0e465ab46f Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Wed, 22 Jan 2025 17:11:41 -0800 Subject: [PATCH 14/42] fix function name --- src/indexing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 6c0348da..0ec37a21 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -172,7 +172,7 @@ void printIndexValue( INDEX_TYPE const & val ) } else { - static_assert(!sizeof(T*), "Unsupported integral type for print_value"); + static_assert(!sizeof(T*), "Unsupported integral type for printIndexValue"); } } From 00e117d3f449d350ff8b6fd4be491ff846379533 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Wed, 22 Jan 2025 17:17:02 -0800 Subject: [PATCH 15/42] doxygen. --- src/indexing.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 0ec37a21..cfd3def6 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -153,7 +153,10 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic return oss.str(); } - +/** + * @tparam INDEX_TYPE The integral type to be printed. + * @param val the value to be printed + */ template< typename INDEX_TYPE > LVARRAY_HOST_DEVICE void printIndexValue( INDEX_TYPE const & val ) From 1ccdce54352d92940aaa42caff852d339549d3ca Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Wed, 22 Jan 2025 17:28:19 -0800 Subject: [PATCH 16/42] fix typo. --- src/indexing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index cfd3def6..0be6ffb5 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -175,7 +175,7 @@ void printIndexValue( INDEX_TYPE const & val ) } else { - static_assert(!sizeof(T*), "Unsupported integral type for printIndexValue"); + static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue"); } } From 956aa7ce91467a0ff891637ed94b5588310de0d5 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 11:57:02 -0800 Subject: [PATCH 17/42] add other integral types. --- src/indexing.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/indexing.hpp b/src/indexing.hpp index 0be6ffb5..7ba6dfd8 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -161,18 +161,32 @@ template< typename INDEX_TYPE > LVARRAY_HOST_DEVICE void printIndexValue( INDEX_TYPE const & val ) { + static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." ); + if constexpr( std::is_same_v ) { printf( "%d", val ); } + else if constexpr( std::is_same_v ) + { + printf( "%u", val ); + } else if constexpr( std::is_same_v ) { printf( "%ld", val ); } + else if constexpr( std::is_same_v ) + { + printf( "%lu", val ); + } else if constexpr( std::is_same_v ) { printf( "%lld", val ); } + else if constexpr( std::is_same_v ) + { + printf( "%hd", val ); + } else { static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue"); From 8eb4727d9f02f97aefb934bc7ce24540484f58cb Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 12:28:06 -0800 Subject: [PATCH 18/42] add debug assert. --- src/indexing.hpp | 12 +++++++++--- src/typeManipulation.hpp | 4 ++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 7ba6dfd8..9774397a 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -20,8 +20,6 @@ // TPL includes #include -#include - namespace LvArray { @@ -162,7 +160,15 @@ LVARRAY_HOST_DEVICE void printIndexValue( INDEX_TYPE const & val ) { static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." ); - + + static_assert( std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v, "Unsupported integral type for printIndexValue" ); + + if constexpr( std::is_same_v ) { printf( "%d", val ); diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp index 33ca95c6..cd67293a 100644 --- a/src/typeManipulation.hpp +++ b/src/typeManipulation.hpp @@ -549,5 +549,9 @@ LVARRAY_HOST_DEVICE inline constexpr CArray< camp::idx_t, sizeof...( INDICES ) > asArray( camp::idx_seq< INDICES... > ) { return { INDICES ... }; } + + + + } // namespace typeManipulation } // namespace LvArray From 2fa2667092f515d8a901553e6070e79022b8f796 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 12:33:51 -0800 Subject: [PATCH 19/42] add all integral types. --- src/indexing.hpp | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 9774397a..74bba31a 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -161,14 +161,6 @@ void printIndexValue( INDEX_TYPE const & val ) { static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." ); - static_assert( std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v, "Unsupported integral type for printIndexValue" ); - - if constexpr( std::is_same_v ) { printf( "%d", val ); @@ -189,10 +181,34 @@ void printIndexValue( INDEX_TYPE const & val ) { printf( "%lld", val ); } + else if constexpr( std::is_same_v ) + { + printf( "%llu", val ); + } else if constexpr( std::is_same_v ) { printf( "%hd", val ); } + else if constexpr( std::is_same_v ) + { + printf( "%hu", val ); + } + // else if constexpr( std::is_same_v ) + // { + // printf( "%c", val ); + // } + // else if constexpr( std::is_same_v ) + // { + // printf( "%hhd", val ); + // } + // else if constexpr( std::is_same_v ) + // { + // printf( "%hhu", val ); + // } + // else if constexpr( std::is_same_v ) + // { + // printf( "%d", val ); + // } else { static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue"); From 2343b7dbcc67fd8c35de07d0e63a336530cc821d Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 13:39:42 -0800 Subject: [PATCH 20/42] add char types --- src/indexing.hpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 74bba31a..9a027dd6 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -193,22 +193,22 @@ void printIndexValue( INDEX_TYPE const & val ) { printf( "%hu", val ); } - // else if constexpr( std::is_same_v ) - // { - // printf( "%c", val ); - // } - // else if constexpr( std::is_same_v ) - // { - // printf( "%hhd", val ); - // } - // else if constexpr( std::is_same_v ) - // { - // printf( "%hhu", val ); - // } - // else if constexpr( std::is_same_v ) - // { - // printf( "%d", val ); - // } + else if constexpr( std::is_same_v ) + { + printf( "%c", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%hhd", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%hhu", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%d", val ); + } else { static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue"); From ddc611e9e020d200ef39f44dd8d40cbc41959b62 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 13:48:29 -0800 Subject: [PATCH 21/42] add decay type. --- src/indexing.hpp | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 9a027dd6..c59d4117 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -159,59 +159,44 @@ template< typename INDEX_TYPE > LVARRAY_HOST_DEVICE void printIndexValue( INDEX_TYPE const & val ) { - static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." ); + using DecayedType = std::decay_t; + static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." ); - if constexpr( std::is_same_v ) + if constexpr( std::is_same_v ) { printf( "%d", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%u", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%ld", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%lu", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%lld", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%llu", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%hd", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%hu", val ); } - else if constexpr( std::is_same_v ) - { - printf( "%c", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%hhd", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%hhu", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%d", val ); - } else { - static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue"); + static_assert(!sizeof(DecayedType*), "Unsupported integral type for printIndexValue"); } } From 52c1d5e55ab01f151510f8a238c22136d203eec6 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 14:03:25 -0800 Subject: [PATCH 22/42] add a static cast option --- src/indexing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index c59d4117..502ac04b 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -196,7 +196,7 @@ void printIndexValue( INDEX_TYPE const & val ) } else { - static_assert(!sizeof(DecayedType*), "Unsupported integral type for printIndexValue"); + printf( "%llu", static_cast< long long>( val ) ); } } From 58fc69df376813210f03ace9b653cb5cd7d76b4c Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 15:26:22 -0800 Subject: [PATCH 23/42] remove assert. --- src/indexing.hpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 502ac04b..d5002efc 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -160,7 +160,7 @@ LVARRAY_HOST_DEVICE void printIndexValue( INDEX_TYPE const & val ) { using DecayedType = std::decay_t; - static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." ); + // static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." ); if constexpr( std::is_same_v ) { @@ -194,9 +194,25 @@ void printIndexValue( INDEX_TYPE const & val ) { printf( "%hu", val ); } + else if constexpr( std::is_same_v ) + { + printf( "%c", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%hhd", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%hhu", val ); + } + else if constexpr( std::is_same_v ) + { + printf( "%d", val ); + } else { - printf( "%llu", static_cast< long long>( val ) ); + printf( "%lld", static_cast< long long >( val ) ); } } From d89ed76b236fb709e4c287ecc2ac5334c6a778d4 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Thu, 23 Jan 2025 15:50:41 -0800 Subject: [PATCH 24/42] clean up. --- src/indexing.hpp | 27 ++++++++++++++------------- src/typeManipulation.hpp | 4 ---- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index d5002efc..b25e3e74 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -159,54 +159,55 @@ template< typename INDEX_TYPE > LVARRAY_HOST_DEVICE void printIndexValue( INDEX_TYPE const & val ) { - using DecayedType = std::decay_t; + /// NOTE: the assert is removed because of an error on one of the GEOS CI jobs. + // using DecayedType = std::decay_t; // static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." ); - if constexpr( std::is_same_v ) + if constexpr( std::is_same_v ) { printf( "%d", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%u", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%ld", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%lu", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%lld", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%llu", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%hd", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%hu", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%c", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%hhd", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%hhu", val ); } - else if constexpr( std::is_same_v ) + else if constexpr( std::is_same_v ) { printf( "%d", val ); } diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp index cd67293a..33ca95c6 100644 --- a/src/typeManipulation.hpp +++ b/src/typeManipulation.hpp @@ -549,9 +549,5 @@ LVARRAY_HOST_DEVICE inline constexpr CArray< camp::idx_t, sizeof...( INDICES ) > asArray( camp::idx_seq< INDICES... > ) { return { INDICES ... }; } - - - - } // namespace typeManipulation } // namespace LvArray From ac6c0fdfef95cf9b4813660bd00246dc512cb0ba Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Fri, 24 Jan 2025 10:31:48 -0800 Subject: [PATCH 25/42] remove chars. --- src/indexing.hpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index b25e3e74..16fd332e 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -195,22 +195,6 @@ void printIndexValue( INDEX_TYPE const & val ) { printf( "%hu", val ); } - else if constexpr( std::is_same_v ) - { - printf( "%c", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%hhd", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%hhu", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%d", val ); - } else { printf( "%lld", static_cast< long long >( val ) ); From c2efd00ca5e92d450daa175c1790a05c564d1048 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Fri, 24 Jan 2025 10:41:07 -0800 Subject: [PATCH 26/42] just always cast to long long. --- src/indexing.hpp | 76 ++++++++---------------------------------------- 1 file changed, 12 insertions(+), 64 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index 16fd332e..c6333bd5 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -151,56 +151,6 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic return oss.str(); } -/** - * @tparam INDEX_TYPE The integral type to be printed. - * @param val the value to be printed - */ -template< typename INDEX_TYPE > -LVARRAY_HOST_DEVICE -void printIndexValue( INDEX_TYPE const & val ) -{ - /// NOTE: the assert is removed because of an error on one of the GEOS CI jobs. - // using DecayedType = std::decay_t; - // static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." ); - - if constexpr( std::is_same_v ) - { - printf( "%d", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%u", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%ld", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%lu", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%lld", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%llu", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%hd", val ); - } - else if constexpr( std::is_same_v ) - { - printf( "%hu", val ); - } - else - { - printf( "%lld", static_cast< long long >( val ) ); - } -} - /** * @tparam INDEX_TYPE The integral type of the dimensions. * @tparam INDICES A variadic pack of the integral types of the indices. @@ -213,33 +163,31 @@ LVARRAY_HOST_DEVICE void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices ) { constexpr int NDIM = sizeof ... (INDICES); - printf( "dimensions = { "); + printf( "dimensions = { " ); printIndexValue( dims[0] ); for( int i = 1; i < NDIM; ++i ) { - printf( ", "); - printIndexValue(dims[i]); + printf( ", %lld", static_cast< long long >( dims[ i ] )); } - printf( "}\n"); - - printf( "indices = { "); - bool firstIndex = true; - ( (firstIndex ? ( firstIndex = false, printIndexValue(indices) ) : (printf(", "), printIndexValue(indices))), ...); - printf(" }\n"); + printf( "}\n" ); + + printf( "indices = { " ); + (printf( ", %lld", static_cast< long long >(indices)), ...); + printf( "}\n" ); } /** * @brief Function to check if an index is invalid * @tparam DIMS_TYPE The integral type used for the dimensions of the space - * @tparam INDEX_TYPE The Integral types of the index to check against + * @tparam INDEX_TYPE The Integral types of the index to check against * @param dims A pointer to the dimensions of the space. * @param indices the index to check against. * @return whether the index is invalid */ template< typename DIMS_TYPE, typename INDEX_TYPE > LVARRAY_HOST_DEVICE inline constexpr -bool invalidIndex( DIMS_TYPE const * const LVARRAY_RESTRICT dims, +bool invalidIndex( DIMS_TYPE const * const LVARRAY_RESTRICT dims, int const dimsIndex, INDEX_TYPE const index ) { @@ -260,7 +208,7 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con { bool invalid = false; int curDim = 0; - ( (invalid = invalid || invalidIndex(dims, curDim++, indices)), ...); + ( (invalid = invalid || invalidIndex( dims, curDim++, indices )), ...); return invalid; } @@ -274,12 +222,12 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con template< typename INDEX_TYPE, typename ... INDICES > LVARRAY_HOST_DEVICE inline void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices ) -{ +{ bool const invalid = invalidIndices( dims, indices ... ); if( invalid ) { printDimsAndIndices( dims, indices ... ); - LVARRAY_ERROR( "Invalid indices. Info precedes this line." ); + LVARRAY_ERROR( "Invalid indices. Info precedes this line." ); } } From e60fa8fd82a9c6cc05d3d616aa1ad8ce88cf8992 Mon Sep 17 00:00:00 2001 From: Matteo Cusini Date: Fri, 24 Jan 2025 15:49:11 -0800 Subject: [PATCH 27/42] forgotten one. --- src/indexing.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/indexing.hpp b/src/indexing.hpp index c6333bd5..d5ea6fe9 100644 --- a/src/indexing.hpp +++ b/src/indexing.hpp @@ -163,8 +163,7 @@ LVARRAY_HOST_DEVICE void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices ) { constexpr int NDIM = sizeof ... (INDICES); - printf( "dimensions = { " ); - printIndexValue( dims[0] ); + printf( "dimensions = { %lld", static_cast< long long >( dims[ 0 ] ) ); for( int i = 1; i < NDIM; ++i ) { printf( ", %lld", static_cast< long long >( dims[ i ] )); From 65c00eca9e2afdab71e926286d66b1d81e95d5ef Mon Sep 17 00:00:00 2001 From: wrtobin Date: Tue, 8 Apr 2025 09:04:43 -0700 Subject: [PATCH 28/42] aoa and aoav traits --- src/ArrayOfArrays.hpp | 17 +++++++++++++++++ src/ArrayOfArraysView.hpp | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/src/ArrayOfArrays.hpp b/src/ArrayOfArrays.hpp index 61facade..8dfbed2a 100644 --- a/src/ArrayOfArrays.hpp +++ b/src/ArrayOfArrays.hpp @@ -484,4 +484,21 @@ class ArrayOfArrays : protected ArrayOfArraysView< T, INDEX_TYPE, false, BUFFER_ } }; +/** + * @brief True if the template type is an ArrayOfArrays. + */ +template< class > +constexpr bool isArrayOfArrays = false; + +/** + * @tparam T The type contained in the ArrayOfArrays. + * @tparam INDEX_TYPE The integral type used as an index. + * @tparam BUFFER_TYPE The type used to manage the underlying allocation. + * @brief Specialization of isArrayOfArrays for the ArrayOfArrays class. + */ +template< typename T, + typename INDEX_TYPE, + template< typename > class BUFFER_TYPE > +constexpr bool isArrayOfArrays< ArrayOfArrays< T, INDEX_TYPE, BUFFER_TYPE > > = true; + } /* namespace LvArray */ diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 3ce26cd1..540c50e9 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -1074,4 +1074,23 @@ class ArrayOfArraysView } }; +/** + * @brief True if the template type is an ArrayOfArraysView. + */ +template< class > +constexpr bool isArrayOfArraysView = false; + +/** + * @tparam T The type contained in the ArrayOfArraysView. + * @tparam INDEX_TYPE The integral type used as an index. + * @tparam CONST_SIZES True iff the size of each array is constant. + * @tparam BUFFER_TYPE The type used to manager the underlying allocation. + * @brief Specialization of isArrayOfArraysView for the ArrayOfArraysView class. + */ +template< typename T, + typename INDEX_TYPE, + bool CONST_SIZES, + template< typename > class BUFFER_TYPE > +constexpr bool isArrayOfArraysView< ArrayOfArraysView< T, INDEX_TYPE, CONST_SIZES, BUFFER_TYPE > > = true; + } /* namespace LvArray */ From cff7f1b42ee107e0a19fad5237928b4c9f81b023 Mon Sep 17 00:00:00 2001 From: wrtobin Date: Tue, 8 Apr 2025 12:42:41 -0700 Subject: [PATCH 29/42] just a typo fix --- src/ArrayOfArraysView.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 540c50e9..6db5946d 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -249,7 +249,7 @@ class ArrayOfArraysView /** * @brief Move assignment operator.. - * @param src the SparsityPatternView to be moved from. + * @param src the ArrayOfArraysView to be moved from. * @return *this. */ LVARRAY_HOST_DEVICE From fead905dd4cbea9c763d0c31cbe919071e9b0150 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Tue, 3 Jun 2025 09:10:11 -0700 Subject: [PATCH 30/42] Updated branch with current versions from develop that were overriden during merge --- cmake/SetupTPL.cmake | 51 ++++--- host-configs/LLNL/lassen-cuda-11-base.cmake | 2 +- host-configs/LLNL/lassen-cuda-12-base.cmake | 2 +- scripts/uberenv/packages/lvarray/package.py | 26 ++-- scripts/uberenv/project.json | 4 +- .../blueos_3_ppc64le_ib_p9/compilers.yaml | 132 +++--------------- .../toss_3_x86_64_ib/compilers.yaml | 83 +---------- .../toss_3_x86_64_ib_python/packages.yaml | 7 - 8 files changed, 65 insertions(+), 242 deletions(-) diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake index 29a60128..bff94834 100644 --- a/cmake/SetupTPL.cmake +++ b/cmake/SetupTPL.cmake @@ -1,22 +1,19 @@ set(thirdPartyLibs "") -############################### +################################ # CAMP -############################### -if(CAMP_DIR STREQUAL RAJA_DIR) - message(STATUS "LvArray using CAMP from RAJA.") -else() - if(NOT EXISTS ${CAMP_DIR}) - message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") - endif() +################################ +if(NOT EXISTS ${CAMP_DIR}) + message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") +endif() - message(STATUS "LvArray using CAMP from ${CAMP_DIR}") +message(STATUS "Using CAMP from ${CAMP_DIR}") - find_package(camp REQUIRED PATHS ${CAMP_DIR}) +find_package(camp REQUIRED PATHS ${CAMP_DIR}) - set(thirdPartyLibs ${thirdPartyLibs} camp) -endif() +set(ENABLE_CAMP ON CACHE BOOL "") +set(thirdPartyLibs ${thirdPartyLibs} camp) ################################ # RAJA @@ -25,7 +22,7 @@ if(NOT EXISTS ${RAJA_DIR}) message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.") endif() -message(STATUS "LvArray using RAJA from ${RAJA_DIR}") +message(STATUS "Using RAJA from ${RAJA_DIR}") find_package(RAJA REQUIRED PATHS ${RAJA_DIR}) @@ -42,14 +39,14 @@ if(ENABLE_UMPIRE) message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.") endif() - message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}") + message(STATUS "Using Umpire from ${UMPIRE_DIR}") find_package(umpire REQUIRED PATHS ${UMPIRE_DIR}) set(thirdPartyLibs ${thirdPartyLibs} umpire) else() - message(STATUS "LvArray not using Umpire.") + message(STATUS "Not using Umpire.") endif() ################################ @@ -68,32 +65,32 @@ if(ENABLE_CHAI) message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.") endif() - message(STATUS "LvArray using CHAI from ${CHAI_DIR}") + message(STATUS "Using CHAI from ${CHAI_DIR}") find_package(chai REQUIRED PATHS ${CHAI_DIR}) - - # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. - # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) - # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) - # set_target_properties(chai - # PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") + + # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. + get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) + list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) + set_target_properties(chai + PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") set(thirdPartyLibs ${thirdPartyLibs} chai) else() - message(STATUS "LvArray not using CHAI.") + message(STATUS "Not using CHAI.") endif() -############################### +################################ # CALIPER -############################### +################################ if(ENABLE_CALIPER) if(NOT EXISTS ${CALIPER_DIR}) message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.") endif() - message(STATUS "LvArray using caliper from ${CALIPER_DIR}") + message(STATUS "Using caliper from ${CALIPER_DIR}") find_package(caliper REQUIRED PATHS ${CALIPER_DIR}) @@ -105,7 +102,7 @@ if(ENABLE_CALIPER) set(thirdPartyLibs ${thirdPartyLibs} caliper) else() - message(STATUS "LvArray not using caliper.") + message(STATUS "Not using caliper.") endif() ################################ diff --git a/host-configs/LLNL/lassen-cuda-11-base.cmake b/host-configs/LLNL/lassen-cuda-11-base.cmake index 73b7d51c..3c1ea972 100644 --- a/host-configs/LLNL/lassen-cuda-11-base.cmake +++ b/host-configs/LLNL/lassen-cuda-11-base.cmake @@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "") set(CMAKE_CUDA_STANDARD 17 CACHE STRING "") set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") -set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo --source-in-ptx ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "") +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "") set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE) diff --git a/host-configs/LLNL/lassen-cuda-12-base.cmake b/host-configs/LLNL/lassen-cuda-12-base.cmake index 174ba7fb..3b8f4125 100644 --- a/host-configs/LLNL/lassen-cuda-12-base.cmake +++ b/host-configs/LLNL/lassen-cuda-12-base.cmake @@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "") set(CMAKE_CUDA_STANDARD 17 CACHE STRING "") set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") -set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -lineinfo --source-in-ptx -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "") set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE) diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index cf9d5548..9c4b47d9 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -56,36 +56,32 @@ class Lvarray(CMakePackage, CudaPackage): variant('docs', default=False, description='Build docs') variant('addr2line', default=True, description='Build support for addr2line.') - + depends_on('blt', when='@0.2.0:', type='build') depends_on('camp') + depends_on('camp+cuda', when='+cuda') depends_on('raja') + depends_on('raja+cuda', when='+cuda') + # At the moment Umpire doesn't support shared when building with CUDA. depends_on('umpire', when='+umpire') + depends_on('umpire+cuda~shared', when='+umpire+cuda') depends_on('chai+raja', when='+chai') + depends_on('chai+raja+cuda', when='+chai+cuda') depends_on('caliper', when='+caliper') depends_on('python +shared +pic', when='+pylvarray') - depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray') - depends_on('py-scipy@1.5.2:', when='+pylvarray') + depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray') + depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray') depends_on('py-pip', when='+pylvarray') depends_on('doxygen@1.8.13:', when='+docs', type='build') depends_on('py-sphinx@1.6.3:', when='+docs', type='build') - with when('+cuda'): - for sm_ in CudaPackage.cuda_arch_values: - depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) - - phases = ['hostconfig', 'cmake', 'build', 'install'] @run_after('build') @@ -289,6 +285,10 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#{0}\n\n".format("-" * 80)) if "+caliper" in spec: + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Caliper\n") + cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) else: @@ -297,7 +297,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write('#{0}\n'.format('-' * 80)) cfg.write('# Python\n') cfg.write('#{0}\n\n'.format('-' * 80)) - if '+pylvarray' in spec: cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) @@ -307,7 +306,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Documentation\n") cfg.write("#{0}\n\n".format("-" * 80)) - if "+docs" in spec: cfg.write(cmake_cache_option("ENABLE_DOCS", True)) sphinx_dir = spec['py-sphinx'].prefix diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json index 703db7a4..9822f975 100644 --- a/scripts/uberenv/project.json +++ b/scripts/uberenv/project.json @@ -3,8 +3,8 @@ "package_version" : "develop", "package_final_phase" : "hostconfig", "package_source_dir" : "../..", - "spack_url": "https://github.com/spack/spack", - "spack_branch": "v0.19.0", + "spack_url": "https://github.com/corbett5/spack", + "spack_branch": "package/corbett/lvarray-update", "spack_activate" : {}, "spack_clean_packages": ["lvarray"], "build_jobs": 100 diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml index b8353dd0..b1bf26cb 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml @@ -2,90 +2,30 @@ compilers: - compiler: spec: clang@10.0.1 paths: - cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cflags: -mcpu=native -mtune=native + cxxflags: -mcpu=native -mtune=native operating_system: rhel7 - target: x86_64 + target: ppc64le modules: [] environment: {} extra_rpaths: [] - compiler: spec: clang@11.0.1 paths: - cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cflags: -mcpu=native -mtune=native + cxxflags: -mcpu=native -mtune=native operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@12.0.1 - paths: - cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@13.0.1 - paths: - cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@14.0.4 - paths: - cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang - cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@7.3.0 - paths: - cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 + target: ppc64le modules: [] environment: {} extra_rpaths: [] @@ -97,55 +37,25 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@9.3.1 - paths: - cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@10.2.1 - paths: - cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native + cflags: -mcpu=native -mtune=native + cxxflags: -mcpu=native -mtune=native operating_system: rhel7 - target: x86_64 + target: ppc64le modules: [] environment: {} extra_rpaths: [] - compiler: - spec: intel@19.1.2 + spec: xl@16.1.1 paths: - cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc - cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc - f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort - fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort + cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc + cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC + f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf + fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native - cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native + cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 + cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 operating_system: rhel7 - target: x86_64 + target: ppc64le modules: [] environment: {} extra_rpaths: [] diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml index b8353dd0..3d9648a7 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml @@ -7,8 +7,8 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native operating_system: rhel7 target: x86_64 modules: [] @@ -22,53 +22,8 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@12.0.1 - paths: - cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@13.0.1 - paths: - cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@14.0.4 - paths: - cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang - cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 - cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native operating_system: rhel7 target: x86_64 modules: [] @@ -104,36 +59,6 @@ compilers: modules: [] environment: {} extra_rpaths: [] -- compiler: - spec: gcc@9.3.1 - paths: - cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@10.2.1 - paths: - cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] - compiler: spec: intel@19.1.2 paths: diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml index a6fbda09..0c6b833b 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml @@ -107,10 +107,3 @@ packages: externals: - spec: pkg-config@0.27.1 prefix: /usr/bin/ - - ninja: - buildable: False - externals: - - spec: ninja@kitware - prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/ - From 00327f812f9fe0f2d9d3ca9c920f41a9bc89bee0 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Tue, 17 Jun 2025 14:59:24 -0700 Subject: [PATCH 31/42] Fixed compilation error related to polarDecomposition --- src/fixedSizeSquareMatrixOpsImpl.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index d4c3f1b7..f0f33bd5 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -113,7 +113,9 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, scale< M, M >( R, 0.5 ); // Determine how close R is to being orthogonal using L2Norm(R.R^T-I) - Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, R ); + real64 copyR[M][M] = { { 0.0 } }; + LvArray::tensorOps::copy< M, M >( copyR, R); + Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR ); addIdentity< M >( RRTMinusI, -1.0 ); for( std::ptrdiff_t i = 0 ; i < M ; i++ ) { From 95aa41e87da4514c16aed76b9a94ccfc60a9b183 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Tue, 17 Jun 2025 15:06:03 -0700 Subject: [PATCH 32/42] Removed wrong namespace --- src/fixedSizeSquareMatrixOpsImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index f0f33bd5..cce92ff5 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -114,7 +114,7 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, // Determine how close R is to being orthogonal using L2Norm(R.R^T-I) real64 copyR[M][M] = { { 0.0 } }; - LvArray::tensorOps::copy< M, M >( copyR, R); + copy< M, M >( copyR, R); Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR ); addIdentity< M >( RRTMinusI, -1.0 ); for( std::ptrdiff_t i = 0 ; i < M ; i++ ) From 85136bec9eba8ed35716196595b762ee6493fd5b Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Tue, 17 Jun 2025 15:08:28 -0700 Subject: [PATCH 33/42] Fixed incorrect type --- src/fixedSizeSquareMatrixOpsImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index cce92ff5..135f645f 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -113,7 +113,7 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, scale< M, M >( R, 0.5 ); // Determine how close R is to being orthogonal using L2Norm(R.R^T-I) - real64 copyR[M][M] = { { 0.0 } }; + FloatingPoint copyR[M][M] = { { 0.0 } }; copy< M, M >( copyR, R); Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR ); addIdentity< M >( RRTMinusI, -1.0 ); From 34a4ea5759284ad4fd0335c20e12c8af8c044c7e Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Sun, 6 Jul 2025 12:15:40 -0700 Subject: [PATCH 34/42] Added round to math and cofactor to square tensor ops for 2x2 and 3x3 matrices --- cmake/Config.cmake | 2 +- src/fixedSizeSquareMatrixOps.hpp | 36 ++++++++ src/fixedSizeSquareMatrixOpsImpl.hpp | 79 ++++++++++++++++++ src/math.hpp | 45 ++++++++++ unitTests/CMakeLists.txt | 2 + unitTests/testTensorOpsCofactorOneArg.cpp | 29 +++++++ unitTests/testTensorOpsCofactorTwoArgs.cpp | 29 +++++++ unitTests/testTensorOpsInverse.hpp | 97 ++++++++++++++++++++++ 8 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 unitTests/testTensorOpsCofactorOneArg.cpp create mode 100644 unitTests/testTensorOpsCofactorTwoArgs.cpp diff --git a/cmake/Config.cmake b/cmake/Config.cmake index cf8ff35b..49c5ae12 100644 --- a/cmake/Config.cmake +++ b/cmake/Config.cmake @@ -2,7 +2,7 @@ set( PREPROCESSOR_DEFINES UMPIRE CHAI CUDA - HIP + HIP TOTALVIEW_OUTPUT CALIPER ) diff --git a/src/fixedSizeSquareMatrixOps.hpp b/src/fixedSizeSquareMatrixOps.hpp index 9fb82ec3..f39c3f15 100644 --- a/src/fixedSizeSquareMatrixOps.hpp +++ b/src/fixedSizeSquareMatrixOps.hpp @@ -38,6 +38,42 @@ LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline auto determinant( MATRIX const & matrix ) { return internal::SquareMatrixOps< M >::determinant( matrix ); } +/** + * @brief Compute the cofactor of the source matrix @p srcMatrix and store the result in @p dstMatrix. + * @tparam M The size of the matrices @p dstMatrix and @p srcMatrix. + * @tparam DST_MATRIX The type of @p dstMatrix. + * @tparam SRC_MATRIX The type of @p srcMatrix. + * @param dstMatrix The M x M matrix to write the cofactor to. + * @param srcMatrix The M x M matrix to take the cofactor of. + * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values. + */ +template< std::ptrdiff_t M, typename DST_MATRIX, typename SRC_MATRIX > +LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline +auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) +{ + static_assert( std::is_floating_point< std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) > >::value, + "The destination matrix must be contain floating point values." ); + internal::SquareMatrixOps< M >::cofactor( std::forward< DST_MATRIX >( dstMatrix ), srcMatrix ); +} + +/** + * @brief Compute the cofactor of the matrix @p matrix overwritting it. + * @tparam M The size of the matrix @p matrix. + * @tparam MATRIX The type of @p matrix. + * @param matrix The M x M matrix to take the cofactor of and overwrite. + * @return The determinant. + * @note @p matrix must contain floating point values. + */ +template< std::ptrdiff_t M, typename MATRIX > +LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline +auto cofactor( MATRIX && matrix ) +{ + static_assert( std::is_floating_point< std::decay_t< decltype( matrix[ 0 ][ 0 ] ) > >::value, + "The matrix must be contain floating point values." ); + internal::SquareMatrixOps< M >::cofactor( std::forward< MATRIX >( matrix ) ); +} + /** * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. * @tparam M The size of the matrices @p dstMatrix and @p srcMatrix. diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index 135f645f..19cd2249 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -151,6 +151,26 @@ struct SquareMatrixOps< 2 > return matrix[ 0 ][ 0 ] * matrix[ 1 ][ 1 ] - matrix[ 0 ][ 1 ] * matrix[ 1 ][ 0 ]; } + /** + * @return Compute the cofactor of the source matrix @p srcMatrix and store the result in @p dstMatrix + * @tparam DST_MATRIX The type of @p dstMatrix. + * @tparam SRC_MATRIX The type of @p srcMatrix. + * @param dstMatrix The 2x2 matrix to write the cofactor to. + * @param srcMatrix The 2x2 matrix to take the cofactor of. + */ + template< typename DST_MATRIX, typename SRC_MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) + { + checkSizes< 2, 2 >( dstMatrix ); + checkSizes< 2, 2 >( srcMatrix ); + dstMatrix[0][0] = srcMatrix[1][1]; + dstMatrix[1][1] = srcMatrix[0][0]; + dstMatrix[0][1] = -srcMatrix[1][0]; + dstMatrix[1][0] = -srcMatrix[0][1]; + } + /** * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. * @tparam DST_MATRIX The type of @p dstMatrix. @@ -181,6 +201,24 @@ struct SquareMatrixOps< 2 > return det; } + /** + * @brief Compute the cofactor of the matrix @p srcMatrix overwritting it. + * @tparam MATRIX The type of @p matrix. + * @param matrix The 2x2 matrix to take the cofactor of and overwrite. + * @note @p matrix must contain floating point values. + */ + template< typename MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto cofactor( MATRIX && matrix ) + { + checkSizes< 2, 2 >( matrix ); + + using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >; + realType temp[ 2 ][ 2 ]; + copy< 2, 2 >( temp, matrix ); + return cofactor( matrix, temp ); + } + /** * @brief Invert the matrix @p srcMatrix overwritting it. * @tparam MATRIX The type of @p matrix. @@ -655,6 +693,31 @@ struct SquareMatrixOps< 3 > matrix[ 2 ][ 0 ] * ( matrix[ 0 ][ 1 ] * matrix[ 1 ][ 2 ] - matrix[ 0 ][ 2 ] * matrix[ 1 ][ 1 ] ); } + /** + * @return Compute the cofactor of the source matrix @p srcMatrix and store the result in @p dstMatrix + * @tparam DST_MATRIX The type of @p dstMatrix. + * @tparam SRC_MATRIX The type of @p srcMatrix. + * @param dstMatrix The 3x3 matrix to write the cofactor to. + * @param srcMatrix The 3x3 matrix to take the cofactor of. + */ + template< typename DST_MATRIX, typename SRC_MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) + { + checkSizes< 3, 3 >( dstMatrix ); + checkSizes< 3, 3 >( srcMatrix ); + dstMatrix[0][0] = srcMatrix[1][1] * srcMatrix[2][2] - srcMatrix[1][2] * srcMatrix[2][1]; + dstMatrix[0][1] = srcMatrix[1][2] * srcMatrix[2][0] - srcMatrix[1][0] * srcMatrix[2][2]; + dstMatrix[0][2] = srcMatrix[1][0] * srcMatrix[2][1] - srcMatrix[1][1] * srcMatrix[2][0]; + dstMatrix[1][0] = srcMatrix[0][2] * srcMatrix[2][1] - srcMatrix[0][1] * srcMatrix[2][2]; + dstMatrix[1][1] = srcMatrix[0][0] * srcMatrix[2][2] - srcMatrix[0][2] * srcMatrix[2][0]; + dstMatrix[1][2] = srcMatrix[0][1] * srcMatrix[2][0] - srcMatrix[0][0] * srcMatrix[2][1]; + dstMatrix[2][0] = srcMatrix[0][1] * srcMatrix[1][2] - srcMatrix[0][2] * srcMatrix[1][1]; + dstMatrix[2][1] = srcMatrix[0][2] * srcMatrix[1][0] - srcMatrix[0][0] * srcMatrix[1][2]; + dstMatrix[2][2] = srcMatrix[0][0] * srcMatrix[1][1] - srcMatrix[0][1] * srcMatrix[1][0]; + } + /** * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. * @tparam DST_MATRIX The type of @p dstMatrix. @@ -696,6 +759,22 @@ struct SquareMatrixOps< 3 > return det; } + /** + * @brief Compute the cofactor of the matrix @p srcMatrix overwritting it. + * @tparam MATRIX The type of @p matrix. + * @param matrix The 3x3 matrix to take the cofactor of and overwrite. + * @note @p srcMatrix must contain floating point values. + */ + template< typename MATRIX > + LVARRAY_HOST_DEVICE constexpr inline + static auto cofactor( MATRIX && matrix ) + { + using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >; + realType temp[ 3 ][ 3 ]; + copy< 3, 3 >( temp, matrix ); + return cofactor( matrix, temp ); + } + /** * @brief Invert the matrix @p srcMatrix overwritting it. * @tparam MATRIX The type of @p matrix. diff --git a/src/math.hpp b/src/math.hpp index 4dde2358..601fcfc3 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -441,6 +441,51 @@ __half2 abs( __half2 const x ) #endif +/** + * @return The round value of @p x. + * @param x The number to get the round value of. + * @note This set of overloads is valid for any numeric type. + */ +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE +float round( float const x ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::roundf( x ); +#else + return std::round( x ); +#endif +} + +template< typename T > +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr +double round( T const x ) +{ +#if defined(LVARRAY_DEVICE_COMPILE) + return ::round( double ( x ) ); +#else + return std::round( x ); +#endif +} + +#if defined( LVARRAY_USE_DEVICE ) + +/// @copydoc round( T ) +LVARRAY_DEVICE LVARRAY_FORCE_INLINE +__half round( __half const x ) +{ + return round( x ); +} + +/// @copydoc round( T ) +LVARRAY_DEVICE LVARRAY_FORCE_INLINE +__half2 round( __half2 const x ) +{ + return round( x ); +} + +#endif + + /** * @return The ceiling value of @p x. * @param x The number to get the ceiling value of. diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt index 18eb75d6..625a1209 100644 --- a/unitTests/CMakeLists.txt +++ b/unitTests/CMakeLists.txt @@ -70,6 +70,8 @@ set( testSources testStackTrace.cpp testTensorOpsDeterminant.cpp testTensorOpsEigen.cpp + testTensorOpsCofactorOneArg.cpp + testTensorOpsCofactorTwoArgs.cpp testTensorOpsInverseOneArg.cpp testTensorOpsInverseTwoArgs.cpp testTensorOpsPolarDecomposition.cpp diff --git a/unitTests/testTensorOpsCofactorOneArg.cpp b/unitTests/testTensorOpsCofactorOneArg.cpp new file mode 100644 index 00000000..82509dd8 --- /dev/null +++ b/unitTests/testTensorOpsCofactorOneArg.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +#include "testTensorOpsInverse.hpp" + +namespace LvArray +{ +namespace testing +{ + +TYPED_TEST( CofactorFloatOnlyTest, cofactorOneArg ) +{ + this->cofactorOneArg(); +} + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + return result; +} diff --git a/unitTests/testTensorOpsCofactorTwoArgs.cpp b/unitTests/testTensorOpsCofactorTwoArgs.cpp new file mode 100644 index 00000000..3d8ebe3e --- /dev/null +++ b/unitTests/testTensorOpsCofactorTwoArgs.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +#include "testTensorOpsInverse.hpp" + +namespace LvArray +{ +namespace testing +{ + +TYPED_TEST( CofactorTest, cofactorTwoArgs ) +{ + this->cofactorTwoArgs(); +} + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + return result; +} diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index 5603f3e1..4ead0e3b 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -90,6 +90,85 @@ class InverseTest : public ::testing::Test } ); } + void cofactorTwoArgs() + { + ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView(); + ArrayViewT< T, 3, 1 > const srcMatrix_IKJ = m_srcMatrix_IKJ.toView(); + ArrayViewT< T, 3, 0 > const srcMatrix_KJI = m_srcMatrix_KJI.toView(); + + ArrayViewT< FLOAT, 3, 2 > const dstMatrix_IJK = m_dstMatrix_IJK.toView(); + ArrayViewT< FLOAT, 3, 1 > const dstMatrix_IKJ = m_dstMatrix_IKJ.toView(); + ArrayViewT< FLOAT, 3, 0 > const dstMatrix_KJI = m_dstMatrix_KJI.toView(); + + ArrayT< T, RAJA::PERM_IJK > arrayOfMatrices( numMatrices, M, M ); + + T scale = 100; + for( T & value : arrayOfMatrices ) + { value = randomValue( scale, m_gen ); } + + ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst(); + + forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i ) + { + T srcLocal[ M ][ M ]; + FLOAT dstLocal[ M ][ M ]; + + #define _TEST( output, input ) \ + tensorOps::copy< M, M >( input, matrices[ i ] ); \ + tensorOps::cofactor< M >( output, input ); \ + checkCofactor( scale, output, input ) + + #define _TEST_PERMS( input, output0, output1, output2, output3 ) \ + _TEST( output0, input ); \ + _TEST( output1, input ); \ + _TEST( output2, input ); \ + _TEST( output3, input ) + + _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void cofactorOneArg() + { + ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView(); + ArrayViewT< T, 3, 1 > const srcMatrix_IKJ = m_srcMatrix_IKJ.toView(); + ArrayViewT< T, 3, 0 > const srcMatrix_KJI = m_srcMatrix_KJI.toView(); + + ArrayT< T, RAJA::PERM_IJK > arrayOfMatrices( numMatrices, M, M ); + + T scale = 100; + for( T & value : arrayOfMatrices ) + { value = randomValue( scale, m_gen ); } + + ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst(); + + forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i ) + { + T initialMatrix[ M ][ M ]; + T srcLocal[ M ][ M ]; + T det; + + #define _TEST( matrix ) \ + tensorOps::copy< M, M >( initialMatrix, matrices[ i ] ); \ + tensorOps::copy< M, M >( matrix, initialMatrix ); \ + tensorOps::cofactor< M >( matrix ); \ + checkCofactor( scale, matrix, initialMatrix ) + + _TEST( srcMatrix_IJK[ i ] ); + _TEST( srcMatrix_IKJ[ i ] ); + _TEST( srcMatrix_KJI[ i ] ); + _TEST( srcLocal ); + + #undef _TEST + } ); + } + void inverseTwoArgs() { ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView(); @@ -340,6 +419,24 @@ class InverseTest : public ::testing::Test 3 * scale * scale * scale * epsilon ) } + + template< typename MATRIX_A, typename MATRIX_B > + static void LVARRAY_HOST_DEVICE checkCofactor( T const scale, MATRIX_A && cofactor, MATRIX_B && source ) + { + FLOAT const epsilon = NumericLimitsNC< FLOAT >{}.epsilon; + + FLOAT inverse[ M ][ M ]; + T det = tensorOps::invert< M >( inverse, src ); + tensorOps::transpose< M >( inverse ); + tensorOps::scale< M, M >( inverse, det ); + + for( int i = 0; i < M; ++i ) + { + for( int j = 0; j < M; ++j ) + { PORTABLE_EXPECT_NEAR( cofactor[ i ][ j ], inverse[ i ][ j ], 5 * scale * epsilon ); } + } + } + template< typename MATRIX_A, typename MATRIX_B > static void LVARRAY_HOST_DEVICE checkInverse( T const scale, T const det, From fa06984d55cb2da24a2b7a25f63c28707edde8f2 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Sun, 6 Jul 2025 12:19:19 -0700 Subject: [PATCH 35/42] Updated formatting --- src/ArrayOfArrays.hpp | 4 ++-- src/ArrayOfArraysView.hpp | 10 ++++---- src/fixedSizeSquareMatrixOpsImpl.hpp | 8 +++---- src/math.hpp | 3 ++- unitTests/testTensorOpsInverse.hpp | 36 ++++++++++++++-------------- 5 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/ArrayOfArrays.hpp b/src/ArrayOfArrays.hpp index 8dfbed2a..c78109c9 100644 --- a/src/ArrayOfArrays.hpp +++ b/src/ArrayOfArrays.hpp @@ -497,8 +497,8 @@ constexpr bool isArrayOfArrays = false; * @brief Specialization of isArrayOfArrays for the ArrayOfArrays class. */ template< typename T, - typename INDEX_TYPE, - template< typename > class BUFFER_TYPE > + typename INDEX_TYPE, + template< typename > class BUFFER_TYPE > constexpr bool isArrayOfArrays< ArrayOfArrays< T, INDEX_TYPE, BUFFER_TYPE > > = true; } /* namespace LvArray */ diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 757dbec8..85ecfd13 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -858,10 +858,10 @@ class ArrayOfArraysView destroyValues( 0, m_numArrays, pairs.first ... ); INDEX_TYPE const offsetsSize = ( m_numArrays == 0 ) ? 0 : m_numArrays + 1; - + bufferManipulation::copyInto( m_offsets, offsetsSize, srcOffsets, srcNumArrays + 1 ); bufferManipulation::copyInto( m_sizes, m_numArrays, srcSizes, srcNumArrays ); - + INDEX_TYPE const maxOffset = m_offsets[ m_numArrays ]; typeManipulation::forEachArg( [maxOffset, srcMaxOffset]( auto & dstBuffer ) { @@ -1101,9 +1101,9 @@ constexpr bool isArrayOfArraysView = false; * @brief Specialization of isArrayOfArraysView for the ArrayOfArraysView class. */ template< typename T, - typename INDEX_TYPE, - bool CONST_SIZES, - template< typename > class BUFFER_TYPE > + typename INDEX_TYPE, + bool CONST_SIZES, + template< typename > class BUFFER_TYPE > constexpr bool isArrayOfArraysView< ArrayOfArraysView< T, INDEX_TYPE, CONST_SIZES, BUFFER_TYPE > > = true; } /* namespace LvArray */ diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index 135f645f..f6c33d2c 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -114,12 +114,12 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, // Determine how close R is to being orthogonal using L2Norm(R.R^T-I) FloatingPoint copyR[M][M] = { { 0.0 } }; - copy< M, M >( copyR, R); + copy< M, M >( copyR, R ); Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR ); addIdentity< M >( RRTMinusI, -1.0 ); - for( std::ptrdiff_t i = 0 ; i < M ; i++ ) + for( std::ptrdiff_t i = 0; i < M; i++ ) { - for( std::ptrdiff_t j = 0 ; j < M ; j++ ) + for( std::ptrdiff_t j = 0; j < M; j++ ) { errorSquared += RRTMinusI[i][j] * RRTMinusI[i][j]; } @@ -127,7 +127,7 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R, } if( iter == 100 ) { - printf("Polar decomposition did not converge in 100 iterations!"); + printf( "Polar decomposition did not converge in 100 iterations!" ); } } diff --git a/src/math.hpp b/src/math.hpp index 4dde2358..2a8e0c4d 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -487,7 +487,8 @@ __half2 ceil( __half2 const x ) #if CUDART_VERSION > 11000 return h2ceil( x ); #else - return LVARRAY_THROW( "h2ceil is not implemented for host", std::runtime_error ); // This is wrong, copied from other function used to mimic + return LVARRAY_THROW( "h2ceil is not implemented for host", std::runtime_error ); // This is wrong, copied from other function used to + // mimic #endif } diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index 5603f3e1..fa701742 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -265,29 +265,29 @@ class InverseTest : public ::testing::Test ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst(); forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i ) - { - T srcLocal[ M ][ M ]; - FLOAT dstLocal[ M ][ M ]; + { + T srcLocal[ M ][ M ]; + FLOAT dstLocal[ M ][ M ]; - #define _TEST( output, input ) \ - tensorOps::copy< M, M >( input, matrices[ i ] ); \ - tensorOps::polarDecomposition< M >( output, input ); \ - checkPolarDecomposition( output, input ) + #define _TEST( output, input ) \ + tensorOps::copy< M, M >( input, matrices[ i ] ); \ + tensorOps::polarDecomposition< M >( output, input ); \ + checkPolarDecomposition( output, input ) - #define _TEST_PERMS( input, output0, output1, output2, output3 ) \ - _TEST( output0, input ); \ - _TEST( output1, input ); \ - _TEST( output2, input ); \ - _TEST( output3, input ) + #define _TEST_PERMS( input, output0, output1, output2, output3 ) \ + _TEST( output0, input ); \ + _TEST( output1, input ); \ + _TEST( output2, input ); \ + _TEST( output3, input ) - _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); - _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); - _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); - _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); + _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal ); #undef _TEST_PERMS #undef _TEST - } ); + } ); } @@ -400,7 +400,7 @@ class InverseTest : public ::testing::Test using FloatingPoint = std::decay_t< decltype( orthogonalMatrix[0][0] ) >; FloatingPoint tolerance = 10 * LvArray::NumericLimits< FloatingPoint >::epsilon; FloatingPoint sqrtTolerance = math::sqrt( tolerance ); - + if( M == 2 ) { // Check if orthogonalMatrix is actually orthogonal From 7d4a46baa539e1237616da728ede97f8f3e4431d Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Sun, 6 Jul 2025 12:42:15 -0700 Subject: [PATCH 36/42] Fixed bug in unit tests for cofactors --- unitTests/testTensorOpsCofactorOneArg.cpp | 2 +- unitTests/testTensorOpsCofactorTwoArgs.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unitTests/testTensorOpsCofactorOneArg.cpp b/unitTests/testTensorOpsCofactorOneArg.cpp index 82509dd8..7af3c287 100644 --- a/unitTests/testTensorOpsCofactorOneArg.cpp +++ b/unitTests/testTensorOpsCofactorOneArg.cpp @@ -12,7 +12,7 @@ namespace LvArray namespace testing { -TYPED_TEST( CofactorFloatOnlyTest, cofactorOneArg ) +TYPED_TEST( InverseFloatOnlyTest, cofactorOneArg ) { this->cofactorOneArg(); } diff --git a/unitTests/testTensorOpsCofactorTwoArgs.cpp b/unitTests/testTensorOpsCofactorTwoArgs.cpp index 3d8ebe3e..21f9992a 100644 --- a/unitTests/testTensorOpsCofactorTwoArgs.cpp +++ b/unitTests/testTensorOpsCofactorTwoArgs.cpp @@ -12,7 +12,7 @@ namespace LvArray namespace testing { -TYPED_TEST( CofactorTest, cofactorTwoArgs ) +TYPED_TEST( InverseTest, cofactorTwoArgs ) { this->cofactorTwoArgs(); } From a1f3c560e21d7539cd04143f7771a80f9c1e1f52 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Sun, 6 Jul 2025 12:48:59 -0700 Subject: [PATCH 37/42] Fixed typo in unittests --- unitTests/testTensorOpsInverse.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index e7aeefa3..0411c459 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -426,7 +426,7 @@ class InverseTest : public ::testing::Test FLOAT const epsilon = NumericLimitsNC< FLOAT >{}.epsilon; FLOAT inverse[ M ][ M ]; - T det = tensorOps::invert< M >( inverse, src ); + T det = tensorOps::invert< M >( inverse, source ); tensorOps::transpose< M >( inverse ); tensorOps::scale< M, M >( inverse, det ); From bed5b66efcd9e96187de91e642ff4e35b4d7de25 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Sun, 6 Jul 2025 12:53:16 -0700 Subject: [PATCH 38/42] Removed unused variable --- unitTests/testTensorOpsInverse.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index 0411c459..1e1461f6 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -152,7 +152,6 @@ class InverseTest : public ::testing::Test { T initialMatrix[ M ][ M ]; T srcLocal[ M ][ M ]; - T det; #define _TEST( matrix ) \ tensorOps::copy< M, M >( initialMatrix, matrices[ i ] ); \ From 8aebaa559f09c2ccc45d6c0da64a6e905a310842 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Thu, 21 Aug 2025 17:16:00 -0700 Subject: [PATCH 39/42] Added invert for 4x4 matrices --- host-configs/LLNL/dane-clang-14-mpm.cmake | 5 ++ src/fixedSizeSquareMatrixOpsImpl.hpp | 96 +++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 host-configs/LLNL/dane-clang-14-mpm.cmake diff --git a/host-configs/LLNL/dane-clang-14-mpm.cmake b/host-configs/LLNL/dane-clang-14-mpm.cmake new file mode 100644 index 00000000..2c17649d --- /dev/null +++ b/host-configs/LLNL/dane-clang-14-mpm.cmake @@ -0,0 +1,5 @@ +set(CONFIG_NAME "dane-clang-14-mpm" CACHE PATH "") + +set(GEOS_TPL_DIR /usr/WS1/crook5/thirdPartyLibs/install-dane-clang-14-mpm-release CACHE PATH "") + +include(${CMAKE_CURRENT_LIST_DIR}/llnl-cpu-clang-14.cmake) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index 5c0a1df2..ba2c21c3 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -1427,6 +1427,102 @@ struct SquareMatrixOps< 3 > } }; +/** + * @struct SquareMatrixOps< 4 > + * @brief Performs operations on 4x4 square matrices. + */ +template<> +struct SquareMatrixOps< 4 > +{ + /** + * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. + * @tparam DST_MATRIX The type of @p dstMatrix. + * @tparam SRC_MATRIX The type of @p srcMatrix. + * @param dstMatrix The 4x4 matrix to write the inverse to. + * @param srcMatrix The 4x4 matrix to take the inverse of. + * @return The determinant. + * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values. + */ + template< typename DST_MATRIX, typename SRC_MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto invert( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) + { + checkSizes< 4, 4 >( dstMatrix ); + checkSizes< 4, 4 >( srcMatrix ); + + using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >; + + auto const det = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - + srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - + srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + + srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + + srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - + srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1] - + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] + + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - + srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + + srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0] + + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0] - + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] + + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0]; + FloatingPoint const invDet = FloatingPoint( 1 ) / det; + + dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1]; + dstMatrix[0][1] = srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][1]; + dstMatrix[0][2] = srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][1]; + dstMatrix[0][3] = srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]; + + dstMatrix[1][0] = srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0]; + dstMatrix[1][1] = srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][0]; + dstMatrix[1][2] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][0]; + dstMatrix[1][3] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]; + + dstMatrix[2][0] = srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[2][1] = srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[2][2] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][0]; + dstMatrix[2][3] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]; + + dstMatrix[3][0] = srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[3][1] = srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][0] + srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0]; + dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]; + + tensorOps::scale< 4, 4 >( dstMatrix, invDet ); + + return det; + } + + /** + * @brief Invert the matrix @p srcMatrix overwritting it. + * @tparam MATRIX The type of @p matrix. + * @param matrix The 4x4 matrix to take the inverse of and overwrite. + * @return The determinant. + * @note @p srcMatrix must contain floating point values. + */ + template< typename MATRIX > + LVARRAY_HOST_DEVICE constexpr inline + static auto invert( MATRIX && matrix ) + { + using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >; + + realType temp[ 4 ][ 4 ]; + copy< 4, 4 >( temp, matrix ); + return invert( matrix, temp ); + } +}; + } // namespace internal } // namespace tensorOps } // namespace LvArray From d45263773647d5e698cf402a0bfb7329af2f7926 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Wed, 17 Sep 2025 11:01:18 -0700 Subject: [PATCH 40/42] Added determinant and inverse operations for square 4x4 matrices --- src/fixedSizeSquareMatrixOpsImpl.hpp | 84 +++++++++++++++++++--------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index ba2c21c3..9ac2281e 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -1434,6 +1434,63 @@ struct SquareMatrixOps< 3 > template<> struct SquareMatrixOps< 4 > { + /** + * @return Return the determinant of the matrix @p matrix. + * @tparam MATRIX The type of @p matrix. + * @param matrix The 4x4 matrix to get the determinant of. + */ + template< typename MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto determinant( MATRIX const & matrix ) + { + checkSizes< 4, 4 >( matrix ); + + return matrix[0][0]*( + matrix[1][1]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][1] + matrix[1][3]*matrix[2][1]*matrix[3][2] + - matrix[1][3]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][1]*matrix[3][3] + ) + - matrix[0][1]*( + matrix[1][0]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][2] + - matrix[1][3]*matrix[2][2]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][0]*matrix[3][3] + ) + + matrix[0][2]*( + matrix[1][0]*matrix[2][1]*matrix[3][3] + matrix[1][1]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][1] + - matrix[1][3]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][3] + ) + - matrix[0][3]*( + matrix[1][0]*matrix[2][1]*matrix[3][2] + matrix[1][1]*matrix[2][2]*matrix[3][0] + matrix[1][2]*matrix[2][0]*matrix[3][1] + - matrix[1][2]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][2] + ); + + // return matrix[0][0] * matrix[1][1] * matrix[2][2] * matrix[3][3] - + // matrix[0][0] * matrix[1][1] * matrix[2][3] * matrix[3][2] - + // matrix[0][0] * matrix[1][2] * matrix[2][1] * matrix[3][3] + + // matrix[0][0] * matrix[1][2] * matrix[2][3] * matrix[3][1] + + // matrix[0][0] * matrix[1][3] * matrix[2][1] * matrix[3][2] - + // matrix[0][0] * matrix[1][3] * matrix[2][2] * matrix[3][1] - + + // matrix[0][1] * matrix[1][0] * matrix[2][2] * matrix[3][3] + + // matrix[0][1] * matrix[1][0] * matrix[2][3] * matrix[3][2] + + // matrix[0][1] * matrix[1][2] * matrix[2][0] * matrix[3][3] - + // matrix[0][1] * matrix[1][2] * matrix[2][3] * matrix[3][0] - + // matrix[0][1] * matrix[1][3] * matrix[2][0] * matrix[3][2] + + // matrix[0][1] * matrix[1][3] * matrix[2][2] * matrix[3][0] + + + // matrix[0][2] * matrix[1][0] * matrix[2][1] * matrix[3][3] - + // matrix[0][2] * matrix[1][0] * matrix[2][3] * matrix[3][1] - + // matrix[0][2] * matrix[1][1] * matrix[2][0] * matrix[3][3] + + // matrix[0][2] * matrix[1][1] * matrix[2][3] * matrix[3][0] + + // matrix[0][2] * matrix[1][3] * matrix[2][0] * matrix[3][1] - + // matrix[0][2] * matrix[1][3] * matrix[2][1] * matrix[3][0] - + + // matrix[0][3] * matrix[1][0] * matrix[2][1] * matrix[3][2] + + // matrix[0][3] * matrix[1][0] * matrix[2][2] * matrix[3][1] + + // matrix[0][3] * matrix[1][1] * matrix[2][0] * matrix[3][2] - + // matrix[0][3] * matrix[1][1] * matrix[2][2] * matrix[3][0] - + // matrix[0][3] * matrix[1][2] * matrix[2][0] * matrix[3][1] + + // matrix[0][3] * matrix[1][2] * matrix[2][1] * matrix[3][0]; + } + /** * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. * @tparam DST_MATRIX The type of @p dstMatrix. @@ -1453,30 +1510,7 @@ struct SquareMatrixOps< 4 > using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >; - auto const det = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1] - - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] + - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0] + - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0] - - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] + - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0]; + FloatingPoint const det = determinant( srcMatrix ); FloatingPoint const invDet = FloatingPoint( 1 ) / det; dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1]; @@ -1499,7 +1533,7 @@ struct SquareMatrixOps< 4 > dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0]; dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]; - tensorOps::scale< 4, 4 >( dstMatrix, invDet ); + scale< 4, 4 >( dstMatrix, invDet ); return det; } From cd4a94d90315d8155300078215fb4804b27efbeb Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Mon, 10 Nov 2025 21:53:04 -0800 Subject: [PATCH 41/42] Added host config for MPM solver --- host-configs/LLNL/dane-gcc-12-mpm.cmake | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 host-configs/LLNL/dane-gcc-12-mpm.cmake diff --git a/host-configs/LLNL/dane-gcc-12-mpm.cmake b/host-configs/LLNL/dane-gcc-12-mpm.cmake new file mode 100644 index 00000000..86adf6be --- /dev/null +++ b/host-configs/LLNL/dane-gcc-12-mpm.cmake @@ -0,0 +1,5 @@ +set(CONFIG_NAME "dane-gcc-12-mpm" CACHE PATH "") + +set(GEOS_TPL_DIR /usr/WS1/crook5/thirdPartyLibs/install-dane-gcc-12-mpm-release CACHE PATH "") + +include(${CMAKE_CURRENT_LIST_DIR}/llnl-cpu-gcc-12.cmake) From c4853542618a14edaf02c87a97c7fb5b96b8e0c9 Mon Sep 17 00:00:00 2001 From: Cameron Mikel Crook Date: Mon, 1 Dec 2025 10:11:37 -0800 Subject: [PATCH 42/42] Minor formatting changes --- src/fixedSizeSquareMatrixOpsImpl.hpp | 212 ++++++++++++++------------- 1 file changed, 114 insertions(+), 98 deletions(-) diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp index 9ac2281e..9c2533d2 100644 --- a/src/fixedSizeSquareMatrixOpsImpl.hpp +++ b/src/fixedSizeSquareMatrixOpsImpl.hpp @@ -160,7 +160,7 @@ struct SquareMatrixOps< 2 > */ template< typename DST_MATRIX, typename SRC_MATRIX > LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline - static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) { checkSizes< 2, 2 >( dstMatrix ); @@ -702,7 +702,7 @@ struct SquareMatrixOps< 3 > */ template< typename DST_MATRIX, typename SRC_MATRIX > LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline - static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) { checkSizes< 3, 3 >( dstMatrix ); @@ -759,7 +759,7 @@ struct SquareMatrixOps< 3 > return det; } - /** + /** * @brief Compute the cofactor of the matrix @p srcMatrix overwritting it. * @tparam MATRIX The type of @p matrix. * @param matrix The 3x3 matrix to take the cofactor of and overwrite. @@ -1434,109 +1434,125 @@ struct SquareMatrixOps< 3 > template<> struct SquareMatrixOps< 4 > { - /** - * @return Return the determinant of the matrix @p matrix. - * @tparam MATRIX The type of @p matrix. - * @param matrix The 4x4 matrix to get the determinant of. - */ - template< typename MATRIX > - LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline - static auto determinant( MATRIX const & matrix ) - { - checkSizes< 4, 4 >( matrix ); + /** + * @return Return the determinant of the matrix @p matrix. + * @tparam MATRIX The type of @p matrix. + * @param matrix The 4x4 matrix to get the determinant of. + */ + template< typename MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto determinant( MATRIX const & matrix ) + { + checkSizes< 4, 4 >( matrix ); - return matrix[0][0]*( - matrix[1][1]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][1] + matrix[1][3]*matrix[2][1]*matrix[3][2] - - matrix[1][3]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][1]*matrix[3][3] - ) + return matrix[0][0]*( + matrix[1][1]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][1] + matrix[1][3]*matrix[2][1]*matrix[3][2] + - matrix[1][3]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][1]*matrix[3][3] + ) - matrix[0][1]*( - matrix[1][0]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][2] - - matrix[1][3]*matrix[2][2]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][0]*matrix[3][3] - ) + matrix[1][0]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][2] + - matrix[1][3]*matrix[2][2]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][0]*matrix[3][3] + ) + matrix[0][2]*( - matrix[1][0]*matrix[2][1]*matrix[3][3] + matrix[1][1]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][1] - - matrix[1][3]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][3] - ) - - matrix[0][3]*( - matrix[1][0]*matrix[2][1]*matrix[3][2] + matrix[1][1]*matrix[2][2]*matrix[3][0] + matrix[1][2]*matrix[2][0]*matrix[3][1] - - matrix[1][2]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][2] - ); - - // return matrix[0][0] * matrix[1][1] * matrix[2][2] * matrix[3][3] - - // matrix[0][0] * matrix[1][1] * matrix[2][3] * matrix[3][2] - - // matrix[0][0] * matrix[1][2] * matrix[2][1] * matrix[3][3] + - // matrix[0][0] * matrix[1][2] * matrix[2][3] * matrix[3][1] + - // matrix[0][0] * matrix[1][3] * matrix[2][1] * matrix[3][2] - - // matrix[0][0] * matrix[1][3] * matrix[2][2] * matrix[3][1] - - - // matrix[0][1] * matrix[1][0] * matrix[2][2] * matrix[3][3] + - // matrix[0][1] * matrix[1][0] * matrix[2][3] * matrix[3][2] + - // matrix[0][1] * matrix[1][2] * matrix[2][0] * matrix[3][3] - - // matrix[0][1] * matrix[1][2] * matrix[2][3] * matrix[3][0] - - // matrix[0][1] * matrix[1][3] * matrix[2][0] * matrix[3][2] + - // matrix[0][1] * matrix[1][3] * matrix[2][2] * matrix[3][0] + - - // matrix[0][2] * matrix[1][0] * matrix[2][1] * matrix[3][3] - - // matrix[0][2] * matrix[1][0] * matrix[2][3] * matrix[3][1] - - // matrix[0][2] * matrix[1][1] * matrix[2][0] * matrix[3][3] + - // matrix[0][2] * matrix[1][1] * matrix[2][3] * matrix[3][0] + - // matrix[0][2] * matrix[1][3] * matrix[2][0] * matrix[3][1] - - // matrix[0][2] * matrix[1][3] * matrix[2][1] * matrix[3][0] - - - // matrix[0][3] * matrix[1][0] * matrix[2][1] * matrix[3][2] + - // matrix[0][3] * matrix[1][0] * matrix[2][2] * matrix[3][1] + - // matrix[0][3] * matrix[1][1] * matrix[2][0] * matrix[3][2] - - // matrix[0][3] * matrix[1][1] * matrix[2][2] * matrix[3][0] - - // matrix[0][3] * matrix[1][2] * matrix[2][0] * matrix[3][1] + - // matrix[0][3] * matrix[1][2] * matrix[2][1] * matrix[3][0]; - } - - /** - * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. - * @tparam DST_MATRIX The type of @p dstMatrix. - * @tparam SRC_MATRIX The type of @p srcMatrix. - * @param dstMatrix The 4x4 matrix to write the inverse to. - * @param srcMatrix The 4x4 matrix to take the inverse of. - * @return The determinant. - * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values. - */ - template< typename DST_MATRIX, typename SRC_MATRIX > - LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline - static auto invert( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, - SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) - { - checkSizes< 4, 4 >( dstMatrix ); - checkSizes< 4, 4 >( srcMatrix ); - - using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >; - - FloatingPoint const det = determinant( srcMatrix ); - FloatingPoint const invDet = FloatingPoint( 1 ) / det; + matrix[1][0]*matrix[2][1]*matrix[3][3] + matrix[1][1]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][1] + - matrix[1][3]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][3] + ) + - matrix[0][3]*( + matrix[1][0]*matrix[2][1]*matrix[3][2] + matrix[1][1]*matrix[2][2]*matrix[3][0] + matrix[1][2]*matrix[2][0]*matrix[3][1] + - matrix[1][2]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][2] + ); + + // return matrix[0][0] * matrix[1][1] * matrix[2][2] * matrix[3][3] - + // matrix[0][0] * matrix[1][1] * matrix[2][3] * matrix[3][2] - + // matrix[0][0] * matrix[1][2] * matrix[2][1] * matrix[3][3] + + // matrix[0][0] * matrix[1][2] * matrix[2][3] * matrix[3][1] + + // matrix[0][0] * matrix[1][3] * matrix[2][1] * matrix[3][2] - + // matrix[0][0] * matrix[1][3] * matrix[2][2] * matrix[3][1] - + + // matrix[0][1] * matrix[1][0] * matrix[2][2] * matrix[3][3] + + // matrix[0][1] * matrix[1][0] * matrix[2][3] * matrix[3][2] + + // matrix[0][1] * matrix[1][2] * matrix[2][0] * matrix[3][3] - + // matrix[0][1] * matrix[1][2] * matrix[2][3] * matrix[3][0] - + // matrix[0][1] * matrix[1][3] * matrix[2][0] * matrix[3][2] + + // matrix[0][1] * matrix[1][3] * matrix[2][2] * matrix[3][0] + + + // matrix[0][2] * matrix[1][0] * matrix[2][1] * matrix[3][3] - + // matrix[0][2] * matrix[1][0] * matrix[2][3] * matrix[3][1] - + // matrix[0][2] * matrix[1][1] * matrix[2][0] * matrix[3][3] + + // matrix[0][2] * matrix[1][1] * matrix[2][3] * matrix[3][0] + + // matrix[0][2] * matrix[1][3] * matrix[2][0] * matrix[3][1] - + // matrix[0][2] * matrix[1][3] * matrix[2][1] * matrix[3][0] - + + // matrix[0][3] * matrix[1][0] * matrix[2][1] * matrix[3][2] + + // matrix[0][3] * matrix[1][0] * matrix[2][2] * matrix[3][1] + + // matrix[0][3] * matrix[1][1] * matrix[2][0] * matrix[3][2] - + // matrix[0][3] * matrix[1][1] * matrix[2][2] * matrix[3][0] - + // matrix[0][3] * matrix[1][2] * matrix[2][0] * matrix[3][1] + + // matrix[0][3] * matrix[1][2] * matrix[2][1] * matrix[3][0]; + } - dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1]; - dstMatrix[0][1] = srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][1]; - dstMatrix[0][2] = srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][1]; - dstMatrix[0][3] = srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]; + /** + * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix. + * @tparam DST_MATRIX The type of @p dstMatrix. + * @tparam SRC_MATRIX The type of @p srcMatrix. + * @param dstMatrix The 4x4 matrix to write the inverse to. + * @param srcMatrix The 4x4 matrix to take the inverse of. + * @return The determinant. + * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values. + */ + template< typename DST_MATRIX, typename SRC_MATRIX > + LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline + static auto invert( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, + SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix ) + { + checkSizes< 4, 4 >( dstMatrix ); + checkSizes< 4, 4 >( srcMatrix ); - dstMatrix[1][0] = srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0]; - dstMatrix[1][1] = srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][0]; - dstMatrix[1][2] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][0]; - dstMatrix[1][3] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]; + using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >; - dstMatrix[2][0] = srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0]; - dstMatrix[2][1] = srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][0]; - dstMatrix[2][2] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][0]; - dstMatrix[2][3] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]; - - dstMatrix[3][0] = srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0]; - dstMatrix[3][1] = srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][0] + srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][0]; - dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0]; - dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]; + FloatingPoint const det = determinant( srcMatrix ); + FloatingPoint const invDet = FloatingPoint( 1 ) / det; - scale< 4, 4 >( dstMatrix, invDet ); + dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]* + srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1]; + dstMatrix[0][1] = srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[0][2]* + srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][1]; + dstMatrix[0][2] = srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][3] + srcMatrix[0][2]* + srcMatrix[1][3]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][1]; + dstMatrix[0][3] = srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3] - srcMatrix[0][2]* + srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]; + + dstMatrix[1][0] = srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[1][2]* + srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0]; + dstMatrix[1][1] = srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[0][2]* + srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][0]; + dstMatrix[1][2] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][3] - srcMatrix[0][2]* + srcMatrix[1][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][0]; + dstMatrix[1][3] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3] + srcMatrix[0][2]* + srcMatrix[1][3]*srcMatrix[2][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]; + + dstMatrix[2][0] = srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[1][1]* + srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[2][1] = srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[0][1]* + srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[2][2] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][3] + srcMatrix[0][1]* + srcMatrix[1][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][0]; + dstMatrix[2][3] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3] - srcMatrix[0][1]* + srcMatrix[1][3]*srcMatrix[2][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]; + + dstMatrix[3][0] = srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[1][1]* + srcMatrix[2][2]*srcMatrix[3][0] - srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[3][1] = srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[0][1]* + srcMatrix[2][2]*srcMatrix[3][0] + srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][0]; + dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]* + srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0]; + dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]* + srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]; + + scale< 4, 4 >( dstMatrix, invDet ); - return det; - } + return det; + } /** * @brief Invert the matrix @p srcMatrix overwritting it.