From d52e57307cd728c59b913fe9b515d7ea83d78d95 Mon Sep 17 00:00:00 2001
From: Stefan Povolny <povolny1@llnl.gov>
Date: Tue, 24 Jan 2023 10:55:14 -0800
Subject: [PATCH 01/42] initial commit, added an algorithm to
 fixedSizeSquareMatrixOps that calculates the polar decomposition of a 3x3
 square matrix

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 44 ++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index f006d866..06a591d1 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -1253,6 +1253,50 @@ struct SquareMatrixOps< 3 >
 
     return row;
   }
+
+  /**
+   * @brief Determine the polar decomposition using the Higham algorithm such that @p R . U = V . @p R = @p matrix
+   * @tparam DST_MATRIX The type of @p R.
+   * @tparam MATRIX The type of @p matrix.
+   * @param R The resultant rotation matrix, of size ISIZE x ISIZE.
+   * @param matrix The matrix to be decomposed, of size ISIZE x ISIZE.
+   */
+  template< typename DST_MATRIX, typename MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R,
+                                  MATRIX const & LVARRAY_RESTRICT_REF matrix )
+  {
+    checkSizes< 3, 3 >( R );
+    checkSizes< 3, 3 >( matrix );
+
+    using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >;
+
+    // Initialize
+    copy< 3, 3>( R, matrix );
+    realType RInverse[3][3], RInverseTranspose[3][3], RRTMinusI[3][3];
+
+    // Higham Algorithm
+    realType error = 1.0;
+    int iter = 0;
+    while( error > 1.0e-16 )
+    {
+      iter++;
+      invert( RInverse, R );
+      transpose< 3, 3 >( RInverseTranspose, RInverse );
+      add< 3, 3 >( R, RInverseTranspose );
+      scale< 3, 3 >( R, 0.5 );
+      Rij_eq_AikBjk< 3, 3 >( RRTMinusI, R, R );
+      addIdentity< 3, 3 >( RRTMinusI, -1.0 );
+      for( std::ptrdiff_t i = 0 ; i < 3 ; i++ )
+      {
+        for( std::ptrdiff_t j = 0 ; j < 3 ; j++ )
+        {
+          error += RRTMinusI[i][j] * RRTMinusI[i][j];
+        }
+      }
+      LVARRAY_ERROR_IF( iter >= 100, "Polar decomposition failed to converge!");
+    }
+  }
 };
 
 } // namespace internal

From 6d4875ec34e6c65e162a590c42c7ec9f4d7931d4 Mon Sep 17 00:00:00 2001
From: Stefan Povolny <povolny1@llnl.gov>
Date: Wed, 25 Jan 2023 08:38:03 -0800
Subject: [PATCH 02/42] implementation of polar decomposition complete, still
 need to test

---
 src/fixedSizeSquareMatrixOps.hpp     |  18 ++++
 src/fixedSizeSquareMatrixOpsImpl.hpp | 144 +++++++++++++++++++--------
 2 files changed, 118 insertions(+), 44 deletions(-)

diff --git a/src/fixedSizeSquareMatrixOps.hpp b/src/fixedSizeSquareMatrixOps.hpp
index c61e4a9e..9fb82ec3 100644
--- a/src/fixedSizeSquareMatrixOps.hpp
+++ b/src/fixedSizeSquareMatrixOps.hpp
@@ -354,6 +354,24 @@ void symmetricToDense( DST_MATRIX && dstMatrix, SRC_SYM_MATRIX const & srcSymMat
                                                            srcSymMatrix );
 }
 
+/**
+ * @brief Determine the polar decomposition of the matrix @p srcMatrix
+ * @tparam M The size of @p R and @p srcMatrix.
+ * @tparam DST_MATRIX The type of @p R.
+ * @tparam MATRIX The type of @p srcMatrix.
+ * @param R The resultant rotation matrix.
+ * @param matrix The matrix to be decomposed.
+ * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p srcMatrix.
+ *   This is done using Higham's iterative algorithm.
+ */
+template< std::ptrdiff_t M, typename DST_MATRIX, typename MATRIX >
+LVARRAY_HOST_DEVICE constexpr inline
+void polarDecomposition( DST_MATRIX && R, MATRIX const & srcMatrix )
+{
+  return internal::SquareMatrixOps< M >::polarDecomposition( std::forward< DST_MATRIX >( R ),
+                                                             srcMatrix );
+}
+
 ///@}
 
 } // namespace tensorOps
diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 06a591d1..3733fae4 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -529,6 +529,56 @@ struct SquareMatrixOps< 2 >
     dstMatrix[ 1 ][ 0 ] = srcSymMatrix[ 2 ];
   }
 
+  /**
+   * @brief Determine the polar decomposition of the 2x2 matrix @p matrix
+   * @tparam DST_MATRIX The type of @p R.
+   * @tparam MATRIX The type of @p matrix.
+   * @param R The resultant rotation matrix.
+   * @param matrix The matrix to be decomposed.
+   * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix.
+   *   This is done using Higham's iterative algorithm.
+   */
+  template< typename DST_MATRIX, typename MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R,
+                                  MATRIX const & LVARRAY_RESTRICT_REF matrix )
+  {
+    checkSizes< 2, 2 >( R );
+    checkSizes< 2, 2 >( matrix );
+
+    using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >;
+
+    // Initialize
+    copy< 2, 2 >( R, matrix );
+    FloatingPoint RInverse[2][2] = { {0} },
+                  RInverseTranspose[2][2] = { {0} },
+                  RRTMinusI[2][2] = { {0} };
+
+    // Higham Algorithm
+    FloatingPoint error = 1.0;
+    int iter = 0;
+    while( error > 1.0e-16 && iter < 100 )
+    {
+      iter++;
+      error = 0.0;
+
+      invert( RInverse, R );
+      transpose< 2, 2 >( RInverseTranspose, RInverse );
+      add< 2, 2 >( R, RInverseTranspose );
+      scale< 2, 2 >( R, 0.5 );
+      Rij_eq_AikBjk< 2, 2, 2 >( RRTMinusI, R, R );
+      addIdentity< 2 >( RRTMinusI, -1.0 );
+
+      for( std::ptrdiff_t i = 0 ; i < 2 ; i++ )
+      {
+        for( std::ptrdiff_t j = 0 ; j < 2 ; j++ )
+        {
+          error += RRTMinusI[i][j] * RRTMinusI[i][j];
+        }
+      }
+    }
+  }
+
 private:
 
   /**
@@ -1162,6 +1212,56 @@ struct SquareMatrixOps< 3 >
     dstMatrix[ 2 ][ 1 ] = srcSymMatrix[ 3 ];
   }
 
+  /**
+   * @brief Determine the polar decomposition of the 3x3 matrix @p matrix
+   * @tparam DST_MATRIX The type of @p R.
+   * @tparam MATRIX The type of @p matrix.
+   * @param R The resultant rotation matrix.
+   * @param matrix The matrix to be decomposed.
+   * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix.
+   *   This is done using Higham's iterative algorithm.
+   */
+  template< typename DST_MATRIX, typename MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R,
+                                  MATRIX const & LVARRAY_RESTRICT_REF matrix )
+  {
+    checkSizes< 3, 3 >( R );
+    checkSizes< 3, 3 >( matrix );
+
+    using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >;
+
+    // Initialize
+    copy< 3, 3 >( R, matrix );
+    FloatingPoint RInverse[3][3] = { {0} },
+                  RInverseTranspose[3][3] = { {0} },
+                  RRTMinusI[3][3] = { {0} };
+
+    // Higham Algorithm
+    FloatingPoint error = 1.0;
+    int iter = 0;
+    while( error > 1.0e-16 && iter < 100 )
+    {
+      iter++;
+      error = 0.0;
+
+      invert( RInverse, R );
+      transpose< 3, 3 >( RInverseTranspose, RInverse );
+      add< 3, 3 >( R, RInverseTranspose );
+      scale< 3, 3 >( R, 0.5 );
+      Rij_eq_AikBjk< 3, 3, 3 >( RRTMinusI, R, R );
+      addIdentity< 3 >( RRTMinusI, -1.0 );
+
+      for( std::ptrdiff_t i = 0 ; i < 3 ; i++ )
+      {
+        for( std::ptrdiff_t j = 0 ; j < 3 ; j++ )
+        {
+          error += RRTMinusI[i][j] * RRTMinusI[i][j];
+        }
+      }
+    }
+  }
+
 private:
   /**
    * @brief Compute the eigenvalues of the 3x3 symmetric matrix @p matrix.
@@ -1253,50 +1353,6 @@ struct SquareMatrixOps< 3 >
 
     return row;
   }
-
-  /**
-   * @brief Determine the polar decomposition using the Higham algorithm such that @p R . U = V . @p R = @p matrix
-   * @tparam DST_MATRIX The type of @p R.
-   * @tparam MATRIX The type of @p matrix.
-   * @param R The resultant rotation matrix, of size ISIZE x ISIZE.
-   * @param matrix The matrix to be decomposed, of size ISIZE x ISIZE.
-   */
-  template< typename DST_MATRIX, typename MATRIX >
-  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
-  static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R,
-                                  MATRIX const & LVARRAY_RESTRICT_REF matrix )
-  {
-    checkSizes< 3, 3 >( R );
-    checkSizes< 3, 3 >( matrix );
-
-    using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >;
-
-    // Initialize
-    copy< 3, 3>( R, matrix );
-    realType RInverse[3][3], RInverseTranspose[3][3], RRTMinusI[3][3];
-
-    // Higham Algorithm
-    realType error = 1.0;
-    int iter = 0;
-    while( error > 1.0e-16 )
-    {
-      iter++;
-      invert( RInverse, R );
-      transpose< 3, 3 >( RInverseTranspose, RInverse );
-      add< 3, 3 >( R, RInverseTranspose );
-      scale< 3, 3 >( R, 0.5 );
-      Rij_eq_AikBjk< 3, 3 >( RRTMinusI, R, R );
-      addIdentity< 3, 3 >( RRTMinusI, -1.0 );
-      for( std::ptrdiff_t i = 0 ; i < 3 ; i++ )
-      {
-        for( std::ptrdiff_t j = 0 ; j < 3 ; j++ )
-        {
-          error += RRTMinusI[i][j] * RRTMinusI[i][j];
-        }
-      }
-      LVARRAY_ERROR_IF( iter >= 100, "Polar decomposition failed to converge!");
-    }
-  }
 };
 
 } // namespace internal

From 02e0e1c050e4cf97893e20f86bf6f4cdfa0034dd Mon Sep 17 00:00:00 2001
From: Benjamin Curtice Corbett <corbett5@llnl.gov>
Date: Thu, 26 Jan 2023 10:58:05 -0800
Subject: [PATCH 03/42] Fixed cmake so that GEOSX TPLs work and also updated
 spack.

---
 cmake/SetupTPL.cmake                          |  51 +++----
 host-configs/LLNL/lassen-base.cmake           |   3 -
 host-configs/LLNL/quartz-base.cmake           |   3 -
 scripts/uberenv/packages/lvarray/package.py   |  26 ++--
 scripts/uberenv/project.json                  |   4 +-
 .../blueos_3_ppc64le_ib_p9/compilers.yaml     | 132 +++++++++++++++---
 .../toss_3_x86_64_ib/compilers.yaml           |  83 ++++++++++-
 .../toss_3_x86_64_ib_python/packages.yaml     |   7 +
 8 files changed, 240 insertions(+), 69 deletions(-)

diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake
index bff94834..29a60128 100644
--- a/cmake/SetupTPL.cmake
+++ b/cmake/SetupTPL.cmake
@@ -1,19 +1,22 @@
 set(thirdPartyLibs "")
 
-################################
+###############################
 # CAMP
-################################
-if(NOT EXISTS ${CAMP_DIR})
-    message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
-endif()
+###############################
+if(CAMP_DIR STREQUAL RAJA_DIR)
+    message(STATUS "LvArray using CAMP from RAJA.")
+else()
+    if(NOT EXISTS ${CAMP_DIR})
+        message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
+    endif()
 
-message(STATUS "Using CAMP from ${CAMP_DIR}")
+    message(STATUS "LvArray using CAMP from ${CAMP_DIR}")
 
-find_package(camp REQUIRED PATHS ${CAMP_DIR})
+    find_package(camp REQUIRED PATHS ${CAMP_DIR})
 
-set(ENABLE_CAMP ON CACHE BOOL "")
+    set(thirdPartyLibs ${thirdPartyLibs} camp)
+endif()
 
-set(thirdPartyLibs ${thirdPartyLibs} camp)
 
 ################################
 # RAJA
@@ -22,7 +25,7 @@ if(NOT EXISTS ${RAJA_DIR})
     message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.")
 endif()
 
-message(STATUS "Using RAJA from ${RAJA_DIR}")
+message(STATUS "LvArray using RAJA from ${RAJA_DIR}")
 
 find_package(RAJA REQUIRED PATHS ${RAJA_DIR})
 
@@ -39,14 +42,14 @@ if(ENABLE_UMPIRE)
         message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.")
     endif()
 
-    message(STATUS "Using Umpire from ${UMPIRE_DIR}")
+    message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}")
 
     find_package(umpire REQUIRED
                  PATHS ${UMPIRE_DIR})
     
     set(thirdPartyLibs ${thirdPartyLibs} umpire)
 else()
-    message(STATUS "Not using Umpire.")
+    message(STATUS "LvArray not using Umpire.")
 endif()
 
 ################################
@@ -65,32 +68,32 @@ if(ENABLE_CHAI)
         message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.")
     endif()
 
-    message(STATUS "Using CHAI from ${CHAI_DIR}")
+    message(STATUS "LvArray using CHAI from ${CHAI_DIR}")
 
     find_package(chai REQUIRED
                  PATHS ${CHAI_DIR})
-
-    # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
-    get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
-    list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
-    set_target_properties(chai
-                          PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
+    
+    # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
+    # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
+    # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
+    # set_target_properties(chai
+    #                       PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
 
     set(thirdPartyLibs ${thirdPartyLibs} chai)
 else()
-    message(STATUS "Not using CHAI.")
+    message(STATUS "LvArray not using CHAI.")
 endif()
 
 
-################################
+###############################
 # CALIPER
-################################
+###############################
 if(ENABLE_CALIPER)
     if(NOT EXISTS ${CALIPER_DIR})
         message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.")
     endif()
 
-    message(STATUS "Using caliper from ${CALIPER_DIR}")
+    message(STATUS "LvArray using caliper from ${CALIPER_DIR}")
 
     find_package(caliper REQUIRED
                  PATHS ${CALIPER_DIR})
@@ -102,7 +105,7 @@ if(ENABLE_CALIPER)
 
     set(thirdPartyLibs ${thirdPartyLibs} caliper)
 else()
-    message(STATUS "Not using caliper.")
+    message(STATUS "LvArray not using caliper.")
 endif()
 
 ################################
diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake
index d544d936..cb33c4a0 100644
--- a/host-configs/LLNL/lassen-base.cmake
+++ b/host-configs/LLNL/lassen-base.cmake
@@ -11,9 +11,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 set(ENABLE_CHAI ON CACHE BOOL "")
 set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 
-set(ENABLE_CALIPER ON CACHE BOOL "")
-set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "")
-
 set(ENABLE_ADDR2LINE ON CACHE BOOL "")
 
 # Cuda options
diff --git a/host-configs/LLNL/quartz-base.cmake b/host-configs/LLNL/quartz-base.cmake
index 397324e9..e203ffe6 100644
--- a/host-configs/LLNL/quartz-base.cmake
+++ b/host-configs/LLNL/quartz-base.cmake
@@ -12,9 +12,6 @@ set(UMPIRE_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 set(ENABLE_CHAI ON CACHE BOOL "")
 set(CHAI_DIR ${GEOSX_TPL_DIR}/chai CACHE PATH "")
 
-set(ENABLE_CALIPER ON CACHE BOOL "")
-set(CALIPER_DIR ${GEOSX_TPL_DIR}/caliper CACHE PATH "")
-
 # set(ENABLE_PYLVARRAY ON CACHE BOOL "")
 # set(PYTHON_DIR /usr/tce/packages/python/python-3.7.2 CACHE PATH "")
 
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index 9c4b47d9..cf9d5548 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -56,32 +56,36 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('docs', default=False, description='Build docs')
     variant('addr2line', default=True,
             description='Build support for addr2line.')
-
+        
     depends_on('blt', when='@0.2.0:', type='build')
 
     depends_on('camp')
-    depends_on('camp+cuda', when='+cuda')
 
     depends_on('raja')
-    depends_on('raja+cuda', when='+cuda')
 
-    # At the moment Umpire doesn't support shared when building with CUDA.
     depends_on('umpire', when='+umpire')
-    depends_on('umpire+cuda~shared', when='+umpire+cuda')
 
     depends_on('chai+raja', when='+chai')
-    depends_on('chai+raja+cuda', when='+chai+cuda')
 
     depends_on('caliper', when='+caliper')
 
     depends_on('python +shared +pic', when='+pylvarray')
-    depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray')
-    depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray')
+    depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray')
+    depends_on('py-scipy@1.5.2:', when='+pylvarray')
     depends_on('py-pip', when='+pylvarray')
 
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
     depends_on('py-sphinx@1.6.3:', when='+docs', type='build')
 
+    with when('+cuda'):
+        for sm_ in CudaPackage.cuda_arch_values:
+            depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+    
+
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
     @run_after('build')
@@ -285,10 +289,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write("#{0}\n\n".format("-" * 80))
 
         if "+caliper" in spec:
-            cfg.write("#{0}\n".format("-" * 80))
-            cfg.write("# Caliper\n")
-            cfg.write("#{0}\n\n".format("-" * 80))
-
             cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
             cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
         else:
@@ -297,6 +297,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write('#{0}\n'.format('-' * 80))
         cfg.write('# Python\n')
         cfg.write('#{0}\n\n'.format('-' * 80))
+
         if '+pylvarray' in spec:
             cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
             cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
@@ -306,6 +307,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write("#{0}\n".format("-" * 80))
         cfg.write("# Documentation\n")
         cfg.write("#{0}\n\n".format("-" * 80))
+
         if "+docs" in spec:
             cfg.write(cmake_cache_option("ENABLE_DOCS", True))
             sphinx_dir = spec['py-sphinx'].prefix
diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json
index 9822f975..703db7a4 100644
--- a/scripts/uberenv/project.json
+++ b/scripts/uberenv/project.json
@@ -3,8 +3,8 @@
   "package_version" : "develop",
   "package_final_phase" : "hostconfig",
   "package_source_dir" : "../..",
-  "spack_url": "https://github.com/corbett5/spack",
-  "spack_branch": "package/corbett/lvarray-update",
+  "spack_url": "https://github.com/spack/spack",
+  "spack_branch": "v0.19.0",
   "spack_activate" : {},
   "spack_clean_packages": ["lvarray"],
   "build_jobs": 100
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
index b1bf26cb..b8353dd0 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
@@ -2,30 +2,90 @@ compilers:
 - compiler:
     spec: clang@10.0.1
     paths:
-      cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
     spec: clang@11.0.1
     paths:
-      cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@12.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@13.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@14.0.4
+    paths:
+      cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang
+      cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@7.3.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
@@ -37,25 +97,55 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@9.3.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@10.2.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
-    spec: xl@16.1.1
+    spec: intel@19.1.2
     paths:
-      cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc
-      cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC
-      f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
-      fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
+      cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc
+      cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc
+      f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
+      fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
     flags:
-      cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
-      cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native
+      cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
index 3d9648a7..b8353dd0 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
@@ -7,8 +7,8 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
     target: x86_64
     modules: []
@@ -22,8 +22,53 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@12.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@13.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@14.0.4
+    paths:
+      cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang
+      cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
     target: x86_64
     modules: []
@@ -59,6 +104,36 @@ compilers:
     modules: []
     environment: {}
     extra_rpaths: []
+- compiler:
+    spec: gcc@9.3.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@10.2.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
 - compiler:
     spec: intel@19.1.2
     paths:
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
index 0c6b833b..a6fbda09 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
@@ -107,3 +107,10 @@ packages:
     externals:
     - spec: pkg-config@0.27.1
       prefix: /usr/bin/
+  
+  ninja:
+    buildable: False
+    externals:
+    - spec: ninja@kitware
+      prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/
+

From 943b300e8b9f8620ccb4ada3a9f914c2dae24897 Mon Sep 17 00:00:00 2001
From: Stefan Povolny <povolny1@llnl.gov>
Date: Thu, 9 Mar 2023 12:21:35 -0800
Subject: [PATCH 04/42] implemented unit test for polar decomposition

---
 src/fixedSizeSquareMatrixOpsImpl.hpp          | 134 ++++++++----------
 unitTests/CMakeLists.txt                      |   1 +
 unitTests/testTensorOpsInverse.hpp            |  93 ++++++++++++
 unitTests/testTensorOpsPolarDecomposition.cpp |  29 ++++
 4 files changed, 184 insertions(+), 73 deletions(-)
 create mode 100644 unitTests/testTensorOpsPolarDecomposition.cpp

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 3733fae4..d135cf7a 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "genericTensorOps.hpp"
+#include "limits.hpp"
 
 namespace LvArray
 {
@@ -71,6 +72,60 @@ template< std::ptrdiff_t M >
 struct SquareMatrixOps
 {};
 
+/**
+ * @brief Determine the polar decomposition of @p matrix
+ * @tparam DST_MATRIX The type of @p R.
+ * @tparam MATRIX The type of @p matrix.
+ * @param R The resultant orthogonal matrix.
+ * @param matrix The matrix to be decomposed.
+ * @details The polar decomposition returns an orthogonal matrix such that @p R . U = V . @p R = @p matrix.
+ *   This is done using Higham's iterative algorithm.
+ */
+template< std::ptrdiff_t M, typename DST_MATRIX, typename MATRIX >
+LVARRAY_HOST_DEVICE inline
+static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
+                                    MATRIX const & LVARRAY_RESTRICT_REF matrix )
+{
+  checkSizes< M, M >( R );
+  checkSizes< M, M >( matrix );
+
+  using FloatingPoint = std::decay_t< decltype( R[0][0] ) >;
+
+  // Initialize
+  copy< M, M >( R, matrix );
+  FloatingPoint RInverse[M][M] = { {0} },
+                RInverseTranspose[M][M] = { {0} },
+                RRTMinusI[M][M] = { {0} };
+
+  // Higham Algorithm
+  FloatingPoint errorSquared = 1.0;
+  FloatingPoint tolerance = 10 * LvArray::NumericLimits< FloatingPoint >::epsilon;
+  int iter = 0;
+  while( errorSquared > tolerance * tolerance && iter < 100 )
+  {
+    iter++;
+    errorSquared = 0.0;
+
+    // Average the current R with its inverse tranpose
+    SquareMatrixOps< M >::invert( RInverse, R );
+    transpose< M, M >( RInverseTranspose, RInverse );
+    add< M, M >( R, RInverseTranspose );
+    scale< M, M >( R, 0.5 );
+
+    // Determine how close R is to being orthogonal using L2Norm(R.R^T-I)
+    Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, R );
+    addIdentity< M >( RRTMinusI, -1.0 );
+    for( std::ptrdiff_t i = 0 ; i < M ; i++ )
+    {
+      for( std::ptrdiff_t j = 0 ; j < M ; j++ )
+      {
+        errorSquared += RRTMinusI[i][j] * RRTMinusI[i][j];
+      }
+    }
+  }
+  LVARRAY_WARNING_IF( iter == 100, "Polar decomposition did not converge in 100 iterations!");
+}
+
 /**
  * @struct SquareMatrixOps< 2 >
  * @brief Performs operations on 2x2 square matrices.
@@ -533,9 +588,9 @@ struct SquareMatrixOps< 2 >
    * @brief Determine the polar decomposition of the 2x2 matrix @p matrix
    * @tparam DST_MATRIX The type of @p R.
    * @tparam MATRIX The type of @p matrix.
-   * @param R The resultant rotation matrix.
+   * @param R The resultant orthogonal matrix.
    * @param matrix The matrix to be decomposed.
-   * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix.
+   * @details The polar decomposition returns an orthogonal matrix such that @p R . U = V . @p R = @p matrix.
    *   This is done using Higham's iterative algorithm.
    */
   template< typename DST_MATRIX, typename MATRIX >
@@ -543,44 +598,10 @@ struct SquareMatrixOps< 2 >
   static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R,
                                   MATRIX const & LVARRAY_RESTRICT_REF matrix )
   {
-    checkSizes< 2, 2 >( R );
-    checkSizes< 2, 2 >( matrix );
-
-    using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >;
-
-    // Initialize
-    copy< 2, 2 >( R, matrix );
-    FloatingPoint RInverse[2][2] = { {0} },
-                  RInverseTranspose[2][2] = { {0} },
-                  RRTMinusI[2][2] = { {0} };
-
-    // Higham Algorithm
-    FloatingPoint error = 1.0;
-    int iter = 0;
-    while( error > 1.0e-16 && iter < 100 )
-    {
-      iter++;
-      error = 0.0;
-
-      invert( RInverse, R );
-      transpose< 2, 2 >( RInverseTranspose, RInverse );
-      add< 2, 2 >( R, RInverseTranspose );
-      scale< 2, 2 >( R, 0.5 );
-      Rij_eq_AikBjk< 2, 2, 2 >( RRTMinusI, R, R );
-      addIdentity< 2 >( RRTMinusI, -1.0 );
-
-      for( std::ptrdiff_t i = 0 ; i < 2 ; i++ )
-      {
-        for( std::ptrdiff_t j = 0 ; j < 2 ; j++ )
-        {
-          error += RRTMinusI[i][j] * RRTMinusI[i][j];
-        }
-      }
-    }
+    polarDecompositionBase< 2 >( R, matrix );
   }
 
 private:
-
   /**
    * @brief Compute the eigenvalues of the 2x2 symmetric matrix @p matrix.
    * @tparam FloatingPoint A floating point type.
@@ -1216,9 +1237,9 @@ struct SquareMatrixOps< 3 >
    * @brief Determine the polar decomposition of the 3x3 matrix @p matrix
    * @tparam DST_MATRIX The type of @p R.
    * @tparam MATRIX The type of @p matrix.
-   * @param R The resultant rotation matrix.
+   * @param R The resultant orthogonal matrix.
    * @param matrix The matrix to be decomposed.
-   * @details The polar decomposition returns a rotation matrix such that @p R . U = V . @p R = @p matrix.
+   * @details The polar decomposition returns an orthogonal matrix such that @p R . U = V . @p R = @p matrix.
    *   This is done using Higham's iterative algorithm.
    */
   template< typename DST_MATRIX, typename MATRIX >
@@ -1226,40 +1247,7 @@ struct SquareMatrixOps< 3 >
   static void polarDecomposition( DST_MATRIX && LVARRAY_RESTRICT_REF R,
                                   MATRIX const & LVARRAY_RESTRICT_REF matrix )
   {
-    checkSizes< 3, 3 >( R );
-    checkSizes< 3, 3 >( matrix );
-
-    using FloatingPoint = std::decay_t< decltype( matrix[ 0 ][ 0 ] ) >;
-
-    // Initialize
-    copy< 3, 3 >( R, matrix );
-    FloatingPoint RInverse[3][3] = { {0} },
-                  RInverseTranspose[3][3] = { {0} },
-                  RRTMinusI[3][3] = { {0} };
-
-    // Higham Algorithm
-    FloatingPoint error = 1.0;
-    int iter = 0;
-    while( error > 1.0e-16 && iter < 100 )
-    {
-      iter++;
-      error = 0.0;
-
-      invert( RInverse, R );
-      transpose< 3, 3 >( RInverseTranspose, RInverse );
-      add< 3, 3 >( R, RInverseTranspose );
-      scale< 3, 3 >( R, 0.5 );
-      Rij_eq_AikBjk< 3, 3, 3 >( RRTMinusI, R, R );
-      addIdentity< 3 >( RRTMinusI, -1.0 );
-
-      for( std::ptrdiff_t i = 0 ; i < 3 ; i++ )
-      {
-        for( std::ptrdiff_t j = 0 ; j < 3 ; j++ )
-        {
-          error += RRTMinusI[i][j] * RRTMinusI[i][j];
-        }
-      }
-    }
+    polarDecompositionBase< 3 >( R, matrix );
   }
 
 private:
diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt
index 4d91681e..cda1b58b 100644
--- a/unitTests/CMakeLists.txt
+++ b/unitTests/CMakeLists.txt
@@ -70,6 +70,7 @@ set( testSources
      testTensorOpsEigen.cpp
      testTensorOpsInverseOneArg.cpp
      testTensorOpsInverseTwoArgs.cpp
+     testTensorOpsPolarDecomposition.cpp
      testTensorOpsSymDeterminant.cpp
      testTensorOpsSymInverseOneArg.cpp
      testTensorOpsSymInverseTwoArgs.cpp
diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index 4909a686..c4665a01 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -244,6 +244,52 @@ class InverseTest : public ::testing::Test
         } );
   }
 
+  void polarDecomposition()
+  {
+    ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView();
+    ArrayViewT< T, 3, 1 > const srcMatrix_IKJ = m_srcMatrix_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const srcMatrix_KJI = m_srcMatrix_KJI.toView();
+
+    ArrayViewT< FLOAT, 3, 2 > const dstMatrix_IJK = m_dstMatrix_IJK.toView();
+    ArrayViewT< FLOAT, 3, 1 > const dstMatrix_IKJ = m_dstMatrix_IKJ.toView();
+    ArrayViewT< FLOAT, 3, 0 > const dstMatrix_KJI = m_dstMatrix_KJI.toView();
+
+    ArrayT< T, RAJA::PERM_IJK > arrayOfMatrices( numMatrices, M, M );
+
+    T scale = 100;
+    for( T & value : arrayOfMatrices )
+    {
+      value = randomValue( scale, m_gen );
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst();
+
+    forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i )
+    {
+      T srcLocal[ M ][ M ];
+      FLOAT dstLocal[ M ][ M ];
+
+      #define _TEST( output, input ) \
+        tensorOps::copy< M, M >( input, matrices[ i ] ); \
+        tensorOps::polarDecomposition< M >( output, input ); \
+        checkPolarDecomposition( output, input )
+
+      #define _TEST_PERMS( input, output0, output1, output2, output3 ) \
+        _TEST( output0, input ); \
+        _TEST( output1, input ); \
+        _TEST( output2, input ); \
+        _TEST( output3, input )
+
+      _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+      _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+      _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+      _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+
+      #undef _TEST_PERMS
+      #undef _TEST
+    } );
+  }
+
 
 private:
 
@@ -347,6 +393,53 @@ class InverseTest : public ::testing::Test
     }
   }
 
+  template< typename MATRIX_A, typename MATRIX_B >
+  static void LVARRAY_HOST_DEVICE checkPolarDecomposition( MATRIX_A const & orthogonalMatrix, MATRIX_B const & sourceMatrix )
+  {
+    // Error tolerance (depends on the implementation!)
+    using FloatingPoint = std::decay_t< decltype( orthogonalMatrix[0][0] ) >;
+    FloatingPoint tolerance = 10 * LvArray::NumericLimits< FloatingPoint >::epsilon;
+    FloatingPoint sqrtTolerance = math::sqrt( tolerance );
+    
+    if( M == 2 )
+    {
+      // Check if orthogonalMatrix is actually orthogonal
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[0][0] + orthogonalMatrix[0][1] * orthogonalMatrix[0][1], 1.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[0][1] + orthogonalMatrix[1][0] * orthogonalMatrix[1][1], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[1][0] + orthogonalMatrix[0][1] * orthogonalMatrix[1][1], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[1][0] + orthogonalMatrix[1][1] * orthogonalMatrix[1][1], 1.0, tolerance );
+
+      // Check if sourceMatrix.(orthogonalMatrix^T) yields a symmetric matrix
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * sourceMatrix[0][0] + orthogonalMatrix[1][1] * sourceMatrix[0][1],
+                            orthogonalMatrix[0][0] * sourceMatrix[1][0] + orthogonalMatrix[0][1] * sourceMatrix[1][1],
+                            sqrtTolerance );
+    }
+    else if( M == 3 )
+    {
+      // Check if orthogonalMatrix is actually orthogonal
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[0][0] + orthogonalMatrix[0][1] * orthogonalMatrix[0][1] + orthogonalMatrix[0][2] * orthogonalMatrix[0][2], 1.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[1][0] + orthogonalMatrix[0][1] * orthogonalMatrix[1][1] + orthogonalMatrix[0][2] * orthogonalMatrix[1][2], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[2][0] + orthogonalMatrix[0][1] * orthogonalMatrix[2][1] + orthogonalMatrix[0][2] * orthogonalMatrix[2][2], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[1][0] + orthogonalMatrix[0][1] * orthogonalMatrix[1][1] + orthogonalMatrix[0][2] * orthogonalMatrix[1][2], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[1][0] + orthogonalMatrix[1][1] * orthogonalMatrix[1][1] + orthogonalMatrix[1][2] * orthogonalMatrix[1][2], 1.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[2][0] + orthogonalMatrix[1][1] * orthogonalMatrix[2][1] + orthogonalMatrix[1][2] * orthogonalMatrix[2][2], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[0][0] * orthogonalMatrix[2][0] + orthogonalMatrix[0][1] * orthogonalMatrix[2][1] + orthogonalMatrix[0][2] * orthogonalMatrix[2][2], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * orthogonalMatrix[2][0] + orthogonalMatrix[1][1] * orthogonalMatrix[2][1] + orthogonalMatrix[1][2] * orthogonalMatrix[2][2], 0.0, tolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[2][0] * orthogonalMatrix[2][0] + orthogonalMatrix[2][1] * orthogonalMatrix[2][1] + orthogonalMatrix[2][2] * orthogonalMatrix[2][2], 1.0, tolerance );
+
+      // Check if sourceMatrix.(orthogonalMatrix^T) yields a symmetric matrix
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[1][0] * sourceMatrix[0][0] + orthogonalMatrix[1][1] * sourceMatrix[0][1] + orthogonalMatrix[1][2] * sourceMatrix[0][2],
+                            orthogonalMatrix[0][0] * sourceMatrix[1][0] + orthogonalMatrix[0][1] * sourceMatrix[1][1] + orthogonalMatrix[0][2] * sourceMatrix[1][2],
+                            sqrtTolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[2][0] * sourceMatrix[0][0] + orthogonalMatrix[2][1] * sourceMatrix[0][1] + orthogonalMatrix[2][2] * sourceMatrix[0][2],
+                            orthogonalMatrix[0][0] * sourceMatrix[2][0] + orthogonalMatrix[0][1] * sourceMatrix[2][1] + orthogonalMatrix[0][2] * sourceMatrix[2][2],
+                            sqrtTolerance );
+      PORTABLE_EXPECT_NEAR( orthogonalMatrix[2][0] * sourceMatrix[1][0] + orthogonalMatrix[2][1] * sourceMatrix[1][1] + orthogonalMatrix[2][2] * sourceMatrix[1][2],
+                            orthogonalMatrix[1][0] * sourceMatrix[2][0] + orthogonalMatrix[1][1] * sourceMatrix[2][1] + orthogonalMatrix[1][2] * sourceMatrix[2][2],
+                            sqrtTolerance );
+    }
+  }
+
 
   std::mt19937_64 m_gen;
 
diff --git a/unitTests/testTensorOpsPolarDecomposition.cpp b/unitTests/testTensorOpsPolarDecomposition.cpp
new file mode 100644
index 00000000..29c48dc0
--- /dev/null
+++ b/unitTests/testTensorOpsPolarDecomposition.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+#include "testTensorOpsInverse.hpp"
+
+namespace LvArray
+{
+namespace testing
+{
+
+TYPED_TEST( InverseTest, polarDecomposition )
+{
+  this->polarDecomposition();
+}
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+  return result;
+}

From ddcd7b418a58c72307d95fbbb5613b19f42aaedc Mon Sep 17 00:00:00 2001
From: Stefan Povolny <povolny1@llnl.gov>
Date: Thu, 4 May 2023 14:15:37 -0700
Subject: [PATCH 05/42] made error message in polar decomposition
 GPU-compatible

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index d135cf7a..d4c3f1b7 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -123,7 +123,10 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
       }
     }
   }
-  LVARRAY_WARNING_IF( iter == 100, "Polar decomposition did not converge in 100 iterations!");
+  if( iter == 100 )
+  {
+    printf("Polar decomposition did not converge in 100 iterations!");
+  }
 }
 
 /**

From 1339ea72414dbbcc0efdc3e8e5ecc5aabd13206d Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@lassen708.coral.llnl.gov>
Date: Wed, 30 Oct 2024 11:51:14 -0700
Subject: [PATCH 06/42] Added device math functions for ceil, floor, and power

---
 src/math.hpp | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/src/math.hpp b/src/math.hpp
index d45f68f3..d4cd408d 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -440,6 +440,113 @@ __half2 abs( __half2 const x )
 
 #endif
 
+
+/**
+ * @return The ceiling value of @p x.
+ * @param x The number to get the ceiling value of.
+ * @note This set of overloads is valid for any numeric type.
+ */
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+float ceil( float const x )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::ceilf( x );
+#else
+  return std::ceil( x );
+#endif
+}
+
+template< typename T >
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
+double ceil( T const x )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::ceil( double ( x ) );
+#else
+  return std::ceil( x );
+#endif
+}
+
+#if defined( LVARRAY_USE_DEVICE )
+
+/// @copydoc ceil( T )
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
+__half ceil( __half const x )
+{
+#if CUDART_VERSION > 11000
+  return hceil( x );
+#else
+  return x > __half( 0 ) ? x : -x;
+#endif
+}
+
+/// @copydoc ceil( T )
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
+__half2 ceil( __half2 const x )
+{
+#if CUDART_VERSION > 11000
+  return h2ceil( x );
+#else
+  return LVARRAY_THROW( "h2ceil is not implemented for host", std::runtime_error ); // This is wrong, copied from other function used to mimic
+#endif
+}
+
+#endif
+
+
+/**
+ * @return The floor value of @p x.
+ * @param x The number to get the floor value of.
+ * @note This set of overloads is valid for any numeric type.
+ */
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+float floor( float const x )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::floorf( x );
+#else
+  return std::floor( x );
+#endif
+}
+
+template< typename T >
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
+double floor( T const x )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::floor( double ( x ) );
+#else
+  return std::floor( x );
+#endif
+}
+
+#if defined( LVARRAY_USE_DEVICE )
+
+/// @copydoc floor( T )
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
+__half floor( __half const x )
+{
+#if CUDART_VERSION > 11000
+  return hfloor( x );
+#else
+  return x > __half( 0 ) ? x : -x;
+#endif
+}
+
+/// @copydoc floor( T )
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
+__half2 floor( __half2 const x )
+{
+#if CUDART_VERSION > 11000
+  return h2floor( x );
+#else
+  return LVARRAY_THROW( "h2floor is not implemented for host", std::runtime_error );
+#endif
+}
+
+#endif
+
+
 /**
  * @return @code x * x @endcode.
  * @tparam T The typeof @p x.
@@ -452,6 +559,44 @@ T square( T const x )
 
 ///@}
 
+
+/**
+ * @name Power.
+ */
+///@{
+
+/**
+ * @return The power of @p x.
+ * @param x The number to get the power of.
+ * @param n The exponent.
+ * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
+ *   and the return type is @c double.
+ */
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+float pow( float const x, float const n )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::powf( x, n );
+#else
+  return std::pow( x, n );
+#endif
+}
+
+/// @copydoc pow( float )
+template< typename T >
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+double pow( T const x, T const n )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::pow( double( x ), double( n ) );
+#else
+  return std::pow( x, n );
+#endif
+}
+
+///@}
+
+
 /**
  * @name Square root and inverse square root.
  */

From 1e0127af1d596359878efe3453d5b34e1bf17370 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@lassen709.coral.llnl.gov>
Date: Thu, 5 Dec 2024 09:07:51 -0800
Subject: [PATCH 07/42] Corrected host only function inside of checkIndices
 preventing device compilation

---
 src/indexing.hpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 2dca4597..0665d68e 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -206,7 +206,27 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con
 template< typename INDEX_TYPE, typename ... INDICES >
 LVARRAY_HOST_DEVICE inline
 void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices )
-{ LVARRAY_ERROR_IF( invalidIndices( dims, indices ... ), "Invalid indices. " << printDimsAndIndices( dims, indices ... ) ); }
+{ 
+  if( invalidIndices( dims, indices ... ) )
+  {
+    constexpr int NDIM = sizeof ... (INDICES);
+    printf( "Invalid indices. \n");
+
+
+    printf( "  ( indices ) = { ");
+    (printf( " %d", indices),...);
+    printf( "\n");
+
+    printf( "  ( dims ) = { ");
+    for( INDEX_TYPE dim=0; dim<NDIM; ++dim )
+    {
+      printf( "( %d) ", dims[dim] );
+    }
+    printf( "\n");
+    asm ( "trap;" );
+  }
+  //LVARRAY_ERROR_IF( invalidIndices( dims, indices ... ), "Invalid indices. " << printDimsAndIndices( dims, indices ... ) ); 
+}
 
 /**
  * @brief Calculate the strides given the dimensions and permutation.

From bf73593226fe6b1e9e184c6f5c03096c3dda3dbc Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby966.llnl.gov>
Date: Tue, 14 Jan 2025 09:38:08 -0800
Subject: [PATCH 08/42] Added debugging flags to host configs

---
 host-configs/LLNL/lassen-cuda-11-base.cmake | 2 +-
 host-configs/LLNL/lassen-cuda-12-base.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/host-configs/LLNL/lassen-cuda-11-base.cmake b/host-configs/LLNL/lassen-cuda-11-base.cmake
index 3c1ea972..73b7d51c 100644
--- a/host-configs/LLNL/lassen-cuda-11-base.cmake
+++ b/host-configs/LLNL/lassen-cuda-11-base.cmake
@@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "")
 set(CMAKE_CUDA_STANDARD 17 CACHE STRING "")
 set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo --source-in-ptx ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
 
 set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE)
diff --git a/host-configs/LLNL/lassen-cuda-12-base.cmake b/host-configs/LLNL/lassen-cuda-12-base.cmake
index 3b8f4125..174ba7fb 100644
--- a/host-configs/LLNL/lassen-cuda-12-base.cmake
+++ b/host-configs/LLNL/lassen-cuda-12-base.cmake
@@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "")
 set(CMAKE_CUDA_STANDARD 17 CACHE STRING "")
 set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -lineinfo --source-in-ptx -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
 
 set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE)

From 8e8670602c678bf053b28cf752b7a76a2fd32f63 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 16 Jan 2025 18:05:54 -0800
Subject: [PATCH 09/42] wip: trying to fix build error on Lassen.

---
 src/indexing.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 2dca4597..1203aa1d 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -206,7 +206,7 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con
 template< typename INDEX_TYPE, typename ... INDICES >
 LVARRAY_HOST_DEVICE inline
 void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices )
-{ LVARRAY_ERROR_IF( invalidIndices( dims, indices ... ), "Invalid indices. " << printDimsAndIndices( dims, indices ... ) ); }
+{ LVARRAY_ERROR_IF( false , "Invalid indices. " ); }
 
 /**
  * @brief Calculate the strides given the dimensions and permutation.

From c021cc526ce002be245480db1f79c1549a92d813 Mon Sep 17 00:00:00 2001
From: "Randolph R. Settgast" <settgast1@llnl.gov>
Date: Wed, 22 Jan 2025 14:42:25 -0800
Subject: [PATCH 10/42] use fold expression to do index checks

---
 src/indexing.hpp         | 50 +++++++++++++++++++++++++++++-----------
 src/typeManipulation.hpp |  8 ++++---
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 1203aa1d..fc90af57 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -159,21 +159,41 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic
  * @param indices A variadic pack of indices.
  */
 template< typename INDEX_TYPE, typename ... INDICES >
-std::string printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices )
+LVARRAY_HOST_DEVICE
+void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices )
 {
   constexpr int NDIM = sizeof ... (INDICES);
-  std::ostringstream oss;
-  oss << "dimensions = { " << dims[ 0 ];
+  printf( "dimensions = { %d", dims[ 0 ] );
   for( int i = 1; i < NDIM; ++i )
   {
-    oss << ", " << dims[ i ];
+    printf( ", %d", dims[ i ] );
   }
+  printf( "}\n");
 
-  oss << " } indices = " << getIndexString( indices ... );
+  printf( "   indices = { " );
+  (printf(" %d, ", indices ),...);
+  printf( "}\n");
+}
 
-  return oss.str();
+
+/**
+ * @brief Function to check if an index is invalid
+ * @tparam DIMS_TYPE The integral type used for the dimensions of the space
+ * @tparam INDEX_TYPE The Integral types of the index to check against 
+ * @param dims A pointer to the dimensions of the space.
+ * @param indices the index to check against.
+ * @return whether the index is invalid
+ */
+template< typename DIMS_TYPE, typename INDEX_TYPE >
+LVARRAY_HOST_DEVICE inline constexpr
+bool invalidIndex( DIMS_TYPE const * const LVARRAY_RESTRICT dims, 
+                   int const dimsIndex,
+                   INDEX_TYPE const index )
+{
+  return (index < 0) || (index >= dims[dimsIndex]);
 }
 
+
 /**
  * @tparam INDEX_TYPE The integral type used for the dimensions of the space.
  * @tparam INDICES A variadic pack of the integral types of the indices.
@@ -185,14 +205,9 @@ template< typename INDEX_TYPE, typename ... INDICES >
 LVARRAY_HOST_DEVICE inline constexpr
 bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices )
 {
-  int curDim = 0;
   bool invalid = false;
-  typeManipulation::forEachArg( [dims, &curDim, &invalid]( auto const index )
-  {
-    invalid = invalid || ( index < 0 ) || ( index >= dims[ curDim ] );
-    ++curDim;
-  }, indices ... );
-
+  int curDim = 0;
+  (invalidIndex(dims, curDim++, indices),...);
   return invalid;
 }
 
@@ -206,7 +221,14 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con
 template< typename INDEX_TYPE, typename ... INDICES >
 LVARRAY_HOST_DEVICE inline
 void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices )
-{ LVARRAY_ERROR_IF( false , "Invalid indices. " ); }
+{ 
+  bool const invalid = invalidIndices( dims, indices ... );
+  if( invalid )
+  {
+    printDimsAndIndices( dims, indices ... );
+    LVARRAY_ERROR( "Invalid indices. Info precedes this line." ); 
+  }
+}
 
 /**
  * @brief Calculate the strides given the dimensions and permutation.
diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp
index 33ca95c6..90f3df31 100644
--- a/src/typeManipulation.hpp
+++ b/src/typeManipulation.hpp
@@ -124,10 +124,12 @@ namespace typeManipulation
  * @param f The function to not call.
  */
 DISABLE_HD_WARNING
-template< typename F >
+template< typename F, typename ARG >
 inline constexpr LVARRAY_HOST_DEVICE
-void forEachArg( F && f )
-{ LVARRAY_UNUSED_VARIABLE( f ); }
+void forEachArg( F && f, ARG && arg )
+{ 
+  f(arg); 
+}
 
 /**
  * @tparam F The type of the function to call.

From efc221da0d0969fbd5c75393abe8b167e7c2a3a3 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Wed, 22 Jan 2025 14:59:32 -0800
Subject: [PATCH 11/42] revert changes to typeManipulation.

---
 src/typeManipulation.hpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp
index 90f3df31..33ca95c6 100644
--- a/src/typeManipulation.hpp
+++ b/src/typeManipulation.hpp
@@ -124,12 +124,10 @@ namespace typeManipulation
  * @param f The function to not call.
  */
 DISABLE_HD_WARNING
-template< typename F, typename ARG >
+template< typename F >
 inline constexpr LVARRAY_HOST_DEVICE
-void forEachArg( F && f, ARG && arg )
-{ 
-  f(arg); 
-}
+void forEachArg( F && f )
+{ LVARRAY_UNUSED_VARIABLE( f ); }
 
 /**
  * @tparam F The type of the function to call.

From 9275fe76e29991230cf14eea56534ebfec59e4e1 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Wed, 22 Jan 2025 15:10:21 -0800
Subject: [PATCH 12/42] use bool return value.

---
 src/indexing.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index fc90af57..8c07b3e2 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -207,7 +207,7 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con
 {
   bool invalid = false;
   int curDim = 0;
-  (invalidIndex(dims, curDim++, indices),...);
+  ( (invalid = invalid || invalidIndex(dims, curDim++, indices)), ...);
   return invalid;
 }
 

From 882b587cec1cd2c46c9adb83c3a91a1b98651c45 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Wed, 22 Jan 2025 17:11:09 -0800
Subject: [PATCH 13/42] use fold expression.

---
 src/indexing.hpp | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 8c07b3e2..6c0348da 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -20,6 +20,8 @@
 // TPL includes
 #include <RAJA/RAJA.hpp>
 
+#include <inttypes.h>
+
 namespace LvArray
 {
 
@@ -151,6 +153,29 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic
   return oss.str();
 }
 
+
+template< typename INDEX_TYPE >
+LVARRAY_HOST_DEVICE
+void printIndexValue( INDEX_TYPE const & val )
+{
+  if constexpr( std::is_same_v<INDEX_TYPE, int> )
+  {
+    printf( "%d", val );
+  }
+  else if constexpr( std::is_same_v<INDEX_TYPE, long> )
+  {
+    printf( "%ld", val );
+  }
+  else if constexpr( std::is_same_v<INDEX_TYPE, long long> )
+  {
+    printf( "%lld", val );
+  }
+  else
+  {
+    static_assert(!sizeof(T*), "Unsupported integral type for print_value");
+  }
+}
+
 /**
  * @tparam INDEX_TYPE The integral type of the dimensions.
  * @tparam INDICES A variadic pack of the integral types of the indices.
@@ -163,16 +188,19 @@ LVARRAY_HOST_DEVICE
 void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices )
 {
   constexpr int NDIM = sizeof ... (INDICES);
-  printf( "dimensions = { %d", dims[ 0 ] );
+  printf( "dimensions = { ");
+  printIndexValue( dims[0] );
   for( int i = 1; i < NDIM; ++i )
   {
-    printf( ", %d", dims[ i ] );
+    printf( ", ");
+    printIndexValue(dims[i]);
   }
   printf( "}\n");
-
-  printf( "   indices = { " );
-  (printf(" %d, ", indices ),...);
-  printf( "}\n");
+  
+  printf( "indices = { ");
+  bool firstIndex = true;
+  ( (firstIndex ? ( firstIndex = false, printIndexValue(indices) ) : (printf(", "), printIndexValue(indices))), ...);
+  printf(" }\n");
 }
 
 

From e599472d6255f14110b5ff8bf21f7d0e465ab46f Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Wed, 22 Jan 2025 17:11:41 -0800
Subject: [PATCH 14/42] fix function name

---
 src/indexing.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 6c0348da..0ec37a21 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -172,7 +172,7 @@ void printIndexValue( INDEX_TYPE const & val )
   }
   else
   {
-    static_assert(!sizeof(T*), "Unsupported integral type for print_value");
+    static_assert(!sizeof(T*), "Unsupported integral type for printIndexValue");
   }
 }
 

From 00e117d3f449d350ff8b6fd4be491ff846379533 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Wed, 22 Jan 2025 17:17:02 -0800
Subject: [PATCH 15/42] doxygen.

---
 src/indexing.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 0ec37a21..cfd3def6 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -153,7 +153,10 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic
   return oss.str();
 }
 
-
+/**
+ * @tparam INDEX_TYPE The integral type to be printed.
+ * @param val the value to be printed
+ */
 template< typename INDEX_TYPE >
 LVARRAY_HOST_DEVICE
 void printIndexValue( INDEX_TYPE const & val )

From 1ccdce54352d92940aaa42caff852d339549d3ca Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Wed, 22 Jan 2025 17:28:19 -0800
Subject: [PATCH 16/42] fix typo.

---
 src/indexing.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index cfd3def6..0be6ffb5 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -175,7 +175,7 @@ void printIndexValue( INDEX_TYPE const & val )
   }
   else
   {
-    static_assert(!sizeof(T*), "Unsupported integral type for printIndexValue");
+    static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue");
   }
 }
 

From 956aa7ce91467a0ff891637ed94b5588310de0d5 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 11:57:02 -0800
Subject: [PATCH 17/42] add other integral types.

---
 src/indexing.hpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 0be6ffb5..7ba6dfd8 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -161,18 +161,32 @@ template< typename INDEX_TYPE >
 LVARRAY_HOST_DEVICE
 void printIndexValue( INDEX_TYPE const & val )
 {
+  static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." );
+  
   if constexpr( std::is_same_v<INDEX_TYPE, int> )
   {
     printf( "%d", val );
   }
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned int> )
+  {
+    printf( "%u", val );
+  }
   else if constexpr( std::is_same_v<INDEX_TYPE, long> )
   {
     printf( "%ld", val );
   }
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long> )
+  {
+    printf( "%lu", val );
+  }
   else if constexpr( std::is_same_v<INDEX_TYPE, long long> )
   {
     printf( "%lld", val );
   }
+  else if constexpr( std::is_same_v<INDEX_TYPE, short> )
+  {
+    printf( "%hd", val );
+  }
   else
   {
     static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue");

From 8eb4727d9f02f97aefb934bc7ce24540484f58cb Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 12:28:06 -0800
Subject: [PATCH 18/42] add debug assert.

---
 src/indexing.hpp         | 12 +++++++++---
 src/typeManipulation.hpp |  4 ++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 7ba6dfd8..9774397a 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -20,8 +20,6 @@
 // TPL includes
 #include <RAJA/RAJA.hpp>
 
-#include <inttypes.h>
-
 namespace LvArray
 {
 
@@ -162,7 +160,15 @@ LVARRAY_HOST_DEVICE
 void printIndexValue( INDEX_TYPE const & val )
 {
   static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." );
-  
+
+  static_assert( std::is_same_v<INDEX_TYPE, int> ||
+                 std::is_same_v<INDEX_TYPE, unsigned int> ||
+                 std::is_same_v<INDEX_TYPE, long> ||
+                 std::is_same_v<INDEX_TYPE, unsigned long> ||
+                 std::is_same_v<INDEX_TYPE, long long> ||
+                 std::is_same_v<INDEX_TYPE, short>, "Unsupported integral type for printIndexValue" );
+
+
   if constexpr( std::is_same_v<INDEX_TYPE, int> )
   {
     printf( "%d", val );
diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp
index 33ca95c6..cd67293a 100644
--- a/src/typeManipulation.hpp
+++ b/src/typeManipulation.hpp
@@ -549,5 +549,9 @@ LVARRAY_HOST_DEVICE inline constexpr
 CArray< camp::idx_t, sizeof...( INDICES ) > asArray( camp::idx_seq< INDICES... > )
 { return { INDICES ... }; }
 
+
+
+
+
 } // namespace typeManipulation
 } // namespace LvArray

From 2fa2667092f515d8a901553e6070e79022b8f796 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 12:33:51 -0800
Subject: [PATCH 19/42] add all integral types.

---
 src/indexing.hpp | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 9774397a..74bba31a 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -161,14 +161,6 @@ void printIndexValue( INDEX_TYPE const & val )
 {
   static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." );
 
-  static_assert( std::is_same_v<INDEX_TYPE, int> ||
-                 std::is_same_v<INDEX_TYPE, unsigned int> ||
-                 std::is_same_v<INDEX_TYPE, long> ||
-                 std::is_same_v<INDEX_TYPE, unsigned long> ||
-                 std::is_same_v<INDEX_TYPE, long long> ||
-                 std::is_same_v<INDEX_TYPE, short>, "Unsupported integral type for printIndexValue" );
-
-
   if constexpr( std::is_same_v<INDEX_TYPE, int> )
   {
     printf( "%d", val );
@@ -189,10 +181,34 @@ void printIndexValue( INDEX_TYPE const & val )
   {
     printf( "%lld", val );
   }
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long long> )
+  {
+    printf( "%llu", val );
+  }
   else if constexpr( std::is_same_v<INDEX_TYPE, short> )
   {
     printf( "%hd", val );
   }
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned short> )
+  {
+    printf( "%hu", val );
+  }
+  // else if constexpr( std::is_same_v<INDEX_TYPE, char> )
+  // {
+  //   printf( "%c", val );
+  // }
+  // else if constexpr( std::is_same_v<INDEX_TYPE, signed char> )
+  // {
+  //   printf( "%hhd", val );
+  // }
+  // else if constexpr( std::is_same_v<INDEX_TYPE, unsigned char> )
+  // {
+  //   printf( "%hhu", val );
+  // }
+  // else if constexpr( std::is_same_v<INDEX_TYPE, bool> )
+  // {
+  //   printf( "%d", val );
+  // }
   else
   {
     static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue");

From 2343b7dbcc67fd8c35de07d0e63a336530cc821d Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 13:39:42 -0800
Subject: [PATCH 20/42] add char types

---
 src/indexing.hpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 74bba31a..9a027dd6 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -193,22 +193,22 @@ void printIndexValue( INDEX_TYPE const & val )
   {
     printf( "%hu", val );
   }
-  // else if constexpr( std::is_same_v<INDEX_TYPE, char> )
-  // {
-  //   printf( "%c", val );
-  // }
-  // else if constexpr( std::is_same_v<INDEX_TYPE, signed char> )
-  // {
-  //   printf( "%hhd", val );
-  // }
-  // else if constexpr( std::is_same_v<INDEX_TYPE, unsigned char> )
-  // {
-  //   printf( "%hhu", val );
-  // }
-  // else if constexpr( std::is_same_v<INDEX_TYPE, bool> )
-  // {
-  //   printf( "%d", val );
-  // }
+  else if constexpr( std::is_same_v<INDEX_TYPE, char> )
+  {
+    printf( "%c", val );
+  }
+  else if constexpr( std::is_same_v<INDEX_TYPE, signed char> )
+  {
+    printf( "%hhd", val );
+  }
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned char> )
+  {
+    printf( "%hhu", val );
+  }
+  else if constexpr( std::is_same_v<INDEX_TYPE, bool> )
+  {
+    printf( "%d", val );
+  }
   else
   {
     static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue");

From ddc611e9e020d200ef39f44dd8d40cbc41959b62 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 13:48:29 -0800
Subject: [PATCH 21/42] add decay type.

---
 src/indexing.hpp | 37 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 9a027dd6..c59d4117 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -159,59 +159,44 @@ template< typename INDEX_TYPE >
 LVARRAY_HOST_DEVICE
 void printIndexValue( INDEX_TYPE const & val )
 {
-  static_assert( std::is_integral_v< INDEX_TYPE >, "INDEX_TYPE must be an integral type." );
+  using DecayedType = std::decay_t<INDEX_TYPE>;
+  static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." );
 
-  if constexpr( std::is_same_v<INDEX_TYPE, int> )
+  if constexpr( std::is_same_v<DecayedType, int> )
   {
     printf( "%d", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned int> )
+  else if constexpr( std::is_same_v<DecayedType, unsigned int> )
   {
     printf( "%u", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, long> )
+  else if constexpr( std::is_same_v<DecayedType, long> )
   {
     printf( "%ld", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long> )
+  else if constexpr( std::is_same_v<DecayedType, unsigned long> )
   {
     printf( "%lu", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, long long> )
+  else if constexpr( std::is_same_v<DecayedType, long long> )
   {
     printf( "%lld", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long long> )
+  else if constexpr( std::is_same_v<DecayedType, unsigned long long> )
   {
     printf( "%llu", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, short> )
+  else if constexpr( std::is_same_v<DecayedType, short> )
   {
     printf( "%hd", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned short> )
+  else if constexpr( std::is_same_v<DecayedType, unsigned short> )
   {
     printf( "%hu", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, char> )
-  {
-    printf( "%c", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, signed char> )
-  {
-    printf( "%hhd", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned char> )
-  {
-    printf( "%hhu", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, bool> )
-  {
-    printf( "%d", val );
-  }
   else
   {
-    static_assert(!sizeof(INDEX_TYPE*), "Unsupported integral type for printIndexValue");
+    static_assert(!sizeof(DecayedType*), "Unsupported integral type for printIndexValue");
   }
 }
 

From 52c1d5e55ab01f151510f8a238c22136d203eec6 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 14:03:25 -0800
Subject: [PATCH 22/42] add a static cast option

---
 src/indexing.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index c59d4117..502ac04b 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -196,7 +196,7 @@ void printIndexValue( INDEX_TYPE const & val )
   }
   else
   {
-    static_assert(!sizeof(DecayedType*), "Unsupported integral type for printIndexValue");
+    printf( "%llu", static_cast< long long>( val ) );
   }
 }
 

From 58fc69df376813210f03ace9b653cb5cd7d76b4c Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 15:26:22 -0800
Subject: [PATCH 23/42] remove assert.

---
 src/indexing.hpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 502ac04b..d5002efc 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -160,7 +160,7 @@ LVARRAY_HOST_DEVICE
 void printIndexValue( INDEX_TYPE const & val )
 {
   using DecayedType = std::decay_t<INDEX_TYPE>;
-  static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." );
+  // static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." );
 
   if constexpr( std::is_same_v<DecayedType, int> )
   {
@@ -194,9 +194,25 @@ void printIndexValue( INDEX_TYPE const & val )
   {
     printf( "%hu", val );
   }
+  else if constexpr( std::is_same_v<DecayedType, char> )
+  {
+    printf( "%c", val );
+  }
+  else if constexpr( std::is_same_v<DecayedType, signed char> )
+  {
+    printf( "%hhd", val );
+  }
+  else if constexpr( std::is_same_v<DecayedType, unsigned char> )
+  {
+    printf( "%hhu", val );
+  }
+  else if constexpr( std::is_same_v<DecayedType, bool> )
+  {
+    printf( "%d", val );
+  }
   else
   {
-    printf( "%llu", static_cast< long long>( val ) );
+    printf( "%lld", static_cast< long long >( val ) );
   }
 }
 

From d89ed76b236fb709e4c287ecc2ac5334c6a778d4 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Thu, 23 Jan 2025 15:50:41 -0800
Subject: [PATCH 24/42] clean up.

---
 src/indexing.hpp         | 27 ++++++++++++++-------------
 src/typeManipulation.hpp |  4 ----
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index d5002efc..b25e3e74 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -159,54 +159,55 @@ template< typename INDEX_TYPE >
 LVARRAY_HOST_DEVICE
 void printIndexValue( INDEX_TYPE const & val )
 {
-  using DecayedType = std::decay_t<INDEX_TYPE>;
+  /// NOTE: the assert is removed because of an error on one of the GEOS CI jobs.
+  // using DecayedType = std::decay_t<INDEX_TYPE>;
   // static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." );
 
-  if constexpr( std::is_same_v<DecayedType, int> )
+  if constexpr( std::is_same_v<INDEX_TYPE, int> )
   {
     printf( "%d", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, unsigned int> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned int> )
   {
     printf( "%u", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, long> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, long> )
   {
     printf( "%ld", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, unsigned long> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long> )
   {
     printf( "%lu", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, long long> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, long long> )
   {
     printf( "%lld", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, unsigned long long> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long long> )
   {
     printf( "%llu", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, short> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, short> )
   {
     printf( "%hd", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, unsigned short> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned short> )
   {
     printf( "%hu", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, char> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, char> )
   {
     printf( "%c", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, signed char> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, signed char> )
   {
     printf( "%hhd", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, unsigned char> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned char> )
   {
     printf( "%hhu", val );
   }
-  else if constexpr( std::is_same_v<DecayedType, bool> )
+  else if constexpr( std::is_same_v<INDEX_TYPE, bool> )
   {
     printf( "%d", val );
   }
diff --git a/src/typeManipulation.hpp b/src/typeManipulation.hpp
index cd67293a..33ca95c6 100644
--- a/src/typeManipulation.hpp
+++ b/src/typeManipulation.hpp
@@ -549,9 +549,5 @@ LVARRAY_HOST_DEVICE inline constexpr
 CArray< camp::idx_t, sizeof...( INDICES ) > asArray( camp::idx_seq< INDICES... > )
 { return { INDICES ... }; }
 
-
-
-
-
 } // namespace typeManipulation
 } // namespace LvArray

From ac6c0fdfef95cf9b4813660bd00246dc512cb0ba Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Fri, 24 Jan 2025 10:31:48 -0800
Subject: [PATCH 25/42] remove chars.

---
 src/indexing.hpp | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index b25e3e74..16fd332e 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -195,22 +195,6 @@ void printIndexValue( INDEX_TYPE const & val )
   {
     printf( "%hu", val );
   }
-  else if constexpr( std::is_same_v<INDEX_TYPE, char> )
-  {
-    printf( "%c", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, signed char> )
-  {
-    printf( "%hhd", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned char> )
-  {
-    printf( "%hhu", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, bool> )
-  {
-    printf( "%d", val );
-  }
   else
   {
     printf( "%lld", static_cast< long long >( val ) );

From c2efd00ca5e92d450daa175c1790a05c564d1048 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Fri, 24 Jan 2025 10:41:07 -0800
Subject: [PATCH 26/42] just always cast to long long.

---
 src/indexing.hpp | 76 ++++++++----------------------------------------
 1 file changed, 12 insertions(+), 64 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index 16fd332e..c6333bd5 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -151,56 +151,6 @@ std::string getIndexString( INDEX const index, REMAINING_INDICES const ... indic
   return oss.str();
 }
 
-/**
- * @tparam INDEX_TYPE The integral type to be printed.
- * @param val the value to be printed
- */
-template< typename INDEX_TYPE >
-LVARRAY_HOST_DEVICE
-void printIndexValue( INDEX_TYPE const & val )
-{
-  /// NOTE: the assert is removed because of an error on one of the GEOS CI jobs.
-  // using DecayedType = std::decay_t<INDEX_TYPE>;
-  // static_assert( std::is_integral_v< DecayedType >, "INDEX_TYPE must be an integral type." );
-
-  if constexpr( std::is_same_v<INDEX_TYPE, int> )
-  {
-    printf( "%d", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned int> )
-  {
-    printf( "%u", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, long> )
-  {
-    printf( "%ld", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long> )
-  {
-    printf( "%lu", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, long long> )
-  {
-    printf( "%lld", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned long long> )
-  {
-    printf( "%llu", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, short> )
-  {
-    printf( "%hd", val );
-  }
-  else if constexpr( std::is_same_v<INDEX_TYPE, unsigned short> )
-  {
-    printf( "%hu", val );
-  }
-  else
-  {
-    printf( "%lld", static_cast< long long >( val ) );
-  }
-}
-
 /**
  * @tparam INDEX_TYPE The integral type of the dimensions.
  * @tparam INDICES A variadic pack of the integral types of the indices.
@@ -213,33 +163,31 @@ LVARRAY_HOST_DEVICE
 void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices )
 {
   constexpr int NDIM = sizeof ... (INDICES);
-  printf( "dimensions = { ");
+  printf( "dimensions = { " );
   printIndexValue( dims[0] );
   for( int i = 1; i < NDIM; ++i )
   {
-    printf( ", ");
-    printIndexValue(dims[i]);
+    printf( ", %lld", static_cast< long long >( dims[ i ] ));
   }
-  printf( "}\n");
-  
-  printf( "indices = { ");
-  bool firstIndex = true;
-  ( (firstIndex ? ( firstIndex = false, printIndexValue(indices) ) : (printf(", "), printIndexValue(indices))), ...);
-  printf(" }\n");
+  printf( "}\n" );
+
+  printf( "indices = { " );
+  (printf( ", %lld", static_cast< long long >(indices)), ...);
+  printf( "}\n" );
 }
 
 
 /**
  * @brief Function to check if an index is invalid
  * @tparam DIMS_TYPE The integral type used for the dimensions of the space
- * @tparam INDEX_TYPE The Integral types of the index to check against 
+ * @tparam INDEX_TYPE The Integral types of the index to check against
  * @param dims A pointer to the dimensions of the space.
  * @param indices the index to check against.
  * @return whether the index is invalid
  */
 template< typename DIMS_TYPE, typename INDEX_TYPE >
 LVARRAY_HOST_DEVICE inline constexpr
-bool invalidIndex( DIMS_TYPE const * const LVARRAY_RESTRICT dims, 
+bool invalidIndex( DIMS_TYPE const * const LVARRAY_RESTRICT dims,
                    int const dimsIndex,
                    INDEX_TYPE const index )
 {
@@ -260,7 +208,7 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con
 {
   bool invalid = false;
   int curDim = 0;
-  ( (invalid = invalid || invalidIndex(dims, curDim++, indices)), ...);
+  ( (invalid = invalid || invalidIndex( dims, curDim++, indices )), ...);
   return invalid;
 }
 
@@ -274,12 +222,12 @@ bool invalidIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES con
 template< typename INDEX_TYPE, typename ... INDICES >
 LVARRAY_HOST_DEVICE inline
 void checkIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const ... indices )
-{ 
+{
   bool const invalid = invalidIndices( dims, indices ... );
   if( invalid )
   {
     printDimsAndIndices( dims, indices ... );
-    LVARRAY_ERROR( "Invalid indices. Info precedes this line." ); 
+    LVARRAY_ERROR( "Invalid indices. Info precedes this line." );
   }
 }
 

From e60fa8fd82a9c6cc05d3d616aa1ad8ce88cf8992 Mon Sep 17 00:00:00 2001
From: Matteo Cusini <matteo.cusini@gmail.com>
Date: Fri, 24 Jan 2025 15:49:11 -0800
Subject: [PATCH 27/42] forgotten one.

---
 src/indexing.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/indexing.hpp b/src/indexing.hpp
index c6333bd5..d5ea6fe9 100644
--- a/src/indexing.hpp
+++ b/src/indexing.hpp
@@ -163,8 +163,7 @@ LVARRAY_HOST_DEVICE
 void printDimsAndIndices( INDEX_TYPE const * const LVARRAY_RESTRICT dims, INDICES const... indices )
 {
   constexpr int NDIM = sizeof ... (INDICES);
-  printf( "dimensions = { " );
-  printIndexValue( dims[0] );
+  printf( "dimensions = { %lld", static_cast< long long >( dims[ 0 ] ) );
   for( int i = 1; i < NDIM; ++i )
   {
     printf( ", %lld", static_cast< long long >( dims[ i ] ));

From 65c00eca9e2afdab71e926286d66b1d81e95d5ef Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Tue, 8 Apr 2025 09:04:43 -0700
Subject: [PATCH 28/42] aoa and aoav traits

---
 src/ArrayOfArrays.hpp     | 17 +++++++++++++++++
 src/ArrayOfArraysView.hpp | 19 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/src/ArrayOfArrays.hpp b/src/ArrayOfArrays.hpp
index 61facade..8dfbed2a 100644
--- a/src/ArrayOfArrays.hpp
+++ b/src/ArrayOfArrays.hpp
@@ -484,4 +484,21 @@ class ArrayOfArrays : protected ArrayOfArraysView< T, INDEX_TYPE, false, BUFFER_
   }
 };
 
+/**
+ * @brief True if the template type is an ArrayOfArrays.
+ */
+template< class >
+constexpr bool isArrayOfArrays = false;
+
+/**
+ * @tparam T The type contained in the ArrayOfArrays.
+ * @tparam INDEX_TYPE The integral type used as an index.
+ * @tparam BUFFER_TYPE The type used to manage the underlying allocation.
+ * @brief Specialization of isArrayOfArrays for the ArrayOfArrays class.
+ */
+template< typename T,
+  typename INDEX_TYPE,
+  template< typename > class BUFFER_TYPE >
+constexpr bool isArrayOfArrays< ArrayOfArrays< T, INDEX_TYPE, BUFFER_TYPE > > = true;
+
 } /* namespace LvArray */
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 3ce26cd1..540c50e9 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -1074,4 +1074,23 @@ class ArrayOfArraysView
   }
 };
 
+/**
+ * @brief True if the template type is an ArrayOfArraysView.
+ */
+template< class >
+constexpr bool isArrayOfArraysView = false;
+
+/**
+ * @tparam T The type contained in the ArrayOfArraysView.
+ * @tparam INDEX_TYPE The integral type used as an index.
+ * @tparam CONST_SIZES True iff the size of each array is constant.
+ * @tparam BUFFER_TYPE The type used to manager the underlying allocation.
+ * @brief Specialization of isArrayOfArraysView for the ArrayOfArraysView class.
+ */
+template< typename T,
+      typename INDEX_TYPE,
+      bool CONST_SIZES,
+      template< typename > class BUFFER_TYPE >
+constexpr bool isArrayOfArraysView< ArrayOfArraysView< T, INDEX_TYPE, CONST_SIZES, BUFFER_TYPE > > = true;
+
 } /* namespace LvArray */

From cff7f1b42ee107e0a19fad5237928b4c9f81b023 Mon Sep 17 00:00:00 2001
From: wrtobin <tobin6@llnl.gov>
Date: Tue, 8 Apr 2025 12:42:41 -0700
Subject: [PATCH 29/42] just a typo fix

---
 src/ArrayOfArraysView.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 540c50e9..6db5946d 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -249,7 +249,7 @@ class ArrayOfArraysView
 
   /**
    * @brief Move assignment operator..
-   * @param src the SparsityPatternView to be moved from.
+   * @param src the ArrayOfArraysView to be moved from.
    * @return *this.
    */
   LVARRAY_HOST_DEVICE

From fead905dd4cbea9c763d0c31cbe919071e9b0150 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby962.llnl.gov>
Date: Tue, 3 Jun 2025 09:10:11 -0700
Subject: [PATCH 30/42] Updated branch with current versions from develop that
 were overriden during merge

---
 cmake/SetupTPL.cmake                          |  51 ++++---
 host-configs/LLNL/lassen-cuda-11-base.cmake   |   2 +-
 host-configs/LLNL/lassen-cuda-12-base.cmake   |   2 +-
 scripts/uberenv/packages/lvarray/package.py   |  26 ++--
 scripts/uberenv/project.json                  |   4 +-
 .../blueos_3_ppc64le_ib_p9/compilers.yaml     | 132 +++---------------
 .../toss_3_x86_64_ib/compilers.yaml           |  83 +----------
 .../toss_3_x86_64_ib_python/packages.yaml     |   7 -
 8 files changed, 65 insertions(+), 242 deletions(-)

diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake
index 29a60128..bff94834 100644
--- a/cmake/SetupTPL.cmake
+++ b/cmake/SetupTPL.cmake
@@ -1,22 +1,19 @@
 set(thirdPartyLibs "")
 
-###############################
+################################
 # CAMP
-###############################
-if(CAMP_DIR STREQUAL RAJA_DIR)
-    message(STATUS "LvArray using CAMP from RAJA.")
-else()
-    if(NOT EXISTS ${CAMP_DIR})
-        message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
-    endif()
+################################
+if(NOT EXISTS ${CAMP_DIR})
+    message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
+endif()
 
-    message(STATUS "LvArray using CAMP from ${CAMP_DIR}")
+message(STATUS "Using CAMP from ${CAMP_DIR}")
 
-    find_package(camp REQUIRED PATHS ${CAMP_DIR})
+find_package(camp REQUIRED PATHS ${CAMP_DIR})
 
-    set(thirdPartyLibs ${thirdPartyLibs} camp)
-endif()
+set(ENABLE_CAMP ON CACHE BOOL "")
 
+set(thirdPartyLibs ${thirdPartyLibs} camp)
 
 ################################
 # RAJA
@@ -25,7 +22,7 @@ if(NOT EXISTS ${RAJA_DIR})
     message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.")
 endif()
 
-message(STATUS "LvArray using RAJA from ${RAJA_DIR}")
+message(STATUS "Using RAJA from ${RAJA_DIR}")
 
 find_package(RAJA REQUIRED PATHS ${RAJA_DIR})
 
@@ -42,14 +39,14 @@ if(ENABLE_UMPIRE)
         message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.")
     endif()
 
-    message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}")
+    message(STATUS "Using Umpire from ${UMPIRE_DIR}")
 
     find_package(umpire REQUIRED
                  PATHS ${UMPIRE_DIR})
     
     set(thirdPartyLibs ${thirdPartyLibs} umpire)
 else()
-    message(STATUS "LvArray not using Umpire.")
+    message(STATUS "Not using Umpire.")
 endif()
 
 ################################
@@ -68,32 +65,32 @@ if(ENABLE_CHAI)
         message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.")
     endif()
 
-    message(STATUS "LvArray using CHAI from ${CHAI_DIR}")
+    message(STATUS "Using CHAI from ${CHAI_DIR}")
 
     find_package(chai REQUIRED
                  PATHS ${CHAI_DIR})
-    
-    # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
-    # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
-    # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
-    # set_target_properties(chai
-    #                       PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
+
+    # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
+    get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
+    list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
+    set_target_properties(chai
+                          PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
 
     set(thirdPartyLibs ${thirdPartyLibs} chai)
 else()
-    message(STATUS "LvArray not using CHAI.")
+    message(STATUS "Not using CHAI.")
 endif()
 
 
-###############################
+################################
 # CALIPER
-###############################
+################################
 if(ENABLE_CALIPER)
     if(NOT EXISTS ${CALIPER_DIR})
         message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.")
     endif()
 
-    message(STATUS "LvArray using caliper from ${CALIPER_DIR}")
+    message(STATUS "Using caliper from ${CALIPER_DIR}")
 
     find_package(caliper REQUIRED
                  PATHS ${CALIPER_DIR})
@@ -105,7 +102,7 @@ if(ENABLE_CALIPER)
 
     set(thirdPartyLibs ${thirdPartyLibs} caliper)
 else()
-    message(STATUS "LvArray not using caliper.")
+    message(STATUS "Not using caliper.")
 endif()
 
 ################################
diff --git a/host-configs/LLNL/lassen-cuda-11-base.cmake b/host-configs/LLNL/lassen-cuda-11-base.cmake
index 73b7d51c..3c1ea972 100644
--- a/host-configs/LLNL/lassen-cuda-11-base.cmake
+++ b/host-configs/LLNL/lassen-cuda-11-base.cmake
@@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "")
 set(CMAKE_CUDA_STANDARD 17 CACHE STRING "")
 set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo --source-in-ptx ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
 
 set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE)
diff --git a/host-configs/LLNL/lassen-cuda-12-base.cmake b/host-configs/LLNL/lassen-cuda-12-base.cmake
index 174ba7fb..3b8f4125 100644
--- a/host-configs/LLNL/lassen-cuda-12-base.cmake
+++ b/host-configs/LLNL/lassen-cuda-12-base.cmake
@@ -10,7 +10,7 @@ set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "")
 set(CMAKE_CUDA_STANDARD 17 CACHE STRING "")
 set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -lineinfo --source-in-ptx -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -g -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
 
 set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE)
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index cf9d5548..9c4b47d9 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -56,36 +56,32 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('docs', default=False, description='Build docs')
     variant('addr2line', default=True,
             description='Build support for addr2line.')
-        
+
     depends_on('blt', when='@0.2.0:', type='build')
 
     depends_on('camp')
+    depends_on('camp+cuda', when='+cuda')
 
     depends_on('raja')
+    depends_on('raja+cuda', when='+cuda')
 
+    # At the moment Umpire doesn't support shared when building with CUDA.
     depends_on('umpire', when='+umpire')
+    depends_on('umpire+cuda~shared', when='+umpire+cuda')
 
     depends_on('chai+raja', when='+chai')
+    depends_on('chai+raja+cuda', when='+chai+cuda')
 
     depends_on('caliper', when='+caliper')
 
     depends_on('python +shared +pic', when='+pylvarray')
-    depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray')
-    depends_on('py-scipy@1.5.2:', when='+pylvarray')
+    depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray')
+    depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray')
     depends_on('py-pip', when='+pylvarray')
 
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
     depends_on('py-sphinx@1.6.3:', when='+docs', type='build')
 
-    with when('+cuda'):
-        for sm_ in CudaPackage.cuda_arch_values:
-            depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-            depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-            depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-            depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-            depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
-    
-
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
     @run_after('build')
@@ -289,6 +285,10 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write("#{0}\n\n".format("-" * 80))
 
         if "+caliper" in spec:
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Caliper\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+
             cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
             cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
         else:
@@ -297,7 +297,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write('#{0}\n'.format('-' * 80))
         cfg.write('# Python\n')
         cfg.write('#{0}\n\n'.format('-' * 80))
-
         if '+pylvarray' in spec:
             cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
             cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
@@ -307,7 +306,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cfg.write("#{0}\n".format("-" * 80))
         cfg.write("# Documentation\n")
         cfg.write("#{0}\n\n".format("-" * 80))
-
         if "+docs" in spec:
             cfg.write(cmake_cache_option("ENABLE_DOCS", True))
             sphinx_dir = spec['py-sphinx'].prefix
diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json
index 703db7a4..9822f975 100644
--- a/scripts/uberenv/project.json
+++ b/scripts/uberenv/project.json
@@ -3,8 +3,8 @@
   "package_version" : "develop",
   "package_final_phase" : "hostconfig",
   "package_source_dir" : "../..",
-  "spack_url": "https://github.com/spack/spack",
-  "spack_branch": "v0.19.0",
+  "spack_url": "https://github.com/corbett5/spack",
+  "spack_branch": "package/corbett/lvarray-update",
   "spack_activate" : {},
   "spack_clean_packages": ["lvarray"],
   "build_jobs": 100
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
index b8353dd0..b1bf26cb 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
@@ -2,90 +2,30 @@ compilers:
 - compiler:
     spec: clang@10.0.1
     paths:
-      cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cflags: -mcpu=native -mtune=native
+      cxxflags: -mcpu=native -mtune=native
     operating_system: rhel7
-    target: x86_64
+    target: ppc64le
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
     spec: clang@11.0.1
     paths:
-      cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cflags: -mcpu=native -mtune=native
+      cxxflags: -mcpu=native -mtune=native
     operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@12.0.1
-    paths:
-      cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@13.0.1
-    paths:
-      cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@14.0.4
-    paths:
-      cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang
-      cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@7.3.0
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
+    target: ppc64le
     modules: []
     environment: {}
     extra_rpaths: []
@@ -97,55 +37,25 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@9.3.1
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@10.2.1
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
+      cflags: -mcpu=native -mtune=native
+      cxxflags: -mcpu=native -mtune=native
     operating_system: rhel7
-    target: x86_64
+    target: ppc64le
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
-    spec: intel@19.1.2
+    spec: xl@16.1.1
     paths:
-      cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc
-      cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc
-      f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
-      fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
+      cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc
+      cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC
+      f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
+      fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
     flags:
-      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native
-      cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native
+      cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
+      cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
     operating_system: rhel7
-    target: x86_64
+    target: ppc64le
     modules: []
     environment: {}
     extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
index b8353dd0..3d9648a7 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
@@ -7,8 +7,8 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
     operating_system: rhel7
     target: x86_64
     modules: []
@@ -22,53 +22,8 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@12.0.1
-    paths:
-      cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@13.0.1
-    paths:
-      cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@14.0.4
-    paths:
-      cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang
-      cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
-      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
     operating_system: rhel7
     target: x86_64
     modules: []
@@ -104,36 +59,6 @@ compilers:
     modules: []
     environment: {}
     extra_rpaths: []
-- compiler:
-    spec: gcc@9.3.1
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@10.2.1
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
 - compiler:
     spec: intel@19.1.2
     paths:
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
index a6fbda09..0c6b833b 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
@@ -107,10 +107,3 @@ packages:
     externals:
     - spec: pkg-config@0.27.1
       prefix: /usr/bin/
-  
-  ninja:
-    buildable: False
-    externals:
-    - spec: ninja@kitware
-      prefix: /g/g14/corbett5/Programs/ninja/ninja-1.9.0.g99df1.kitware.dyndep-1.jobserver-1/quartz-build/
-

From 00327f812f9fe0f2d9d3ca9c920f41a9bc89bee0 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby964.llnl.gov>
Date: Tue, 17 Jun 2025 14:59:24 -0700
Subject: [PATCH 31/42] Fixed compilation error related to polarDecomposition

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index d4c3f1b7..f0f33bd5 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -113,7 +113,9 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
     scale< M, M >( R, 0.5 );
 
     // Determine how close R is to being orthogonal using L2Norm(R.R^T-I)
-    Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, R );
+    real64 copyR[M][M] = { { 0.0 } };
+    LvArray::tensorOps::copy< M, M >( copyR, R);
+    Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR );
     addIdentity< M >( RRTMinusI, -1.0 );
     for( std::ptrdiff_t i = 0 ; i < M ; i++ )
     {

From 95aa41e87da4514c16aed76b9a94ccfc60a9b183 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby964.llnl.gov>
Date: Tue, 17 Jun 2025 15:06:03 -0700
Subject: [PATCH 32/42] Removed wrong namespace

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index f0f33bd5..cce92ff5 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -114,7 +114,7 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
 
     // Determine how close R is to being orthogonal using L2Norm(R.R^T-I)
     real64 copyR[M][M] = { { 0.0 } };
-    LvArray::tensorOps::copy< M, M >( copyR, R);
+    copy< M, M >( copyR, R);
     Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR );
     addIdentity< M >( RRTMinusI, -1.0 );
     for( std::ptrdiff_t i = 0 ; i < M ; i++ )

From 85136bec9eba8ed35716196595b762ee6493fd5b Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby964.llnl.gov>
Date: Tue, 17 Jun 2025 15:08:28 -0700
Subject: [PATCH 33/42] Fixed incorrect type

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index cce92ff5..135f645f 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -113,7 +113,7 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
     scale< M, M >( R, 0.5 );
 
     // Determine how close R is to being orthogonal using L2Norm(R.R^T-I)
-    real64 copyR[M][M] = { { 0.0 } };
+    FloatingPoint copyR[M][M] = { { 0.0 } };
     copy< M, M >( copyR, R);
     Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR );
     addIdentity< M >( RRTMinusI, -1.0 );

From 34a4ea5759284ad4fd0335c20e12c8af8c044c7e Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby961.llnl.gov>
Date: Sun, 6 Jul 2025 12:15:40 -0700
Subject: [PATCH 34/42] Added round to math and cofactor to square tensor ops
 for 2x2 and 3x3 matrices

---
 cmake/Config.cmake                         |  2 +-
 src/fixedSizeSquareMatrixOps.hpp           | 36 ++++++++
 src/fixedSizeSquareMatrixOpsImpl.hpp       | 79 ++++++++++++++++++
 src/math.hpp                               | 45 ++++++++++
 unitTests/CMakeLists.txt                   |  2 +
 unitTests/testTensorOpsCofactorOneArg.cpp  | 29 +++++++
 unitTests/testTensorOpsCofactorTwoArgs.cpp | 29 +++++++
 unitTests/testTensorOpsInverse.hpp         | 97 ++++++++++++++++++++++
 8 files changed, 318 insertions(+), 1 deletion(-)
 create mode 100644 unitTests/testTensorOpsCofactorOneArg.cpp
 create mode 100644 unitTests/testTensorOpsCofactorTwoArgs.cpp

diff --git a/cmake/Config.cmake b/cmake/Config.cmake
index cf8ff35b..49c5ae12 100644
--- a/cmake/Config.cmake
+++ b/cmake/Config.cmake
@@ -2,7 +2,7 @@
 set( PREPROCESSOR_DEFINES UMPIRE
                           CHAI
                           CUDA
-			  HIP
+                          HIP
                           TOTALVIEW_OUTPUT
                           CALIPER )
 
diff --git a/src/fixedSizeSquareMatrixOps.hpp b/src/fixedSizeSquareMatrixOps.hpp
index 9fb82ec3..f39c3f15 100644
--- a/src/fixedSizeSquareMatrixOps.hpp
+++ b/src/fixedSizeSquareMatrixOps.hpp
@@ -38,6 +38,42 @@ LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
 auto determinant( MATRIX const & matrix )
 { return internal::SquareMatrixOps< M >::determinant( matrix ); }
 
+/**
+ * @brief Compute the cofactor of the source matrix @p srcMatrix and store the result in @p dstMatrix.
+ * @tparam M The size of the matrices @p dstMatrix and @p srcMatrix.
+ * @tparam DST_MATRIX The type of @p dstMatrix.
+ * @tparam SRC_MATRIX The type of @p srcMatrix.
+ * @param dstMatrix The M x M matrix to write the cofactor to.
+ * @param srcMatrix The M x M matrix to take the cofactor of.
+ * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values.
+ */
+template< std::ptrdiff_t M, typename DST_MATRIX, typename SRC_MATRIX >
+LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
+               SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
+{
+  static_assert( std::is_floating_point< std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) > >::value,
+                 "The destination matrix must be contain floating point values." );
+  internal::SquareMatrixOps< M >::cofactor( std::forward< DST_MATRIX >( dstMatrix ), srcMatrix );
+}
+
+/**
+ * @brief Compute the cofactor of the matrix @p matrix overwritting it.
+ * @tparam M The size of the matrix @p matrix.
+ * @tparam MATRIX The type of @p matrix.
+ * @param matrix The M x M matrix to take the cofactor of and overwrite.
+ * @return The determinant.
+ * @note @p matrix must contain floating point values.
+ */
+template< std::ptrdiff_t M, typename MATRIX >
+LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+auto cofactor( MATRIX && matrix )
+{
+  static_assert( std::is_floating_point< std::decay_t< decltype( matrix[ 0 ][ 0 ] ) > >::value,
+                 "The matrix must be contain floating point values." );
+  internal::SquareMatrixOps< M >::cofactor( std::forward< MATRIX >( matrix ) );
+}
+
 /**
  * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
  * @tparam M The size of the matrices @p dstMatrix and @p srcMatrix.
diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 135f645f..19cd2249 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -151,6 +151,26 @@ struct SquareMatrixOps< 2 >
     return matrix[ 0 ][ 0 ] * matrix[ 1 ][ 1 ] - matrix[ 0 ][ 1 ] * matrix[ 1 ][ 0 ];
   }
 
+  /**
+   * @return Compute the cofactor of the source matrix @p srcMatrix and store the result in @p dstMatrix
+   * @tparam DST_MATRIX The type of @p dstMatrix.
+   * @tparam SRC_MATRIX The type of @p srcMatrix.
+   * @param dstMatrix The 2x2 matrix to write the cofactor to.
+   * @param srcMatrix The 2x2 matrix to take the cofactor of.
+   */
+  template< typename DST_MATRIX, typename SRC_MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, 
+                        SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
+  {
+    checkSizes< 2, 2 >( dstMatrix );
+    checkSizes< 2, 2 >( srcMatrix );
+    dstMatrix[0][0] = srcMatrix[1][1];
+    dstMatrix[1][1] = srcMatrix[0][0];
+    dstMatrix[0][1] = -srcMatrix[1][0];
+    dstMatrix[1][0] = -srcMatrix[0][1];
+  }
+
   /**
    * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
    * @tparam DST_MATRIX The type of @p dstMatrix.
@@ -181,6 +201,24 @@ struct SquareMatrixOps< 2 >
     return det;
   }
 
+  /**
+   * @brief Compute the cofactor of the matrix @p srcMatrix overwritting it.
+   * @tparam MATRIX The type of @p matrix.
+   * @param matrix The 2x2 matrix to take the cofactor of and overwrite.
+   * @note @p matrix must contain floating point values.
+   */
+  template< typename MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static auto cofactor( MATRIX && matrix )
+  {
+    checkSizes< 2, 2 >( matrix );
+
+    using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >;
+    realType temp[ 2 ][ 2 ];
+    copy< 2, 2 >( temp, matrix );
+    return cofactor( matrix, temp );
+  }
+
   /**
    * @brief Invert the matrix @p srcMatrix overwritting it.
    * @tparam MATRIX The type of @p matrix.
@@ -655,6 +693,31 @@ struct SquareMatrixOps< 3 >
            matrix[ 2 ][ 0 ] * ( matrix[ 0 ][ 1 ] * matrix[ 1 ][ 2 ] - matrix[ 0 ][ 2 ] * matrix[ 1 ][ 1 ] );
   }
 
+  /**
+   * @return Compute the cofactor of the source matrix @p srcMatrix and store the result in @p dstMatrix
+   * @tparam DST_MATRIX The type of @p dstMatrix.
+   * @tparam SRC_MATRIX The type of @p srcMatrix.
+   * @param dstMatrix The 3x3 matrix to write the cofactor to.
+   * @param srcMatrix The 3x3 matrix to take the cofactor of.
+   */
+  template< typename DST_MATRIX, typename SRC_MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, 
+                        SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
+  {
+    checkSizes< 3, 3 >( dstMatrix );
+    checkSizes< 3, 3 >( srcMatrix );
+    dstMatrix[0][0] = srcMatrix[1][1] * srcMatrix[2][2] - srcMatrix[1][2] * srcMatrix[2][1];
+    dstMatrix[0][1] = srcMatrix[1][2] * srcMatrix[2][0] - srcMatrix[1][0] * srcMatrix[2][2];
+    dstMatrix[0][2] = srcMatrix[1][0] * srcMatrix[2][1] - srcMatrix[1][1] * srcMatrix[2][0];
+    dstMatrix[1][0] = srcMatrix[0][2] * srcMatrix[2][1] - srcMatrix[0][1] * srcMatrix[2][2];
+    dstMatrix[1][1] = srcMatrix[0][0] * srcMatrix[2][2] - srcMatrix[0][2] * srcMatrix[2][0];
+    dstMatrix[1][2] = srcMatrix[0][1] * srcMatrix[2][0] - srcMatrix[0][0] * srcMatrix[2][1];
+    dstMatrix[2][0] = srcMatrix[0][1] * srcMatrix[1][2] - srcMatrix[0][2] * srcMatrix[1][1];
+    dstMatrix[2][1] = srcMatrix[0][2] * srcMatrix[1][0] - srcMatrix[0][0] * srcMatrix[1][2];
+    dstMatrix[2][2] = srcMatrix[0][0] * srcMatrix[1][1] - srcMatrix[0][1] * srcMatrix[1][0];
+  }
+
   /**
    * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
    * @tparam DST_MATRIX The type of @p dstMatrix.
@@ -696,6 +759,22 @@ struct SquareMatrixOps< 3 >
     return det;
   }
 
+   /**
+   * @brief Compute the cofactor of the matrix @p srcMatrix overwritting it.
+   * @tparam MATRIX The type of @p matrix.
+   * @param matrix The 3x3 matrix to take the cofactor of and overwrite.
+   * @note @p srcMatrix must contain floating point values.
+   */
+  template< typename MATRIX >
+  LVARRAY_HOST_DEVICE constexpr inline
+  static auto cofactor( MATRIX && matrix )
+  {
+    using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >;
+    realType temp[ 3 ][ 3 ];
+    copy< 3, 3 >( temp, matrix );
+    return cofactor( matrix, temp );
+  }
+
   /**
    * @brief Invert the matrix @p srcMatrix overwritting it.
    * @tparam MATRIX The type of @p matrix.
diff --git a/src/math.hpp b/src/math.hpp
index 4dde2358..601fcfc3 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -441,6 +441,51 @@ __half2 abs( __half2 const x )
 #endif
 
 
+/**
+ * @return The round value of @p x.
+ * @param x The number to get the round value of.
+ * @note This set of overloads is valid for any numeric type.
+ */
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
+float round( float const x )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::roundf( x );
+#else
+  return std::round( x );
+#endif
+}
+
+template< typename T >
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
+double round( T const x )
+{
+#if defined(LVARRAY_DEVICE_COMPILE)
+  return ::round( double ( x ) );
+#else
+  return std::round( x );
+#endif
+}
+
+#if defined( LVARRAY_USE_DEVICE )
+
+/// @copydoc round( T )
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
+__half round( __half const x )
+{
+  return round( x );
+}
+
+/// @copydoc round( T )
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
+__half2 round( __half2 const x )
+{
+  return round( x );
+}
+
+#endif
+
+
 /**
  * @return The ceiling value of @p x.
  * @param x The number to get the ceiling value of.
diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt
index 18eb75d6..625a1209 100644
--- a/unitTests/CMakeLists.txt
+++ b/unitTests/CMakeLists.txt
@@ -70,6 +70,8 @@ set( testSources
      testStackTrace.cpp
      testTensorOpsDeterminant.cpp
      testTensorOpsEigen.cpp
+     testTensorOpsCofactorOneArg.cpp
+     testTensorOpsCofactorTwoArgs.cpp
      testTensorOpsInverseOneArg.cpp
      testTensorOpsInverseTwoArgs.cpp
      testTensorOpsPolarDecomposition.cpp
diff --git a/unitTests/testTensorOpsCofactorOneArg.cpp b/unitTests/testTensorOpsCofactorOneArg.cpp
new file mode 100644
index 00000000..82509dd8
--- /dev/null
+++ b/unitTests/testTensorOpsCofactorOneArg.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+#include "testTensorOpsInverse.hpp"
+
+namespace LvArray
+{
+namespace testing
+{
+
+TYPED_TEST( CofactorFloatOnlyTest, cofactorOneArg )
+{
+  this->cofactorOneArg();
+}
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+  return result;
+}
diff --git a/unitTests/testTensorOpsCofactorTwoArgs.cpp b/unitTests/testTensorOpsCofactorTwoArgs.cpp
new file mode 100644
index 00000000..3d8ebe3e
--- /dev/null
+++ b/unitTests/testTensorOpsCofactorTwoArgs.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+#include "testTensorOpsInverse.hpp"
+
+namespace LvArray
+{
+namespace testing
+{
+
+TYPED_TEST( CofactorTest, cofactorTwoArgs )
+{
+  this->cofactorTwoArgs();
+}
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+  return result;
+}
diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index 5603f3e1..4ead0e3b 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -90,6 +90,85 @@ class InverseTest : public ::testing::Test
         } );
   }
 
+  void cofactorTwoArgs()
+  {
+    ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView();
+    ArrayViewT< T, 3, 1 > const srcMatrix_IKJ = m_srcMatrix_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const srcMatrix_KJI = m_srcMatrix_KJI.toView();
+
+    ArrayViewT< FLOAT, 3, 2 > const dstMatrix_IJK = m_dstMatrix_IJK.toView();
+    ArrayViewT< FLOAT, 3, 1 > const dstMatrix_IKJ = m_dstMatrix_IKJ.toView();
+    ArrayViewT< FLOAT, 3, 0 > const dstMatrix_KJI = m_dstMatrix_KJI.toView();
+
+    ArrayT< T, RAJA::PERM_IJK > arrayOfMatrices( numMatrices, M, M );
+
+    T scale = 100;
+    for( T & value : arrayOfMatrices )
+    { value = randomValue( scale, m_gen ); }
+
+    ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst();
+
+    forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i )
+        {
+          T srcLocal[ M ][ M ];
+          FLOAT dstLocal[ M ][ M ];
+
+          #define _TEST( output, input ) \
+            tensorOps::copy< M, M >( input, matrices[ i ] ); \
+            tensorOps::cofactor< M >( output, input ); \
+            checkCofactor( scale, output, input )
+
+          #define _TEST_PERMS( input, output0, output1, output2, output3 ) \
+            _TEST( output0, input ); \
+            _TEST( output1, input ); \
+            _TEST( output2, input ); \
+            _TEST( output3, input )
+
+          _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+
+      #undef _TEST_PERMS
+      #undef _TEST
+        } );
+  }
+
+  void cofactorOneArg()
+  {
+    ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView();
+    ArrayViewT< T, 3, 1 > const srcMatrix_IKJ = m_srcMatrix_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const srcMatrix_KJI = m_srcMatrix_KJI.toView();
+
+    ArrayT< T, RAJA::PERM_IJK > arrayOfMatrices( numMatrices, M, M );
+
+    T scale = 100;
+    for( T & value : arrayOfMatrices )
+    { value = randomValue( scale, m_gen ); }
+
+    ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst();
+
+    forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i )
+        {
+          T initialMatrix[ M ][ M ];
+          T srcLocal[ M ][ M ];
+          T det;
+
+          #define _TEST( matrix ) \
+            tensorOps::copy< M, M >( initialMatrix, matrices[ i ] ); \
+            tensorOps::copy< M, M >( matrix, initialMatrix ); \
+            tensorOps::cofactor< M >( matrix ); \
+            checkCofactor( scale, matrix, initialMatrix )
+
+          _TEST( srcMatrix_IJK[ i ] );
+          _TEST( srcMatrix_IKJ[ i ] );
+          _TEST( srcMatrix_KJI[ i ] );
+          _TEST( srcLocal );
+
+      #undef _TEST
+        } );
+  }
+
   void inverseTwoArgs()
   {
     ArrayViewT< T, 3, 2 > const srcMatrix_IJK = m_srcMatrix_IJK.toView();
@@ -340,6 +419,24 @@ class InverseTest : public ::testing::Test
                           3 * scale * scale * scale * epsilon )
   }
 
+
+  template< typename MATRIX_A, typename MATRIX_B >
+  static void LVARRAY_HOST_DEVICE checkCofactor( T const scale, MATRIX_A && cofactor, MATRIX_B && source )
+  {
+    FLOAT const epsilon = NumericLimitsNC< FLOAT >{}.epsilon;
+
+    FLOAT inverse[ M ][ M ];
+    T det = tensorOps::invert< M >( inverse, src );
+    tensorOps::transpose< M >( inverse );
+    tensorOps::scale< M, M >( inverse, det );
+
+    for( int i = 0; i < M; ++i )
+    {
+      for( int j = 0; j < M; ++j )
+      { PORTABLE_EXPECT_NEAR( cofactor[ i ][ j ], inverse[ i ][ j ], 5 * scale * epsilon ); }
+    }
+  }
+
   template< typename MATRIX_A, typename MATRIX_B >
   static void LVARRAY_HOST_DEVICE checkInverse( T const scale,
                                                 T const det,

From fa06984d55cb2da24a2b7a25f63c28707edde8f2 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby961.llnl.gov>
Date: Sun, 6 Jul 2025 12:19:19 -0700
Subject: [PATCH 35/42] Updated formatting

---
 src/ArrayOfArrays.hpp                |  4 ++--
 src/ArrayOfArraysView.hpp            | 10 ++++----
 src/fixedSizeSquareMatrixOpsImpl.hpp |  8 +++----
 src/math.hpp                         |  3 ++-
 unitTests/testTensorOpsInverse.hpp   | 36 ++++++++++++++--------------
 5 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/ArrayOfArrays.hpp b/src/ArrayOfArrays.hpp
index 8dfbed2a..c78109c9 100644
--- a/src/ArrayOfArrays.hpp
+++ b/src/ArrayOfArrays.hpp
@@ -497,8 +497,8 @@ constexpr bool isArrayOfArrays = false;
  * @brief Specialization of isArrayOfArrays for the ArrayOfArrays class.
  */
 template< typename T,
-  typename INDEX_TYPE,
-  template< typename > class BUFFER_TYPE >
+          typename INDEX_TYPE,
+          template< typename > class BUFFER_TYPE >
 constexpr bool isArrayOfArrays< ArrayOfArrays< T, INDEX_TYPE, BUFFER_TYPE > > = true;
 
 } /* namespace LvArray */
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 757dbec8..85ecfd13 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -858,10 +858,10 @@ class ArrayOfArraysView
     destroyValues( 0, m_numArrays, pairs.first ... );
 
     INDEX_TYPE const offsetsSize = ( m_numArrays == 0 ) ? 0 : m_numArrays + 1;
-    
+
     bufferManipulation::copyInto( m_offsets, offsetsSize, srcOffsets, srcNumArrays + 1 );
     bufferManipulation::copyInto( m_sizes, m_numArrays, srcSizes, srcNumArrays );
-    
+
     INDEX_TYPE const maxOffset = m_offsets[ m_numArrays ];
     typeManipulation::forEachArg( [maxOffset, srcMaxOffset]( auto & dstBuffer )
     {
@@ -1101,9 +1101,9 @@ constexpr bool isArrayOfArraysView = false;
  * @brief Specialization of isArrayOfArraysView for the ArrayOfArraysView class.
  */
 template< typename T,
-      typename INDEX_TYPE,
-      bool CONST_SIZES,
-      template< typename > class BUFFER_TYPE >
+          typename INDEX_TYPE,
+          bool CONST_SIZES,
+          template< typename > class BUFFER_TYPE >
 constexpr bool isArrayOfArraysView< ArrayOfArraysView< T, INDEX_TYPE, CONST_SIZES, BUFFER_TYPE > > = true;
 
 } /* namespace LvArray */
diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 135f645f..f6c33d2c 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -114,12 +114,12 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
 
     // Determine how close R is to being orthogonal using L2Norm(R.R^T-I)
     FloatingPoint copyR[M][M] = { { 0.0 } };
-    copy< M, M >( copyR, R);
+    copy< M, M >( copyR, R );
     Rij_eq_AikBjk< M, M, M >( RRTMinusI, R, copyR );
     addIdentity< M >( RRTMinusI, -1.0 );
-    for( std::ptrdiff_t i = 0 ; i < M ; i++ )
+    for( std::ptrdiff_t i = 0; i < M; i++ )
     {
-      for( std::ptrdiff_t j = 0 ; j < M ; j++ )
+      for( std::ptrdiff_t j = 0; j < M; j++ )
       {
         errorSquared += RRTMinusI[i][j] * RRTMinusI[i][j];
       }
@@ -127,7 +127,7 @@ static void polarDecompositionBase( DST_MATRIX && LVARRAY_RESTRICT_REF R,
   }
   if( iter == 100 )
   {
-    printf("Polar decomposition did not converge in 100 iterations!");
+    printf( "Polar decomposition did not converge in 100 iterations!" );
   }
 }
 
diff --git a/src/math.hpp b/src/math.hpp
index 4dde2358..2a8e0c4d 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -487,7 +487,8 @@ __half2 ceil( __half2 const x )
 #if CUDART_VERSION > 11000
   return h2ceil( x );
 #else
-  return LVARRAY_THROW( "h2ceil is not implemented for host", std::runtime_error ); // This is wrong, copied from other function used to mimic
+  return LVARRAY_THROW( "h2ceil is not implemented for host", std::runtime_error ); // This is wrong, copied from other function used to
+                                                                                    // mimic
 #endif
 }
 
diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index 5603f3e1..fa701742 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -265,29 +265,29 @@ class InverseTest : public ::testing::Test
     ArrayViewT< T const, 3, 2 > const matrices = arrayOfMatrices.toViewConst();
 
     forall< POLICY >( matrices.size( 0 ), [=] LVARRAY_HOST_DEVICE ( INDEX_TYPE const i )
-    {
-      T srcLocal[ M ][ M ];
-      FLOAT dstLocal[ M ][ M ];
+        {
+          T srcLocal[ M ][ M ];
+          FLOAT dstLocal[ M ][ M ];
 
-      #define _TEST( output, input ) \
-        tensorOps::copy< M, M >( input, matrices[ i ] ); \
-        tensorOps::polarDecomposition< M >( output, input ); \
-        checkPolarDecomposition( output, input )
+          #define _TEST( output, input ) \
+            tensorOps::copy< M, M >( input, matrices[ i ] ); \
+            tensorOps::polarDecomposition< M >( output, input ); \
+            checkPolarDecomposition( output, input )
 
-      #define _TEST_PERMS( input, output0, output1, output2, output3 ) \
-        _TEST( output0, input ); \
-        _TEST( output1, input ); \
-        _TEST( output2, input ); \
-        _TEST( output3, input )
+          #define _TEST_PERMS( input, output0, output1, output2, output3 ) \
+            _TEST( output0, input ); \
+            _TEST( output1, input ); \
+            _TEST( output2, input ); \
+            _TEST( output3, input )
 
-      _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
-      _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
-      _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
-      _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcMatrix_IJK[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcMatrix_IKJ[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcMatrix_KJI[ i ], dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
+          _TEST_PERMS( srcLocal, dstMatrix_IJK[ i ], dstMatrix_IKJ[ i ], dstMatrix_KJI[ i ], dstLocal );
 
       #undef _TEST_PERMS
       #undef _TEST
-    } );
+        } );
   }
 
 
@@ -400,7 +400,7 @@ class InverseTest : public ::testing::Test
     using FloatingPoint = std::decay_t< decltype( orthogonalMatrix[0][0] ) >;
     FloatingPoint tolerance = 10 * LvArray::NumericLimits< FloatingPoint >::epsilon;
     FloatingPoint sqrtTolerance = math::sqrt( tolerance );
-    
+
     if( M == 2 )
     {
       // Check if orthogonalMatrix is actually orthogonal

From 7d4a46baa539e1237616da728ede97f8f3e4431d Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby966.llnl.gov>
Date: Sun, 6 Jul 2025 12:42:15 -0700
Subject: [PATCH 36/42] Fixed bug in unit tests for cofactors

---
 unitTests/testTensorOpsCofactorOneArg.cpp  | 2 +-
 unitTests/testTensorOpsCofactorTwoArgs.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/unitTests/testTensorOpsCofactorOneArg.cpp b/unitTests/testTensorOpsCofactorOneArg.cpp
index 82509dd8..7af3c287 100644
--- a/unitTests/testTensorOpsCofactorOneArg.cpp
+++ b/unitTests/testTensorOpsCofactorOneArg.cpp
@@ -12,7 +12,7 @@ namespace LvArray
 namespace testing
 {
 
-TYPED_TEST( CofactorFloatOnlyTest, cofactorOneArg )
+TYPED_TEST( InverseFloatOnlyTest, cofactorOneArg )
 {
   this->cofactorOneArg();
 }
diff --git a/unitTests/testTensorOpsCofactorTwoArgs.cpp b/unitTests/testTensorOpsCofactorTwoArgs.cpp
index 3d8ebe3e..21f9992a 100644
--- a/unitTests/testTensorOpsCofactorTwoArgs.cpp
+++ b/unitTests/testTensorOpsCofactorTwoArgs.cpp
@@ -12,7 +12,7 @@ namespace LvArray
 namespace testing
 {
 
-TYPED_TEST( CofactorTest, cofactorTwoArgs )
+TYPED_TEST( InverseTest, cofactorTwoArgs )
 {
   this->cofactorTwoArgs();
 }

From a1f3c560e21d7539cd04143f7771a80f9c1e1f52 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby966.llnl.gov>
Date: Sun, 6 Jul 2025 12:48:59 -0700
Subject: [PATCH 37/42] Fixed typo in unittests

---
 unitTests/testTensorOpsInverse.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index e7aeefa3..0411c459 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -426,7 +426,7 @@ class InverseTest : public ::testing::Test
     FLOAT const epsilon = NumericLimitsNC< FLOAT >{}.epsilon;
 
     FLOAT inverse[ M ][ M ];
-    T det = tensorOps::invert< M >( inverse, src );
+    T det = tensorOps::invert< M >( inverse, source );
     tensorOps::transpose< M >( inverse );
     tensorOps::scale< M, M >( inverse, det );
 

From bed5b66efcd9e96187de91e642ff4e35b4d7de25 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@ruby966.llnl.gov>
Date: Sun, 6 Jul 2025 12:53:16 -0700
Subject: [PATCH 38/42] Removed unused variable

---
 unitTests/testTensorOpsInverse.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index 0411c459..1e1461f6 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -152,7 +152,6 @@ class InverseTest : public ::testing::Test
         {
           T initialMatrix[ M ][ M ];
           T srcLocal[ M ][ M ];
-          T det;
 
           #define _TEST( matrix ) \
             tensorOps::copy< M, M >( initialMatrix, matrices[ i ] ); \

From 8aebaa559f09c2ccc45d6c0da64a6e905a310842 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@dane6.llnl.gov>
Date: Thu, 21 Aug 2025 17:16:00 -0700
Subject: [PATCH 39/42] Added invert for 4x4 matrices

---
 host-configs/LLNL/dane-clang-14-mpm.cmake |  5 ++
 src/fixedSizeSquareMatrixOpsImpl.hpp      | 96 +++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 host-configs/LLNL/dane-clang-14-mpm.cmake

diff --git a/host-configs/LLNL/dane-clang-14-mpm.cmake b/host-configs/LLNL/dane-clang-14-mpm.cmake
new file mode 100644
index 00000000..2c17649d
--- /dev/null
+++ b/host-configs/LLNL/dane-clang-14-mpm.cmake
@@ -0,0 +1,5 @@
+set(CONFIG_NAME "dane-clang-14-mpm" CACHE PATH "")
+
+set(GEOS_TPL_DIR /usr/WS1/crook5/thirdPartyLibs/install-dane-clang-14-mpm-release CACHE PATH "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/llnl-cpu-clang-14.cmake)
diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 5c0a1df2..ba2c21c3 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -1427,6 +1427,102 @@ struct SquareMatrixOps< 3 >
   }
 };
 
+/**
+ * @struct SquareMatrixOps< 4 >
+ * @brief Performs operations on 4x4 square matrices.
+ */
+template<>
+struct SquareMatrixOps< 4 >
+{
+    /**
+    * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
+    * @tparam DST_MATRIX The type of @p dstMatrix.
+    * @tparam SRC_MATRIX The type of @p srcMatrix.
+    * @param dstMatrix The 4x4 matrix to write the inverse to.
+    * @param srcMatrix The 4x4 matrix to take the inverse of.
+    * @return The determinant.
+    * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values.
+    */
+    template< typename DST_MATRIX, typename SRC_MATRIX >
+    LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+    static auto invert( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
+                        SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
+    {
+      checkSizes< 4, 4 >( dstMatrix );
+      checkSizes< 4, 4 >( srcMatrix );
+
+      using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >;
+
+      auto const det = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - 
+                       srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - 
+                       srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + 
+                       srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + 
+                       srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - 
+                       srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1] - 
+                       srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + 
+                       srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] + 
+                       srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - 
+                       srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - 
+                       srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + 
+                       srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0] + 
+                       srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - 
+                       srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - 
+                       srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + 
+                       srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + 
+                       srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - 
+                       srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0] - 
+                       srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + 
+                       srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] + 
+                       srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - 
+                       srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - 
+                       srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + 
+                       srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0];
+      FloatingPoint const invDet = FloatingPoint( 1 ) / det;
+
+      dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1];
+      dstMatrix[0][1] = srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][1];
+      dstMatrix[0][2] = srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][1];
+      dstMatrix[0][3] = srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1];
+
+      dstMatrix[1][0] = srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0];
+      dstMatrix[1][1] = srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][0];
+      dstMatrix[1][2] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][0];
+      dstMatrix[1][3] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0];
+
+      dstMatrix[2][0] = srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0];
+      dstMatrix[2][1] = srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][0];
+      dstMatrix[2][2] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][0];
+      dstMatrix[2][3] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0];
+      
+      dstMatrix[3][0] = srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0];
+      dstMatrix[3][1] = srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][0] + srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][0];
+      dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0];
+      dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0];
+
+      tensorOps::scale< 4, 4 >( dstMatrix, invDet );
+
+      return det;
+    }
+
+  /**
+   * @brief Invert the matrix @p srcMatrix overwritting it.
+   * @tparam MATRIX The type of @p matrix.
+   * @param matrix The 4x4 matrix to take the inverse of and overwrite.
+   * @return The determinant.
+   * @note @p srcMatrix must contain floating point values.
+   */
+  template< typename MATRIX >
+  LVARRAY_HOST_DEVICE constexpr inline
+  static auto invert( MATRIX && matrix )
+  {
+    using realType = std::remove_reference_t< decltype( matrix[ 0 ][ 0 ] ) >;
+
+    realType temp[ 4 ][ 4 ];
+    copy< 4, 4 >( temp, matrix );
+    return invert( matrix, temp );
+  }
+};
+
 } // namespace internal
 } // namespace tensorOps
 } // namespace LvArray

From d45263773647d5e698cf402a0bfb7329af2f7926 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@dane5.llnl.gov>
Date: Wed, 17 Sep 2025 11:01:18 -0700
Subject: [PATCH 40/42] Added determinant and inverse operations for square 4x4
 matrices

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 84 +++++++++++++++++++---------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index ba2c21c3..9ac2281e 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -1434,6 +1434,63 @@ struct SquareMatrixOps< 3 >
 template<>
 struct SquareMatrixOps< 4 >
 {
+    /**
+    * @return Return the determinant of the matrix @p matrix.
+    * @tparam MATRIX The type of @p matrix.
+    * @param matrix The 4x4 matrix to get the determinant of.
+    */
+    template< typename MATRIX >
+    LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+    static auto determinant( MATRIX const & matrix )
+    {
+      checkSizes< 4, 4 >( matrix );
+
+      return matrix[0][0]*(
+                            matrix[1][1]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][1] + matrix[1][3]*matrix[2][1]*matrix[3][2]
+                            - matrix[1][3]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][1]*matrix[3][3]
+                          )
+           - matrix[0][1]*(
+                              matrix[1][0]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][2]
+                              - matrix[1][3]*matrix[2][2]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][0]*matrix[3][3]
+                          )
+           + matrix[0][2]*(
+                              matrix[1][0]*matrix[2][1]*matrix[3][3] + matrix[1][1]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][1]
+                              - matrix[1][3]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][3]
+                          )
+          - matrix[0][3]*(
+                              matrix[1][0]*matrix[2][1]*matrix[3][2] + matrix[1][1]*matrix[2][2]*matrix[3][0] + matrix[1][2]*matrix[2][0]*matrix[3][1]
+                              - matrix[1][2]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][2]
+                          );
+
+      // return matrix[0][0] * matrix[1][1] * matrix[2][2] * matrix[3][3] - 
+      //        matrix[0][0] * matrix[1][1] * matrix[2][3] * matrix[3][2] - 
+      //        matrix[0][0] * matrix[1][2] * matrix[2][1] * matrix[3][3] + 
+      //        matrix[0][0] * matrix[1][2] * matrix[2][3] * matrix[3][1] + 
+      //        matrix[0][0] * matrix[1][3] * matrix[2][1] * matrix[3][2] - 
+      //        matrix[0][0] * matrix[1][3] * matrix[2][2] * matrix[3][1] - 
+  
+      //        matrix[0][1] * matrix[1][0] * matrix[2][2] * matrix[3][3] + 
+      //        matrix[0][1] * matrix[1][0] * matrix[2][3] * matrix[3][2] + 
+      //        matrix[0][1] * matrix[1][2] * matrix[2][0] * matrix[3][3] - 
+      //        matrix[0][1] * matrix[1][2] * matrix[2][3] * matrix[3][0] - 
+      //        matrix[0][1] * matrix[1][3] * matrix[2][0] * matrix[3][2] + 
+      //        matrix[0][1] * matrix[1][3] * matrix[2][2] * matrix[3][0] + 
+
+      //        matrix[0][2] * matrix[1][0] * matrix[2][1] * matrix[3][3] - 
+      //        matrix[0][2] * matrix[1][0] * matrix[2][3] * matrix[3][1] - 
+      //        matrix[0][2] * matrix[1][1] * matrix[2][0] * matrix[3][3] + 
+      //        matrix[0][2] * matrix[1][1] * matrix[2][3] * matrix[3][0] + 
+      //        matrix[0][2] * matrix[1][3] * matrix[2][0] * matrix[3][1] - 
+      //        matrix[0][2] * matrix[1][3] * matrix[2][1] * matrix[3][0] - 
+         
+      //        matrix[0][3] * matrix[1][0] * matrix[2][1] * matrix[3][2] + 
+      //        matrix[0][3] * matrix[1][0] * matrix[2][2] * matrix[3][1] + 
+      //        matrix[0][3] * matrix[1][1] * matrix[2][0] * matrix[3][2] - 
+      //        matrix[0][3] * matrix[1][1] * matrix[2][2] * matrix[3][0] - 
+      //        matrix[0][3] * matrix[1][2] * matrix[2][0] * matrix[3][1] + 
+      //        matrix[0][3] * matrix[1][2] * matrix[2][1] * matrix[3][0];
+    }
+
     /**
     * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
     * @tparam DST_MATRIX The type of @p dstMatrix.
@@ -1453,30 +1510,7 @@ struct SquareMatrixOps< 4 >
 
       using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >;
 
-      auto const det = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - 
-                       srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - 
-                       srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + 
-                       srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + 
-                       srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - 
-                       srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1] - 
-                       srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + 
-                       srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] + 
-                       srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - 
-                       srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - 
-                       srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + 
-                       srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0] + 
-                       srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - 
-                       srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - 
-                       srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + 
-                       srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + 
-                       srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - 
-                       srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0] - 
-                       srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + 
-                       srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] + 
-                       srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - 
-                       srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - 
-                       srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + 
-                       srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0];
+      FloatingPoint const det = determinant( srcMatrix );
       FloatingPoint const invDet = FloatingPoint( 1 ) / det;
 
       dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1];
@@ -1499,7 +1533,7 @@ struct SquareMatrixOps< 4 >
       dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0];
       dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0];
 
-      tensorOps::scale< 4, 4 >( dstMatrix, invDet );
+      scale< 4, 4 >( dstMatrix, invDet );
 
       return det;
     }

From cd4a94d90315d8155300078215fb4804b27efbeb Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@dane1.llnl.gov>
Date: Mon, 10 Nov 2025 21:53:04 -0800
Subject: [PATCH 41/42] Added host config for MPM solver

---
 host-configs/LLNL/dane-gcc-12-mpm.cmake | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 host-configs/LLNL/dane-gcc-12-mpm.cmake

diff --git a/host-configs/LLNL/dane-gcc-12-mpm.cmake b/host-configs/LLNL/dane-gcc-12-mpm.cmake
new file mode 100644
index 00000000..86adf6be
--- /dev/null
+++ b/host-configs/LLNL/dane-gcc-12-mpm.cmake
@@ -0,0 +1,5 @@
+set(CONFIG_NAME "dane-gcc-12-mpm" CACHE PATH "")
+
+set(GEOS_TPL_DIR /usr/WS1/crook5/thirdPartyLibs/install-dane-gcc-12-mpm-release CACHE PATH "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/llnl-cpu-gcc-12.cmake)

From c4853542618a14edaf02c87a97c7fb5b96b8e0c9 Mon Sep 17 00:00:00 2001
From: Cameron Mikel Crook <crook5@dane5.llnl.gov>
Date: Mon, 1 Dec 2025 10:11:37 -0800
Subject: [PATCH 42/42] Minor formatting changes

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 212 ++++++++++++++-------------
 1 file changed, 114 insertions(+), 98 deletions(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 9ac2281e..9c2533d2 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -160,7 +160,7 @@ struct SquareMatrixOps< 2 >
    */
   template< typename DST_MATRIX, typename SRC_MATRIX >
   LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
-  static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, 
+  static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
                         SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
   {
     checkSizes< 2, 2 >( dstMatrix );
@@ -702,7 +702,7 @@ struct SquareMatrixOps< 3 >
    */
   template< typename DST_MATRIX, typename SRC_MATRIX >
   LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
-  static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix, 
+  static auto cofactor( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
                         SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
   {
     checkSizes< 3, 3 >( dstMatrix );
@@ -759,7 +759,7 @@ struct SquareMatrixOps< 3 >
     return det;
   }
 
-   /**
+  /**
    * @brief Compute the cofactor of the matrix @p srcMatrix overwritting it.
    * @tparam MATRIX The type of @p matrix.
    * @param matrix The 3x3 matrix to take the cofactor of and overwrite.
@@ -1434,109 +1434,125 @@ struct SquareMatrixOps< 3 >
 template<>
 struct SquareMatrixOps< 4 >
 {
-    /**
-    * @return Return the determinant of the matrix @p matrix.
-    * @tparam MATRIX The type of @p matrix.
-    * @param matrix The 4x4 matrix to get the determinant of.
-    */
-    template< typename MATRIX >
-    LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
-    static auto determinant( MATRIX const & matrix )
-    {
-      checkSizes< 4, 4 >( matrix );
+  /**
+   * @return Return the determinant of the matrix @p matrix.
+   * @tparam MATRIX The type of @p matrix.
+   * @param matrix The 4x4 matrix to get the determinant of.
+   */
+  template< typename MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static auto determinant( MATRIX const & matrix )
+  {
+    checkSizes< 4, 4 >( matrix );
 
-      return matrix[0][0]*(
-                            matrix[1][1]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][1] + matrix[1][3]*matrix[2][1]*matrix[3][2]
-                            - matrix[1][3]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][1]*matrix[3][3]
-                          )
+    return matrix[0][0]*(
+      matrix[1][1]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][1] + matrix[1][3]*matrix[2][1]*matrix[3][2]
+      - matrix[1][3]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][1]*matrix[3][3]
+      )
            - matrix[0][1]*(
-                              matrix[1][0]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][2]
-                              - matrix[1][3]*matrix[2][2]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][0]*matrix[3][3]
-                          )
+      matrix[1][0]*matrix[2][2]*matrix[3][3] + matrix[1][2]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][2]
+      - matrix[1][3]*matrix[2][2]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][2] - matrix[1][2]*matrix[2][0]*matrix[3][3]
+      )
            + matrix[0][2]*(
-                              matrix[1][0]*matrix[2][1]*matrix[3][3] + matrix[1][1]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][1]
-                              - matrix[1][3]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][3]
-                          )
-          - matrix[0][3]*(
-                              matrix[1][0]*matrix[2][1]*matrix[3][2] + matrix[1][1]*matrix[2][2]*matrix[3][0] + matrix[1][2]*matrix[2][0]*matrix[3][1]
-                              - matrix[1][2]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][2]
-                          );
-
-      // return matrix[0][0] * matrix[1][1] * matrix[2][2] * matrix[3][3] - 
-      //        matrix[0][0] * matrix[1][1] * matrix[2][3] * matrix[3][2] - 
-      //        matrix[0][0] * matrix[1][2] * matrix[2][1] * matrix[3][3] + 
-      //        matrix[0][0] * matrix[1][2] * matrix[2][3] * matrix[3][1] + 
-      //        matrix[0][0] * matrix[1][3] * matrix[2][1] * matrix[3][2] - 
-      //        matrix[0][0] * matrix[1][3] * matrix[2][2] * matrix[3][1] - 
-  
-      //        matrix[0][1] * matrix[1][0] * matrix[2][2] * matrix[3][3] + 
-      //        matrix[0][1] * matrix[1][0] * matrix[2][3] * matrix[3][2] + 
-      //        matrix[0][1] * matrix[1][2] * matrix[2][0] * matrix[3][3] - 
-      //        matrix[0][1] * matrix[1][2] * matrix[2][3] * matrix[3][0] - 
-      //        matrix[0][1] * matrix[1][3] * matrix[2][0] * matrix[3][2] + 
-      //        matrix[0][1] * matrix[1][3] * matrix[2][2] * matrix[3][0] + 
-
-      //        matrix[0][2] * matrix[1][0] * matrix[2][1] * matrix[3][3] - 
-      //        matrix[0][2] * matrix[1][0] * matrix[2][3] * matrix[3][1] - 
-      //        matrix[0][2] * matrix[1][1] * matrix[2][0] * matrix[3][3] + 
-      //        matrix[0][2] * matrix[1][1] * matrix[2][3] * matrix[3][0] + 
-      //        matrix[0][2] * matrix[1][3] * matrix[2][0] * matrix[3][1] - 
-      //        matrix[0][2] * matrix[1][3] * matrix[2][1] * matrix[3][0] - 
-         
-      //        matrix[0][3] * matrix[1][0] * matrix[2][1] * matrix[3][2] + 
-      //        matrix[0][3] * matrix[1][0] * matrix[2][2] * matrix[3][1] + 
-      //        matrix[0][3] * matrix[1][1] * matrix[2][0] * matrix[3][2] - 
-      //        matrix[0][3] * matrix[1][1] * matrix[2][2] * matrix[3][0] - 
-      //        matrix[0][3] * matrix[1][2] * matrix[2][0] * matrix[3][1] + 
-      //        matrix[0][3] * matrix[1][2] * matrix[2][1] * matrix[3][0];
-    }
-
-    /**
-    * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
-    * @tparam DST_MATRIX The type of @p dstMatrix.
-    * @tparam SRC_MATRIX The type of @p srcMatrix.
-    * @param dstMatrix The 4x4 matrix to write the inverse to.
-    * @param srcMatrix The 4x4 matrix to take the inverse of.
-    * @return The determinant.
-    * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values.
-    */
-    template< typename DST_MATRIX, typename SRC_MATRIX >
-    LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
-    static auto invert( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
-                        SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
-    {
-      checkSizes< 4, 4 >( dstMatrix );
-      checkSizes< 4, 4 >( srcMatrix );
-
-      using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >;
-
-      FloatingPoint const det = determinant( srcMatrix );
-      FloatingPoint const invDet = FloatingPoint( 1 ) / det;
+      matrix[1][0]*matrix[2][1]*matrix[3][3] + matrix[1][1]*matrix[2][3]*matrix[3][0] + matrix[1][3]*matrix[2][0]*matrix[3][1]
+      - matrix[1][3]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][3]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][3]
+      )
+           - matrix[0][3]*(
+      matrix[1][0]*matrix[2][1]*matrix[3][2] + matrix[1][1]*matrix[2][2]*matrix[3][0] + matrix[1][2]*matrix[2][0]*matrix[3][1]
+      - matrix[1][2]*matrix[2][1]*matrix[3][0] - matrix[1][0]*matrix[2][2]*matrix[3][1] - matrix[1][1]*matrix[2][0]*matrix[3][2]
+      );
+
+    // return matrix[0][0] * matrix[1][1] * matrix[2][2] * matrix[3][3] -
+    //        matrix[0][0] * matrix[1][1] * matrix[2][3] * matrix[3][2] -
+    //        matrix[0][0] * matrix[1][2] * matrix[2][1] * matrix[3][3] +
+    //        matrix[0][0] * matrix[1][2] * matrix[2][3] * matrix[3][1] +
+    //        matrix[0][0] * matrix[1][3] * matrix[2][1] * matrix[3][2] -
+    //        matrix[0][0] * matrix[1][3] * matrix[2][2] * matrix[3][1] -
+
+    //        matrix[0][1] * matrix[1][0] * matrix[2][2] * matrix[3][3] +
+    //        matrix[0][1] * matrix[1][0] * matrix[2][3] * matrix[3][2] +
+    //        matrix[0][1] * matrix[1][2] * matrix[2][0] * matrix[3][3] -
+    //        matrix[0][1] * matrix[1][2] * matrix[2][3] * matrix[3][0] -
+    //        matrix[0][1] * matrix[1][3] * matrix[2][0] * matrix[3][2] +
+    //        matrix[0][1] * matrix[1][3] * matrix[2][2] * matrix[3][0] +
+
+    //        matrix[0][2] * matrix[1][0] * matrix[2][1] * matrix[3][3] -
+    //        matrix[0][2] * matrix[1][0] * matrix[2][3] * matrix[3][1] -
+    //        matrix[0][2] * matrix[1][1] * matrix[2][0] * matrix[3][3] +
+    //        matrix[0][2] * matrix[1][1] * matrix[2][3] * matrix[3][0] +
+    //        matrix[0][2] * matrix[1][3] * matrix[2][0] * matrix[3][1] -
+    //        matrix[0][2] * matrix[1][3] * matrix[2][1] * matrix[3][0] -
+
+    //        matrix[0][3] * matrix[1][0] * matrix[2][1] * matrix[3][2] +
+    //        matrix[0][3] * matrix[1][0] * matrix[2][2] * matrix[3][1] +
+    //        matrix[0][3] * matrix[1][1] * matrix[2][0] * matrix[3][2] -
+    //        matrix[0][3] * matrix[1][1] * matrix[2][2] * matrix[3][0] -
+    //        matrix[0][3] * matrix[1][2] * matrix[2][0] * matrix[3][1] +
+    //        matrix[0][3] * matrix[1][2] * matrix[2][1] * matrix[3][0];
+  }
 
-      dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1];
-      dstMatrix[0][1] = srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][1];
-      dstMatrix[0][2] = srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][1];
-      dstMatrix[0][3] = srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1];
+  /**
+   * @brief Invert the source matrix @p srcMatrix and store the result in @p dstMatrix.
+   * @tparam DST_MATRIX The type of @p dstMatrix.
+   * @tparam SRC_MATRIX The type of @p srcMatrix.
+   * @param dstMatrix The 4x4 matrix to write the inverse to.
+   * @param srcMatrix The 4x4 matrix to take the inverse of.
+   * @return The determinant.
+   * @note @p srcMatrix can contain integers but @p dstMatrix must contain floating point values.
+   */
+  template< typename DST_MATRIX, typename SRC_MATRIX >
+  LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
+  static auto invert( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
+                      SRC_MATRIX const & LVARRAY_RESTRICT_REF srcMatrix )
+  {
+    checkSizes< 4, 4 >( dstMatrix );
+    checkSizes< 4, 4 >( srcMatrix );
 
-      dstMatrix[1][0] = srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[1][2]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0];
-      dstMatrix[1][1] = srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][0];
-      dstMatrix[1][2] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][3] - srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][0];
-      dstMatrix[1][3] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][3]*srcMatrix[2][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0];
+    using FloatingPoint = std::decay_t< decltype( dstMatrix[ 0 ][ 0 ] ) >;
 
-      dstMatrix[2][0] = srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0];
-      dstMatrix[2][1] = srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][0];
-      dstMatrix[2][2] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][0];
-      dstMatrix[2][3] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0];
-      
-      dstMatrix[3][0] = srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][0] - srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0];
-      dstMatrix[3][1] = srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][0] + srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][0];
-      dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0];
-      dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0];
+    FloatingPoint const det = determinant( srcMatrix );
+    FloatingPoint const invDet = FloatingPoint( 1 ) / det;
 
-      scale< 4, 4 >( dstMatrix, invDet );
+    dstMatrix[0][0] = srcMatrix[1][1]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[1][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[1][2]*
+                      srcMatrix[2][3]*srcMatrix[3][1] + srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][1];
+    dstMatrix[0][1] = srcMatrix[0][1]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][1]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[0][2]*
+                      srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][1];
+    dstMatrix[0][2] = srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[3][3] - srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][3] + srcMatrix[0][2]*
+                      srcMatrix[1][3]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][1];
+    dstMatrix[0][3] = srcMatrix[0][1]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][1]*srcMatrix[1][2]*srcMatrix[2][3] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][3] - srcMatrix[0][2]*
+                      srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][1];
+
+    dstMatrix[1][0] = srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][3] + srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[1][2]*
+                      srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[1][3]*srcMatrix[2][2]*srcMatrix[3][0];
+    dstMatrix[1][1] = srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][2] - srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[0][2]*
+                      srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[0][3]*srcMatrix[2][2]*srcMatrix[3][0];
+    dstMatrix[1][2] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][3] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][3] - srcMatrix[0][2]*
+                      srcMatrix[1][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][2] + srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[3][0];
+    dstMatrix[1][3] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][2] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][3] + srcMatrix[0][2]*
+                      srcMatrix[1][3]*srcMatrix[2][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][2] - srcMatrix[0][3]*srcMatrix[1][2]*srcMatrix[2][0];
+
+    dstMatrix[2][0] = srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][3] - srcMatrix[1][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][3] + srcMatrix[1][1]*
+                      srcMatrix[2][3]*srcMatrix[3][0] + srcMatrix[1][3]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[1][3]*srcMatrix[2][1]*srcMatrix[3][0];
+    dstMatrix[2][1] = srcMatrix[0][0]*srcMatrix[2][3]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][3] + srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][3] - srcMatrix[0][1]*
+                      srcMatrix[2][3]*srcMatrix[3][0] - srcMatrix[0][3]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[0][3]*srcMatrix[2][1]*srcMatrix[3][0];
+    dstMatrix[2][2] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][3] - srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][3] + srcMatrix[0][1]*
+                      srcMatrix[1][3]*srcMatrix[3][0] + srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[3][1] - srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[3][0];
+    dstMatrix[2][3] = srcMatrix[0][0]*srcMatrix[1][3]*srcMatrix[2][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][3] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][3] - srcMatrix[0][1]*
+                      srcMatrix[1][3]*srcMatrix[2][0] - srcMatrix[0][3]*srcMatrix[1][0]*srcMatrix[2][1] + srcMatrix[0][3]*srcMatrix[1][1]*srcMatrix[2][0];
+
+    dstMatrix[3][0] = srcMatrix[1][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[1][0]*srcMatrix[2][1]*srcMatrix[3][2] + srcMatrix[1][1]*srcMatrix[2][0]*srcMatrix[3][2] - srcMatrix[1][1]*
+                      srcMatrix[2][2]*srcMatrix[3][0] - srcMatrix[1][2]*srcMatrix[2][0]*srcMatrix[3][1] + srcMatrix[1][2]*srcMatrix[2][1]*srcMatrix[3][0];
+    dstMatrix[3][1] = srcMatrix[0][0]*srcMatrix[2][1]*srcMatrix[3][2] - srcMatrix[0][0]*srcMatrix[2][2]*srcMatrix[3][1] - srcMatrix[0][1]*srcMatrix[2][0]*srcMatrix[3][2] + srcMatrix[0][1]*
+                      srcMatrix[2][2]*srcMatrix[3][0] + srcMatrix[0][2]*srcMatrix[2][0]*srcMatrix[3][1] - srcMatrix[0][2]*srcMatrix[2][1]*srcMatrix[3][0];
+    dstMatrix[3][2] = srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[3][1] - srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[3][2] + srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[3][2] - srcMatrix[0][1]*
+                      srcMatrix[1][2]*srcMatrix[3][0] - srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[3][1] + srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[3][0];
+    dstMatrix[3][3] = srcMatrix[0][0]*srcMatrix[1][1]*srcMatrix[2][2] - srcMatrix[0][0]*srcMatrix[1][2]*srcMatrix[2][1] - srcMatrix[0][1]*srcMatrix[1][0]*srcMatrix[2][2] + srcMatrix[0][1]*
+                      srcMatrix[1][2]*srcMatrix[2][0] + srcMatrix[0][2]*srcMatrix[1][0]*srcMatrix[2][1] - srcMatrix[0][2]*srcMatrix[1][1]*srcMatrix[2][0];
+
+    scale< 4, 4 >( dstMatrix, invDet );
 
-      return det;
-    }
+    return det;
+  }
 
   /**
    * @brief Invert the matrix @p srcMatrix overwritting it.