NVIDIA · aliceb-nv · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
@@ -32,11 +32,12 @@ class default_get_solution_callback_t : public get_solution_callback_t {
                     void* solution_bound,
                     void* user_data) override
   {
-    PyObject* numpy_matrix = get_numpy_array(data, n_variables);
-    PyObject* numpy_array  = get_numpy_array(objective_value, 1);
-    PyObject* numpy_bound  = get_numpy_array(solution_bound, 1);
-    PyObject* py_user_data = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
-    PyObject* res          = PyObject_CallMethod(this->pyCallbackClass,
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    PyObject* numpy_matrix  = get_numpy_array(data, n_variables);
+    PyObject* numpy_array   = get_numpy_array(objective_value, 1);
+    PyObject* numpy_bound   = get_numpy_array(solution_bound, 1);
+    PyObject* py_user_data  = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
+    PyObject* res           = PyObject_CallMethod(this->pyCallbackClass,
                                         "get_solution",
                                         "(OOOO)",
                                         numpy_matrix,
@@ -47,6 +48,7 @@ class default_get_solution_callback_t : public get_solution_callback_t {
     Py_DECREF(numpy_array);
     Py_DECREF(numpy_bound);
     if (res != nullptr) { Py_DECREF(res); }
+    PyGILState_Release(gstate);
   }
 
   PyObject* pyCallbackClass;
@@ -69,11 +71,12 @@ class default_set_solution_callback_t : public set_solution_callback_t {
                     void* solution_bound,
                     void* user_data) override
   {
-    PyObject* numpy_matrix = get_numpy_array(data, n_variables);
-    PyObject* numpy_array  = get_numpy_array(objective_value, 1);
-    PyObject* numpy_bound  = get_numpy_array(solution_bound, 1);
-    PyObject* py_user_data = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
-    PyObject* res          = PyObject_CallMethod(this->pyCallbackClass,
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    PyObject* numpy_matrix  = get_numpy_array(data, n_variables);
+    PyObject* numpy_array   = get_numpy_array(objective_value, 1);
+    PyObject* numpy_bound   = get_numpy_array(solution_bound, 1);
+    PyObject* py_user_data  = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
+    PyObject* res           = PyObject_CallMethod(this->pyCallbackClass,
                                         "set_solution",
                                         "(OOOO)",
                                         numpy_matrix,
@@ -84,6 +87,7 @@ class default_set_solution_callback_t : public set_solution_callback_t {
     Py_DECREF(numpy_array);
     Py_DECREF(numpy_bound);
     if (res != nullptr) { Py_DECREF(res); }
+    PyGILState_Release(gstate);
   }
 
   PyObject* pyCallbackClass;

@@ -898,7 +898,7 @@ struct nondeterministic_policy_t : tree_update_policy_t<i_t, f_t> {
   {
   }
 
-  f_t upper_bound() const override { return bnb.upper_bound_.load(); }
+  f_t upper_bound() const override { return bnb.get_cutoff(); }
 
   void update_pseudo_costs(mip_node_t<i_t, f_t>* node, f_t leaf_obj) override
   {
@@ -1316,10 +1316,11 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
+  f_t cutoff = get_cutoff();
   if (original_lp_.objective_is_integral) {
-    lp_settings.cut_off = std::ceil(upper_bound_ - settings_.integer_tol) + settings_.dual_tol;
+    lp_settings.cut_off = std::ceil(cutoff - settings_.integer_tol) + settings_.dual_tol;
   } else {
-    lp_settings.cut_off = upper_bound_ + settings_.dual_tol;
+    lp_settings.cut_off = cutoff + settings_.dual_tol;
   }
   lp_settings.inside_mip    = 2;
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
@@ -1426,7 +1427,7 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     // - The lower bound of the parent is lower or equal to its children
     worker->lower_bound = lower_bound;
 
-    if (lower_bound > upper_bound) {
+    if (lower_bound > get_cutoff()) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
       search_tree_.update(node_ptr, node_status_t::FATHOMED);
       worker->recompute_basis  = true;
@@ -1536,7 +1537,7 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     f_t rel_gap         = user_relative_gap(original_lp_, upper_bound, lower_bound);
     worker->lower_bound = lower_bound;
 
-    if (node_ptr->lower_bound > upper_bound) {
+    if (node_ptr->lower_bound > get_cutoff()) {
       worker->recompute_basis  = true;
       worker->recompute_bounds = true;
       continue;
@@ -1675,7 +1676,7 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
         std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
         if (!start_node.has_value()) { continue; }
-        if (upper_bound_ < start_node.value()->lower_bound) {
+        if (get_cutoff() < start_node.value()->lower_bound) {
           // This node was put on the heap earlier but its lower bound is now greater than the
           // current upper bound
           search_tree_.graphviz_node(
@@ -1699,7 +1700,7 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
         std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
 
         if (!start_node.has_value()) { continue; }
-        if (upper_bound_ < start_node.value()->lower_bound ||
+        if (get_cutoff() < start_node.value()->lower_bound ||
             start_node.value()->depth < diving_settings.min_node_depth) {
           continue;
         }
@@ -1767,7 +1768,7 @@ void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
     std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
     if (!start_node.has_value()) { continue; }
-    if (upper_bound_ < start_node.value()->lower_bound) {
+    if (get_cutoff() < start_node.value()->lower_bound) {
       // This node was put on the heap earlier but its lower bound is now greater than the
       // current upper bound
       search_tree_.graphviz_node(
@@ -2189,12 +2190,12 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         return mip_status_t::NUMERICAL;
       }
 
-      if (settings_.reduced_cost_strengthening >= 1 && upper_bound_.load() < last_upper_bound) {
+      if (settings_.reduced_cost_strengthening >= 1 && get_cutoff() < last_upper_bound) {
         mutex_upper_.lock();
-        last_upper_bound = upper_bound_.load();
+        last_upper_bound = get_cutoff();
         std::vector<f_t> lower_bounds;
         std::vector<f_t> upper_bounds;
-        find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds);
+        find_reduced_cost_fixings(get_cutoff(), lower_bounds, upper_bounds);
         mutex_upper_.unlock();
         mutex_original_lp_.lock();
         original_lp_.lower = lower_bounds;
@@ -2372,10 +2373,10 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     return solver_status_;
   }
 
-  if (settings_.reduced_cost_strengthening >= 2 && upper_bound_.load() < last_upper_bound) {
+  if (settings_.reduced_cost_strengthening >= 2 && get_cutoff() < last_upper_bound) {
     std::vector<f_t> lower_bounds;
     std::vector<f_t> upper_bounds;
-    i_t num_fixed = find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds);
+    i_t num_fixed = find_reduced_cost_fixings(get_cutoff(), lower_bounds, upper_bounds);
     if (num_fixed > 0) {
       std::vector<bool> bounds_changed(original_lp_.num_cols, true);
       std::vector<char> row_sense;
@@ -2479,7 +2480,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
         if (!start_node.has_value()) { continue; }
-        if (upper_bound_ < start_node.value()->lower_bound) {
+        if (get_cutoff() < start_node.value()->lower_bound) {
           // This node was put on the heap earlier but its lower bound is now greater than the
           // current upper bound
           search_tree_.graphviz_node(
@@ -3321,7 +3322,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_sort_replay_events(
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::deterministic_prune_worker_nodes_vs_incumbent()
 {
-  f_t upper_bound = upper_bound_.load();
+  f_t upper_bound = get_cutoff();
 
   for (auto& worker : *deterministic_workers_) {
     // Check nodes in plunge stack - filter in place
@@ -3457,14 +3458,14 @@ void branch_and_bound_t<i_t, f_t>::deterministic_populate_diving_heap()
   const int num_diving                  = deterministic_diving_workers_->size();
   constexpr int target_nodes_per_worker = 10;
   const int target_total                = num_diving * target_nodes_per_worker;
-  f_t upper_bound                       = upper_bound_.load();
+  f_t cutoff                            = get_cutoff();
 
   // Collect candidate nodes from BFS worker backlog heaps
   std::vector<std::pair<mip_node_t<i_t, f_t>*, f_t>> candidates;
 
   for (auto& worker : *deterministic_workers_) {
     for (auto* node : worker.backlog.data()) {
-      if (node->lower_bound < upper_bound) {
+      if (node->lower_bound < cutoff) {
         f_t score = node->objective_estimate;
         if (score >= inf) { score = node->lower_bound; }
         candidates.push_back({node, score});

@@ -108,6 +108,14 @@ class branch_and_bound_t {
 
   bool stop_for_time_limit(mip_solution_t<i_t, f_t>& solution);
 
+  // Set a cutoff bound from an external source (e.g., early FJ during presolve).
+  // Used for node pruning and reduced cost strengthening but NOT for gap computation.
+  // Unlike upper_bound_, this does not imply a verified incumbent solution exists.
+  void set_initial_cutoff(f_t bound) { initial_cutoff_ = bound; }
+
+  // Effective cutoff for node pruning: min of verified incumbent and external cutoff.
+  f_t get_cutoff() const { return std::min(upper_bound_.load(), initial_cutoff_); }
+
   // Repair a low-quality solution from the heuristics.
   bool repair_solution(const std::vector<f_t>& leaf_edge_norms,
                        const std::vector<f_t>& potential_solution,
@@ -169,9 +177,12 @@ class branch_and_bound_t {
   // Mutex for upper bound
   omp_mutex_t mutex_upper_;
 
-  // Global variable for upper bound
+  // Verified incumbent bound (only set when B&B has an actual integer-feasible solution).
   omp_atomic_t<f_t> upper_bound_;
 
+  // External cutoff from early heuristics (for pruning only, no verified solution).
+  f_t initial_cutoff_{std::numeric_limits<f_t>::infinity()};
+
   // Global variable for incumbent. The incumbent should be updated with the upper bound
   mip_solution_t<i_t, f_t> incumbent_;
 

@@ -40,7 +40,9 @@ set(MIP_NON_LP_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/trivial_presolve.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/early_cpufj.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/early_gpufj.cu)
 
 # Choose which files to include based on build mode
 if(BUILD_LP_ONLY)

@@ -267,9 +267,9 @@ void population_t<i_t, f_t>::invoke_get_solution_callback(
   f_t user_bound     = context.stats.get_solution_bound();
   solution_t<i_t, f_t> temp_sol(sol);
   problem_ptr->post_process_assignment(temp_sol.assignment);
-  if (context.settings.mip_scaling) {
+  if (context.settings.mip_scaling && context.scaling != nullptr) {
     rmm::device_uvector<f_t> dummy(0, temp_sol.handle_ptr->get_stream());
-    context.scaling.unscale_solutions(temp_sol.assignment, dummy);
+    context.scaling->unscale_solutions(temp_sol.assignment, dummy);
   }
   if (problem_ptr->has_papilo_presolve_data()) {
     problem_ptr->papilo_uncrush_assignment(temp_sol.assignment);
@@ -346,7 +346,9 @@ void population_t<i_t, f_t>::run_solution_callbacks(solution_t<i_t, f_t>& sol)
                  incumbent_assignment.size(),
                  sol.handle_ptr->get_stream());
 
-      if (context.settings.mip_scaling) { context.scaling.scale_solutions(incumbent_assignment); }
+      if (context.settings.mip_scaling && context.scaling != nullptr) {
+        context.scaling->scale_solutions(incumbent_assignment);
+      }
       bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment);
       if (!is_valid) { return; }
       cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(),

@@ -0,0 +1,109 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/solution/solution.cuh>
+
+#include <cuopt/linear_programming/mip/solver_settings.hpp>
+
+#include <utilities/logger.hpp>
+
+#include <thrust/fill.h>
+
+#include <chrono>
+#include <functional>
+#include <limits>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename f_t>
+using early_incumbent_callback_t =
+  std::function<void(f_t solver_obj, f_t user_obj, const std::vector<f_t>& assignment)>;
+
+// CRTP base for early heuristics that run on the original (or papilo-presolved) problem
+// during presolve to find incumbents as early as possible.
+// Derived classes implement start() and stop().
+template <typename i_t, typename f_t, typename Derived>
+class early_heuristic_t {
+ public:
+  early_heuristic_t(const optimization_problem_t<i_t, f_t>& op_problem,
+                    const typename mip_solver_settings_t<i_t, f_t>::tolerances_t& tolerances,
+                    early_incumbent_callback_t<f_t> incumbent_callback)
+    : incumbent_callback_(std::move(incumbent_callback))
+  {
+    RAFT_CUDA_TRY(cudaGetDevice(&device_id_));
+
+    // Build and preprocess on the original handle, then copy onto our own handle
+    // so the derived solver can run on a dedicated stream (prevents graph capture conflicts).
+    problem_t<i_t, f_t> temp_problem(op_problem, tolerances, false);
+    temp_problem.preprocess_problem();
+    temp_problem.handle_ptr->sync_stream();
+    problem_ptr_ = std::make_unique<problem_t<i_t, f_t>>(temp_problem, &handle_);
+
+    solution_ptr_ = std::make_unique<solution_t<i_t, f_t>>(*problem_ptr_);
+    thrust::fill(handle_.get_thrust_policy(),
+                 solution_ptr_->assignment.begin(),
+                 solution_ptr_->assignment.end(),
+                 f_t{0});
+    solution_ptr_->clamp_within_bounds();
+  }
+
+  bool solution_found() const { return solution_found_; }
+  f_t get_best_objective() const { return best_objective_; }
+  void set_best_objective(f_t obj) { best_objective_ = obj; }
+  const std::vector<f_t>& get_best_assignment() const { return best_assignment_; }
+
+ protected:
+  ~early_heuristic_t() = default;
+
+  // NOT thread-safe. solver_obj is in solver-space (always minimization).
+  // Uses a private CUDA stream to avoid racing with the FJ solver's stream.
+  void try_update_best(f_t solver_obj, const std::vector<f_t>& assignment)
+  {
+    if (solver_obj >= best_objective_) { return; }
+    best_objective_ = solver_obj;
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_id_));
+    auto stream = handle_.get_stream();
+    rmm::device_uvector<f_t> d_assignment(assignment.size(), stream);
+    raft::copy(d_assignment.data(), assignment.data(), assignment.size(), stream);
+    problem_ptr_->post_process_assignment(d_assignment, true, stream);
+    auto user_assignment = cuopt::host_copy(d_assignment, stream);
+
+    best_assignment_ = user_assignment;
+    solution_found_  = true;
+    f_t user_obj     = problem_ptr_->get_user_obj_from_solver_obj(solver_obj);
+    double elapsed =
+      std::chrono::duration<double>(std::chrono::steady_clock::now() - start_time_).count();
+    CUOPT_LOG_INFO("Early heuristics (%s) lowered the primal bound. Objective %g. Time %.2f",
+                   Derived::name(),
+                   user_obj,
+                   elapsed);
+    if (incumbent_callback_) { incumbent_callback_(solver_obj, user_obj, user_assignment); }
+  }
+
+  int device_id_{0};
+
+  // handle_ must be declared before problem_ptr_/solution_ptr_ so it outlives them
+  // (C++ destroys members in reverse declaration order)
+  raft::handle_t handle_;
+
+  std::unique_ptr<problem_t<i_t, f_t>> problem_ptr_;
+  std::unique_ptr<solution_t<i_t, f_t>> solution_ptr_;
+
+  bool solution_found_{false};
+  f_t best_objective_{std::numeric_limits<f_t>::infinity()};
+  std::vector<f_t> best_assignment_;
+
+  early_incumbent_callback_t<f_t> incumbent_callback_;
+  std::chrono::steady_clock::time_point start_time_;
+};
+
+}  // namespace cuopt::linear_programming::detail